2 # Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for PowerISA 2.07.
20 # This is straightforward KECCAK_1X_ALT SIMD implementation, but with
21 # disjoint Rho and Pi. The module is ABI-bitness- and endian-neutral.
22 # POWER8 processor spends 9.8 cycles to process byte out of large
23 # buffer for r=1088, which matches SHA3-256. This is 17% better than
24 # scalar PPC64 code. It probably should be noted that if POWER8's
25 # successor can achieve higher scalar instruction issue rate, then
26 # this module will loose... And it does on POWER9 with 12.0 vs. 9.4.
28 # $output is the last argument if it looks like a file (it has an extension)
29 # $flavour is the first argument if it doesn't look like a file
30 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
31 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
33 if ($flavour =~ /64/) {
40 } elsif ($flavour =~ /32/) {
47 } else { die "nonsense $flavour"; }
49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
51 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
52 die "can't locate ppc-xlate.pl";
54 open STDOUT,"| $^X $xlate $flavour \"$output\""
55 or die "can't call $xlate: $!";
57 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
63 ########################################################################
82 # v13..25 rhotates[][]
89 .type KeccakF1600_int,\@function
99 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Theta
100 vxor v26,v0, v5 ; A[0..1][0]^A[2..3][0]
101 vxor v27,v1, v6 ; A[0..1][1]^A[2..3][1]
102 vxor v28,v2, v7 ; A[0..1][2]^A[2..3][2]
103 vxor v29,v3, v8 ; A[0..1][3]^A[2..3][3]
104 vxor v30,v4, v9 ; A[0..1][4]^A[2..3][4]
105 vpermdi v31,v26,v27,0b00 ; A[0][0..1]^A[2][0..1]
106 vpermdi v26,v26,v27,0b11 ; A[1][0..1]^A[3][0..1]
107 vpermdi v27,v28,v29,0b00 ; A[0][2..3]^A[2][2..3]
108 vpermdi v28,v28,v29,0b11 ; A[1][2..3]^A[3][2..3]
109 vpermdi v29,v30,v30,0b10 ; A[1..0][4]^A[3..2][4]
110 vxor v26,v26,v31 ; C[0..1]
111 vxor v27,v27,v28 ; C[2..3]
112 vxor v28,v29,v30 ; C[4..4]
114 vxor v26,v26,v10 ; C[0..1] ^= A[4][0..1]
115 vxor v27,v27,v11 ; C[2..3] ^= A[4][2..3]
116 vxor v28,v28,v12 ; C[4..4] ^= A[4][4..4], low!
118 vrld v29,v26,v31 ; ROL64(C[0..1],1)
119 vrld v30,v27,v31 ; ROL64(C[2..3],1)
120 vrld v31,v28,v31 ; ROL64(C[4..4],1)
121 vpermdi v31,v31,v29,0b10
122 vxor v26,v26,v30 ; C[0..1] ^= ROL64(C[2..3],1)
123 vxor v27,v27,v31 ; C[2..3] ^= ROL64(C[4..0],1)
124 vxor v28,v28,v29 ; C[4..4] ^= ROL64(C[0..1],1), low!
126 vpermdi v29,v26,v26,0b00 ; C[0..0]
127 vpermdi v30,v28,v26,0b10 ; C[4..0]
128 vpermdi v31,v28,v28,0b11 ; C[4..4]
129 vxor v1, v1, v29 ; A[0..1][1] ^= C[0..0]
130 vxor v6, v6, v29 ; A[2..3][1] ^= C[0..0]
131 vxor v10,v10,v30 ; A[4][0..1] ^= C[4..0]
132 vxor v0, v0, v31 ; A[0..1][0] ^= C[4..4]
133 vxor v5, v5, v31 ; A[2..3][0] ^= C[4..4]
135 vpermdi v29,v27,v27,0b00 ; C[2..2]
136 vpermdi v30,v26,v26,0b11 ; C[1..1]
137 vpermdi v31,v26,v27,0b10 ; C[1..2]
138 vxor v3, v3, v29 ; A[0..1][3] ^= C[2..2]
139 vxor v8, v8, v29 ; A[2..3][3] ^= C[2..2]
140 vxor v2, v2, v30 ; A[0..1][2] ^= C[1..1]
141 vxor v7, v7, v30 ; A[2..3][2] ^= C[1..1]
142 vxor v11,v11,v31 ; A[4][2..3] ^= C[1..2]
144 vpermdi v29,v27,v27,0b11 ; C[3..3]
145 vxor v4, v4, v29 ; A[0..1][4] ^= C[3..3]
146 vxor v9, v9, v29 ; A[2..3][4] ^= C[3..3]
147 vxor v12,v12,v29 ; A[4..4][4] ^= C[3..3]
149 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Rho
150 vrld v26,v0, v13 ; v0
152 vrld v27,v2, v15 ; v2
153 vrld v28,v3, v16 ; v3
157 vrld v29,v7, v20 ; v7
161 vrld v30,v11,v24 ; v11
164 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Pi
165 vpermdi v0, v26,v28,0b00 ; [0][0] [1][0] < [0][0] [0][3]
166 vpermdi v2, v29,v5, 0b00 ; [0][2] [1][2] < [2][2] [2][0]
167 vpermdi v11,v9, v5, 0b01 ; [4][2] [4][3] < [2][4] [3][0]
168 vpermdi v5, v1, v4, 0b00 ; [2][0] [3][0] < [0][1] [0][4]
169 vpermdi v1, v1, v4, 0b11 ; [0][1] [1][1] < [1][1] [1][4]
170 vpermdi v3, v8, v6, 0b11 ; [0][3] [1][3] < [3][3] [3][1]
171 vpermdi v4, v12,v30,0b10 ; [0][4] [1][4] < [4][4] [4][2]
172 vpermdi v7, v8, v6, 0b00 ; [2][2] [3][2] < [2][3] [2][1]
173 vpermdi v6, v27,v26,0b11 ; [2][1] [3][1] < [1][2] [1][0]
174 vpermdi v8, v9, v29,0b11 ; [2][3] [3][3] < [3][4] [3][2]
175 vpermdi v12,v10,v10,0b11 ; [4][4] [4][4] < [4][1] [4][1]
176 vpermdi v9, v10,v30,0b01 ; [2][4] [3][4] < [4][0] [4][3]
177 vpermdi v10,v27,v28,0b01 ; [4][0] [4][1] < [0][2] [1][3]
179 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; Chi + Iota
180 lvx_u v31,$iotas,r0 ; iotas[index]
181 addic r0,r0,16 ; index++
183 vandc v26,v2, v1 ; (~A[0..1][1] & A[0..1][2])
184 vandc v27,v3, v2 ; (~A[0..1][2] & A[0..1][3])
185 vandc v28,v4, v3 ; (~A[0..1][3] & A[0..1][4])
186 vandc v29,v0, v4 ; (~A[0..1][4] & A[0..1][0])
187 vandc v30,v1, v0 ; (~A[0..1][0] & A[0..1][1])
188 vxor v0, v0, v26 ; A[0..1][0] ^= (~A[0..1][1] & A[0..1][2])
189 vxor v1, v1, v27 ; A[0..1][1] ^= (~A[0..1][2] & A[0..1][3])
190 vxor v2, v2, v28 ; A[0..1][2] ^= (~A[0..1][3] & A[0..1][4])
191 vxor v3, v3, v29 ; A[0..1][3] ^= (~A[0..1][4] & A[0..1][0])
192 vxor v4, v4, v30 ; A[0..1][4] ^= (~A[0..1][0] & A[0..1][1])
194 vandc v26,v7, v6 ; (~A[2..3][1] & A[2..3][2])
195 vandc v27,v8, v7 ; (~A[2..3][2] & A[2..3][3])
196 vandc v28,v9, v8 ; (~A[2..3][3] & A[2..3][4])
197 vandc v29,v5, v9 ; (~A[2..3][4] & A[2..3][0])
198 vandc v30,v6, v5 ; (~A[2..3][0] & A[2..3][1])
199 vxor v5, v5, v26 ; A[2..3][0] ^= (~A[2..3][1] & A[2..3][2])
200 vxor v6, v6, v27 ; A[2..3][1] ^= (~A[2..3][2] & A[2..3][3])
201 vxor v7, v7, v28 ; A[2..3][2] ^= (~A[2..3][3] & A[2..3][4])
202 vxor v8, v8, v29 ; A[2..3][3] ^= (~A[2..3][4] & A[2..3][0])
203 vxor v9, v9, v30 ; A[2..3][4] ^= (~A[2..3][0] & A[2..3][1])
205 vxor v0, v0, v31 ; A[0][0] ^= iotas[index++]
207 vpermdi v26,v10,v11,0b10 ; A[4][1..2]
208 vpermdi v27,v12,v10,0b00 ; A[4][4..0]
209 vpermdi v28,v11,v12,0b10 ; A[4][3..4]
210 vpermdi v29,v10,v10,0b10 ; A[4][1..0]
211 vandc v26,v11,v26 ; (~A[4][1..2] & A[4][2..3])
212 vandc v27,v27,v28 ; (~A[4][3..4] & A[4][4..0])
213 vandc v28,v10,v29 ; (~A[4][1..0] & A[4][0..1])
214 vxor v10,v10,v26 ; A[4][0..1] ^= (~A[4][1..2] & A[4][2..3])
215 vxor v11,v11,v27 ; A[4][2..3] ^= (~A[4][3..4] & A[4][4..0])
216 vxor v12,v12,v28 ; A[4][4..4] ^= (~A[4][0..1] & A[4][1..0])
220 vpermdi v12,v12,v12,0b11 ; broadcast A[4][4]
223 .byte 0,12,0x14,0,0,0,0,0
224 .size KeccakF1600_int,.-KeccakF1600_int
226 .type KeccakF1600,\@function
229 $STU $sp,-$FRAME($sp)
230 li r10,`15+6*$SIZE_T`
231 li r11,`31+6*$SIZE_T`
233 mfspr r7, 256 ; save vrsave
256 stw r7,`$FRAME-4`($sp) ; save vrsave
258 $PUSH r8,`$FRAME+$LRSAVE`($sp)
259 mtspr 256, r0 ; preserve all AltiVec registers
262 lvx_4w v0,0,r3 ; load A[5][5]
290 lvx_u v13,0,r12 ; load rhotates
314 addi r12,r12,`16*16` ; points at iotas
319 stvx_4w v0,0,r3 ; return A[5][5]
344 li r10,`15+6*$SIZE_T`
345 li r11,`31+6*$SIZE_T`
347 mtspr 256, r7 ; restore vrsave
373 .byte 0,12,0x04,1,0x80,0,1,0
375 .size KeccakF1600,.-KeccakF1600
378 my ($A_jagged,$inp,$len,$bsz) = map("r$_",(3..6));
382 .type SHA3_absorb,\@function
385 $STU $sp,-$FRAME($sp)
386 li r10,`15+6*$SIZE_T`
387 li r11,`31+6*$SIZE_T`
389 mfspr r7, 256 ; save vrsave
412 stw r7,`$FRAME-4`($sp) ; save vrsave
414 $PUSH r8,`$FRAME+$LRSAVE`($sp)
415 mtspr 256, r0 ; preserve all AltiVec registers
418 lvx_4w v0,0,$A_jagged ; load A[5][5]
420 lvx_4w v1,r11,$A_jagged
422 lvx_4w v2,r10,$A_jagged
424 lvx_4w v3,r11,$A_jagged
426 lvx_4w v4,r10,$A_jagged
428 lvx_4w v5,r11,$A_jagged
430 lvx_4w v6,r10,$A_jagged
432 lvx_4w v7,r11,$A_jagged
434 lvx_4w v8,r10,$A_jagged
436 lvx_4w v9,r11,$A_jagged
438 lvx_4w v10,r10,$A_jagged
440 lvx_4w v11,r11,$A_jagged
441 lvx_splt v12,r10,$A_jagged
446 lvx_u v13,0,r12 ; load rhotates
472 addi r12,r12,`16*16` ; points at iotas
477 $UCMP $len,$bsz ; len < bsz?
480 sub $len,$len,$bsz ; len -= bsz
484 lvx_u v30,r10,r12 ; permutation masks
486 ?vspltisb v27,7 ; prepare masks for byte swap
487 ?vxor v30,v30,v27 ; on big-endian
490 vxor v27,v27,v27 ; zero
493 vperm v26,v26,v27,v30
498 vperm v26,v26,v27,v30
503 vperm v26,v26,v27,v30
508 vperm v26,v26,v27,v30
513 vperm v26,v26,v27,v30
518 vperm v26,v26,v27,v31
523 vperm v26,v26,v27,v31
528 vperm v26,v26,v27,v31
533 vperm v26,v26,v27,v31
538 vperm v26,v26,v27,v31
543 vperm v26,v26,v27,v30
548 vperm v26,v26,v27,v30
553 vperm v26,v26,v27,v30
558 vperm v26,v26,v27,v30
563 vperm v26,v26,v27,v30
568 vperm v26,v26,v27,v31
573 vperm v26,v26,v27,v31
578 vperm v26,v26,v27,v31
583 vperm v26,v26,v27,v31
588 vperm v26,v26,v27,v31
593 vperm v26,v26,v27,v30
598 vperm v26,v26,v27,v31
603 vperm v26,v26,v27,v30
608 vperm v26,v26,v27,v31
613 vperm v26,v26,v27,v31
624 stvx_4w v0,0,$A_jagged ; return A[5][5]
626 stvx_4w v1,r11,$A_jagged
628 stvx_4w v2,r10,$A_jagged
630 stvx_4w v3,r11,$A_jagged
632 stvx_4w v4,r10,$A_jagged
634 stvx_4w v5,r11,$A_jagged
636 stvx_4w v6,r10,$A_jagged
638 stvx_4w v7,r11,$A_jagged
640 stvx_4w v8,r10,$A_jagged
642 stvx_4w v9,r11,$A_jagged
644 stvx_4w v10,r10,$A_jagged
646 stvx_4w v11,r11,$A_jagged
647 stvdx_u v12,r10,$A_jagged
649 mr r3,$len ; return value
650 li r10,`15+6*$SIZE_T`
651 li r11,`31+6*$SIZE_T`
653 mtspr 256, r7 ; restore vrsave
679 .byte 0,12,0x04,1,0x80,0,4,0
681 .size SHA3_absorb,.-SHA3_absorb
685 my ($A_jagged,$out,$len,$bsz) = map("r$_",(3..6));
689 .type SHA3_squeeze,\@function
692 mflr r9 ; r9 is not touched by KeccakF1600
693 subi $out,$out,1 ; prepare for stbu
694 addi r8,$A_jagged,4 ; prepare volatiles
701 lwzx r0,r11,$A_jagged ; hi
705 stbu r7,1($out) ; write lo
712 stbu r0,1($out) ; write hi
721 beqlr ; return if done
726 addi r11,r11,16 ; calculate jagged index
749 addi r8,$A_jagged,4 ; restore volatiles
775 .byte 0,12,0x14,0,0,0,4,0
777 .size SHA3_squeeze,.-SHA3_squeeze
785 mflr r12 ; vvvvvv "distance" between . and 1st data entry
790 .byte 0,12,0x14,0,0,0,0,0
792 .type rhotates,\@object
808 .size rhotates,.-rhotates
810 .quad 0x0001020304050607,0x1011121314151617
811 .quad 0x1011121314151617,0x0001020304050607
814 .quad 0x0000000000000001,0
815 .quad 0x0000000000008082,0
816 .quad 0x800000000000808a,0
817 .quad 0x8000000080008000,0
818 .quad 0x000000000000808b,0
819 .quad 0x0000000080000001,0
820 .quad 0x8000000080008081,0
821 .quad 0x8000000000008009,0
822 .quad 0x000000000000008a,0
823 .quad 0x0000000000000088,0
824 .quad 0x0000000080008009,0
825 .quad 0x000000008000000a,0
826 .quad 0x000000008000808b,0
827 .quad 0x800000000000008b,0
828 .quad 0x8000000000008089,0
829 .quad 0x8000000000008003,0
830 .quad 0x8000000000008002,0
831 .quad 0x8000000000000080,0
832 .quad 0x000000000000800a,0
833 .quad 0x800000008000000a,0
834 .quad 0x8000000080008081,0
835 .quad 0x8000000000008080,0
836 .quad 0x0000000080000001,0
837 .quad 0x8000000080008008,0
839 .asciz "Keccak-1600 absorb and squeeze for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
842 foreach (split("\n",$code)) {
843 s/\`([^\`]*)\`/eval $1/ge;
845 if ($flavour =~ /le$/) { # little-endian
847 } else { # big-endian
854 close STDOUT or die "error closing STDOUT: $!";