2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for x86_86.
20 # Below code is [lane complementing] KECCAK_2X implementation (see
21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22 # instead of actually unrolling the loop pair-wise I simply flip
23 # pointers to T[][] and A[][] at the end of round. Since number of
24 # rounds is even, last round writes to A[][] and everything works out.
26 ########################################################################
27 # Numbers are cycles per processed byte out of large message.
44 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51 die "can't locate x86_64-xlate.pl";
53 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
56 my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
57 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
59 my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
60 my @D = map("%r$_",(8..12));
61 my @T = map("%r$_",(13..14));
64 my @rhotates = ([ 0, 1, 62, 28, 27 ],
65 [ 36, 44, 6, 55, 20 ],
66 [ 3, 10, 43, 25, 39 ],
67 [ 41, 45, 15, 21, 8 ],
68 [ 18, 2, 61, 56, 14 ]);
73 .type __KeccakF1600,\@function
76 mov $A[4][0](%rdi),@C[0]
77 mov $A[4][1](%rdi),@C[1]
78 mov $A[4][2](%rdi),@C[2]
79 mov $A[4][3](%rdi),@C[3]
80 mov $A[4][4](%rdi),@C[4]
85 xor $A[0][0](%rdi),@C[0]
86 xor $A[0][1](%rdi),@C[1]
87 xor $A[0][2](%rdi),@C[2]
88 xor $A[0][3](%rdi),@C[3]
89 xor $A[0][4](%rdi),@C[4]
91 xor $A[1][0](%rdi),@C[0]
92 xor $A[1][1](%rdi),@C[1]
93 xor $A[1][2](%rdi),@C[2]
94 xor $A[1][3](%rdi),@C[3]
95 xor $A[1][4](%rdi),@C[4]
97 xor $A[2][0](%rdi),@C[0]
98 xor $A[2][1](%rdi),@C[1]
99 xor $A[2][2](%rdi),@C[2]
100 xor $A[2][3](%rdi),@C[3]
101 xor $A[2][4](%rdi),@C[4]
103 xor $A[3][0](%rdi),@C[0]
104 xor $A[3][1](%rdi),@C[1]
105 xor $A[3][2](%rdi),@C[2]
106 xor $A[3][3](%rdi),@C[3]
107 xor $A[3][4](%rdi),@C[4]
111 mov $A[0][0](%rdi),@D[0]
112 xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0]
115 mov $A[1][1](%rdi),@D[1]
116 xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3]
119 mov $A[2][2](%rdi),@D[2]
120 xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1]
123 mov $A[3][3](%rdi),@D[3]
124 xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4]
127 mov $A[4][4](%rdi),@D[4]
128 xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2]
131 @D = (@C[1],@C[2],@C[3],@C[4],@C[0]);
140 rol \$$rhotates[1][1],@C[1]
141 rol \$$rhotates[2][2],@C[2]
142 rol \$$rhotates[3][3],@C[3]
143 rol \$$rhotates[4][4],@C[4]
147 xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2])
150 mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
154 xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3])
155 mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3])
159 xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3])
160 mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3])
163 xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0])
164 mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0])
167 xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0])
168 mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0])
171 mov $A[0][3](%rdi),@C[0]
172 mov $A[1][4](%rdi),@C[1]
173 mov $A[2][0](%rdi),@C[2]
174 mov $A[3][1](%rdi),@C[3]
175 mov $A[4][2](%rdi),@C[4]
183 rol \$$rhotates[0][3],@C[0]
184 rol \$$rhotates[1][4],@C[1]
185 rol \$$rhotates[2][0],@C[2]
186 rol \$$rhotates[3][1],@C[3]
187 rol \$$rhotates[4][2],@C[4]
191 xor @C[3],@C[0] # C[3] ^ (C[0] | C[4])
192 mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4])
196 xor @C[4],@C[1] # C[4] ^ (C[1] & C[0])
197 mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0])
201 xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3])
202 mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3])
205 xor @T[1],@C[3] # C[1] ^ (C[3] & C[2])
206 mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2])
209 xor @T[0],@T[1] # C[0] ^ (C[1] | C[2])
210 mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2])
213 mov $A[0][1](%rdi),@C[0]
214 mov $A[1][2](%rdi),@C[1]
215 mov $A[2][3](%rdi),@C[2]
216 mov $A[3][4](%rdi),@C[3]
217 mov $A[4][0](%rdi),@C[4]
225 rol \$$rhotates[0][1],@C[0]
226 rol \$$rhotates[1][2],@C[1]
227 rol \$$rhotates[2][3],@C[2]
228 rol \$$rhotates[3][4],@C[3]
229 rol \$$rhotates[4][0],@C[4]
233 xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3])
234 mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3])
239 xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3])
240 mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
243 xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1])
244 mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1])
247 xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0])
248 mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0])
251 xor @C[3],@C[0] # ~C[3] ^ ( C[0] | C[4])
252 mov @C[0],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
255 mov $A[0][4](%rdi),@C[0]
256 mov $A[1][0](%rdi),@C[1]
257 mov $A[2][1](%rdi),@C[2]
258 mov $A[3][2](%rdi),@C[3]
259 mov $A[4][3](%rdi),@C[4]
267 rol \$$rhotates[0][4],@C[0]
268 rol \$$rhotates[1][0],@C[1]
269 rol \$$rhotates[2][1],@C[2]
270 rol \$$rhotates[3][2],@C[3]
271 rol \$$rhotates[4][3],@C[4]
275 xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3])
276 mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3])
281 xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3])
282 mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
285 xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1])
286 mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1])
289 xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0])
290 mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0])
293 xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4])
294 mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
297 xor $A[0][2](%rdi),@D[2]
298 xor $A[1][3](%rdi),@D[3]
299 xor $A[2][4](%rdi),@D[4]
300 xor $A[3][0](%rdi),@D[0]
301 xor $A[4][1](%rdi),@D[1]
304 rol \$$rhotates[0][2],@D[2]
305 rol \$$rhotates[1][3],@D[3]
306 rol \$$rhotates[2][4],@D[4]
307 rol \$$rhotates[3][0],@D[0]
308 rol \$$rhotates[4][1],@D[1]
310 @C = (@D[2],@D[3],@D[4],@D[0],@D[1]);
314 xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1])
315 mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1])
320 xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1])
321 mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
324 xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4])
325 mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4])
328 xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3])
329 mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3])
332 xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3])
333 mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
335 mov @C[0],@C[1] # harmonize with the loop top
341 lea -192($iotas),$iotas # rewind iotas
343 .size __KeccakF1600,.-__KeccakF1600
346 .type KeccakF1600,\@function
356 lea 100(%rdi),%rdi # size optimization
366 lea iotas(%rip),$iotas
367 lea 100(%rsp),%rsi # size optimization
377 lea -100(%rdi),%rdi # preserve A[][]
388 .size KeccakF1600,.-KeccakF1600
391 { my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
392 ($A_flat,$inp) = ("%r8","%r9");
395 .type SHA3_absorb,\@function
405 lea 100(%rdi),%rdi # size optimization
409 lea 100(%rsp),%rsi # size optimization
417 lea iotas(%rip),$iotas
419 mov $bsz,216-100(%rsi) # save bsz
426 lea -100(%rdi),$A_flat
432 lea 8($A_flat),$A_flat
438 mov $inp,200-100(%rsi) # save inp
439 mov $len,208-100(%rsi) # save len
441 mov 200-100(%rsi),$inp # pull inp
442 mov 208-100(%rsi),$len # pull len
443 mov 216-100(%rsi),$bsz # pull bsz
448 mov $len,%rax # return value
466 .size SHA3_absorb,.-SHA3_absorb
469 { my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
470 ($out,$len,$bsz) = ("%r12","%r13","%r14");
474 .type SHA3_squeeze,\@function
497 sub \$8,$len # len -= 8
512 .byte 0xf3,0xa4 # rep movsb
519 .size SHA3_squeeze,.-SHA3_squeeze
524 .quad 0,0,0,0,0,0,0,0
527 .quad 0x0000000000000001
528 .quad 0x0000000000008082
529 .quad 0x800000000000808a
530 .quad 0x8000000080008000
531 .quad 0x000000000000808b
532 .quad 0x0000000080000001
533 .quad 0x8000000080008081
534 .quad 0x8000000000008009
535 .quad 0x000000000000008a
536 .quad 0x0000000000000088
537 .quad 0x0000000080008009
538 .quad 0x000000008000000a
539 .quad 0x000000008000808b
540 .quad 0x800000000000008b
541 .quad 0x8000000000008089
542 .quad 0x8000000000008003
543 .quad 0x8000000000008002
544 .quad 0x8000000000000080
545 .quad 0x000000000000800a
546 .quad 0x800000008000000a
547 .quad 0x8000000080008081
548 .quad 0x8000000000008080
549 .quad 0x0000000080000001
550 .quad 0x8000000080008008
552 .asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"