2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for x86_86.
20 # Below code is [lane complementing] KECCAK_2X implementation (see
21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22 # instead of actually unrolling the loop pair-wise I simply flip
23 # pointers to T[][] and A[][] at the end of round. Since number of
24 # rounds is even, last round writes to A[][] and everything works out.
25 # How does it compare to assembly module in Keccak Code Package? KCP
26 # is faster on couple of processors, VIA Nano and Goldmont by 4-6%,
27 # otherwise this module is either as fast or faster by up to 15%...
29 ########################################################################
30 # Numbers are cycles per processed byte out of large message.
37 # Sandy Bridge 12.9(**)
46 # (*) Corresponds to SHA3-256. Improvement over compiler-generate
47 # varies a lot, most commont coefficient is 15% in comparison to
48 # gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
49 # (**) Sandy Bridge has broken rotate instruction. Performance can be
50 # improved by 14% by replacing rotates with double-precision
51 # shift with same register as source and destination.
55 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
57 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
59 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
61 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
62 die "can't locate x86_64-xlate.pl";
64 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
67 my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
68 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
70 my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
71 my @D = map("%r$_",(8..12));
72 my @T = map("%r$_",(13..14));
75 my @rhotates = ([ 0, 1, 62, 28, 27 ],
76 [ 36, 44, 6, 55, 20 ],
77 [ 3, 10, 43, 25, 39 ],
78 [ 41, 45, 15, 21, 8 ],
79 [ 18, 2, 61, 56, 14 ]);
84 .type __KeccakF1600,\@function
87 mov $A[4][0](%rdi),@C[0]
88 mov $A[4][1](%rdi),@C[1]
89 mov $A[4][2](%rdi),@C[2]
90 mov $A[4][3](%rdi),@C[3]
91 mov $A[4][4](%rdi),@C[4]
96 mov $A[0][0](%rdi),@D[0]
97 mov $A[1][1](%rdi),@D[1]
98 mov $A[2][2](%rdi),@D[2]
99 mov $A[3][3](%rdi),@D[3]
101 xor $A[0][2](%rdi),@C[2]
102 xor $A[0][3](%rdi),@C[3]
104 xor $A[0][1](%rdi),@C[1]
105 xor $A[1][2](%rdi),@C[2]
106 xor $A[1][0](%rdi),@C[0]
108 xor $A[0][4](%rdi),@C[4]
111 xor $A[2][0](%rdi),@C[0]
112 xor $A[1][3](%rdi),@C[3]
114 xor $A[1][4](%rdi),@C[4]
116 xor $A[3][2](%rdi),@C[2]
117 xor $A[3][0](%rdi),@C[0]
118 xor $A[2][3](%rdi),@C[3]
119 xor $A[2][1](%rdi),@C[1]
120 xor $A[2][4](%rdi),@C[4]
124 xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0]
128 xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3]
129 xor $A[3][1](%rdi),@C[1]
132 xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1]
133 xor $A[3][4](%rdi),@C[4]
136 xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4]
139 xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2]
142 @D = (@C[1],@C[2],@C[3],@C[4],@C[0]);
147 rol \$$rhotates[1][1],@C[1]
150 rol \$$rhotates[2][2],@C[2]
153 rol \$$rhotates[3][3],@C[3]
155 xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2])
156 rol \$$rhotates[4][4],@C[4]
163 mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
164 xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3])
166 mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3])
169 xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3])
170 mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3])
173 xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0])
174 mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0])
177 xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0])
178 mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0])
181 mov $A[0][3](%rdi),@C[0]
182 mov $A[4][2](%rdi),@C[4]
183 mov $A[3][1](%rdi),@C[3]
184 mov $A[1][4](%rdi),@C[1]
185 mov $A[2][0](%rdi),@C[2]
189 rol \$$rhotates[0][3],@C[0]
192 rol \$$rhotates[4][2],@C[4]
193 rol \$$rhotates[3][1],@C[3]
195 rol \$$rhotates[1][4],@C[1]
198 rol \$$rhotates[2][0],@C[2]
200 xor @C[3],@C[0] # C[3] ^ (C[0] | C[4])
201 mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4])
205 xor @C[4],@C[1] # C[4] ^ (C[1] & C[0])
207 mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0])
210 xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3])
211 mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3])
214 xor @T[1],@C[3] # C[1] ^ (C[3] & C[2])
215 mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2])
218 xor @T[0],@T[1] # C[0] ^ (C[1] | C[2])
219 mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2])
222 mov $A[2][3](%rdi),@C[2]
223 mov $A[3][4](%rdi),@C[3]
224 mov $A[1][2](%rdi),@C[1]
225 mov $A[4][0](%rdi),@C[4]
226 mov $A[0][1](%rdi),@C[0]
230 rol \$$rhotates[2][3],@C[2]
232 rol \$$rhotates[3][4],@C[3]
234 rol \$$rhotates[1][2],@C[1]
236 rol \$$rhotates[4][0],@C[4]
239 rol \$$rhotates[0][1],@C[0]
242 xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3])
243 mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3])
247 xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3])
248 mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
251 xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1])
252 mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1])
255 xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0])
256 mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0])
259 xor @C[3],@C[0] # ~C[3] ^ ( C[0] | C[4])
260 mov @C[0],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
263 mov $A[2][1](%rdi),@C[2]
264 mov $A[3][2](%rdi),@C[3]
265 mov $A[1][0](%rdi),@C[1]
266 mov $A[4][3](%rdi),@C[4]
267 mov $A[0][4](%rdi),@C[0]
271 rol \$$rhotates[2][1],@C[2]
273 rol \$$rhotates[3][2],@C[3]
275 rol \$$rhotates[1][0],@C[1]
277 rol \$$rhotates[4][3],@C[4]
280 rol \$$rhotates[0][4],@C[0]
283 xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3])
284 mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3])
288 xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3])
289 mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
292 xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1])
293 mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1])
296 xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0])
297 mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0])
300 xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4])
301 mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
304 xor $A[0][2](%rdi),@D[2]
305 xor $A[1][3](%rdi),@D[3]
306 rol \$$rhotates[0][2],@D[2]
307 xor $A[4][1](%rdi),@D[1]
308 rol \$$rhotates[1][3],@D[3]
309 xor $A[2][4](%rdi),@D[4]
310 rol \$$rhotates[4][1],@D[1]
311 xor $A[3][0](%rdi),@D[0]
313 rol \$$rhotates[2][4],@D[4]
314 rol \$$rhotates[3][0],@D[0]
316 @C = (@D[2],@D[3],@D[4],@D[0],@D[1]);
321 xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1])
322 mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1])
326 xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1])
327 mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
330 xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4])
331 mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4])
334 xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3])
335 mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3])
338 xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3])
339 mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
341 mov @C[0],@C[1] # harmonize with the loop top
347 lea -192($iotas),$iotas # rewind iotas
349 .size __KeccakF1600,.-__KeccakF1600
352 .type KeccakF1600,\@function
362 lea 100(%rdi),%rdi # size optimization
372 lea iotas(%rip),$iotas
373 lea 100(%rsp),%rsi # size optimization
383 lea -100(%rdi),%rdi # preserve A[][]
394 .size KeccakF1600,.-KeccakF1600
397 { my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
398 ($A_flat,$inp) = ("%r8","%r9");
401 .type SHA3_absorb,\@function
411 lea 100(%rdi),%rdi # size optimization
415 lea 100(%rsp),%rsi # size optimization
423 lea iotas(%rip),$iotas
425 mov $bsz,216-100(%rsi) # save bsz
432 lea -100(%rdi),$A_flat
438 lea 8($A_flat),$A_flat
444 mov $inp,200-100(%rsi) # save inp
445 mov $len,208-100(%rsi) # save len
447 mov 200-100(%rsi),$inp # pull inp
448 mov 208-100(%rsi),$len # pull len
449 mov 216-100(%rsi),$bsz # pull bsz
454 mov $len,%rax # return value
472 .size SHA3_absorb,.-SHA3_absorb
475 { my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
476 ($out,$len,$bsz) = ("%r12","%r13","%r14");
480 .type SHA3_squeeze,\@function
503 sub \$8,$len # len -= 8
518 .byte 0xf3,0xa4 # rep movsb
525 .size SHA3_squeeze,.-SHA3_squeeze
530 .quad 0,0,0,0,0,0,0,0
533 .quad 0x0000000000000001
534 .quad 0x0000000000008082
535 .quad 0x800000000000808a
536 .quad 0x8000000080008000
537 .quad 0x000000000000808b
538 .quad 0x0000000080000001
539 .quad 0x8000000080008081
540 .quad 0x8000000000008009
541 .quad 0x000000000000008a
542 .quad 0x0000000000000088
543 .quad 0x0000000080008009
544 .quad 0x000000008000000a
545 .quad 0x000000008000808b
546 .quad 0x800000000000008b
547 .quad 0x8000000000008089
548 .quad 0x8000000000008003
549 .quad 0x8000000000008002
550 .quad 0x8000000000000080
551 .quad 0x000000000000800a
552 .quad 0x800000008000000a
553 .quad 0x8000000080008081
554 .quad 0x8000000000008080
555 .quad 0x0000000080000001
556 .quad 0x8000000080008008
558 .asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
561 foreach (split("\n",$code)) {
562 # Below replacement results in 11.3 on Sandy Bridge, 9.4 on
563 # Haswell, but it hurts other processors by up to 2-3-4x...
564 #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;