2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for x86_86.
20 # Below code is [lane complementing] KECCAK_2X implementation (see
21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22 # instead of actually unrolling the loop pair-wise I simply flip
23 # pointers to T[][] and A[][] at the end of round. Since number of
24 # rounds is even, last round writes to A[][] and everything works out.
25 # How does it compare to assembly module in Keccak Code Package? KCP
26 # is faster on couple of processors, VIA Nano and Goldmont by 4-6%,
27 # otherwise this module is either as fast or faster by up to 15%...
29 ########################################################################
30 # Numbers are cycles per processed byte out of large message.
37 # Sandy Bridge 12.9(**)
46 # (*) Corresponds to SHA3-256. Improvement over compiler-generate
47 # varies a lot, most commont coefficient is 15% in comparison to
48 # gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
49 # (**) Sandy Bridge has broken rotate instruction. Performance can be
50 # improved by 14% by replacing rotates with double-precision
51 # shift with same register as source and destination.
55 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
57 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
59 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
60 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
61 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
62 die "can't locate x86_64-xlate.pl";
64 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
67 my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
68 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
70 my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
71 my @D = map("%r$_",(8..12));
72 my @T = map("%r$_",(13..14));
75 my @rhotates = ([ 0, 1, 62, 28, 27 ],
76 [ 36, 44, 6, 55, 20 ],
77 [ 3, 10, 43, 25, 39 ],
78 [ 41, 45, 15, 21, 8 ],
79 [ 18, 2, 61, 56, 14 ]);
84 .type __KeccakF1600,\@function
87 mov $A[4][0](%rdi),@C[0]
88 mov $A[4][1](%rdi),@C[1]
89 mov $A[4][2](%rdi),@C[2]
90 mov $A[4][3](%rdi),@C[3]
91 mov $A[4][4](%rdi),@C[4]
96 mov $A[0][0](%rdi),@D[0]
97 mov $A[1][1](%rdi),@D[1]
98 mov $A[2][2](%rdi),@D[2]
99 mov $A[3][3](%rdi),@D[3]
101 xor $A[0][2](%rdi),@C[2]
102 xor $A[0][3](%rdi),@C[3]
104 xor $A[0][1](%rdi),@C[1]
105 xor $A[1][2](%rdi),@C[2]
106 xor $A[1][0](%rdi),@C[0]
108 xor $A[0][4](%rdi),@C[4]
111 xor $A[2][0](%rdi),@C[0]
112 xor $A[1][3](%rdi),@C[3]
114 xor $A[1][4](%rdi),@C[4]
116 xor $A[3][2](%rdi),@C[2]
117 xor $A[3][0](%rdi),@C[0]
118 xor $A[2][3](%rdi),@C[3]
119 xor $A[2][1](%rdi),@C[1]
120 xor $A[2][4](%rdi),@C[4]
124 xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0]
128 xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3]
129 xor $A[3][1](%rdi),@C[1]
132 xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1]
133 xor $A[3][4](%rdi),@C[4]
136 xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4]
139 xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2]
142 @D = (@C[1],@C[2],@C[3],@C[4],@C[0]);
147 rol \$$rhotates[1][1],@C[1]
150 rol \$$rhotates[2][2],@C[2]
153 rol \$$rhotates[3][3],@C[3]
155 xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2])
156 rol \$$rhotates[4][4],@C[4]
163 mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
164 xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3])
166 mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3])
169 xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3])
170 mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3])
173 xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0])
174 mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0])
177 xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0])
178 mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0])
181 mov $A[0][3](%rdi),@C[0]
182 mov $A[4][2](%rdi),@C[4]
183 mov $A[3][1](%rdi),@C[3]
184 mov $A[1][4](%rdi),@C[1]
185 mov $A[2][0](%rdi),@C[2]
189 rol \$$rhotates[0][3],@C[0]
192 rol \$$rhotates[4][2],@C[4]
193 rol \$$rhotates[3][1],@C[3]
195 rol \$$rhotates[1][4],@C[1]
198 rol \$$rhotates[2][0],@C[2]
200 xor @C[3],@C[0] # C[3] ^ (C[0] | C[4])
201 mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4])
205 xor @C[4],@C[1] # C[4] ^ (C[1] & C[0])
207 mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0])
210 xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3])
211 mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3])
214 xor @T[1],@C[3] # C[1] ^ (C[3] & C[2])
215 mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2])
218 xor @T[0],@T[1] # C[0] ^ (C[1] | C[2])
219 mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2])
222 mov $A[2][3](%rdi),@C[2]
223 mov $A[3][4](%rdi),@C[3]
224 mov $A[1][2](%rdi),@C[1]
225 mov $A[4][0](%rdi),@C[4]
226 mov $A[0][1](%rdi),@C[0]
230 rol \$$rhotates[2][3],@C[2]
232 rol \$$rhotates[3][4],@C[3]
234 rol \$$rhotates[1][2],@C[1]
236 rol \$$rhotates[4][0],@C[4]
239 rol \$$rhotates[0][1],@C[0]
242 xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3])
243 mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3])
247 xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3])
248 mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
251 xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1])
252 mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1])
255 xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0])
256 mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0])
259 xor @C[3],@C[0] # ~C[3] ^ ( C[0] | C[4])
260 mov @C[0],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
263 mov $A[2][1](%rdi),@C[2]
264 mov $A[3][2](%rdi),@C[3]
265 mov $A[1][0](%rdi),@C[1]
266 mov $A[4][3](%rdi),@C[4]
267 mov $A[0][4](%rdi),@C[0]
271 rol \$$rhotates[2][1],@C[2]
273 rol \$$rhotates[3][2],@C[3]
275 rol \$$rhotates[1][0],@C[1]
277 rol \$$rhotates[4][3],@C[4]
280 rol \$$rhotates[0][4],@C[0]
283 xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3])
284 mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3])
288 xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3])
289 mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
292 xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1])
293 mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1])
296 xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0])
297 mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0])
300 xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4])
301 mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
304 xor $A[0][2](%rdi),@D[2]
305 xor $A[1][3](%rdi),@D[3]
306 rol \$$rhotates[0][2],@D[2]
307 xor $A[4][1](%rdi),@D[1]
308 rol \$$rhotates[1][3],@D[3]
309 xor $A[2][4](%rdi),@D[4]
310 rol \$$rhotates[4][1],@D[1]
311 xor $A[3][0](%rdi),@D[0]
313 rol \$$rhotates[2][4],@D[4]
314 rol \$$rhotates[3][0],@D[0]
316 @C = (@D[2],@D[3],@D[4],@D[0],@D[1]);
321 xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1])
322 mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1])
326 xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1])
327 mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
330 xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4])
331 mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4])
334 xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3])
335 mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3])
338 xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3])
339 mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
341 mov @C[0],@C[1] # harmonize with the loop top
347 lea -192($iotas),$iotas # rewind iotas
349 .size __KeccakF1600,.-__KeccakF1600
352 .type KeccakF1600,\@function
369 lea 100(%rdi),%rdi # size optimization
371 .cfi_adjust_cfa_offset 200
380 lea iotas(%rip),$iotas
381 lea 100(%rsp),%rsi # size optimization
391 lea -100(%rdi),%rdi # preserve A[][]
394 .cfi_adjust_cfa_offset -200
410 .size KeccakF1600,.-KeccakF1600
413 { my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
414 ($A_flat,$inp) = ("%r8","%r9");
417 .type SHA3_absorb,\@function
434 lea 100(%rdi),%rdi # size optimization
436 .cfi_adjust_cfa_offset 232
439 lea 100(%rsp),%rsi # size optimization
447 lea iotas(%rip),$iotas
449 mov $bsz,216-100(%rsi) # save bsz
456 lea -100(%rdi),$A_flat
462 lea 8($A_flat),$A_flat
468 mov $inp,200-100(%rsi) # save inp
469 mov $len,208-100(%rsi) # save len
471 mov 200-100(%rsi),$inp # pull inp
472 mov 208-100(%rsi),$len # pull len
473 mov 216-100(%rsi),$bsz # pull bsz
478 mov $len,%rax # return value
488 .cfi_adjust_cfa_offset -232
504 .size SHA3_absorb,.-SHA3_absorb
507 { my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
508 ($out,$len,$bsz) = ("%r12","%r13","%r14");
512 .type SHA3_squeeze,\@function
539 sub \$8,$len # len -= 8
554 .byte 0xf3,0xa4 # rep movsb
565 .size SHA3_squeeze,.-SHA3_squeeze
570 .quad 0,0,0,0,0,0,0,0
573 .quad 0x0000000000000001
574 .quad 0x0000000000008082
575 .quad 0x800000000000808a
576 .quad 0x8000000080008000
577 .quad 0x000000000000808b
578 .quad 0x0000000080000001
579 .quad 0x8000000080008081
580 .quad 0x8000000000008009
581 .quad 0x000000000000008a
582 .quad 0x0000000000000088
583 .quad 0x0000000080008009
584 .quad 0x000000008000000a
585 .quad 0x000000008000808b
586 .quad 0x800000000000008b
587 .quad 0x8000000000008089
588 .quad 0x8000000000008003
589 .quad 0x8000000000008002
590 .quad 0x8000000000000080
591 .quad 0x000000000000800a
592 .quad 0x800000008000000a
593 .quad 0x8000000080008081
594 .quad 0x8000000000008080
595 .quad 0x0000000080000001
596 .quad 0x8000000080008008
598 .asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
601 foreach (split("\n",$code)) {
602 # Below replacement results in 11.3 on Sandy Bridge, 9.4 on
603 # Haswell, but it hurts other processors by up to 2-3-4x...
604 #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;