2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for x86_64.
20 # Below code is [lane complementing] KECCAK_2X implementation (see
21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22 # instead of actually unrolling the loop pair-wise I simply flip
23 # pointers to T[][] and A[][] at the end of round. Since number of
24 # rounds is even, last round writes to A[][] and everything works out.
25 # How does it compare to x86_64 assembly module in Keccak Code Package?
26 # Depending on processor it's either as fast or faster by up to 15%...
28 ########################################################################
29 # Numbers are cycles per processed byte out of large message.
36 # Sandy Bridge 12.9(**)
46 # (*) Corresponds to SHA3-256. Improvement over compiler-generate
47 # varies a lot, most commont coefficient is 15% in comparison to
48 # gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
49 # (**) Sandy Bridge has broken rotate instruction. Performance can be
50 # improved by 14% by replacing rotates with double-precision
51 # shift with same register as source and destination.
53 # $output is the last argument if it looks like a file (it has an extension)
54 # $flavour is the first argument if it doesn't look like a file
55 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
56 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
58 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
60 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
61 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
62 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
63 die "can't locate x86_64-xlate.pl";
65 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
66 or die "can't call $xlate: $!";
69 my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
70 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
72 my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
73 my @D = map("%r$_",(8..12));
74 my @T = map("%r$_",(13..14));
77 my @rhotates = ([ 0, 1, 62, 28, 27 ],
78 [ 36, 44, 6, 55, 20 ],
79 [ 3, 10, 43, 25, 39 ],
80 [ 41, 45, 15, 21, 8 ],
81 [ 18, 2, 61, 56, 14 ]);
86 .type __KeccakF1600,\@abi-omnipotent
89 mov $A[4][0](%rdi),@C[0]
90 mov $A[4][1](%rdi),@C[1]
91 mov $A[4][2](%rdi),@C[2]
92 mov $A[4][3](%rdi),@C[3]
93 mov $A[4][4](%rdi),@C[4]
98 mov $A[0][0](%rdi),@D[0]
99 mov $A[1][1](%rdi),@D[1]
100 mov $A[2][2](%rdi),@D[2]
101 mov $A[3][3](%rdi),@D[3]
103 xor $A[0][2](%rdi),@C[2]
104 xor $A[0][3](%rdi),@C[3]
106 xor $A[0][1](%rdi),@C[1]
107 xor $A[1][2](%rdi),@C[2]
108 xor $A[1][0](%rdi),@C[0]
110 xor $A[0][4](%rdi),@C[4]
113 xor $A[2][0](%rdi),@C[0]
114 xor $A[1][3](%rdi),@C[3]
116 xor $A[1][4](%rdi),@C[4]
118 xor $A[3][2](%rdi),@C[2]
119 xor $A[3][0](%rdi),@C[0]
120 xor $A[2][3](%rdi),@C[3]
121 xor $A[2][1](%rdi),@C[1]
122 xor $A[2][4](%rdi),@C[4]
126 xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0]
130 xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3]
131 xor $A[3][1](%rdi),@C[1]
134 xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1]
135 xor $A[3][4](%rdi),@C[4]
138 xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4]
141 xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2]
143 (@D[0..4], @C) = (@C[1..4,0], @D);
147 rol \$$rhotates[1][1],@C[1]
150 rol \$$rhotates[2][2],@C[2]
153 rol \$$rhotates[3][3],@C[3]
155 xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2])
156 rol \$$rhotates[4][4],@C[4]
163 mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
164 xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3])
166 mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3])
169 mov $A[4][2](%rdi),@C[4]
170 xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3])
171 mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3])
174 mov $A[1][4](%rdi),@C[1]
175 xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0])
176 mov $A[2][0](%rdi),@C[2]
177 mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0])
180 mov $A[0][3](%rdi),@C[0]
181 xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0])
182 mov $A[3][1](%rdi),@C[3]
183 mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0])
188 rol \$$rhotates[0][3],@C[0]
191 rol \$$rhotates[4][2],@C[4]
192 rol \$$rhotates[3][1],@C[3]
194 rol \$$rhotates[1][4],@C[1]
197 rol \$$rhotates[2][0],@C[2]
199 xor @C[3],@C[0] # C[3] ^ (C[0] | C[4])
200 mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4])
204 mov $A[0][1](%rdi),@C[0]
205 xor @C[4],@C[1] # C[4] ^ (C[1] & C[0])
207 mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0])
210 mov $A[1][2](%rdi),@C[1]
211 xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3])
212 mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3])
215 mov $A[4][0](%rdi),@C[4]
216 xor @T[1],@C[3] # C[1] ^ (C[3] & C[2])
217 mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2])
220 mov $A[2][3](%rdi),@C[2]
221 xor @T[0],@T[1] # C[0] ^ (C[1] | C[2])
222 mov $A[3][4](%rdi),@C[3]
223 mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2])
228 rol \$$rhotates[2][3],@C[2]
230 rol \$$rhotates[3][4],@C[3]
232 rol \$$rhotates[1][2],@C[1]
234 rol \$$rhotates[4][0],@C[4]
237 rol \$$rhotates[0][1],@C[0]
240 xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3])
241 mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3])
245 mov $A[2][1](%rdi),@C[2]
246 xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3])
247 mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
250 mov $A[4][3](%rdi),@C[4]
251 xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1])
252 mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1])
255 xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0])
256 mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0])
259 mov $A[1][0](%rdi),@C[1]
260 xor @C[3],@T[1] # ~C[3] ^ ( C[0] | C[4])
261 mov $A[3][2](%rdi),@C[3]
262 mov @T[1],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
265 mov $A[0][4](%rdi),@C[0]
269 rol \$$rhotates[2][1],@C[2]
271 rol \$$rhotates[3][2],@C[3]
273 rol \$$rhotates[1][0],@C[1]
275 rol \$$rhotates[4][3],@C[4]
278 rol \$$rhotates[0][4],@C[0]
281 xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3])
282 mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3])
286 xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3])
287 mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
290 xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1])
291 mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1])
294 xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0])
295 mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0])
298 xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4])
299 mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
302 xor $A[0][2](%rdi),@D[2]
303 xor $A[1][3](%rdi),@D[3]
304 rol \$$rhotates[0][2],@D[2]
305 xor $A[4][1](%rdi),@D[1]
306 rol \$$rhotates[1][3],@D[3]
307 xor $A[2][4](%rdi),@D[4]
308 rol \$$rhotates[4][1],@D[1]
309 xor $A[3][0](%rdi),@D[0]
311 rol \$$rhotates[2][4],@D[4]
312 rol \$$rhotates[3][0],@D[0]
319 xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1])
320 mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1])
324 xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1])
325 mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
328 xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4])
329 mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4])
332 xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3])
333 mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3])
336 xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3])
337 mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
339 mov @C[0],@C[1] # harmonize with the loop top
345 lea -192($iotas),$iotas # rewind iotas
347 .size __KeccakF1600,.-__KeccakF1600
349 .type KeccakF1600,\@abi-omnipotent
366 lea 100(%rdi),%rdi # size optimization
368 .cfi_adjust_cfa_offset 200
377 lea iotas(%rip),$iotas
378 lea 100(%rsp),%rsi # size optimization
388 lea -100(%rdi),%rdi # preserve A[][]
391 .cfi_adjust_cfa_offset -200
407 .size KeccakF1600,.-KeccakF1600
410 { my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
411 ($A_flat,$inp) = ("%r8","%r9");
414 .type SHA3_absorb,\@function,4
431 lea 100(%rdi),%rdi # size optimization
433 .cfi_adjust_cfa_offset 232
436 lea 100(%rsp),%rsi # size optimization
444 lea iotas(%rip),$iotas
446 mov $bsz,216-100(%rsi) # save bsz
453 lea -100(%rdi),$A_flat
459 lea 8($A_flat),$A_flat
465 mov $inp,200-100(%rsi) # save inp
466 mov $len,208-100(%rsi) # save len
468 mov 200-100(%rsi),$inp # pull inp
469 mov 208-100(%rsi),$len # pull len
470 mov 216-100(%rsi),$bsz # pull bsz
475 mov $len,%rax # return value
485 .cfi_adjust_cfa_offset -232
501 .size SHA3_absorb,.-SHA3_absorb
504 { my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
505 ($out,$len,$bsz) = ("%r12","%r13","%r14");
509 .type SHA3_squeeze,\@function,4
536 sub \$8,$len # len -= 8
551 .byte 0xf3,0xa4 # rep movsb
562 .size SHA3_squeeze,.-SHA3_squeeze
567 .quad 0,0,0,0,0,0,0,0
570 .quad 0x0000000000000001
571 .quad 0x0000000000008082
572 .quad 0x800000000000808a
573 .quad 0x8000000080008000
574 .quad 0x000000000000808b
575 .quad 0x0000000080000001
576 .quad 0x8000000080008081
577 .quad 0x8000000000008009
578 .quad 0x000000000000008a
579 .quad 0x0000000000000088
580 .quad 0x0000000080008009
581 .quad 0x000000008000000a
582 .quad 0x000000008000808b
583 .quad 0x800000000000008b
584 .quad 0x8000000000008089
585 .quad 0x8000000000008003
586 .quad 0x8000000000008002
587 .quad 0x8000000000000080
588 .quad 0x000000000000800a
589 .quad 0x800000008000000a
590 .quad 0x8000000080008081
591 .quad 0x8000000000008080
592 .quad 0x0000000080000001
593 .quad 0x8000000080008008
595 .asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
598 foreach (split("\n",$code)) {
599 # Below replacement results in 11.2 on Sandy Bridge, 9.4 on
600 # Haswell, but it hurts other processors by up to 2-3-4x...
601 #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
602 # Below replacement results in 9.3 on Haswell [as well as
603 # on Ryzen, i.e. it *hurts* Ryzen]...
604 #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;