2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for x86_86.
20 # Below code is [lane complementing] KECCAK_2X implementation (see
21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
22 # instead of actually unrolling the loop pair-wise I simply flip
23 # pointers to T[][] and A[][] at the end of round. Since number of
24 # rounds is even, last round writes to A[][] and everything works out.
26 my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
27 8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
29 my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
30 my @D = map("%r$_",(8..12));
31 my @T = map("%r$_",(13..14));
34 my @rhotates = ([ 0, 1, 62, 28, 27 ],
35 [ 36, 44, 6, 55, 20 ],
36 [ 3, 10, 43, 25, 39 ],
37 [ 41, 45, 15, 21, 8 ],
38 [ 18, 2, 61, 56, 14 ]);
43 .type __KeccakF1600,\@function
46 mov $A[4][0](%rdi),@C[0]
47 mov $A[4][1](%rdi),@C[1]
48 mov $A[4][2](%rdi),@C[2]
49 mov $A[4][3](%rdi),@C[3]
50 mov $A[4][4](%rdi),@C[4]
55 xor $A[0][0](%rdi),@C[0]
56 xor $A[0][1](%rdi),@C[1]
57 xor $A[0][2](%rdi),@C[2]
58 xor $A[0][3](%rdi),@C[3]
59 xor $A[0][4](%rdi),@C[4]
61 xor $A[1][0](%rdi),@C[0]
62 xor $A[1][1](%rdi),@C[1]
63 xor $A[1][2](%rdi),@C[2]
64 xor $A[1][3](%rdi),@C[3]
65 xor $A[1][4](%rdi),@C[4]
67 xor $A[2][0](%rdi),@C[0]
68 xor $A[2][1](%rdi),@C[1]
69 xor $A[2][2](%rdi),@C[2]
70 xor $A[2][3](%rdi),@C[3]
71 xor $A[2][4](%rdi),@C[4]
73 xor $A[3][0](%rdi),@C[0]
74 xor $A[3][1](%rdi),@C[1]
75 xor $A[3][2](%rdi),@C[2]
76 xor $A[3][3](%rdi),@C[3]
77 xor $A[3][4](%rdi),@C[4]
81 xor @C[0],@C[2] # D[1] = ROL64(C[2], 1) ^ C[0]
84 xor @C[3],@C[0] # D[4] = ROL64(C[0], 1) ^ C[3]
87 xor @C[1],@C[3] # D[2] = ROL64(C[3], 1) ^ C[1]
90 xor @C[4],@C[1] # D[0] = ROL64(C[1], 1) ^ C[4]
93 xor @T[0],@C[4] # D[3] = ROL64(C[4], 1) ^ C[2]
96 @D = (@C[1],@C[2],@C[3],@C[4],@C[0]);
99 mov $A[0][0](%rdi),@C[0]
100 mov $A[1][1](%rdi),@C[1]
101 mov $A[2][2](%rdi),@C[2]
102 mov $A[3][3](%rdi),@C[3]
103 mov $A[4][4](%rdi),@C[4]
111 rol \$$rhotates[1][1],@C[1]
112 rol \$$rhotates[2][2],@C[2]
113 rol \$$rhotates[3][3],@C[3]
114 rol \$$rhotates[4][4],@C[4]
118 xor @C[0],@C[1] # C[0] ^ ( C[1] | C[2])
121 mov @C[1],$A[0][0](%rsi) # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
125 xor @C[2],@C[4] # C[2] ^ ( C[4] & C[3])
126 mov @C[4],$A[0][2](%rsi) # R[0][2] = C[2] ^ ( C[4] & C[3])
130 xor @T[0],@C[2] # C[1] ^ (~C[2] | C[3])
131 mov @C[2],$A[0][1](%rsi) # R[0][1] = C[1] ^ (~C[2] | C[3])
134 xor @T[1],@T[0] # C[4] ^ ( C[1] & C[0])
135 mov @T[0],$A[0][4](%rsi) # R[0][4] = C[4] ^ ( C[1] & C[0])
138 xor @C[3],@T[1] # C[3] ^ ( C[4] | C[0])
139 mov @T[1],$A[0][3](%rsi) # R[0][3] = C[3] ^ ( C[4] | C[0])
142 mov $A[0][3](%rdi),@C[0]
143 mov $A[1][4](%rdi),@C[1]
144 mov $A[2][0](%rdi),@C[2]
145 mov $A[3][1](%rdi),@C[3]
146 mov $A[4][2](%rdi),@C[4]
154 rol \$$rhotates[0][3],@C[0]
155 rol \$$rhotates[1][4],@C[1]
156 rol \$$rhotates[2][0],@C[2]
157 rol \$$rhotates[3][1],@C[3]
158 rol \$$rhotates[4][2],@C[4]
162 xor @C[3],@C[0] # C[3] ^ (C[0] | C[4])
163 mov @C[0],$A[1][3](%rsi) # R[1][3] = C[3] ^ (C[0] | C[4])
167 xor @C[4],@C[1] # C[4] ^ (C[1] & C[0])
168 mov @C[1],$A[1][4](%rsi) # R[1][4] = C[4] ^ (C[1] & C[0])
172 xor @C[2],@C[4] # C[2] ^ (~C[4] | C[3])
173 mov @C[4],$A[1][2](%rsi) # R[1][2] = C[2] ^ (~C[4] | C[3])
176 xor @T[1],@C[3] # C[1] ^ (C[3] & C[2])
177 mov @C[3],$A[1][1](%rsi) # R[1][1] = C[1] ^ (C[3] & C[2])
180 xor @T[0],@T[1] # C[0] ^ (C[1] | C[2])
181 mov @T[1],$A[1][0](%rsi) # R[1][0] = C[0] ^ (C[1] | C[2])
184 mov $A[0][1](%rdi),@C[0]
185 mov $A[1][2](%rdi),@C[1]
186 mov $A[2][3](%rdi),@C[2]
187 mov $A[3][4](%rdi),@C[3]
188 mov $A[4][0](%rdi),@C[4]
196 rol \$$rhotates[0][1],@C[0]
197 rol \$$rhotates[1][2],@C[1]
198 rol \$$rhotates[2][3],@C[2]
199 rol \$$rhotates[3][4],@C[3]
200 rol \$$rhotates[4][0],@C[4]
204 xor @C[1],@C[2] # C[1] ^ ( C[2] & C[3])
205 mov @C[2],$A[2][1](%rsi) # R[2][1] = C[1] ^ ( C[2] & C[3])
210 xor @T[0],@C[4] # C[2] ^ ( C[4] & ~C[3])
211 mov @C[4],$A[2][2](%rsi) # R[2][2] = C[2] ^ ( C[4] & ~C[3])
214 xor @C[0],@T[0] # C[0] ^ ( C[2] | C[1])
215 mov @T[0],$A[2][0](%rsi) # R[2][0] = C[0] ^ ( C[2] | C[1])
218 xor @T[1],@C[1] # C[4] ^ ( C[1] & C[0])
219 mov @C[1],$A[2][4](%rsi) # R[2][4] = C[4] ^ ( C[1] & C[0])
222 xor @C[3],@C[0] # ~C[3] ^ ( C[0] | C[4])
223 mov @C[0],$A[2][3](%rsi) # R[2][3] = ~C[3] ^ ( C[0] | C[4])
226 mov $A[0][4](%rdi),@C[0]
227 mov $A[1][0](%rdi),@C[1]
228 mov $A[2][1](%rdi),@C[2]
229 mov $A[3][2](%rdi),@C[3]
230 mov $A[4][3](%rdi),@C[4]
238 rol \$$rhotates[0][4],@C[0]
239 rol \$$rhotates[1][0],@C[1]
240 rol \$$rhotates[2][1],@C[2]
241 rol \$$rhotates[3][2],@C[3]
242 rol \$$rhotates[4][3],@C[4]
246 xor @C[1],@C[2] # C[1] ^ ( C[2] | C[3])
247 mov @C[2],$A[3][1](%rsi) # R[3][1] = C[1] ^ ( C[2] | C[3])
252 xor @T[0],@C[4] # C[2] ^ ( C[4] | ~C[3])
253 mov @C[4],$A[3][2](%rsi) # R[3][2] = C[2] ^ ( C[4] | ~C[3])
256 xor @C[0],@T[0] # C[0] ^ ( C[2] & C[1])
257 mov @T[0],$A[3][0](%rsi) # R[3][0] = C[0] ^ ( C[2] & C[1])
260 xor @T[1],@C[1] # C[4] ^ ( C[1] | C[0])
261 mov @C[1],$A[3][4](%rsi) # R[3][4] = C[4] ^ ( C[1] | C[0])
264 xor @C[3],@C[0] # ~C[3] ^ ( C[0] & C[4])
265 mov @C[0],$A[3][3](%rsi) # R[3][3] = ~C[3] ^ ( C[0] & C[4])
268 mov $A[0][2](%rdi),@C[0]
269 mov $A[1][3](%rdi),@C[1]
270 mov $A[2][4](%rdi),@C[2]
271 mov $A[3][0](%rdi),@C[3]
272 mov $A[4][1](%rdi),@C[4]
281 rol \$$rhotates[0][2],@C[0]
282 rol \$$rhotates[1][3],@C[1]
283 rol \$$rhotates[2][4],@C[2]
284 rol \$$rhotates[3][0],@C[3]
285 rol \$$rhotates[4][1],@C[4]
289 xor @C[4],@C[0] # C[4] ^ ( C[0] & C[1])
290 mov @C[0],$A[4][4](%rdi) # R[4][4] = C[4] ^ ( C[0] & C[1])
295 xor @T[0],@C[2] # C[0] ^ ( C[2] & ~C[1])
296 mov @C[2],$A[4][0](%rdi) # R[4][0] = C[0] ^ ( C[2] & ~C[1])
299 xor @C[3],@T[0] # C[3] ^ ( C[0] | C[4])
300 mov @T[0],$A[4][3](%rdi) # R[4][3] = C[3] ^ ( C[0] | C[4])
303 xor @T[1],@C[4] # C[2] ^ ( C[4] & C[3])
304 mov @C[4],$A[4][2](%rdi) # R[4][2] = C[2] ^ ( C[4] & C[3])
307 xor @C[1],@C[3] # ~C[1] ^ ( C[2] | C[3])
308 mov @C[3],$A[4][1](%rdi) # R[4][1] = ~C[1] ^ ( C[2] | C[3])
310 mov @C[2],@D[4] # harmonize with the loop top
319 lea -192($iotas),$iotas # rewind iotas
321 .size __KeccakF1600,.-__KeccakF1600
324 .type KeccakF1600,\@function
334 lea 100(%rdi),%rdi # size optimization
344 lea iotas(%rip),$iotas
345 lea 100(%rsp),%rsi # size optimization
355 lea -100(%rdi),%rdi # preserve A[][]
366 .size KeccakF1600,.-KeccakF1600
369 { my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
370 ($A_flat,$inp) = ("%r8","%r9");
373 .type SHA3_absorb,\@function
383 lea 100(%rdi),%rdi # size optimization
387 lea 100(%rsp),%rsi # size optimization
395 lea iotas(%rip),$iotas
397 mov $bsz,216-100(%rsi) # save bsz
404 lea -100(%rdi),$A_flat
410 lea 8($A_flat),$A_flat
416 mov $inp,200-100(%rsi) # save inp
417 mov $len,208-100(%rsi) # save len
419 mov 200-100(%rsi),$inp # pull inp
420 mov 208-100(%rsi),$len # pull len
421 mov 216-100(%rsi),$bsz # pull bsz
426 mov $len,%rax # return value
444 .size SHA3_absorb,.-SHA3_absorb
447 { my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
448 ($out,$len,$bsz) = ("%r12","%r13","%r14");
452 .type SHA3_squeeze,\@function
475 sub \$8,$len # len -= 8
490 .byte 0xf3,0xa4 # rep movsb
497 .size SHA3_squeeze,.-SHA3_squeeze
502 .quad 0,0,0,0,0,0,0,0
505 .quad 0x0000000000000001
506 .quad 0x0000000000008082
507 .quad 0x800000000000808a
508 .quad 0x8000000080008000
509 .quad 0x000000000000808b
510 .quad 0x0000000080000001
511 .quad 0x8000000080008081
512 .quad 0x8000000000008009
513 .quad 0x000000000000008a
514 .quad 0x0000000000000088
515 .quad 0x0000000080008009
516 .quad 0x000000008000000a
517 .quad 0x000000008000808b
518 .quad 0x800000000000008b
519 .quad 0x8000000000008089
520 .quad 0x8000000000008003
521 .quad 0x8000000000008002
522 .quad 0x8000000000000080
523 .quad 0x000000000000800a
524 .quad 0x800000008000000a
525 .quad 0x8000000080008081
526 .quad 0x8000000000008080
527 .quad 0x0000000080000001
528 .quad 0x8000000080008008
530 .asciz "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"