2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for AVX2.
20 # To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
21 # 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
22 # other than A[0][0] in magic order into 6 [256-bit] registers, *each
23 # dedicated to one axis*, Pi permutation is reduced to intra-register
26 # It makes other steps more intricate, but overall, is it a win? To be
27 # more specific index permutations organized by quadruples are:
29 # [4][4] [3][3] [2][2] [1][1]<-+
30 # [0][4] [0][3] [0][2] [0][1]<-+
31 # [3][0] [1][0] [4][0] [2][0] |
32 # [4][3] [3][1] [2][4] [1][2] |
33 # [3][4] [1][3] [4][2] [2][1] |
34 # [2][3] [4][1] [1][4] [3][2] |
35 # [2][2] [4][4] [1][1] [3][3] -+
37 # This however is highly impractical for Theta and Chi. What would help
38 # Theta is if x indices were aligned column-wise, or in other words:
40 # [0][4] [0][3] [0][2] [0][1]
41 # [3][0] [1][0] [4][0] [2][0]
42 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
43 # [2][4] [4][3] [1][2] [3][1]
44 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
45 # [3][4] [1][3] [4][2] [2][1]
46 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
47 # [1][4] [2][3] [3][2] [4][1]
48 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
49 # [4][4] [3][3] [2][2] [1][1]
51 # So here we have it, lines not marked with vpermq() represent the magic
52 # order in which data is to be loaded and maintained. [And lines marked
53 # with vpermq() represent Pi circular permutation in chosen layout. Note
54 # that first step is permutation-free.] A[0][0] is loaded to register of
55 # its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
56 # Digits in variables' names denote right-most coordinates:
58 my ($A00, # [0][0] [0][0] [0][0] [0][0] # %ymm0
59 $A01, # [0][4] [0][3] [0][2] [0][1] # %ymm1
60 $A20, # [3][0] [1][0] [4][0] [2][0] # %ymm2
61 $A31, # [2][4] [4][3] [1][2] [3][1] # %ymm3
62 $A21, # [3][4] [1][3] [4][2] [2][1] # %ymm4
63 $A41, # [1][4] [2][3] [3][2] [4][1] # %ymm5
64 $A11) = # [4][4] [3][3] [2][2] [1][1] # %ymm6
67 # We also need to map the magic order into offsets within structure:
69 my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4]
70 [2,2], [6,0], [3,1], [4,2], [5,3], # [1][0..4]
71 [2,0], [4,0], [6,1], [5,2], [3,3], # [2][0..4]
72 [2,3], [3,0], [5,1], [6,2], [4,3], # [3][0..4]
73 [2,1], [5,0], [4,1], [3,2], [6,3]); # [4][0..4]
74 @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged); # ... and now linear
76 # But on the other hand Chi is much better off if y indices were aligned
77 # column-wise, not x. For this reason we have to shuffle data prior
78 # Chi and revert it afterwards. Prior shuffle is naturally merged with
81 # [0][4] [0][3] [0][2] [0][1]
82 # [3][0] [1][0] [4][0] [2][0]
83 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
84 #vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
85 # [3][1] [1][2] [4][3] [2][4]
86 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
87 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
88 # [3][4] [1][3] [4][2] [2][1]
89 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
90 #vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
91 # [3][2] [1][4] [4][1] [2][3]
92 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
93 #vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
94 # [3][3] [1][1] [4][4] [2][2]
96 # And reverse post-Chi permutation:
98 # [0][4] [0][3] [0][2] [0][1]
99 # [3][0] [1][0] [4][0] [2][0]
100 #vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
101 # [2][4] [4][3] [1][2] [3][1]
102 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
103 # [3][4] [1][3] [4][2] [2][1]
104 #vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
105 # [1][4] [2][3] [3][2] [4][1]
106 #vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
107 # [4][4] [3][3] [2][2] [1][1]
109 ########################################################################
110 # Numbers are cycles per processed byte out of large message.
117 # (*) Corresponds to SHA3-256.
119 my @T = map("%ymm$_",(7..15));
120 my ($C14,$C00,$D00,$D14) = @T[5..8];
125 .type __KeccakF1600,\@function
128 lea rhotates_left+96(%rip),%r8
129 lea rhotates_right+96(%rip),%r9
136 ######################################### Theta
140 vpxor $A11,$C14,$C14 # C[1..4]
141 vpermq \$0b10110001,$A20,$C00
143 vpermq \$0b01001110,$C00,@T[0]
145 vpxor @T[0],$C00,$C00 # C[0..0]
147 vpsrlq \$63,$C14,@T[1]
148 vpaddq $C14,$C14,@T[3]
149 vpor @T[3],@T[1],@T[1] # ROL64(C[1..4],1)
151 vpsrlq \$63,$C00,@T[0]
152 vpaddq $C00,$C00,@T[2]
153 vpor @T[2],@T[0],@T[0] # ROL64(C[0..0],1)
155 vpermq \$0b00000000,@T[1],$D00
156 vpermq \$0b11111111,$C14,@T[3]
157 vpxor @T[3],$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4]
159 vpermq \$0b00111001,@T[1],$D14
160 vpblendd \$0b11000000,@T[0],$D14,$D14
161 vpermq \$0b10010011,$C14,@T[2]
162 vpblendd \$0b00000011,$C00,@T[2],@T[2]
163 vpxor @T[2],$D14,$D14 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
165 vpxor $D00,$A00,$A00 # ^= D[0..0]
166 vpxor $D00,$A20,$A20 # ^= D[0..0]
167 vpxor $D14,$A01,$A01 # ^= D[1..4]
168 vpxor $D14,$A31,$A31 # ^= D[1..4]
169 vpxor $D14,$A21,$A21 # ^= D[1..4]
170 vpxor $D14,$A41,$A41 # ^= D[1..4]
171 vpxor $D14,$A11,$A11 # ^= D[1..4]
173 ######################################### Rho
174 vpsllvq 0*32-96(%r8),$A20,@T[0]
175 vpsrlvq 0*32-96(%r9),$A20,$A20
178 vpsllvq 1*32-96(%r8),$A01,@T[1]
179 vpsrlvq 1*32-96(%r9),$A01,$A01
182 vpsllvq 2*32-96(%r8),$A31,@T[2]
183 vpsrlvq 2*32-96(%r9),$A31,$A31
186 vpsllvq 3*32-96(%r8),$A21,@T[3]
187 vpsrlvq 3*32-96(%r9),$A21,$A21
190 vpsllvq 4*32-96(%r8),$A41,@T[4]
191 vpsrlvq 4*32-96(%r9),$A41,$A41
194 vpsllvq 5*32-96(%r8),$A11,@T[5]
195 vpsrlvq 5*32-96(%r9),$A11,$A11
198 ######################################### Pi + pre-Chi shuffle
199 vpermq \$0b01110010,$A41,@T[6] # vpermq \$0b00011011,$A41,$A11
200 vpermq \$0b00011011,$A21,@T[5] # vpermq \$0b01110010,$A21,$A41
201 vpermq \$0b10001101,$A31,@T[4] # vpermq \$0b10001101,$A31,$A21
202 vpermq \$0b10001101,$A20,@T[3] # vpermq \$0b01110010,$A20,$A31
206 ######################################### Chi
207 vpermq \$0b00000000,@T[1],@T[0] # [0][1] [0][1] [0][1] [0][1]
208 vpermq \$0b01010101,@T[1],@T[7] # [0][2] [0][2] [0][2] [0][2]
209 vpandn @T[7],@T[0],@T[0] # tgting [0][0] [0][0] [0][0] [0][0]
211 vpermq \$0b00111001,@T[1],$A01 # [0][1] [0][4] [0][3] [0][2]
212 vpermq \$0b00011110,@T[1],@T[8] # [0][1] [0][2] [0][4] [0][3]
213 vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2]
214 vpblendd \$0b00110000,$A00,@T[8],@T[8] # [0][1] [0][0] [0][4] [0][3]
215 vpandn @T[8],$A01,$A01 # tgting [0][4] [0][3] [0][2] [0][1]
217 vpblendd \$0b00001100,@T[5],@T[4],$A20 # [4][1] [2][1]
218 vpblendd \$0b00110000,@T[6],$A20,$A20 # [1][1] [4][1] [2][1]
219 vpblendd \$0b11000000,@T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1]
220 vpblendd \$0b00001100,@T[4],@T[6],@T[7] # [4][2] [2][2]
221 vpblendd \$0b00110000,@T[3],@T[7],@T[7] # [1][2] [4][2] [2][2]
222 vpblendd \$0b11000000,@T[5],@T[7],@T[7] # [3][2] [1][2] [4][2] [2][2]
223 vpandn @T[7],$A20,$A20 # tgting [3][0] [1][0] [4][0] [2][0]
225 vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0]
226 vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0]
227 vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0]
228 vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1]
229 vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1]
230 vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1]
231 vpandn @T[8],$A31,$A31 # tgting [3][1] [1][2] [4][3] [2][4]
233 vpblendd \$0b00001100,@T[3],@T[6],$A21 # [4][3] [2][2]
234 vpblendd \$0b00110000,@T[5],$A21,$A21 # [1][4] [4][3] [2][2]
235 vpblendd \$0b11000000,@T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2]
236 vpblendd \$0b00001100,@T[6],@T[5],@T[7] # [4][4] [2][3]
237 vpblendd \$0b00110000,@T[2],@T[7],@T[7] # [1][0] [4][4] [2][3]
238 vpblendd \$0b11000000,@T[3],@T[7],@T[7] # [3][1] [1][0] [4][4] [2][3]
239 vpandn @T[7],$A21,$A21 # tgting [3][4] [1][3] [4][2] [2][1]
241 vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4]
242 vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4]
243 vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4]
244 vpblendd \$0b00001100,@T[3],@T[2],@T[8] # [4][3] [2][0]
245 vpblendd \$0b00110000,@T[6],@T[8],@T[8] # [1][1] [4][3] [2][0]
246 vpblendd \$0b11000000,@T[4],@T[8],@T[8] # [3][4] [1][1] [4][3] [2][0]
247 vpandn @T[8],$A41,$A41 # tgting [3][2] [1][4] [4][1] [2][3]
249 vpblendd \$0b00001100,@T[2],@T[5],$A11 # [4][0] [2][3]
250 vpblendd \$0b00110000,@T[3],$A11,$A11 # [1][2] [4][0] [2][3]
251 vpblendd \$0b11000000,@T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3]
252 vpblendd \$0b00001100,@T[5],@T[3],@T[7] # [4][1] [2][4]
253 vpblendd \$0b00110000,@T[4],@T[7],@T[7] # [1][3] [4][1] [2][4]
254 vpblendd \$0b11000000,@T[2],@T[7],@T[7] # [3][0] [1][3] [4][1] [2][4]
255 vpandn @T[7],$A11,$A11 # tgting [3][3] [1][1] [4][4] [2][2]
257 vpxor @T[0],$A00,$A00
258 vpxor @T[1],$A01,$A01
259 vpxor @T[2],$A20,$A20
260 vpxor @T[3],$A31,$A31
261 vpxor @T[4],$A21,$A21
262 vpxor @T[5],$A41,$A41
263 vpxor @T[6],$A11,$A11
265 vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle
266 vpermq \$0b10001101,$A41,$A41
267 vpermq \$0b01110010,$A11,$A11
269 ######################################### Iota
270 vpxor (%r10),$A00,$A00
277 .size __KeccakF1600,.-__KeccakF1600
279 my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
280 my $out = $inp; # in squeeze
284 .type SHA3_absorb,\@function
292 lea 96($A_flat),$A_flat
298 vpbroadcastq -96($A_flat),$A00 # load A[5][5]
299 vmovdqu 8+32*0-96($A_flat),$A01
300 vmovdqu 8+32*1-96($A_flat),$A20
301 vmovdqu 8+32*2-96($A_flat),$A31
302 vmovdqu 8+32*3-96($A_flat),$A21
303 vmovdqu 8+32*4-96($A_flat),$A41
304 vmovdqu 8+32*5-96($A_flat),$A11
306 vpxor @T[0],@T[0],@T[0]
307 vmovdqa @T[0],32*2-96(%r10) # zero transfer area on stack
308 vmovdqa @T[0],32*3-96(%r10)
309 vmovdqa @T[0],32*4-96(%r10)
310 vmovdqa @T[0],32*5-96(%r10)
311 vmovdqa @T[0],32*6-96(%r10)
316 jc .Ldone_absorb_avx2
319 vpbroadcastq 0-96($inp),@T[0]
320 vmovdqu 8-96($inp),@T[1]
323 for(my $i=5; $i<25; $i++) {
327 mov 8*$i-96($inp),%r8
328 mov %r8,$A_jagged[$i]-96(%r10)
335 vpxor @T[0],$A00,$A00
336 vpxor @T[1],$A01,$A01
337 vpxor 32*2-96(%r10),$A20,$A20
338 vpxor 32*3-96(%r10),$A31,$A31
339 vpxor 32*4-96(%r10),$A21,$A21
340 vpxor 32*5-96(%r10),$A41,$A41
341 vpxor 32*6-96(%r10),$A11,$A11
346 jmp .Loop_absorb_avx2
349 vmovq %xmm0,-96($A_flat)
350 vmovdqu $A01,8+32*0-96($A_flat)
351 vmovdqu $A20,8+32*1-96($A_flat)
352 vmovdqu $A31,8+32*2-96($A_flat)
353 vmovdqu $A21,8+32*3-96($A_flat)
354 vmovdqu $A41,8+32*4-96($A_flat)
355 vmovdqu $A11,8+32*5-96($A_flat)
360 lea ($len,$bsz),%rax # return value
362 .size SHA3_absorb,.-SHA3_absorb
365 .type SHA3_squeeze,\@function
370 lea 96($A_flat),$A_flat
375 vpbroadcastq -96($A_flat),$A00
376 vpxor @T[0],@T[0],@T[0]
377 vmovdqu 8+32*0-96($A_flat),$A01
378 vmovdqu 8+32*1-96($A_flat),$A20
379 vmovdqu 8+32*2-96($A_flat),$A31
380 vmovdqu 8+32*3-96($A_flat),$A21
381 vmovdqu 8+32*4-96($A_flat),$A41
382 vmovdqu 8+32*5-96($A_flat),$A11
387 mov @A_jagged[$i]-96($A_flat),%r8
389 for (my $i=0; $i<25; $i++) {
392 jc .Ltail_squeeze_avx2
395 je .Ldone_squeeze_avx2
397 je .Lextend_output_avx2
398 mov @A_jagged[$i+1]-120($A_flat),%r8
402 .Lextend_output_avx2:
405 vmovq %xmm0,-96($A_flat)
406 vmovdqu $A01,8+32*0-96($A_flat)
407 vmovdqu $A20,8+32*1-96($A_flat)
408 vmovdqu $A31,8+32*2-96($A_flat)
409 vmovdqu $A21,8+32*3-96($A_flat)
410 vmovdqu $A41,8+32*4-96($A_flat)
411 vmovdqu $A11,8+32*5-96($A_flat)
414 jmp .Loop_squeeze_avx2
431 .size SHA3_squeeze,.-SHA3_squeeze
435 .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0]
436 .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4]
437 .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4]
438 .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4]
439 .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4]
440 .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4]
442 .quad 64-3, 64-18, 64-36, 64-41
443 .quad 64-1, 64-62, 64-28, 64-27
444 .quad 64-45, 64-6, 64-56, 64-39
445 .quad 64-10, 64-61, 64-55, 64-8
446 .quad 64-2, 64-15, 64-25, 64-20
447 .quad 64-44, 64-43, 64-21, 64-14
449 .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
450 .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
451 .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
452 .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
453 .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
454 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
455 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
456 .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
457 .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
458 .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
459 .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
460 .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
461 .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
462 .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
463 .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
464 .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
465 .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
466 .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
467 .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
468 .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
469 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
470 .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
471 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
472 .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
474 .asciz "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"