2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for AVX-512F.
20 # Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c).
21 # Pretty straightforward, the only "magic" is data layout in registers.
22 # It's impossible to have one that is optimal for every step, hence
23 # it's changing as algorithm progresses. Data is saved in order that
24 # benefits Chi, but at the same time is easily convertible to order
25 # that benefits Theta. Conversion from Chi layout to Theta is
26 # explicit and reverse one is kind of fused with Pi...
28 ########################################################################
29 # Numbers are cycles per processed byte out of large message.
36 # (*) Corresponds to SHA3-256.
38 ########################################################################
39 # Coordinates below correspond to those in sha/keccak1600.c. Layout
40 # suitable for Chi is one with y coordinates aligned column-wise. Trick
41 # is to add regular shift to x coordinate, so that Chi can still be
42 # performed with as little as 7 instructions, yet be converted to layout
43 # suitable for Theta with intra-register permutations alone. Here is
44 # "magic" layout for Chi (with pre-Theta shuffle):
46 # [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0]
47 # [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0]
48 # [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0]
49 # [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0]
50 # [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0]
52 # Layout suitable to Theta has x coordinates aligned column-wise
53 # [it's interleaved with Pi indices transformation for reference]:
55 # [4][4] [3][3] [2][2] [1][1] [0][0] $A00
56 ##[0][4] [0][3] [0][2] [0][1] [0][0]
57 # [3][4] [2][3] [1][2] [0][1] [4][0] $A01
58 ##[2][3] [2][2] [2][1] [2][0] [2][4]
59 # [2][4] [1][3] [0][2] [4][1] [3][0] $A02
60 ##[4][2] [4][1] [4][0] [4][4] [4][3]
61 # [1][4] [0][3] [4][2] [3][1] [2][0] $A03
62 ##[1][1] [1][0] [1][4] [1][3] [1][2]
63 # [0][4] [4][3] [3][2] [2][1] [1][0] $A04
64 ##[3][0] [3][4] [3][3] [3][2] [3][1]
66 # Pi itself is performed by blending above data and finally shuffling it
67 # to original Chi layout:
69 # [1][1] [2][2] [3][3] [4][4] [0][0]>1.2.3.4.0>[4][4] [3][3] [2][2] [1][1] [0][0]
70 # [2][3] [3][4] [4][0] [0][1] [1][2]>2.3.4.0.1>[4][0] [3][4] [2][3] [1][2] [0][1]
71 # [3][0] [4][1] [0][2] [1][3] [2][4]>3.4.0.1.2>[4][1] [3][0] [2][4] [1][3] [0][2]
72 # [4][2] [0][3] [1][4] [2][0] [3][1]>4.0.1.2.3>[4][2] [3][1] [2][0] [1][4] [0][3]
73 # [0][4] [1][0] [2][1] [3][2] [4][3]>0.1.2.3.4>[4][3] [3][2] [2][1] [1][0] [0][4]
75 # As implied, data is loaded in Chi layout. Digits in variables' names
76 # represent right most coordinates of loaded data chunk:
78 my ($A00, # [4][4] [3][3] [2][2] [1][1] [0][0]
79 $A01, # [4][0] [3][4] [2][3] [1][2] [0][1]
80 $A02, # [4][1] [3][0] [2][4] [1][3] [0][2]
81 $A03, # [4][2] [3][1] [2][0] [1][4] [0][3]
82 $A04) = # [4][3] [3][2] [2][1] [1][0] [0][4]
85 # We also need to map the magic order into offsets within structure:
87 my @A_jagged = ([0,0], [1,0], [2,0], [3,0], [4,0],
88 [4,1], [0,1], [1,1], [2,1], [3,1],
89 [3,2], [4,2], [0,2], [1,2], [2,2],
90 [2,3], [3,3], [4,3], [0,3], [1,3],
91 [1,4], [2,4], [3,4], [4,4], [0,4]);
92 @A_jagged_in = map(8*($$_[0]*8+$$_[1]), @A_jagged); # ... and now linear
93 @A_jagged_out = map(8*($$_[0]*5+$$_[1]), @A_jagged); # ... and now linear
95 my @T = map("%zmm$_",(5..7,16..17));
96 my @Chi = map("%zmm$_",(18..22));
97 my @Theta = map("%zmm$_",(33,23..26)); # invalid @Theta[0] is not typo
98 my @Rhotate = map("%zmm$_",(27..31));
100 my ($C00,$D00) = @T[0..1];
101 my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6));
106 .type __KeccakF1600,\@function
115 ######################################### Theta
116 #vpermq $A00,@Theta[0],$A00 # doesn't actually change order
117 vpermq $A01,@Theta[1],$A01
118 vpermq $A02,@Theta[2],$A02
119 vpermq $A03,@Theta[3],$A03
120 vpermq $A04,@Theta[4],$A04
122 vmovdqa64 $A00,@T[0] # put aside original A00
123 vpternlogq \$0x96,$A02,$A01,$A00 # and use it as "C00"
124 vpternlogq \$0x96,$A04,$A03,$A00
127 vpermq $A00,@Theta[1],$A00
128 vpermq $D00,@Theta[4],$D00
130 vpternlogq \$0x96,$A00,$D00,@T[0] # T[0] is original A00
131 vpternlogq \$0x96,$A00,$D00,$A01
132 vpternlogq \$0x96,$A00,$D00,$A02
133 vpternlogq \$0x96,$A00,$D00,$A03
134 vpternlogq \$0x96,$A00,$D00,$A04
136 ######################################### Rho
137 vprolvq @Rhotate[0],@T[0],$A00 # T[0] is original A00
138 vprolvq @Rhotate[1],$A01,$A01
139 vprolvq @Rhotate[2],$A02,$A02
140 vprolvq @Rhotate[3],$A03,$A03
141 vprolvq @Rhotate[4],$A04,$A04
143 ######################################### Pi
144 vpblendmq $A02,$A00,@{T[0]}{$k00010}
145 vpblendmq $A00,$A03,@{T[1]}{$k00010}
146 vpblendmq $A03,$A01,@{T[2]}{$k00010}
147 vpblendmq $A01,$A04,@{T[3]}{$k00010}
148 vpblendmq $A04,$A02,@{T[4]}{$k00010}
150 vpblendmq $A04,@T[0],@{T[0]}{$k00100}
151 vpblendmq $A02,@T[1],@{T[1]}{$k00100}
152 vpblendmq $A00,@T[2],@{T[2]}{$k00100}
153 vpblendmq $A03,@T[3],@{T[3]}{$k00100}
154 vpblendmq $A01,@T[4],@{T[4]}{$k00100}
156 vpblendmq $A01,@T[0],@{T[0]}{$k01000}
157 vpblendmq $A04,@T[1],@{T[1]}{$k01000}
158 vpblendmq $A02,@T[2],@{T[2]}{$k01000}
159 vpblendmq $A00,@T[3],@{T[3]}{$k01000}
160 vpblendmq $A03,@T[4],@{T[4]}{$k01000}
162 vpblendmq $A03,@T[0],@{T[0]}{$k10000}
163 vpblendmq $A01,@T[1],@{T[1]}{$k10000}
164 vpblendmq $A04,@T[2],@{T[2]}{$k10000}
165 vpblendmq $A02,@T[3],@{T[3]}{$k10000}
166 vpblendmq $A00,@T[4],@{T[4]}{$k10000}
168 vpermq @T[0],@Chi[0],$A00
169 vpermq @T[1],@Chi[1],$A01
170 vpermq @T[2],@Chi[2],$A02
171 vpermq @T[3],@Chi[3],$A03
172 vpermq @T[4],@Chi[4],$A04
174 ######################################### Chi
176 vpternlogq \$0xD2,$A02,$A01,$A00
178 vpternlogq \$0xD2,$A03,$A02,$A01
179 vpternlogq \$0xD2,$A04,$A03,$A02
180 vpternlogq \$0xD2,@T[0],$A04,$A03
181 vpternlogq \$0xD2,@T[1],@T[0],$A04
183 ######################################### Iota
184 vpxorq (%r10),$A00,${A00}{$k00001}
191 .size __KeccakF1600,.-__KeccakF1600
194 my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
195 my $out = $inp; # in squeeze
199 .type SHA3_absorb,\@function
207 lea 96($A_flat),$A_flat
213 lea theta_perm(%rip),%r8
215 kxnorw $k11111,$k11111,$k11111
216 kshiftrw \$15,$k11111,$k00001
217 kshiftrw \$11,$k11111,$k11111
218 kshiftlw \$1,$k00001,$k00010
219 kshiftlw \$2,$k00001,$k00100
220 kshiftlw \$3,$k00001,$k01000
221 kshiftlw \$4,$k00001,$k10000
223 #vmovdqa64 64*0(%r8),@Theta[0]
224 vmovdqa64 64*1(%r8),@Theta[1]
225 vmovdqa64 64*2(%r8),@Theta[2]
226 vmovdqa64 64*3(%r8),@Theta[3]
227 vmovdqa64 64*4(%r8),@Theta[4]
229 vmovdqa64 64*5(%r8),@Rhotate[0]
230 vmovdqa64 64*6(%r8),@Rhotate[1]
231 vmovdqa64 64*7(%r8),@Rhotate[2]
232 vmovdqa64 64*8(%r8),@Rhotate[3]
233 vmovdqa64 64*9(%r8),@Rhotate[4]
235 vmovdqa64 64*10(%r8),@Chi[0]
236 vmovdqa64 64*11(%r8),@Chi[1]
237 vmovdqa64 64*12(%r8),@Chi[2]
238 vmovdqa64 64*13(%r8),@Chi[3]
239 vmovdqa64 64*14(%r8),@Chi[4]
241 vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
242 vpxorq @T[0],@T[0],@T[0]
243 vmovdqu64 40*1-96($A_flat),${A01}{$k11111}{z}
244 vmovdqu64 40*2-96($A_flat),${A02}{$k11111}{z}
245 vmovdqu64 40*3-96($A_flat),${A03}{$k11111}{z}
246 vmovdqu64 40*4-96($A_flat),${A04}{$k11111}{z}
248 vmovdqa64 @T[0],0*64-128(%r9) # zero transfer area on stack
249 vmovdqa64 @T[0],1*64-128(%r9)
250 vmovdqa64 @T[0],2*64-128(%r9)
251 vmovdqa64 @T[0],3*64-128(%r9)
252 vmovdqa64 @T[0],4*64-128(%r9)
253 jmp .Loop_absorb_avx512
259 jc .Ldone_absorb_avx512
263 for(my $i=0; $i<25; $i++) {
265 mov 8*$i-96($inp),%r8
266 mov %r8,$A_jagged_in[$i]-128(%r9)
275 vpxorq 64*0-128(%r9),$A00,$A00
276 vpxorq 64*1-128(%r9),$A01,$A01
277 vpxorq 64*2-128(%r9),$A02,$A02
278 vpxorq 64*3-128(%r9),$A03,$A03
279 vpxorq 64*4-128(%r9),$A04,$A04
283 jmp .Loop_absorb_avx512
286 .Ldone_absorb_avx512:
287 vmovdqu64 $A00,40*0-96($A_flat){$k11111}
288 vmovdqu64 $A01,40*1-96($A_flat){$k11111}
289 vmovdqu64 $A02,40*2-96($A_flat){$k11111}
290 vmovdqu64 $A03,40*3-96($A_flat){$k11111}
291 vmovdqu64 $A04,40*4-96($A_flat){$k11111}
296 lea ($len,$bsz),%rax # return value
298 .size SHA3_absorb,.-SHA3_absorb
301 .type SHA3_squeeze,\@function
306 lea 96($A_flat),$A_flat
308 jbe .Lno_output_extension_avx512
312 lea theta_perm(%rip),%r8
314 kxnorw $k11111,$k11111,$k11111
315 kshiftrw \$15,$k11111,$k00001
316 kshiftrw \$11,$k11111,$k11111
317 kshiftlw \$1,$k00001,$k00010
318 kshiftlw \$2,$k00001,$k00100
319 kshiftlw \$3,$k00001,$k01000
320 kshiftlw \$4,$k00001,$k10000
322 #vmovdqa64 64*0(%r8),@Theta[0]
323 vmovdqa64 64*1(%r8),@Theta[1]
324 vmovdqa64 64*2(%r8),@Theta[2]
325 vmovdqa64 64*3(%r8),@Theta[3]
326 vmovdqa64 64*4(%r8),@Theta[4]
328 vmovdqa64 64*5(%r8),@Rhotate[0]
329 vmovdqa64 64*6(%r8),@Rhotate[1]
330 vmovdqa64 64*7(%r8),@Rhotate[2]
331 vmovdqa64 64*8(%r8),@Rhotate[3]
332 vmovdqa64 64*9(%r8),@Rhotate[4]
334 vmovdqa64 64*10(%r8),@Chi[0]
335 vmovdqa64 64*11(%r8),@Chi[1]
336 vmovdqa64 64*12(%r8),@Chi[2]
337 vmovdqa64 64*13(%r8),@Chi[3]
338 vmovdqa64 64*14(%r8),@Chi[4]
340 vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
341 vmovdqu64 40*1-96($A_flat),${A01}{$k11111}{z}
342 vmovdqu64 40*2-96($A_flat),${A02}{$k11111}{z}
343 vmovdqu64 40*3-96($A_flat),${A03}{$k11111}{z}
344 vmovdqu64 40*4-96($A_flat),${A04}{$k11111}{z}
346 .Lno_output_extension_avx512:
350 .Loop_squeeze_avx512:
351 mov @A_jagged_out[$i]-96($A_flat),%r8
353 for (my $i=0; $i<25; $i++) {
356 jc .Ltail_squeeze_avx512
359 je .Ldone_squeeze_avx512
361 je .Lextend_output_avx512
362 mov @A_jagged_out[$i+1]-96($A_flat),%r8
366 .Lextend_output_avx512:
369 vmovdqu64 $A00,40*0-96($A_flat){$k11111}
370 vmovdqu64 $A01,40*1-96($A_flat){$k11111}
371 vmovdqu64 $A02,40*2-96($A_flat){$k11111}
372 vmovdqu64 $A03,40*3-96($A_flat){$k11111}
373 vmovdqu64 $A04,40*4-96($A_flat){$k11111}
376 jmp .Loop_squeeze_avx512
379 .Ltail_squeeze_avx512:
386 jnz .Loop_tail_avx512
388 .Ldone_squeeze_avx512:
393 .size SHA3_squeeze,.-SHA3_squeeze
397 .quad 0, 1, 2, 3, 4, 5, 6, 7 # [not used]
398 .quad 4, 0, 1, 2, 3, 5, 6, 7
399 .quad 3, 4, 0, 1, 2, 5, 6, 7
400 .quad 2, 3, 4, 0, 1, 5, 6, 7
401 .quad 1, 2, 3, 4, 0, 5, 6, 7
404 .quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4]
405 .quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4]
406 .quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4]
407 .quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4]
408 .quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4]
411 .quad 0, 4, 3, 2, 1, 5, 6, 7
412 .quad 1, 0, 4, 3, 2, 5, 6, 7
413 .quad 2, 1, 0, 4, 3, 5, 6, 7
414 .quad 3, 2, 1, 0, 4, 5, 6, 7
415 .quad 4, 3, 2, 1, 0, 5, 6, 7
418 .quad 0x0000000000000001
419 .quad 0x0000000000008082
420 .quad 0x800000000000808a
421 .quad 0x8000000080008000
422 .quad 0x000000000000808b
423 .quad 0x0000000080000001
424 .quad 0x8000000080008081
425 .quad 0x8000000000008009
426 .quad 0x000000000000008a
427 .quad 0x0000000000000088
428 .quad 0x0000000080008009
429 .quad 0x000000008000000a
430 .quad 0x000000008000808b
431 .quad 0x800000000000008b
432 .quad 0x8000000000008089
433 .quad 0x8000000000008003
434 .quad 0x8000000000008002
435 .quad 0x8000000000000080
436 .quad 0x000000000000800a
437 .quad 0x800000008000000a
438 .quad 0x8000000080008081
439 .quad 0x8000000000008080
440 .quad 0x0000000080000001
441 .quad 0x8000000080008008
443 .asciz "Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>"