2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
16 # Keccak-1600 for AVX-512F.
20 # Below code is KECCAK_1X_ALT implementation (see sha/keccak1600.c).
21 # Pretty straightforward, the only "magic" is data layout in registers.
22 # It's impossible to have one that is optimal for every step, hence
23 # it's changing as algorithm progresses. Data is saved in order that
24 # benefits Chi, but at the same time is easily convertible to order
25 # that benefits Theta. Conversion from Chi layout to Theta is
26 # explicit and reverse one is kind of fused with Pi...
28 ########################################################################
29 # Numbers are cycles per processed byte out of large message.
36 # (*) Corresponds to SHA3-256.
38 ########################################################################
39 # Coordinates below correspond to those in sha/keccak1600.c. Layout
40 # suitable for Chi is one with y coordinates aligned column-wise. Trick
41 # is to add regular shift to x coordinate, so that Chi can still be
42 # performed with as little as 7 instructions, yet be converted to layout
43 # suitable for Theta with intra-register permutations alone. Here is
44 # "magic" layout for Chi (with pre-Theta shuffle):
46 # [4][4] [3][3] [2][2] [1][1] [0][0]>4.3.2.1.0>[4][4] [3][3] [2][2] [1][1] [0][0]
47 # [4][0] [3][4] [2][3] [1][2] [0][1]>3.2.1.0.4>[3][4] [2][3] [1][2] [0][1] [4][0]
48 # [4][1] [3][0] [2][4] [1][3] [0][2]>2.1.0.4.3>[2][4] [1][3] [0][2] [4][1] [3][0]
49 # [4][2] [3][1] [2][0] [1][4] [0][3]>1.0.4.3.2>[1][4] [0][3] [4][2] [3][1] [2][0]
50 # [4][3] [3][2] [2][1] [1][0] [0][4]>0.4.3.2.1>[0][4] [4][3] [3][2] [2][1] [1][0]
52 # Layout suitable to Theta has x coordinates aligned column-wise
53 # [it's interleaved with Pi indices transformation for reference]:
55 # [4][4] [3][3] [2][2] [1][1] [0][0] $A00
56 ##[0][4] [0][3] [0][2] [0][1] [0][0]
57 # [3][4] [2][3] [1][2] [0][1] [4][0] $A01
58 ##[2][3] [2][2] [2][1] [2][0] [2][4]
59 # [2][4] [1][3] [0][2] [4][1] [3][0] $A02
60 ##[4][2] [4][1] [4][0] [4][4] [4][3]
61 # [1][4] [0][3] [4][2] [3][1] [2][0] $A03
62 ##[1][1] [1][0] [1][4] [1][3] [1][2]
63 # [0][4] [4][3] [3][2] [2][1] [1][0] $A04
64 ##[3][0] [3][4] [3][3] [3][2] [3][1]
66 # Pi itself is performed by blending above data and finally shuffling it
67 # to original Chi layout:
69 # [1][1] [2][2] [3][3] [4][4] [0][0]>1.2.3.4.0>[4][4] [3][3] [2][2] [1][1] [0][0]
70 # [2][3] [3][4] [4][0] [0][1] [1][2]>2.3.4.0.1>[4][0] [3][4] [2][3] [1][2] [0][1]
71 # [3][0] [4][1] [0][2] [1][3] [2][4]>3.4.0.1.2>[4][1] [3][0] [2][4] [1][3] [0][2]
72 # [4][2] [0][3] [1][4] [2][0] [3][1]>4.0.1.2.3>[4][2] [3][1] [2][0] [1][4] [0][3]
73 # [0][4] [1][0] [2][1] [3][2] [4][3]>0.1.2.3.4>[4][3] [3][2] [2][1] [1][0] [0][4]
75 # As implied, data is loaded in Chi layout. Digits in variables' names
76 # represent right most coordinates of loaded data chunk:
78 my ($A00, # [4][4] [3][3] [2][2] [1][1] [0][0]
79 $A01, # [4][0] [3][4] [2][3] [1][2] [0][1]
80 $A02, # [4][1] [3][0] [2][4] [1][3] [0][2]
81 $A03, # [4][2] [3][1] [2][0] [1][4] [0][3]
82 $A04) = # [4][3] [3][2] [2][1] [1][0] [0][4]
85 # We also need to map the magic order into offsets within structure:
87 my @A_jagged = ([0,0], [1,0], [2,0], [3,0], [4,0],
88 [4,1], [0,1], [1,1], [2,1], [3,1],
89 [3,2], [4,2], [0,2], [1,2], [2,2],
90 [2,3], [3,3], [4,3], [0,3], [1,3],
91 [1,4], [2,4], [3,4], [4,4], [0,4]);
92 @A_jagged_in = map(8*($$_[0]*8+$$_[1]), @A_jagged); # ... and now linear
93 @A_jagged_out = map(8*($$_[0]*5+$$_[1]), @A_jagged); # ... and now linear
95 my @T = map("%zmm$_",(5..7,16..17));
96 my @Chi = map("%zmm$_",(18..22));
97 my @Theta = map("%zmm$_",(33,23..26)); # invalid @Theta[0] is not typo
98 my @Rhotate = map("%zmm$_",(27..31));
100 my ($C00,$D00) = @T[0..1];
101 my ($k00001,$k00010,$k00100,$k01000,$k10000,$k11111) = map("%k$_",(1..6));
106 .type __KeccakF1600,\@function
115 ######################################### Theta
116 #vpermq $A00,@Theta[0],$A00 # doesn't actually change order
117 vpermq $A01,@Theta[1],$A01
118 vpermq $A02,@Theta[2],$A02
119 vpermq $A03,@Theta[3],$A03
120 vpermq $A04,@Theta[4],$A04
122 vpxorq $A01,$A00,$C00
123 vpxorq $A02,$C00,$C00
124 vpternlogq \$0x96,$A04,$A03,$C00
127 vpermq $C00,@Theta[1],$C00
128 vpermq $D00,@Theta[4],$D00
130 vpternlogq \$0x96,$C00,$D00,$A00
131 vpternlogq \$0x96,$C00,$D00,$A01
132 vpternlogq \$0x96,$C00,$D00,$A02
133 vpternlogq \$0x96,$C00,$D00,$A03
134 vpternlogq \$0x96,$C00,$D00,$A04
136 ######################################### Rho
137 vprolvq @Rhotate[0],$A00,$A00
138 vprolvq @Rhotate[1],$A01,$A01
139 vprolvq @Rhotate[2],$A02,$A02
140 vprolvq @Rhotate[3],$A03,$A03
141 vprolvq @Rhotate[4],$A04,$A04
143 ######################################### Pi
144 vpblendmq $A02,$A00,@{T[0]}{$k00010}
145 vpblendmq $A00,$A03,@{T[1]}{$k00010}
146 vpblendmq $A03,$A01,@{T[2]}{$k00010}
147 vpblendmq $A01,$A04,@{T[3]}{$k00010}
148 vpblendmq $A04,$A02,@{T[4]}{$k00010}
150 vpblendmq $A04,@T[0],@{T[0]}{$k00100}
151 vpblendmq $A02,@T[1],@{T[1]}{$k00100}
152 vpblendmq $A00,@T[2],@{T[2]}{$k00100}
153 vpblendmq $A03,@T[3],@{T[3]}{$k00100}
154 vpblendmq $A01,@T[4],@{T[4]}{$k00100}
156 vpblendmq $A01,@T[0],@{T[0]}{$k01000}
157 vpblendmq $A04,@T[1],@{T[1]}{$k01000}
158 vpblendmq $A02,@T[2],@{T[2]}{$k01000}
159 vpblendmq $A00,@T[3],@{T[3]}{$k01000}
160 vpblendmq $A03,@T[4],@{T[4]}{$k01000}
162 vpblendmq $A03,@T[0],@{T[0]}{$k10000}
163 vpblendmq $A01,@T[1],@{T[1]}{$k10000}
164 vpblendmq $A04,@T[2],@{T[2]}{$k10000}
165 vpblendmq $A02,@T[3],@{T[3]}{$k10000}
166 vpblendmq $A00,@T[4],@{T[4]}{$k10000}
168 vpermq @T[0],@Chi[0],$A00
169 vpermq @T[1],@Chi[1],$A01
170 vpermq @T[2],@Chi[2],$A02
171 vpermq @T[3],@Chi[3],$A03
172 vpermq @T[4],@Chi[4],$A04
174 ######################################### Chi
176 vpternlogq \$0xD2,$A02,$A01,$A00
178 vpternlogq \$0xD2,$A03,$A02,$A01
179 vpternlogq \$0xD2,$A04,$A03,$A02
180 vpternlogq \$0xD2,@T[0],$A04,$A03
181 vpternlogq \$0xD2,@T[1],@T[0],$A04
183 ######################################### Iota
184 vpxorq (%r10),$A00,${A00}{$k00001}
191 .size __KeccakF1600,.-__KeccakF1600
194 my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
195 my $out = $inp; # in squeeze
199 .type SHA3_absorb,\@function
207 lea 96($A_flat),$A_flat
213 lea theta_perm(%rip),%r8
215 kxnorw $k11111,$k11111,$k11111
216 kshiftrw \$15,$k11111,$k00001
217 kshiftrw \$11,$k11111,$k11111
218 kshiftlw \$1,$k00001,$k00010
219 kshiftlw \$2,$k00001,$k00100
220 kshiftlw \$3,$k00001,$k01000
221 kshiftlw \$4,$k00001,$k10000
223 #vmovdqa64 64*0(%r8),@Theta[0]
224 vmovdqa64 64*1(%r8),@Theta[1]
225 vmovdqa64 64*2(%r8),@Theta[2]
226 vmovdqa64 64*3(%r8),@Theta[3]
227 vmovdqa64 64*4(%r8),@Theta[4]
229 vmovdqa64 64*5(%r8),@Rhotate[0]
230 vmovdqa64 64*6(%r8),@Rhotate[1]
231 vmovdqa64 64*7(%r8),@Rhotate[2]
232 vmovdqa64 64*8(%r8),@Rhotate[3]
233 vmovdqa64 64*9(%r8),@Rhotate[4]
235 vmovdqa64 64*10(%r8),@Chi[0]
236 vmovdqa64 64*11(%r8),@Chi[1]
237 vmovdqa64 64*12(%r8),@Chi[2]
238 vmovdqa64 64*13(%r8),@Chi[3]
239 vmovdqa64 64*14(%r8),@Chi[4]
241 vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
242 vpxorq @T[0],@T[0],@T[0]
243 vmovdqu64 40*1-96($A_flat),${A01}{$k11111}{z}
244 vmovdqu64 40*2-96($A_flat),${A02}{$k11111}{z}
245 vmovdqu64 40*3-96($A_flat),${A03}{$k11111}{z}
246 vmovdqu64 40*4-96($A_flat),${A04}{$k11111}{z}
248 vmovdqa64 @T[0],0*64-128(%r9) # zero transfer area on stack
249 vmovdqa64 @T[0],1*64-128(%r9)
250 vmovdqa64 @T[0],2*64-128(%r9)
251 vmovdqa64 @T[0],3*64-128(%r9)
252 vmovdqa64 @T[0],4*64-128(%r9)
253 jmp .Loop_absorb_avx512
259 jc .Ldone_absorb_avx512
262 vmovdqu64 -96($inp),@{T[0]}{$k11111}
265 for(my $i=5; $i<25; $i++) {
269 mov 8*$i-96($inp),%r8
270 mov %r8,$A_jagged_in[$i]-128(%r9)
277 vpxorq @T[0],$A00,$A00
278 vpxorq 64*1-128(%r9),$A01,$A01
279 vpxorq 64*2-128(%r9),$A02,$A02
280 vpxorq 64*3-128(%r9),$A03,$A03
281 vpxorq 64*4-128(%r9),$A04,$A04
285 jmp .Loop_absorb_avx512
288 .Ldone_absorb_avx512:
289 vmovdqu64 $A00,40*0-96($A_flat){$k11111}
290 vmovdqu64 $A01,40*1-96($A_flat){$k11111}
291 vmovdqu64 $A02,40*2-96($A_flat){$k11111}
292 vmovdqu64 $A03,40*3-96($A_flat){$k11111}
293 vmovdqu64 $A04,40*4-96($A_flat){$k11111}
298 lea ($len,$bsz),%rax # return value
300 .size SHA3_absorb,.-SHA3_absorb
303 .type SHA3_squeeze,\@function
308 lea 96($A_flat),$A_flat
310 jbe .Lno_output_extension_avx512
314 lea theta_perm(%rip),%r8
316 kxnorw $k11111,$k11111,$k11111
317 kshiftrw \$15,$k11111,$k00001
318 kshiftrw \$11,$k11111,$k11111
319 kshiftlw \$1,$k00001,$k00010
320 kshiftlw \$2,$k00001,$k00100
321 kshiftlw \$3,$k00001,$k01000
322 kshiftlw \$4,$k00001,$k10000
324 #vmovdqa64 64*0(%r8),@Theta[0]
325 vmovdqa64 64*1(%r8),@Theta[1]
326 vmovdqa64 64*2(%r8),@Theta[2]
327 vmovdqa64 64*3(%r8),@Theta[3]
328 vmovdqa64 64*4(%r8),@Theta[4]
330 vmovdqa64 64*5(%r8),@Rhotate[0]
331 vmovdqa64 64*6(%r8),@Rhotate[1]
332 vmovdqa64 64*7(%r8),@Rhotate[2]
333 vmovdqa64 64*8(%r8),@Rhotate[3]
334 vmovdqa64 64*9(%r8),@Rhotate[4]
336 vmovdqa64 64*10(%r8),@Chi[0]
337 vmovdqa64 64*11(%r8),@Chi[1]
338 vmovdqa64 64*12(%r8),@Chi[2]
339 vmovdqa64 64*13(%r8),@Chi[3]
340 vmovdqa64 64*14(%r8),@Chi[4]
342 vmovdqu64 40*0-96($A_flat),${A00}{$k11111}{z}
343 vmovdqu64 40*1-96($A_flat),${A01}{$k11111}{z}
344 vmovdqu64 40*2-96($A_flat),${A02}{$k11111}{z}
345 vmovdqu64 40*3-96($A_flat),${A03}{$k11111}{z}
346 vmovdqu64 40*4-96($A_flat),${A04}{$k11111}{z}
348 .Lno_output_extension_avx512:
352 .Loop_squeeze_avx512:
353 mov @A_jagged_out[$i]-96($A_flat),%r8
355 for (my $i=0; $i<25; $i++) {
358 jc .Ltail_squeeze_avx512
361 je .Ldone_squeeze_avx512
363 je .Lextend_output_avx512
364 mov @A_jagged_out[$i+1]-96($A_flat),%r8
368 .Lextend_output_avx512:
371 vmovdqu64 $A00,40*0-96($A_flat){$k11111}
372 vmovdqu64 $A01,40*1-96($A_flat){$k11111}
373 vmovdqu64 $A02,40*2-96($A_flat){$k11111}
374 vmovdqu64 $A03,40*3-96($A_flat){$k11111}
375 vmovdqu64 $A04,40*4-96($A_flat){$k11111}
378 jmp .Loop_squeeze_avx512
381 .Ltail_squeeze_avx512:
388 jnz .Loop_tail_avx512
390 .Ldone_squeeze_avx512:
395 .size SHA3_squeeze,.-SHA3_squeeze
399 .quad 0, 1, 2, 3, 4, 5, 6, 7 # [not used]
400 .quad 4, 0, 1, 2, 3, 5, 6, 7
401 .quad 3, 4, 0, 1, 2, 5, 6, 7
402 .quad 2, 3, 4, 0, 1, 5, 6, 7
403 .quad 1, 2, 3, 4, 0, 5, 6, 7
406 .quad 0, 44, 43, 21, 14, 0, 0, 0 # [0][0] [1][1] [2][2] [3][3] [4][4]
407 .quad 18, 1, 6, 25, 8, 0, 0, 0 # [4][0] [0][1] [1][2] [2][3] [3][4]
408 .quad 41, 2, 62, 55, 39, 0, 0, 0 # [3][0] [4][1] [0][2] [1][3] [2][4]
409 .quad 3, 45, 61, 28, 20, 0, 0, 0 # [2][0] [3][1] [4][2] [0][3] [1][4]
410 .quad 36, 10, 15, 56, 27, 0, 0, 0 # [1][0] [2][1] [3][2] [4][3] [0][4]
413 .quad 0, 4, 3, 2, 1, 5, 6, 7
414 .quad 1, 0, 4, 3, 2, 5, 6, 7
415 .quad 2, 1, 0, 4, 3, 5, 6, 7
416 .quad 3, 2, 1, 0, 4, 5, 6, 7
417 .quad 4, 3, 2, 1, 0, 5, 6, 7
420 .quad 0x0000000000000001
421 .quad 0x0000000000008082
422 .quad 0x800000000000808a
423 .quad 0x8000000080008000
424 .quad 0x000000000000808b
425 .quad 0x0000000080000001
426 .quad 0x8000000080008081
427 .quad 0x8000000000008009
428 .quad 0x000000000000008a
429 .quad 0x0000000000000088
430 .quad 0x0000000080008009
431 .quad 0x000000008000000a
432 .quad 0x000000008000808b
433 .quad 0x800000000000008b
434 .quad 0x8000000000008089
435 .quad 0x8000000000008003
436 .quad 0x8000000000008002
437 .quad 0x8000000000000080
438 .quad 0x000000000000800a
439 .quad 0x800000008000000a
440 .quad 0x8000000080008081
441 .quad 0x8000000000008080
442 .quad 0x0000000080000001
443 .quad 0x8000000080008008
445 .asciz "Keccak-1600 absorb and squeeze for AVX-512F, CRYPTOGAMS by <appro\@openssl.org>"