Add sha/asm/keccak1600-avx512vl.pl.
[openssl.git] / crypto / sha / asm / keccak1600-avx512vl.pl
1 #!/usr/bin/env perl
2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # Keccak-1600 for AVX512VL.
17 #
18 # December 2017.
19 #
20 # This is an adaptation of AVX2 module that reuses register data
21 # layout, but utilizes new 256-bit AVX512VL instructions. See AVX2
22 # module for further information on layout.
23 #
24 ########################################################################
25 # Numbers are cycles per processed byte out of large message.
26 #
27 #                       r=1088(*)
28 #
29 # Skylake-X             6.4/+47%
30 #
31 # (*)   Corresponds to SHA3-256. Percentage after slash is improvement
32 #       coefficient in comparison to scalar keccak1600-x86_64.pl.
33
34 # Digits in variables' names denote right-most coordinates:
35
36 my ($A00,       # [0][0] [0][0] [0][0] [0][0]           # %ymm0
37     $A01,       # [0][4] [0][3] [0][2] [0][1]           # %ymm1
38     $A20,       # [3][0] [1][0] [4][0] [2][0]           # %ymm2
39     $A31,       # [2][4] [4][3] [1][2] [3][1]           # %ymm3
40     $A21,       # [3][4] [1][3] [4][2] [2][1]           # %ymm4
41     $A41,       # [1][4] [2][3] [3][2] [4][1]           # %ymm5
42     $A11) =     # [4][4] [3][3] [2][2] [1][1]           # %ymm6
43     map("%ymm$_",(0..6));
44
45 # We also need to map the magic order into offsets within structure:
46
47 my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3],      # [0][0..4]
48                 [2,2], [6,0], [3,1], [4,2], [5,3],      # [1][0..4]
49                 [2,0], [4,0], [6,1], [5,2], [3,3],      # [2][0..4]
50                 [2,3], [3,0], [5,1], [6,2], [4,3],      # [3][0..4]
51                 [2,1], [5,0], [4,1], [3,2], [6,3]);     # [4][0..4]
52    @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged);     # ... and now linear
53
54 my @T = map("%ymm$_",(7..15));
55 my ($C14,$C00,$D00,$D14) = @T[5..8];
56 my ($R20,$R01,$R31,$R21,$R41,$R11) = map("%ymm$_",(16..21));
57
58 $code.=<<___;
59 .text
60
61 .type   __KeccakF1600,\@function
62 .align  32
63 __KeccakF1600:
64         lea             iotas(%rip),%r10
65         mov             \$24,%eax
66         jmp             .Loop_avx512vl
67
68 .align  32
69 .Loop_avx512vl:
70         ######################################### Theta
71         vpshufd         \$0b01001110,$A20,$C00
72         vpxor           $A31,$A41,$C14
73         vpxor           $A11,$A21,@T[2]
74         vpternlogq      \$0x96,$A01,$T[2],$C14  # C[1..4]
75
76         vpxor           $A20,$C00,$C00
77         vpermq          \$0b01001110,$C00,@T[0]
78
79         vpermq          \$0b10010011,$C14,@T[4]
80         vprolq          \$1,$C14,@T[1]          # ROL64(C[1..4],1)
81
82         vpermq          \$0b00111001,@T[1],$D14
83         vpxor           @T[4],@T[1],$D00
84         vpermq          \$0b00000000,$D00,$D00  # D[0..0] = ROL64(C[1],1) ^ C[4]
85
86         vpternlogq      \$0x96,@T[0],$A00,$C00  # C[0..0]
87         vprolq          \$1,$C00,@T[1]          # ROL64(C[0..0],1)
88
89         vpxor           $D00,$A00,$A00          # ^= D[0..0]
90
91         vpblendd        \$0b11000000,@T[1],$D14,$D14
92         vpblendd        \$0b00000011,$C00,@T[4],@T[0]
93
94         ######################################### Rho + Pi + pre-Chi shuffle
95          vpxor          $D00,$A20,$A20          # ^= D[0..0] from Theta
96         vprolvq         $R20,$A20,$A20
97
98          vpternlogq     \$0x96,@T[0],$D14,$A31  # ^= D[1..4] from Theta
99         vprolvq         $R31,$A31,$A31
100
101          vpternlogq     \$0x96,@T[0],$D14,$A21  # ^= D[1..4] from Theta
102         vprolvq         $R21,$A21,$A21
103
104          vpternlogq     \$0x96,@T[0],$D14,$A41  # ^= D[1..4] from Theta
105         vprolvq         $R41,$A41,$A41
106
107          vpermq         \$0b10001101,$A20,@T[3] # $A20 -> future $A31
108          vpermq         \$0b10001101,$A31,@T[4] # $A31 -> future $A21
109          vpternlogq     \$0x96,@T[0],$D14,$A11  # ^= D[1..4] from Theta
110         vprolvq         $R11,$A11,@T[1]         # $A11 -> future $A01
111
112          vpermq         \$0b00011011,$A21,@T[5] # $A21 -> future $A41
113          vpermq         \$0b01110010,$A41,@T[6] # $A41 -> future $A11
114          vpternlogq     \$0x96,@T[0],$D14,$A01  # ^= D[1..4] from Theta
115         vprolvq         $R01,$A01,@T[2]         # $A01 -> future $A20
116
117         ######################################### Chi
118         vpblendd        \$0b00001100,@T[6],@T[2],$A31   #               [4][4] [2][0]
119         vpblendd        \$0b00001100,@T[2],@T[4],@T[8]  #               [4][0] [2][1]
120          vpblendd       \$0b00001100,@T[4],@T[3],$A41   #               [4][2] [2][4]
121          vpblendd       \$0b00001100,@T[3],@T[2],@T[7]  #               [4][3] [2][0]
122         vpblendd        \$0b00110000,@T[4],$A31,$A31    #        [1][3] [4][4] [2][0]
123         vpblendd        \$0b00110000,@T[5],@T[8],@T[8]  #        [1][4] [4][0] [2][1]
124          vpblendd       \$0b00110000,@T[2],$A41,$A41    #        [1][0] [4][2] [2][4]
125          vpblendd       \$0b00110000,@T[6],@T[7],@T[7]  #        [1][1] [4][3] [2][0]
126         vpblendd        \$0b11000000,@T[5],$A31,$A31    # [3][2] [1][3] [4][4] [2][0]
127         vpblendd        \$0b11000000,@T[6],@T[8],@T[8]  # [3][3] [1][4] [4][0] [2][1]
128          vpblendd       \$0b11000000,@T[6],$A41,$A41    # [3][3] [1][0] [4][2] [2][4]
129          vpblendd       \$0b11000000,@T[4],@T[7],@T[7]  # [3][4] [1][1] [4][3] [2][0]
130         vpternlogq      \$0xC6,@T[8],@T[3],$A31         # [3][1] [1][2] [4][3] [2][4]
131          vpternlogq     \$0xC6,@T[7],@T[5],$A41         # [3][2] [1][4] [4][1] [2][3]
132
133         vpsrldq         \$8,@T[1],@T[0]
134         vpandn          @T[0],@T[1],@T[0]       # tgting  [0][0] [0][0] [0][0] [0][0]
135
136         vpblendd        \$0b00001100,@T[2],@T[5],$A11   #               [4][0] [2][3]
137         vpblendd        \$0b00001100,@T[5],@T[3],@T[8]  #               [4][1] [2][4]
138         vpblendd        \$0b00110000,@T[3],$A11,$A11    #        [1][2] [4][0] [2][3]
139         vpblendd        \$0b00110000,@T[4],@T[8],@T[8]  #        [1][3] [4][1] [2][4]
140         vpblendd        \$0b11000000,@T[4],$A11,$A11    # [3][4] [1][2] [4][0] [2][3]
141         vpblendd        \$0b11000000,@T[2],@T[8],@T[8]  # [3][0] [1][3] [4][1] [2][4]
142         vpternlogq      \$0xC6,@T[8],@T[6],$A11         # [3][3] [1][1] [4][4] [2][2]
143
144           vpermq        \$0b00011110,@T[1],$A21         # [0][1] [0][2] [0][4] [0][3]
145           vpblendd      \$0b00110000,$A00,$A21,@T[8]    # [0][1] [0][0] [0][4] [0][3]
146           vpermq        \$0b00111001,@T[1],$A01         # [0][1] [0][4] [0][3] [0][2]
147           vpblendd      \$0b11000000,$A00,$A01,$A01     # [0][0] [0][4] [0][3] [0][2]
148
149         vpblendd        \$0b00001100,@T[5],@T[4],$A20   #               [4][1] [2][1]
150         vpblendd        \$0b00001100,@T[4],@T[6],@T[7]  #               [4][2] [2][2]
151         vpblendd        \$0b00110000,@T[6],$A20,$A20    #        [1][1] [4][1] [2][1]
152         vpblendd        \$0b00110000,@T[3],@T[7],@T[7]  #        [1][2] [4][2] [2][2]
153         vpblendd        \$0b11000000,@T[3],$A20,$A20    # [3][1] [1][1] [4][1] [2][1]
154         vpblendd        \$0b11000000,@T[5],@T[7],@T[7]  # [3][2] [1][2] [4][2] [2][2]
155         vpternlogq      \$0xC6,@T[7],@T[2],$A20         # [3][0] [1][0] [4][0] [2][0]
156
157          vpermq         \$0b00000000,@T[0],@T[0]        # [0][0] [0][0] [0][0] [0][0]
158          vpermq         \$0b00011011,$A31,$A31          # post-Chi shuffle
159          vpermq         \$0b10001101,$A41,$A41
160          vpermq         \$0b01110010,$A11,$A11
161
162         vpblendd        \$0b00001100,@T[3],@T[6],$A21   #               [4][3] [2][2]
163         vpblendd        \$0b00001100,@T[6],@T[5],@T[7]  #               [4][4] [2][3]
164         vpblendd        \$0b00110000,@T[5],$A21,$A21    #        [1][4] [4][3] [2][2]
165         vpblendd        \$0b00110000,@T[2],@T[7],@T[7]  #        [1][0] [4][4] [2][3]
166         vpblendd        \$0b11000000,@T[2],$A21,$A21    # [3][0] [1][4] [4][3] [2][2]
167         vpblendd        \$0b11000000,@T[3],@T[7],@T[7]  # [3][1] [1][0] [4][4] [2][3]
168
169         vpternlogq      \$0xC6,@T[8],@T[1],$A01         # [0][4] [0][3] [0][2] [0][1]
170         vpternlogq      \$0xC6,@T[7],@T[4],$A21         # [3][4] [1][3] [4][2] [2][1]
171
172         ######################################### Iota
173         vpternlogq      \$0x96,(%r10),@T[0],$A00
174         lea             32(%r10),%r10
175
176         dec             %eax
177         jnz             .Loop_avx512vl
178
179         ret
180 .size   __KeccakF1600,.-__KeccakF1600
181 ___
182 my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
183 my  $out = $inp;        # in squeeze
184
185 $code.=<<___;
186 .globl  SHA3_absorb
187 .type   SHA3_absorb,\@function
188 .align  32
189 SHA3_absorb:
190         mov     %rsp,%r11
191
192         lea     -240(%rsp),%rsp
193         and     \$-32,%rsp
194
195         lea     96($A_flat),$A_flat
196         lea     96($inp),$inp
197         lea     96(%rsp),%r10
198         lea     rhotates_left(%rip),%r8
199
200         vzeroupper
201
202         vpbroadcastq    -96($A_flat),$A00       # load A[5][5]
203         vmovdqu         8+32*0-96($A_flat),$A01
204         vmovdqu         8+32*1-96($A_flat),$A20
205         vmovdqu         8+32*2-96($A_flat),$A31
206         vmovdqu         8+32*3-96($A_flat),$A21
207         vmovdqu         8+32*4-96($A_flat),$A41
208         vmovdqu         8+32*5-96($A_flat),$A11
209
210         vmovdqa64       0*32(%r8),$R20          # load "rhotate" indices
211         vmovdqa64       1*32(%r8),$R01
212         vmovdqa64       2*32(%r8),$R31
213         vmovdqa64       3*32(%r8),$R21
214         vmovdqa64       4*32(%r8),$R41
215         vmovdqa64       5*32(%r8),$R11
216
217         vpxor           @T[0],@T[0],@T[0]
218         vmovdqa         @T[0],32*2-96(%r10)     # zero transfer area on stack
219         vmovdqa         @T[0],32*3-96(%r10)
220         vmovdqa         @T[0],32*4-96(%r10)
221         vmovdqa         @T[0],32*5-96(%r10)
222         vmovdqa         @T[0],32*6-96(%r10)
223
224 .Loop_absorb_avx512vl:
225         mov             $bsz,%rax
226         sub             $bsz,$len
227         jc              .Ldone_absorb_avx512vl
228
229         shr             \$3,%eax
230         vpbroadcastq    0-96($inp),@T[0]
231         vmovdqu         8-96($inp),@T[1]
232         sub             \$4,%eax
233 ___
234 for(my $i=5; $i<25; $i++) {
235 $code.=<<___
236         dec     %eax
237         jz      .Labsorved_avx512vl
238         mov     8*$i-96($inp),%r8
239         mov     %r8,$A_jagged[$i]-96(%r10)
240 ___
241 }
242 $code.=<<___;
243 .Labsorved_avx512vl:
244         lea     ($inp,$bsz),$inp
245
246         vpxor   @T[0],$A00,$A00
247         vpxor   @T[1],$A01,$A01
248         vpxor   32*2-96(%r10),$A20,$A20
249         vpxor   32*3-96(%r10),$A31,$A31
250         vpxor   32*4-96(%r10),$A21,$A21
251         vpxor   32*5-96(%r10),$A41,$A41
252         vpxor   32*6-96(%r10),$A11,$A11
253
254         call    __KeccakF1600
255
256         lea     96(%rsp),%r10
257         jmp     .Loop_absorb_avx512vl
258
259 .Ldone_absorb_avx512vl:
260         vmovq   %xmm0,-96($A_flat)
261         vmovdqu $A01,8+32*0-96($A_flat)
262         vmovdqu $A20,8+32*1-96($A_flat)
263         vmovdqu $A31,8+32*2-96($A_flat)
264         vmovdqu $A21,8+32*3-96($A_flat)
265         vmovdqu $A41,8+32*4-96($A_flat)
266         vmovdqu $A11,8+32*5-96($A_flat)
267
268         vzeroupper
269
270         lea     (%r11),%rsp
271         lea     ($len,$bsz),%rax                # return value
272         ret
273 .size   SHA3_absorb,.-SHA3_absorb
274
275 .globl  SHA3_squeeze
276 .type   SHA3_squeeze,\@function
277 .align  32
278 SHA3_squeeze:
279         mov     %rsp,%r11
280
281         lea     96($A_flat),$A_flat
282         lea     rhotates_left(%rip),%r8
283         shr     \$3,$bsz
284
285         vzeroupper
286
287         vpbroadcastq    -96($A_flat),$A00
288         vpxor           @T[0],@T[0],@T[0]
289         vmovdqu         8+32*0-96($A_flat),$A01
290         vmovdqu         8+32*1-96($A_flat),$A20
291         vmovdqu         8+32*2-96($A_flat),$A31
292         vmovdqu         8+32*3-96($A_flat),$A21
293         vmovdqu         8+32*4-96($A_flat),$A41
294         vmovdqu         8+32*5-96($A_flat),$A11
295
296         vmovdqa64       0*32(%r8),$R20          # load "rhotate" indices
297         vmovdqa64       1*32(%r8),$R01
298         vmovdqa64       2*32(%r8),$R31
299         vmovdqa64       3*32(%r8),$R21
300         vmovdqa64       4*32(%r8),$R41
301         vmovdqa64       5*32(%r8),$R11
302
303         mov     $bsz,%rax
304
305 .Loop_squeeze_avx512vl:
306         mov     @A_jagged[$i]-96($A_flat),%r8
307 ___
308 for (my $i=0; $i<25; $i++) {
309 $code.=<<___;
310         sub     \$8,$len
311         jc      .Ltail_squeeze_avx512vl
312         mov     %r8,($out)
313         lea     8($out),$out
314         je      .Ldone_squeeze_avx512vl
315         dec     %eax
316         je      .Lextend_output_avx512vl
317         mov     @A_jagged[$i+1]-120($A_flat),%r8
318 ___
319 }
320 $code.=<<___;
321 .Lextend_output_avx512vl:
322         call    __KeccakF1600
323
324         vmovq   %xmm0,-96($A_flat)
325         vmovdqu $A01,8+32*0-96($A_flat)
326         vmovdqu $A20,8+32*1-96($A_flat)
327         vmovdqu $A31,8+32*2-96($A_flat)
328         vmovdqu $A21,8+32*3-96($A_flat)
329         vmovdqu $A41,8+32*4-96($A_flat)
330         vmovdqu $A11,8+32*5-96($A_flat)
331
332         mov     $bsz,%rax
333         jmp     .Loop_squeeze_avx512vl
334
335
336 .Ltail_squeeze_avx512vl:
337         add     \$8,$len
338 .Loop_tail_avx512vl:
339         mov     %r8b,($out)
340         lea     1($out),$out
341         shr     \$8,%r8
342         dec     $len
343         jnz     .Loop_tail_avx512vl
344
345 .Ldone_squeeze_avx512vl:
346         vzeroupper
347
348         lea     (%r11),%rsp
349         ret
350 .size   SHA3_squeeze,.-SHA3_squeeze
351
352 .align  64
353 rhotates_left:
354         .quad   3,      18,     36,     41      # [2][0] [4][0] [1][0] [3][0]
355         .quad   1,      62,     28,     27      # [0][1] [0][2] [0][3] [0][4]
356         .quad   45,     6,      56,     39      # [3][1] [1][2] [4][3] [2][4]
357         .quad   10,     61,     55,     8       # [2][1] [4][2] [1][3] [3][4]
358         .quad   2,      15,     25,     20      # [4][1] [3][2] [2][3] [1][4]
359         .quad   44,     43,     21,     14      # [1][1] [2][2] [3][3] [4][4]
360 iotas:
361         .quad   0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
362         .quad   0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
363         .quad   0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
364         .quad   0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
365         .quad   0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
366         .quad   0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
367         .quad   0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
368         .quad   0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
369         .quad   0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
370         .quad   0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
371         .quad   0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
372         .quad   0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
373         .quad   0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
374         .quad   0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
375         .quad   0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
376         .quad   0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
377         .quad   0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
378         .quad   0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
379         .quad   0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
380         .quad   0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
381         .quad   0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
382         .quad   0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
383         .quad   0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
384         .quad   0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
385
386 .asciz  "Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by <appro\@openssl.org>"
387 ___
388
389 print $code;
390 close STDOUT;