a56756fb3126adf6ffd3c7ee611d95f188f99e43
[openssl.git] / crypto / sha / asm / keccak1600-avx2.pl
1 #!/usr/bin/env perl
2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # Keccak-1600 for AVX2.
17 #
18 # July 2017.
19 #
20 # To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
21 # 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
22 # other than A[0][0] in magic order into 6 [256-bit] registers, *each
23 # dedicated to one axis*, Pi permutation is reduced to intra-register
24 # shuffles...
25 #
26 # It makes other steps more intricate, but overall, is it a win? To be
27 # more specific index permutations organized by quadruples are:
28 #
29 #       [4][4] [3][3] [2][2] [1][1]<-+
30 #       [0][4] [0][3] [0][2] [0][1]<-+
31 #       [3][0] [1][0] [4][0] [2][0]  |
32 #       [4][3] [3][1] [2][4] [1][2]  |
33 #       [3][4] [1][3] [4][2] [2][1]  |
34 #       [2][3] [4][1] [1][4] [3][2]  |
35 #       [2][2] [4][4] [1][1] [3][3] -+
36 #
37 # This however is highly impractical for Theta and Chi. What would help
38 # Theta is if x indices were aligned column-wise, or in other words:
39 #
40 #       [0][4] [0][3] [0][2] [0][1]
41 #       [3][0] [1][0] [4][0] [2][0]
42 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
43 #       [2][4] [4][3] [1][2] [3][1]
44 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
45 #       [3][4] [1][3] [4][2] [2][1]
46 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
47 #       [1][4] [2][3] [3][2] [4][1]
48 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
49 #       [4][4] [3][3] [2][2] [1][1]
50 #
51 # So here we have it, lines not marked with vpermq() represent the magic
52 # order in which data is to be loaded and maintained. [And lines marked
53 # with vpermq() represent Pi circular permutation in chosen layout. Note
54 # that first step is permutation-free.] A[0][0] is loaded to register of
55 # its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
56 # Digits in variables' names denote right-most coordinates:
57
58 my ($A00,       # [0][0] [0][0] [0][0] [0][0]           # %ymm0
59     $A01,       # [0][4] [0][3] [0][2] [0][1]           # %ymm1
60     $A20,       # [3][0] [1][0] [4][0] [2][0]           # %ymm2
61     $A31,       # [2][4] [4][3] [1][2] [3][1]           # %ymm3
62     $A21,       # [3][4] [1][3] [4][2] [2][1]           # %ymm4
63     $A41,       # [1][4] [2][3] [3][2] [4][1]           # %ymm5
64     $A11) =     # [4][4] [3][3] [2][2] [1][1]           # %ymm6
65     map("%ymm$_",(0..6));
66
67 # We also need to map the magic order into offsets within structure:
68
69 my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3],      # [0][0..4]
70                 [2,2], [6,0], [3,1], [4,2], [5,3],      # [1][0..4]
71                 [2,0], [4,0], [6,1], [5,2], [3,3],      # [2][0..4]
72                 [2,3], [3,0], [5,1], [6,2], [4,3],      # [3][0..4]
73                 [2,1], [5,0], [4,1], [3,2], [6,3]);     # [4][0..4]
74    @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged);     # ... and now linear
75
76 # But on the other hand Chi is much better off if y indices were aligned
77 # column-wise, not x. For this reason we have to shuffle data prior
78 # Chi and revert it afterwards. Prior shuffle is naturally merged with
79 # Pi itself:
80 #
81 #       [0][4] [0][3] [0][2] [0][1]
82 #       [3][0] [1][0] [4][0] [2][0]
83 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
84 #vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
85 #       [3][1] [1][2] [4][3] [2][4]
86 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
87 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
88 #       [3][4] [1][3] [4][2] [2][1]
89 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
90 #vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
91 #       [3][2] [1][4] [4][1] [2][3]
92 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
93 #vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
94 #       [3][3] [1][1] [4][4] [2][2]
95 #
96 # And reverse post-Chi permutation:
97 #
98 #       [0][4] [0][3] [0][2] [0][1]
99 #       [3][0] [1][0] [4][0] [2][0]
100 #vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
101 #       [2][4] [4][3] [1][2] [3][1]
102 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
103 #       [3][4] [1][3] [4][2] [2][1]
104 #vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
105 #       [1][4] [2][3] [3][2] [4][1]
106 #vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
107 #       [4][4] [3][3] [2][2] [1][1]
108 #
109 ########################################################################
110 # Numbers are cycles per processed byte out of large message.
111 #
112 #                       r=1088(*)
113 #
114 # Haswell               8.7/+10%
115 # Skylake               7.8/+20%
116 # Ryzen                 17(**)
117 #
118 # (*)   Corresponds to SHA3-256. Percentage after slash is improvement
119 #       coefficient in comparison to scalar keccak1600-x86_64.pl.
120 # (**)  It's expected that Ryzen performs poorly, because instruction
121 #       issue rate is limited to two AVX2 instructions per cycle and
122 #       in addition vpblendd is reportedly bound to specific port.
123 #       Obviously this code path should not be executed on Ryzen.
124
125 my @T = map("%ymm$_",(7..15));
126 my ($C14,$C00,$D00,$D14) = @T[5..8];
127
128 $code.=<<___;
129 .text
130
131 .type   __KeccakF1600,\@function
132 .align  32
133 __KeccakF1600:
134         lea             rhotates_left+96(%rip),%r8
135         lea             rhotates_right+96(%rip),%r9
136         lea             iotas(%rip),%r10
137         mov             \$24,%eax
138         jmp             .Loop_avx2
139
140 .align  32
141 .Loop_avx2:
142         ######################################### Theta
143         vpshufd         \$0b01001110,$A20,$C00
144         vpxor           $A31,$A41,$C14
145         vpxor           $A11,$A21,@T[2]
146         vpxor           $A01,$C14,$C14
147         vpxor           @T[2],$C14,$C14         # C[1..4]
148
149         vpermq          \$0b10010011,$C14,@T[4]
150         vpxor           $A20,$C00,$C00
151         vpermq          \$0b01001110,$C00,@T[0]
152
153         vpsrlq          \$63,$C14,@T[1]
154         vpaddq          $C14,$C14,@T[2]
155         vpor            @T[2],@T[1],@T[1]       # ROL64(C[1..4],1)
156
157         vpermq          \$0b00111001,@T[1],$D14
158         vpxor           @T[4],@T[1],$D00
159         vpermq          \$0b00000000,$D00,$D00  # D[0..0] = ROL64(C[1],1) ^ C[4]
160
161         vpxor           $A00,$C00,$C00
162         vpxor           @T[0],$C00,$C00         # C[0..0]
163
164         vpsrlq          \$63,$C00,@T[0]
165         vpaddq          $C00,$C00,@T[1]
166         vpor            @T[0],@T[1],@T[1]       # ROL64(C[0..0],1)
167
168         vpxor           $D00,$A20,$A20          # ^= D[0..0]
169         vpxor           $D00,$A00,$A00          # ^= D[0..0]
170
171         vpblendd        \$0b11000000,@T[1],$D14,$D14
172         vpblendd        \$0b00000011,$C00,@T[4],@T[4]
173         vpxor           @T[4],$D14,$D14         # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
174
175         ######################################### Rho + Pi + pre-Chi shuffle
176         vpsllvq         0*32-96(%r8),$A20,@T[3]
177         vpsrlvq         0*32-96(%r9),$A20,$A20
178         vpor            @T[3],$A20,$A20
179
180          vpxor          $D14,$A31,$A31          # ^= D[1..4] from Theta
181         vpsllvq         2*32-96(%r8),$A31,@T[4]
182         vpsrlvq         2*32-96(%r9),$A31,$A31
183         vpor            @T[4],$A31,$A31
184
185          vpxor          $D14,$A21,$A21          # ^= D[1..4] from Theta
186         vpsllvq         3*32-96(%r8),$A21,@T[5]
187         vpsrlvq         3*32-96(%r9),$A21,$A21
188         vpor            @T[5],$A21,$A21
189
190          vpxor          $D14,$A41,$A41          # ^= D[1..4] from Theta
191         vpsllvq         4*32-96(%r8),$A41,@T[6]
192         vpsrlvq         4*32-96(%r9),$A41,$A41
193         vpor            @T[6],$A41,$A41
194
195          vpxor          $D14,$A11,$A11          # ^= D[1..4] from Theta
196          vpermq         \$0b10001101,$A20,@T[3] # $A20 -> future $A31
197          vpermq         \$0b10001101,$A31,@T[4] # $A31 -> future $A21
198         vpsllvq         5*32-96(%r8),$A11,@T[7]
199         vpsrlvq         5*32-96(%r9),$A11,@T[1]
200         vpor            @T[7],@T[1],@T[1]       # $A11 -> future $A01
201
202          vpxor          $D14,$A01,$A01          # ^= D[1..4] from Theta
203          vpermq         \$0b00011011,$A21,@T[5] # $A21 -> future $A41
204          vpermq         \$0b01110010,$A41,@T[6] # $A41 -> future $A11
205         vpsllvq         1*32-96(%r8),$A01,@T[8]
206         vpsrlvq         1*32-96(%r9),$A01,@T[2]
207         vpor            @T[8],@T[2],@T[2]       # $A01 -> future $A20
208
209         ######################################### Chi
210         vpsrldq         \$8,@T[1],@T[7]
211         vpandn          @T[7],@T[1],@T[0]       # tgting  [0][0] [0][0] [0][0] [0][0]
212
213         vpblendd        \$0b00001100,@T[6],@T[2],$A31   #               [4][4] [2][0]
214         vpblendd        \$0b00001100,@T[2],@T[4],@T[8]  #               [4][0] [2][1]
215          vpblendd       \$0b00001100,@T[4],@T[3],$A41   #               [4][2] [2][4]
216          vpblendd       \$0b00001100,@T[3],@T[2],@T[7]  #               [4][3] [2][0]
217         vpblendd        \$0b00110000,@T[4],$A31,$A31    #        [1][3] [4][4] [2][0]
218         vpblendd        \$0b00110000,@T[5],@T[8],@T[8]  #        [1][4] [4][0] [2][1]
219          vpblendd       \$0b00110000,@T[2],$A41,$A41    #        [1][0] [4][2] [2][4]
220          vpblendd       \$0b00110000,@T[6],@T[7],@T[7]  #        [1][1] [4][3] [2][0]
221         vpblendd        \$0b11000000,@T[5],$A31,$A31    # [3][2] [1][3] [4][4] [2][0]
222         vpblendd        \$0b11000000,@T[6],@T[8],@T[8]  # [3][3] [1][4] [4][0] [2][1]
223          vpblendd       \$0b11000000,@T[6],$A41,$A41    # [3][3] [1][0] [4][2] [2][4]
224          vpblendd       \$0b11000000,@T[4],@T[7],@T[7]  # [3][4] [1][1] [4][3] [2][0]
225         vpandn          @T[8],$A31,$A31         # tgting  [3][1] [1][2] [4][3] [2][4]
226          vpandn         @T[7],$A41,$A41         # tgting  [3][2] [1][4] [4][1] [2][3]
227
228         vpblendd        \$0b00001100,@T[2],@T[5],$A11   #               [4][0] [2][3]
229         vpblendd        \$0b00001100,@T[5],@T[3],@T[8]  #               [4][1] [2][4]
230          vpxor          @T[3],$A31,$A31
231         vpblendd        \$0b00110000,@T[3],$A11,$A11    #        [1][2] [4][0] [2][3]
232         vpblendd        \$0b00110000,@T[4],@T[8],@T[8]  #        [1][3] [4][1] [2][4]
233          vpxor          @T[5],$A41,$A41
234         vpblendd        \$0b11000000,@T[4],$A11,$A11    # [3][4] [1][2] [4][0] [2][3]
235         vpblendd        \$0b11000000,@T[2],@T[8],@T[8]  # [3][0] [1][3] [4][1] [2][4]
236         vpandn          @T[8],$A11,$A11         # tgting  [3][3] [1][1] [4][4] [2][2]
237         vpxor           @T[6],$A11,$A11
238
239           vpermq        \$0b00011110,@T[1],$A21         # [0][1] [0][2] [0][4] [0][3]
240           vpblendd      \$0b00110000,$A00,$A21,@T[8]    # [0][1] [0][0] [0][4] [0][3]
241           vpermq        \$0b00111001,@T[1],$A01         # [0][1] [0][4] [0][3] [0][2]
242           vpblendd      \$0b11000000,$A00,$A01,$A01     # [0][0] [0][4] [0][3] [0][2]
243           vpandn        @T[8],$A01,$A01         # tgting  [0][4] [0][3] [0][2] [0][1]
244
245         vpblendd        \$0b00001100,@T[5],@T[4],$A20   #               [4][1] [2][1]
246         vpblendd        \$0b00001100,@T[4],@T[6],@T[7]  #               [4][2] [2][2]
247         vpblendd        \$0b00110000,@T[6],$A20,$A20    #        [1][1] [4][1] [2][1]
248         vpblendd        \$0b00110000,@T[3],@T[7],@T[7]  #        [1][2] [4][2] [2][2]
249         vpblendd        \$0b11000000,@T[3],$A20,$A20    # [3][1] [1][1] [4][1] [2][1]
250         vpblendd        \$0b11000000,@T[5],@T[7],@T[7]  # [3][2] [1][2] [4][2] [2][2]
251         vpandn          @T[7],$A20,$A20         # tgting  [3][0] [1][0] [4][0] [2][0]
252         vpxor           @T[2],$A20,$A20
253
254          vpermq         \$0b00000000,@T[0],@T[0]        # [0][0] [0][0] [0][0] [0][0]
255          vpermq         \$0b00011011,$A31,$A31  # post-Chi shuffle
256          vpermq         \$0b10001101,$A41,$A41
257          vpermq         \$0b01110010,$A11,$A11
258
259         vpblendd        \$0b00001100,@T[3],@T[6],$A21   #               [4][3] [2][2]
260         vpblendd        \$0b00001100,@T[6],@T[5],@T[7]  #               [4][4] [2][3]
261         vpblendd        \$0b00110000,@T[5],$A21,$A21    #        [1][4] [4][3] [2][2]
262         vpblendd        \$0b00110000,@T[2],@T[7],@T[7]  #        [1][0] [4][4] [2][3]
263         vpblendd        \$0b11000000,@T[2],$A21,$A21    # [3][0] [1][4] [4][3] [2][2]
264         vpblendd        \$0b11000000,@T[3],@T[7],@T[7]  # [3][1] [1][0] [4][4] [2][3]
265         vpandn          @T[7],$A21,$A21         # tgting  [3][4] [1][3] [4][2] [2][1]
266
267         vpxor           @T[0],$A00,$A00
268         vpxor           @T[1],$A01,$A01
269         vpxor           @T[4],$A21,$A21
270
271         ######################################### Iota
272         vpxor           (%r10),$A00,$A00
273         lea             32(%r10),%r10
274
275         dec             %eax
276         jnz             .Loop_avx2
277
278         ret
279 .size   __KeccakF1600,.-__KeccakF1600
280 ___
281 my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
282 my  $out = $inp;        # in squeeze
283
284 $code.=<<___;
285 .globl  SHA3_absorb
286 .type   SHA3_absorb,\@function
287 .align  32
288 SHA3_absorb:
289         mov     %rsp,%r11
290
291         lea     -240(%rsp),%rsp
292         and     \$-32,%rsp
293
294         lea     96($A_flat),$A_flat
295         lea     96($inp),$inp
296         lea     96(%rsp),%r10
297
298         vzeroupper
299
300         vpbroadcastq    -96($A_flat),$A00       # load A[5][5]
301         vmovdqu         8+32*0-96($A_flat),$A01
302         vmovdqu         8+32*1-96($A_flat),$A20
303         vmovdqu         8+32*2-96($A_flat),$A31
304         vmovdqu         8+32*3-96($A_flat),$A21
305         vmovdqu         8+32*4-96($A_flat),$A41
306         vmovdqu         8+32*5-96($A_flat),$A11
307
308         vpxor           @T[0],@T[0],@T[0]
309         vmovdqa         @T[0],32*2-96(%r10)     # zero transfer area on stack
310         vmovdqa         @T[0],32*3-96(%r10)
311         vmovdqa         @T[0],32*4-96(%r10)
312         vmovdqa         @T[0],32*5-96(%r10)
313         vmovdqa         @T[0],32*6-96(%r10)
314
315 .Loop_absorb_avx2:
316         mov             $bsz,%rax
317         sub             $bsz,$len
318         jc              .Ldone_absorb_avx2
319
320         shr             \$3,%eax
321         vpbroadcastq    0-96($inp),@T[0]
322         vmovdqu         8-96($inp),@T[1]
323         sub             \$4,%eax
324 ___
325 for(my $i=5; $i<25; $i++) {
326 $code.=<<___
327         dec     %eax
328         jz      .Labsorved_avx2
329         mov     8*$i-96($inp),%r8
330         mov     %r8,$A_jagged[$i]-96(%r10)
331 ___
332 }
333 $code.=<<___;
334 .Labsorved_avx2:
335         lea     ($inp,$bsz),$inp
336
337         vpxor   @T[0],$A00,$A00
338         vpxor   @T[1],$A01,$A01
339         vpxor   32*2-96(%r10),$A20,$A20
340         vpxor   32*3-96(%r10),$A31,$A31
341         vpxor   32*4-96(%r10),$A21,$A21
342         vpxor   32*5-96(%r10),$A41,$A41
343         vpxor   32*6-96(%r10),$A11,$A11
344
345         call    __KeccakF1600
346
347         lea     96(%rsp),%r10
348         jmp     .Loop_absorb_avx2
349
350 .Ldone_absorb_avx2:
351         vmovq   %xmm0,-96($A_flat)
352         vmovdqu $A01,8+32*0-96($A_flat)
353         vmovdqu $A20,8+32*1-96($A_flat)
354         vmovdqu $A31,8+32*2-96($A_flat)
355         vmovdqu $A21,8+32*3-96($A_flat)
356         vmovdqu $A41,8+32*4-96($A_flat)
357         vmovdqu $A11,8+32*5-96($A_flat)
358
359         vzeroupper
360
361         lea     (%r11),%rsp
362         lea     ($len,$bsz),%rax                # return value
363         ret
364 .size   SHA3_absorb,.-SHA3_absorb
365
366 .globl  SHA3_squeeze
367 .type   SHA3_squeeze,\@function
368 .align  32
369 SHA3_squeeze:
370         mov     %rsp,%r11
371
372         lea     96($A_flat),$A_flat
373         shr     \$3,$bsz
374
375         vzeroupper
376
377         vpbroadcastq    -96($A_flat),$A00
378         vpxor           @T[0],@T[0],@T[0]
379         vmovdqu         8+32*0-96($A_flat),$A01
380         vmovdqu         8+32*1-96($A_flat),$A20
381         vmovdqu         8+32*2-96($A_flat),$A31
382         vmovdqu         8+32*3-96($A_flat),$A21
383         vmovdqu         8+32*4-96($A_flat),$A41
384         vmovdqu         8+32*5-96($A_flat),$A11
385
386         mov     $bsz,%rax
387
388 .Loop_squeeze_avx2:
389         mov     @A_jagged[$i]-96($A_flat),%r8
390 ___
391 for (my $i=0; $i<25; $i++) {
392 $code.=<<___;
393         sub     \$8,$len
394         jc      .Ltail_squeeze_avx2
395         mov     %r8,($out)
396         lea     8($out),$out
397         je      .Ldone_squeeze_avx2
398         dec     %eax
399         je      .Lextend_output_avx2
400         mov     @A_jagged[$i+1]-120($A_flat),%r8
401 ___
402 }
403 $code.=<<___;
404 .Lextend_output_avx2:
405         call    __KeccakF1600
406
407         vmovq   %xmm0,-96($A_flat)
408         vmovdqu $A01,8+32*0-96($A_flat)
409         vmovdqu $A20,8+32*1-96($A_flat)
410         vmovdqu $A31,8+32*2-96($A_flat)
411         vmovdqu $A21,8+32*3-96($A_flat)
412         vmovdqu $A41,8+32*4-96($A_flat)
413         vmovdqu $A11,8+32*5-96($A_flat)
414
415         mov     $bsz,%rax
416         jmp     .Loop_squeeze_avx2
417
418
419 .Ltail_squeeze_avx2:
420         add     \$8,$len
421 .Loop_tail_avx2:
422         mov     %r8b,($out)
423         lea     1($out),$out
424         shr     \$8,%r8
425         dec     $len
426         jnz     .Loop_tail_avx2
427
428 .Ldone_squeeze_avx2:
429         vzeroupper
430
431         lea     (%r11),%rsp
432         ret
433 .size   SHA3_squeeze,.-SHA3_squeeze
434
435 .align  64
436 rhotates_left:
437         .quad   3,      18,     36,     41      # [2][0] [4][0] [1][0] [3][0]
438         .quad   1,      62,     28,     27      # [0][1] [0][2] [0][3] [0][4]
439         .quad   45,     6,      56,     39      # [3][1] [1][2] [4][3] [2][4]
440         .quad   10,     61,     55,     8       # [2][1] [4][2] [1][3] [3][4]
441         .quad   2,      15,     25,     20      # [4][1] [3][2] [2][3] [1][4]
442         .quad   44,     43,     21,     14      # [1][1] [2][2] [3][3] [4][4]
443 rhotates_right:
444         .quad   64-3,   64-18,  64-36,  64-41
445         .quad   64-1,   64-62,  64-28,  64-27
446         .quad   64-45,  64-6,   64-56,  64-39
447         .quad   64-10,  64-61,  64-55,  64-8
448         .quad   64-2,   64-15,  64-25,  64-20
449         .quad   64-44,  64-43,  64-21,  64-14
450 iotas:
451         .quad   0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
452         .quad   0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
453         .quad   0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
454         .quad   0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
455         .quad   0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
456         .quad   0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
457         .quad   0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
458         .quad   0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
459         .quad   0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
460         .quad   0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
461         .quad   0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
462         .quad   0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
463         .quad   0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
464         .quad   0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
465         .quad   0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
466         .quad   0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
467         .quad   0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
468         .quad   0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
469         .quad   0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
470         .quad   0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
471         .quad   0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
472         .quad   0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
473         .quad   0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
474         .quad   0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
475
476 .asciz  "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"
477 ___
478
479 $output=pop and open STDOUT,">$output";
480 print $code;
481 close STDOUT;