sha/asm/keccak1600-avx2.pl: remodel register usage.
[openssl.git] / crypto / sha / asm / keccak1600-avx2.pl
1 #!/usr/bin/env perl
2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # Keccak-1600 for AVX2.
17 #
18 # July 2017.
19 #
20 # To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
21 # 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
22 # other than A[0][0] in magic order into 6 [256-bit] registers, *each
23 # dedicated to one axis*, Pi permutation is reduced to intra-register
24 # shuffles...
25 #
26 # It makes other steps more intricate, but overall, is it a win? To be
27 # more specific index permutations organized by quadruples are:
28 #
29 #       [4][4] [3][3] [2][2] [1][1]<-+
30 #       [0][4] [0][3] [0][2] [0][1]<-+
31 #       [3][0] [1][0] [4][0] [2][0]  |
32 #       [4][3] [3][1] [2][4] [1][2]  |
33 #       [3][4] [1][3] [4][2] [2][1]  |
34 #       [2][3] [4][1] [1][4] [3][2]  |
35 #       [2][2] [4][4] [1][1] [3][3] -+
36 #
37 # This however is highly impractical for Theta and Chi. What would help
38 # Theta is if x indices were aligned column-wise, or in other words:
39 #
40 #       [0][4] [0][3] [0][2] [0][1]
41 #       [3][0] [1][0] [4][0] [2][0]
42 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
43 #       [2][4] [4][3] [1][2] [3][1]
44 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
45 #       [3][4] [1][3] [4][2] [2][1]
46 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
47 #       [1][4] [2][3] [3][2] [4][1]
48 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
49 #       [4][4] [3][3] [2][2] [1][1]
50 #
51 # So here we have it, lines not marked with vpermq() represent the magic
52 # order in which data is to be loaded and maintained. [And lines marked
53 # with vpermq() represent Pi circular permutation in chosen layout. Note
54 # that first step is permutation-free.] A[0][0] is loaded to register of
55 # its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
56 # Digits in variables' names denote right-most coordinates:
57
58 my ($A00,       # [0][0] [0][0] [0][0] [0][0]           # %ymm0
59     $A01,       # [0][4] [0][3] [0][2] [0][1]           # %ymm1
60     $A20,       # [3][0] [1][0] [4][0] [2][0]           # %ymm2
61     $A31,       # [2][4] [4][3] [1][2] [3][1]           # %ymm3
62     $A21,       # [3][4] [1][3] [4][2] [2][1]           # %ymm4
63     $A41,       # [1][4] [2][3] [3][2] [4][1]           # %ymm5
64     $A11) =     # [4][4] [3][3] [2][2] [1][1]           # %ymm6
65     map("%ymm$_",(0..6));
66
67 # We also need to map the magic order into offsets within structure:
68
69 my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3],      # [0][0..4]
70                 [2,2], [6,0], [3,1], [4,2], [5,3],      # [1][0..4]
71                 [2,0], [4,0], [6,1], [5,2], [3,3],      # [2][0..4]
72                 [2,3], [3,0], [5,1], [6,2], [4,3],      # [3][0..4]
73                 [2,1], [5,0], [4,1], [3,2], [6,3]);     # [4][0..4]
74    @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged);     # ... and now linear
75
76 # But on the other hand Chi is much better off if y indices were aligned
77 # column-wise, not x. For this reason we have to shuffle data prior
78 # Chi and revert it afterwards. Prior shuffle is naturally merged with
79 # Pi itself:
80 #
81 #       [0][4] [0][3] [0][2] [0][1]
82 #       [3][0] [1][0] [4][0] [2][0]
83 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
84 #vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
85 #       [3][1] [1][2] [4][3] [2][4]
86 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
87 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
88 #       [3][4] [1][3] [4][2] [2][1]
89 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
90 #vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
91 #       [3][2] [1][4] [4][1] [2][3]
92 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
93 #vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
94 #       [3][3] [1][1] [4][4] [2][2]
95 #
96 # And reverse post-Chi permutation:
97 #
98 #       [0][4] [0][3] [0][2] [0][1]
99 #       [3][0] [1][0] [4][0] [2][0]
100 #vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
101 #       [2][4] [4][3] [1][2] [3][1]
102 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
103 #       [3][4] [1][3] [4][2] [2][1]
104 #vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
105 #       [1][4] [2][3] [3][2] [4][1]
106 #vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
107 #       [4][4] [3][3] [2][2] [1][1]
108 #
109 ########################################################################
110 # Numbers are cycles per processed byte out of large message.
111 #
112 #                       r=1088(*)
113 #
114 # Haswell               9.5
115 # Skylake               8.8
116 #
117 # (*)   Corresponds to SHA3-256.
118
119 my @T = map("%ymm$_",(7..15));
120 my ($C14,$C00,$D00,$D14) = @T[5..8];
121
122 $code.=<<___;
123 .text
124
125 .type   __KeccakF1600,\@function
126 .align  32
127 __KeccakF1600:
128         lea             rhotates_left+96(%rip),%r8
129         lea             rhotates_right+96(%rip),%r9
130         lea             iotas(%rip),%r10
131         mov             \$24,%eax
132         jmp             .Loop_avx2
133
134 .align  32
135 .Loop_avx2:
136         ######################################### Theta
137         vpxor           $A01,$A31,$C14
138         vpxor           $A21,$C14,$C14
139         vpxor           $A41,$C14,$C14
140         vpxor           $A11,$C14,$C14          # C[1..4]
141         vpermq          \$0b10110001,$A20,$C00
142         vpxor           $A20,$C00,$C00
143         vpermq          \$0b01001110,$C00,@T[0]
144         vpxor           $A00,$C00,$C00
145         vpxor           @T[0],$C00,$C00         # C[0..0]
146
147         vpsrlq          \$63,$C14,@T[1]
148         vpaddq          $C14,$C14,@T[3]
149         vpor            @T[3],@T[1],@T[1]       # ROL64(C[1..4],1)
150
151         vpsrlq          \$63,$C00,@T[0]
152         vpaddq          $C00,$C00,@T[2]
153         vpor            @T[2],@T[0],@T[0]       # ROL64(C[0..0],1)
154
155         vpermq          \$0b00000000,@T[1],$D00 
156         vpermq          \$0b11111111,$C14,@T[3]
157         vpxor           @T[3],$D00,$D00         # D[0..0] = ROL64(C[1],1) ^ C[4]
158
159         vpermq          \$0b00111001,@T[1],$D14
160         vpblendd        \$0b11000000,@T[0],$D14,$D14
161         vpermq          \$0b10010011,$C14,@T[2]
162         vpblendd        \$0b00000011,$C00,@T[2],@T[2]
163         vpxor           @T[2],$D14,$D14         # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
164
165         vpxor           $D00,$A00,$A00          # ^= D[0..0]
166         vpxor           $D00,$A20,$A20          # ^= D[0..0]
167         vpxor           $D14,$A01,$A01          # ^= D[1..4]
168         vpxor           $D14,$A31,$A31          # ^= D[1..4]
169         vpxor           $D14,$A21,$A21          # ^= D[1..4]
170         vpxor           $D14,$A41,$A41          # ^= D[1..4]
171         vpxor           $D14,$A11,$A11          # ^= D[1..4]
172
173         ######################################### Rho
174         vpsllvq         0*32-96(%r8),$A20,@T[0]
175         vpsrlvq         0*32-96(%r9),$A20,$A20
176         vpor            @T[0],$A20,$A20
177
178         vpsllvq         1*32-96(%r8),$A01,@T[1]
179         vpsrlvq         1*32-96(%r9),$A01,$A01
180         vpor            @T[1],$A01,$A01
181
182         vpsllvq         2*32-96(%r8),$A31,@T[2]
183         vpsrlvq         2*32-96(%r9),$A31,$A31
184         vpor            @T[2],$A31,$A31
185
186         vpsllvq         3*32-96(%r8),$A21,@T[3]
187         vpsrlvq         3*32-96(%r9),$A21,$A21
188         vpor            @T[3],$A21,$A21
189
190         vpsllvq         4*32-96(%r8),$A41,@T[4]
191         vpsrlvq         4*32-96(%r9),$A41,$A41
192         vpor            @T[4],$A41,$A41
193
194         vpsllvq         5*32-96(%r8),$A11,@T[5]
195         vpsrlvq         5*32-96(%r9),$A11,$A11
196         vpor            @T[5],$A11,$A11
197
198         ######################################### Pi + pre-Chi shuffle
199         vpermq          \$0b01110010,$A41,@T[6] # vpermq \$0b00011011,$A41,$A11
200         vpermq          \$0b00011011,$A21,@T[5] # vpermq \$0b01110010,$A21,$A41
201         vpermq          \$0b10001101,$A31,@T[4] # vpermq \$0b10001101,$A31,$A21
202         vpermq          \$0b10001101,$A20,@T[3] # vpermq \$0b01110010,$A20,$A31
203         vmovdqa         $A01,@T[2]
204         vmovdqa         $A11,@T[1]
205
206         ######################################### Chi
207         vpermq          \$0b00000000,@T[1],@T[0]        # [0][1] [0][1] [0][1] [0][1]
208         vpermq          \$0b01010101,@T[1],@T[7]        # [0][2] [0][2] [0][2] [0][2]
209         vpandn          @T[7],@T[0],@T[0]       # tgting  [0][0] [0][0] [0][0] [0][0]
210
211         vpermq          \$0b00111001,@T[1],$A01         # [0][1] [0][4] [0][3] [0][2]
212         vpermq          \$0b00011110,@T[1],@T[8]        # [0][1] [0][2] [0][4] [0][3]
213         vpblendd        \$0b11000000,$A00,$A01,$A01     # [0][0] [0][4] [0][3] [0][2]
214         vpblendd        \$0b00110000,$A00,@T[8],@T[8]   # [0][1] [0][0] [0][4] [0][3]
215         vpandn          @T[8],$A01,$A01         # tgting  [0][4] [0][3] [0][2] [0][1]
216
217         vpblendd        \$0b00001100,@T[5],@T[4],$A20   #               [4][1] [2][1]
218         vpblendd        \$0b00110000,@T[6],$A20,$A20    #        [1][1] [4][1] [2][1]
219         vpblendd        \$0b11000000,@T[3],$A20,$A20    # [3][1] [1][1] [4][1] [2][1]
220         vpblendd        \$0b00001100,@T[4],@T[6],@T[7]  #               [4][2] [2][2]
221         vpblendd        \$0b00110000,@T[3],@T[7],@T[7]  #        [1][2] [4][2] [2][2]
222         vpblendd        \$0b11000000,@T[5],@T[7],@T[7]  # [3][2] [1][2] [4][2] [2][2]
223         vpandn          @T[7],$A20,$A20         # tgting  [3][0] [1][0] [4][0] [2][0]
224
225         vpblendd        \$0b00001100,@T[6],@T[2],$A31   #               [4][4] [2][0]
226         vpblendd        \$0b00110000,@T[4],$A31,$A31    #        [1][3] [4][4] [2][0]
227         vpblendd        \$0b11000000,@T[5],$A31,$A31    # [3][2] [1][3] [4][4] [2][0]
228         vpblendd        \$0b00001100,@T[2],@T[4],@T[8]  #               [4][0] [2][1]
229         vpblendd        \$0b00110000,@T[5],@T[8],@T[8]  #        [1][4] [4][0] [2][1]
230         vpblendd        \$0b11000000,@T[6],@T[8],@T[8]  # [3][3] [1][4] [4][0] [2][1]
231         vpandn          @T[8],$A31,$A31         # tgting  [3][1] [1][2] [4][3] [2][4]
232
233         vpblendd        \$0b00001100,@T[3],@T[6],$A21   #               [4][3] [2][2]
234         vpblendd        \$0b00110000,@T[5],$A21,$A21    #        [1][4] [4][3] [2][2]
235         vpblendd        \$0b11000000,@T[2],$A21,$A21    # [3][0] [1][4] [4][3] [2][2]
236         vpblendd        \$0b00001100,@T[6],@T[5],@T[7]  #               [4][4] [2][3]
237         vpblendd        \$0b00110000,@T[2],@T[7],@T[7]  #        [1][0] [4][4] [2][3]
238         vpblendd        \$0b11000000,@T[3],@T[7],@T[7]  # [3][1] [1][0] [4][4] [2][3]
239         vpandn          @T[7],$A21,$A21         # tgting  [3][4] [1][3] [4][2] [2][1]
240
241         vpblendd        \$0b00001100,@T[4],@T[3],$A41   #               [4][2] [2][4]
242         vpblendd        \$0b00110000,@T[2],$A41,$A41    #        [1][0] [4][2] [2][4]
243         vpblendd        \$0b11000000,@T[6],$A41,$A41    # [3][3] [1][0] [4][2] [2][4]
244         vpblendd        \$0b00001100,@T[3],@T[2],@T[8]  #               [4][3] [2][0]
245         vpblendd        \$0b00110000,@T[6],@T[8],@T[8]  #        [1][1] [4][3] [2][0]
246         vpblendd        \$0b11000000,@T[4],@T[8],@T[8]  # [3][4] [1][1] [4][3] [2][0]
247         vpandn          @T[8],$A41,$A41         # tgting  [3][2] [1][4] [4][1] [2][3]
248
249         vpblendd        \$0b00001100,@T[2],@T[5],$A11   #               [4][0] [2][3]
250         vpblendd        \$0b00110000,@T[3],$A11,$A11    #        [1][2] [4][0] [2][3]
251         vpblendd        \$0b11000000,@T[4],$A11,$A11    # [3][4] [1][2] [4][0] [2][3]
252         vpblendd        \$0b00001100,@T[5],@T[3],@T[7]  #               [4][1] [2][4]
253         vpblendd        \$0b00110000,@T[4],@T[7],@T[7]  #        [1][3] [4][1] [2][4]
254         vpblendd        \$0b11000000,@T[2],@T[7],@T[7]  # [3][0] [1][3] [4][1] [2][4]
255         vpandn          @T[7],$A11,$A11         # tgting  [3][3] [1][1] [4][4] [2][2]
256
257         vpxor           @T[0],$A00,$A00
258         vpxor           @T[1],$A01,$A01
259         vpxor           @T[2],$A20,$A20
260         vpxor           @T[3],$A31,$A31
261         vpxor           @T[4],$A21,$A21
262         vpxor           @T[5],$A41,$A41
263         vpxor           @T[6],$A11,$A11
264
265         vpermq          \$0b00011011,$A31,$A31  # post-Chi shuffle
266         vpermq          \$0b10001101,$A41,$A41
267         vpermq          \$0b01110010,$A11,$A11
268
269         ######################################### Iota
270         vpxor           (%r10),$A00,$A00
271         lea             32(%r10),%r10
272
273         dec             %eax
274         jnz             .Loop_avx2
275
276         ret
277 .size   __KeccakF1600,.-__KeccakF1600
278 ___
279 my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
280 my  $out = $inp;        # in squeeze
281
282 $code.=<<___;
283 .globl  SHA3_absorb
284 .type   SHA3_absorb,\@function
285 .align  32
286 SHA3_absorb:
287         mov     %rsp,%r11
288
289         lea     -240(%rsp),%rsp
290         and     \$-32,%rsp
291
292         lea     96($A_flat),$A_flat
293         lea     96($inp),$inp
294         lea     96(%rsp),%r10
295
296         vzeroupper
297
298         vpbroadcastq    -96($A_flat),$A00       # load A[5][5]
299         vmovdqu         8+32*0-96($A_flat),$A01
300         vmovdqu         8+32*1-96($A_flat),$A20
301         vmovdqu         8+32*2-96($A_flat),$A31
302         vmovdqu         8+32*3-96($A_flat),$A21
303         vmovdqu         8+32*4-96($A_flat),$A41
304         vmovdqu         8+32*5-96($A_flat),$A11
305
306         vpxor           @T[0],@T[0],@T[0]
307         vmovdqa         @T[0],32*2-96(%r10)     # zero transfer area on stack
308         vmovdqa         @T[0],32*3-96(%r10)
309         vmovdqa         @T[0],32*4-96(%r10)
310         vmovdqa         @T[0],32*5-96(%r10)
311         vmovdqa         @T[0],32*6-96(%r10)
312
313 .Loop_absorb_avx2:
314         mov             $bsz,%rax
315         sub             $bsz,$len
316         jc              .Ldone_absorb_avx2
317
318         shr             \$3,%eax
319         vpbroadcastq    0-96($inp),@T[0]
320         vmovdqu         8-96($inp),@T[1]
321         sub             \$4,%eax
322 ___
323 for(my $i=5; $i<25; $i++) {
324 $code.=<<___
325         dec     %eax
326         jz      .Labsorved_avx2
327         mov     8*$i-96($inp),%r8
328         mov     %r8,$A_jagged[$i]-96(%r10)
329 ___
330 }
331 $code.=<<___;
332 .Labsorved_avx2:
333         lea     ($inp,$bsz),$inp
334
335         vpxor   @T[0],$A00,$A00
336         vpxor   @T[1],$A01,$A01
337         vpxor   32*2-96(%r10),$A20,$A20
338         vpxor   32*3-96(%r10),$A31,$A31
339         vpxor   32*4-96(%r10),$A21,$A21
340         vpxor   32*5-96(%r10),$A41,$A41
341         vpxor   32*6-96(%r10),$A11,$A11
342
343         call    __KeccakF1600
344
345         lea     96(%rsp),%r10
346         jmp     .Loop_absorb_avx2
347
348 .Ldone_absorb_avx2:
349         vmovq   %xmm0,-96($A_flat)
350         vmovdqu $A01,8+32*0-96($A_flat)
351         vmovdqu $A20,8+32*1-96($A_flat)
352         vmovdqu $A31,8+32*2-96($A_flat)
353         vmovdqu $A21,8+32*3-96($A_flat)
354         vmovdqu $A41,8+32*4-96($A_flat)
355         vmovdqu $A11,8+32*5-96($A_flat)
356
357         vzeroupper
358
359         lea     (%r11),%rsp
360         lea     ($len,$bsz),%rax                # return value
361         ret
362 .size   SHA3_absorb,.-SHA3_absorb
363
364 .globl  SHA3_squeeze
365 .type   SHA3_squeeze,\@function
366 .align  32
367 SHA3_squeeze:
368         mov     %rsp,%r11
369
370         lea     96($A_flat),$A_flat
371         shr     \$3,$bsz
372
373         vzeroupper
374
375         vpbroadcastq    -96($A_flat),$A00
376         vpxor           @T[0],@T[0],@T[0]
377         vmovdqu         8+32*0-96($A_flat),$A01
378         vmovdqu         8+32*1-96($A_flat),$A20
379         vmovdqu         8+32*2-96($A_flat),$A31
380         vmovdqu         8+32*3-96($A_flat),$A21
381         vmovdqu         8+32*4-96($A_flat),$A41
382         vmovdqu         8+32*5-96($A_flat),$A11
383
384         mov     $bsz,%rax
385
386 .Loop_squeeze_avx2:
387         mov     @A_jagged[$i]-96($A_flat),%r8
388 ___
389 for (my $i=0; $i<25; $i++) {
390 $code.=<<___;
391         sub     \$8,$len
392         jc      .Ltail_squeeze_avx2
393         mov     %r8,($out)
394         lea     8($out),$out
395         je      .Ldone_squeeze_avx2
396         dec     %eax
397         je      .Lextend_output_avx2
398         mov     @A_jagged[$i+1]-120($A_flat),%r8
399 ___
400 }
401 $code.=<<___;
402 .Lextend_output_avx2:
403         call    __KeccakF1600
404
405         vmovq   %xmm0,-96($A_flat)
406         vmovdqu $A01,8+32*0-96($A_flat)
407         vmovdqu $A20,8+32*1-96($A_flat)
408         vmovdqu $A31,8+32*2-96($A_flat)
409         vmovdqu $A21,8+32*3-96($A_flat)
410         vmovdqu $A41,8+32*4-96($A_flat)
411         vmovdqu $A11,8+32*5-96($A_flat)
412
413         mov     $bsz,%rax
414         jmp     .Loop_squeeze_avx2
415
416
417 .Ltail_squeeze_avx2:
418         add     \$8,$len
419 .Loop_tail_avx2:
420         mov     %r8b,($out)
421         lea     1($out),$out
422         shr     \$8,%r8
423         dec     $len
424         jnz     .Loop_tail_avx2
425
426 .Ldone_squeeze_avx2:
427         vzeroupper
428
429         lea     (%r11),%rsp
430         ret
431 .size   SHA3_squeeze,.-SHA3_squeeze
432
433 .align  64
434 rhotates_left:
435         .quad   3,      18,     36,     41      # [2][0] [4][0] [1][0] [3][0]
436         .quad   1,      62,     28,     27      # [0][1] [0][2] [0][3] [0][4]
437         .quad   45,     6,      56,     39      # [3][1] [1][2] [4][3] [2][4]
438         .quad   10,     61,     55,     8       # [2][1] [4][2] [1][3] [3][4]
439         .quad   2,      15,     25,     20      # [4][1] [3][2] [2][3] [1][4]
440         .quad   44,     43,     21,     14      # [1][1] [2][2] [3][3] [4][4]
441 rhotates_right:
442         .quad   64-3,   64-18,  64-36,  64-41
443         .quad   64-1,   64-62,  64-28,  64-27
444         .quad   64-45,  64-6,   64-56,  64-39
445         .quad   64-10,  64-61,  64-55,  64-8
446         .quad   64-2,   64-15,  64-25,  64-20
447         .quad   64-44,  64-43,  64-21,  64-14
448 iotas:
449         .quad   0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
450         .quad   0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
451         .quad   0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
452         .quad   0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
453         .quad   0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
454         .quad   0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
455         .quad   0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
456         .quad   0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
457         .quad   0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
458         .quad   0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
459         .quad   0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
460         .quad   0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
461         .quad   0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
462         .quad   0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
463         .quad   0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
464         .quad   0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
465         .quad   0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
466         .quad   0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
467         .quad   0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
468         .quad   0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
469         .quad   0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
470         .quad   0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
471         .quad   0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
472         .quad   0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
473
474 .asciz  "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"
475 ___
476
477 print $code;
478 close STDOUT;