Add sha/asm/keccak1600-avx2.pl.
[openssl.git] / crypto / sha / asm / keccak1600-avx2.pl
1 #!/usr/bin/env perl
2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8 #
9 # ====================================================================
10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11 # project. The module is, however, dual licensed under OpenSSL and
12 # CRYPTOGAMS licenses depending on where you obtain it. For further
13 # details see http://www.openssl.org/~appro/cryptogams/.
14 # ====================================================================
15 #
16 # Keccak-1600 for AVX2.
17 #
18 # July 2017.
19 #
20 # To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
21 # 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
22 # other than A[0][0] in magic order into 6 [256-bit] registers, *each
23 # dedicated to one axis*, Pi permutation is reduced to intra-register
24 # shuffles...
25 #
26 # It makes other steps more intricate, but overall, is it a win? To be
27 # more specific index permutations organized by quadruples are:
28 #
29 #       [4][4] [3][3] [2][2] [1][1]<-+
30 #       [0][4] [0][3] [0][2] [0][1]<-+
31 #       [3][0] [1][0] [4][0] [2][0]  |
32 #       [4][3] [3][1] [2][4] [1][2]  |
33 #       [3][4] [1][3] [4][2] [2][1]  |
34 #       [2][3] [4][1] [1][4] [3][2]  |
35 #       [2][2] [4][4] [1][1] [3][3] -+
36 #
37 # This however is highly impractical for Theta and Chi. What would help
38 # Theta is if x indices were aligned column-wise, or in other words:
39 #
40 #       [0][4] [0][3] [0][2] [0][1]
41 #       [3][0] [1][0] [4][0] [2][0]
42 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
43 #       [2][4] [4][3] [1][2] [3][1]
44 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
45 #       [3][4] [1][3] [4][2] [2][1]
46 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
47 #       [1][4] [2][3] [3][2] [4][1]
48 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
49 #       [4][4] [3][3] [2][2] [1][1]
50 #
51 # So here we have it, lines not marked with vpermq() represent the magic
52 # order in which data is to be loaded and maintained. [And lines marked
53 # with vpermq() represent Pi circular permutation in chosen layout. Note
54 # that first step is permutation-free.] A[0][0] is loaded to register of
55 # its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
56 # Digits in variables' names denote right-most coordinates:
57
58 my ($A00,       # [0][0] [0][0] [0][0] [0][0]           # %ymm0
59     $A01,       # [0][4] [0][3] [0][2] [0][1]           # %ymm1
60     $A20,       # [3][0] [1][0] [4][0] [2][0]           # %ymm2
61     $A31,       # [2][4] [4][3] [1][2] [3][1]           # %ymm3
62     $A21,       # [3][4] [1][3] [4][2] [2][1]           # %ymm4
63     $A41,       # [1][4] [2][3] [3][2] [4][1]           # %ymm5
64     $A11) =     # [4][4] [3][3] [2][2] [1][1]           # %ymm6
65     map("%ymm$_",(0..6));
66
67 # We also need to map the magic order into offsets within structure:
68
69 my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3],      # [0][0..4]
70                 [2,2], [6,0], [3,1], [4,2], [5,3],      # [1][0..4]
71                 [2,0], [4,0], [6,1], [5,2], [3,3],      # [2][0..4]
72                 [2,3], [3,0], [5,1], [6,2], [4,3],      # [3][0..4]
73                 [2,1], [5,0], [4,1], [3,2], [6,3]);     # [4][0..4]
74    @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged);     # ... and now linear
75
76 # But on the other hand Chi is much better off if y indices were aligned
77 # column-wise, not x. For this reason we have to shuffle data prior
78 # Chi and revert it afterwards. Prior shuffle is naturally merged with
79 # Pi itself:
80 #
81 #       [0][4] [0][3] [0][2] [0][1]
82 #       [3][0] [1][0] [4][0] [2][0]
83 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
84 #vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
85 #       [3][1] [1][2] [4][3] [2][4]
86 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
87 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
88 #       [3][4] [1][3] [4][2] [2][1]
89 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
90 #vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
91 #       [3][2] [1][4] [4][1] [2][3]
92 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
93 #vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
94 #       [3][3] [1][1] [4][4] [2][2]
95 #
96 # And reverse post-Chi permutation:
97 #
98 #       [0][4] [0][3] [0][2] [0][1]
99 #       [3][0] [1][0] [4][0] [2][0]
100 #vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
101 #       [2][4] [4][3] [1][2] [3][1]
102 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
103 #       [3][4] [1][3] [4][2] [2][1]
104 #vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
105 #       [1][4] [2][3] [3][2] [4][1]
106 #vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
107 #       [4][4] [3][3] [2][2] [1][1]
108 #
109 ########################################################################
110 # Numbers are cycles per processed byte out of large message.
111 #
112 #                       r=1088(*)
113 #
114 # Haswell               9.6
115 # Skylake               8.8
116 #
117 # (*)   Corresponds to SHA3-256.
118
119 my @T = map("%ymm$_",(7..15));
120 my ($C14,$C00,$D00,$D14) = @T[5..8];
121
122 $code.=<<___;
123 .text
124
125 .type   __KeccakF1600,\@function
126 .align  32
127 __KeccakF1600:
128         lea             rhotates_left+96(%rip),%r8
129         lea             rhotates_right+96(%rip),%r9
130         lea             iotas(%rip),%r10
131         mov             \$24,%eax
132         jmp             .Loop_avx2
133
134 .align  32
135 .Loop_avx2:
136         ######################################### Theta
137         vpxor           $A01,$A31,$C14
138         vpxor           $A21,$C14,$C14
139         vpxor           $A41,$C14,$C14
140         vpxor           $A11,$C14,$C14          # C[1..4]
141         vpermq          \$0b10110001,$A20,$C00
142         vpxor           $A20,$C00,$C00
143         vpermq          \$0b01001110,$C00,@T[0]
144         vpxor           $A00,$C00,$C00
145         vpxor           @T[0],$C00,$C00         # C[0..0]
146
147         vpsrlq          \$63,$C14,@T[1]
148         vpaddq          $C14,$C14,@T[3]
149         vpor            @T[3],@T[1],@T[1]       # ROL64(C[1..4],1)
150
151         vpsrlq          \$63,$C00,@T[0]
152         vpaddq          $C00,$C00,@T[2]
153         vpor            @T[2],@T[0],@T[0]       # ROL64(C[0..0],1)
154
155         vpermq          \$0b00000000,@T[1],$D00 
156         vpermq          \$0b11111111,$C14,@T[3]
157         vpxor           @T[3],$D00,$D00         # D[0..0] = ROL64(C[1],1) ^ C[4]
158
159         vpermq          \$0b00111001,@T[1],$D14
160         vpblendd        \$0b11000000,@T[0],$D14,$D14
161         vpermq          \$0b10010011,$C14,@T[2]
162         vpblendd        \$0b00000011,$C00,@T[2],@T[2]
163         vpxor           @T[2],$D14,$D14         # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
164
165         vpxor           $D00,$A00,$A00          # ^= D[0..0]
166         vpxor           $D00,$A20,$A20          # ^= D[0..0]
167         vpxor           $D14,$A01,$A01          # ^= D[1..4]
168         vpxor           $D14,$A31,$A31          # ^= D[1..4]
169         vpxor           $D14,$A21,$A21          # ^= D[1..4]
170         vpxor           $D14,$A41,$A41          # ^= D[1..4]
171         vpxor           $D14,$A11,$A11          # ^= D[1..4]
172
173         ######################################### Rho
174         vpsllvq         0*32-96(%r8),$A20,@T[0]
175         vpsrlvq         0*32-96(%r9),$A20,$A20
176         vpor            @T[0],$A20,$A20
177
178         vpsllvq         1*32-96(%r8),$A01,@T[1]
179         vpsrlvq         1*32-96(%r9),$A01,$A01
180         vpor            @T[1],$A01,$A01
181
182         vpsllvq         2*32-96(%r8),$A31,@T[2]
183         vpsrlvq         2*32-96(%r9),$A31,$A31
184         vpor            @T[2],$A31,$A31
185
186         vpsllvq         3*32-96(%r8),$A21,@T[3]
187         vpsrlvq         3*32-96(%r9),$A21,$A21
188         vpor            @T[3],$A21,$A21
189
190         vpsllvq         4*32-96(%r8),$A41,@T[4]
191         vpsrlvq         4*32-96(%r9),$A41,$A41
192         vpor            @T[4],$A41,$A41
193
194         vpsllvq         5*32-96(%r8),$A11,@T[5]
195         vpsrlvq         5*32-96(%r9),$A11,$A11
196         vpor            @T[5],$A11,$A11
197
198         ######################################### Pi + pre-Chi shuffle
199         vpermq          \$0b01110010,$A41,@T[0] # vpermq \$0b00011011,$A41,@T[0]
200         vpermq          \$0b00011011,$A21,$A41  # vpermq \$0b01110010,$A21,$A41
201         vpermq          \$0b10001101,$A31,$A21
202         vpermq          \$0b10001101,$A20,$A31  # vpermq \$0b01110010,$A20,$A31
203         vmovdqa         $A01,$A20
204         vmovdqa         $A11,$A01
205         vmovdqa         @T[0],$A11
206
207         ######################################### Chi
208         vpermq          \$0b00000000,$A01,@T[0]         # [0][1] [0][1] [0][1] [0][1]
209         vpermq          \$0b01010101,$A01,@T[2]         # [0][2] [0][2] [0][2] [0][2]
210         vpandn          @T[2],@T[0],@T[0]       # tgting  [0][0] [0][0] [0][0] [0][0]
211
212         vpermq          \$0b00111001,$A01,@T[1]         # [0][1] [0][4] [0][3] [0][2]
213         vpermq          \$0b00011110,$A01,@T[3]         # [0][1] [0][2] [0][4] [0][3]
214         vpblendd        \$0b11000000,$A00,@T[1],@T[1]   # [0][0] [0][4] [0][3] [0][2]
215         vpblendd        \$0b00110000,$A00,@T[3],@T[3]   # [0][1] [0][0] [0][4] [0][3]
216         vpandn          @T[3],@T[1],@T[1]       # tgting  [0][4] [0][3] [0][2] [0][1]
217
218         vpblendd        \$0b00001100,$A41,$A21, @T[2]   #               [4][1] [2][1]
219         vpblendd        \$0b00110000,$A11,@T[2],@T[2]   #        [1][1] [4][1] [2][1]
220         vpblendd        \$0b11000000,$A31,@T[2],@T[2]   # [3][1] [1][1] [4][1] [2][1]
221         vpblendd        \$0b00001100,$A21,$A11, @T[4]   #               [4][2] [2][2]
222         vpblendd        \$0b00110000,$A31,@T[4],@T[4]   #        [1][2] [4][2] [2][2]
223         vpblendd        \$0b11000000,$A41,@T[4],@T[4]   # [3][2] [1][2] [4][2] [2][2]
224         vpandn          @T[4],@T[2],@T[2]       # tgting  [3][0] [1][0] [4][0] [2][0]
225
226         vpblendd        \$0b00001100,$A11,$A20, @T[3]   #               [4][4] [2][0]
227         vpblendd        \$0b00110000,$A21,@T[3],@T[3]   #        [1][3] [4][4] [2][0]
228         vpblendd        \$0b11000000,$A41,@T[3],@T[3]   # [3][2] [1][3] [4][4] [2][0]
229         vpblendd        \$0b00001100,$A20,$A21, @T[5]   #               [4][0] [2][1]
230         vpblendd        \$0b00110000,$A41,@T[5],@T[5]   #        [1][4] [4][0] [2][1]
231         vpblendd        \$0b11000000,$A11,@T[5],@T[5]   # [3][3] [1][4] [4][0] [2][1]
232         vpandn          @T[5],@T[3],@T[3]       # tgting  [3][1] [1][2] [4][3] [2][4]
233
234         vpblendd        \$0b00001100,$A31,$A11, @T[4]   #               [4][3] [2][2]
235         vpblendd        \$0b00110000,$A41,@T[4],@T[4]   #        [1][4] [4][3] [2][2]
236         vpblendd        \$0b11000000,$A20,@T[4],@T[4]   # [3][0] [1][4] [4][3] [2][2]
237         vpblendd        \$0b00001100,$A11,$A41, @T[6]   #               [4][4] [2][3]
238         vpblendd        \$0b00110000,$A20,@T[6],@T[6]   #        [1][0] [4][4] [2][3]
239         vpblendd        \$0b11000000,$A31,@T[6],@T[6]   # [3][1] [1][0] [4][4] [2][3]
240         vpandn          @T[6],@T[4],@T[4]       # tgting  [3][4] [1][3] [4][2] [2][1]
241
242         vpblendd        \$0b00001100,$A21,$A31, @T[5]   #               [4][2] [2][4]
243         vpblendd        \$0b00110000,$A20,@T[5],@T[5]   #        [1][0] [4][2] [2][4]
244         vpblendd        \$0b11000000,$A11,@T[5],@T[5]   # [3][3] [1][0] [4][2] [2][4]
245         vpblendd        \$0b00001100,$A31,$A20, @T[7]   #               [4][3] [2][0]
246         vpblendd        \$0b00110000,$A11,@T[7],@T[7]   #        [1][1] [4][3] [2][0]
247         vpblendd        \$0b11000000,$A21,@T[7],@T[7]   # [3][4] [1][1] [4][3] [2][0]
248         vpandn          @T[7],@T[5],@T[5]       # tgting  [3][2] [1][4] [4][1] [2][3]
249
250         vpblendd        \$0b00001100,$A20,$A41, @T[6]   #               [4][0] [2][3]
251         vpblendd        \$0b00110000,$A31,@T[6],@T[6]   #        [1][2] [4][0] [2][3]
252         vpblendd        \$0b11000000,$A21,@T[6],@T[6]   # [3][4] [1][2] [4][0] [2][3]
253         vpblendd        \$0b00001100,$A41,$A31, @T[8]   #               [4][1] [2][4]
254         vpblendd        \$0b00110000,$A21,@T[8],@T[8]   #        [1][3] [4][1] [2][4]
255         vpblendd        \$0b11000000,$A20,@T[8],@T[8]   # [3][0] [1][3] [4][1] [2][4]
256         vpandn          @T[8],@T[6],@T[6]       # tgting  [3][3] [1][1] [4][4] [2][2]
257
258         vpxor           @T[0],$A00,$A00
259         vpxor           @T[1],$A01,$A01
260         vpxor           @T[2],$A20,$A20
261         vpxor           @T[3],$A31,$A31
262         vpxor           @T[4],$A21,$A21
263         vpxor           @T[5],$A41,$A41
264         vpxor           @T[6],$A11,$A11
265
266         vpermq          \$0b00011011,$A31,$A31  # post-Chi shuffle
267         vpermq          \$0b10001101,$A41,$A41
268         vpermq          \$0b01110010,$A11,$A11
269
270         ######################################### Iota
271         vpxor           (%r10),$A00,$A00
272         lea             32(%r10),%r10
273
274         dec             %eax
275         jnz             .Loop_avx2
276
277         ret
278 .size   __KeccakF1600,.-__KeccakF1600
279 ___
280 my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
281 my  $out = $inp;        # in squeeze
282
283 $code.=<<___;
284 .globl  SHA3_absorb
285 .type   SHA3_absorb,\@function
286 .align  32
287 SHA3_absorb:
288         mov     %rsp,%r11
289
290         lea     -240(%rsp),%rsp
291         and     \$-32,%rsp
292
293         lea     96($A_flat),$A_flat
294         lea     96($inp),$inp
295         lea     96(%rsp),%r10
296
297         vzeroupper
298
299         vpbroadcastq    -96($A_flat),$A00       # load A[5][5]
300         vmovdqu         8+32*0-96($A_flat),$A01
301         vmovdqu         8+32*1-96($A_flat),$A20
302         vmovdqu         8+32*2-96($A_flat),$A31
303         vmovdqu         8+32*3-96($A_flat),$A21
304         vmovdqu         8+32*4-96($A_flat),$A41
305         vmovdqu         8+32*5-96($A_flat),$A11
306
307         vpxor           @T[0],@T[0],@T[0]
308         vmovdqa         @T[0],32*2-96(%r10)     # zero transfer area on stack
309         vmovdqa         @T[0],32*3-96(%r10)
310         vmovdqa         @T[0],32*4-96(%r10)
311         vmovdqa         @T[0],32*5-96(%r10)
312         vmovdqa         @T[0],32*6-96(%r10)
313
314 .Loop_absorb_avx2:
315         mov             $bsz,%rax
316         sub             $bsz,$len
317         jc              .Ldone_absorb_avx2
318
319         shr             \$3,%eax
320         vpbroadcastq    0-96($inp),@T[0]
321         vmovdqu         8-96($inp),@T[1]
322         sub             \$4,%eax
323 ___
324 for(my $i=5; $i<25; $i++) {
325 $code.=<<___
326         dec     %eax
327         jz      .Labsorved_avx2
328         mov     8*$i-96($inp),%r8
329         mov     %r8,$A_jagged[$i]-96(%r10)
330 ___
331 }
332 $code.=<<___;
333 .Labsorved_avx2:
334         lea     ($inp,$bsz),$inp
335
336         vpxor   @T[0],$A00,$A00
337         vpxor   @T[1],$A01,$A01
338         vpxor   32*2-96(%r10),$A20,$A20
339         vpxor   32*3-96(%r10),$A31,$A31
340         vpxor   32*4-96(%r10),$A21,$A21
341         vpxor   32*5-96(%r10),$A41,$A41
342         vpxor   32*6-96(%r10),$A11,$A11
343
344         call    __KeccakF1600
345
346         lea     96(%rsp),%r10
347         jmp     .Loop_absorb_avx2
348
349 .Ldone_absorb_avx2:
350         vmovq   %xmm0,-96($A_flat)
351         vmovdqu $A01,8+32*0-96($A_flat)
352         vmovdqu $A20,8+32*1-96($A_flat)
353         vmovdqu $A31,8+32*2-96($A_flat)
354         vmovdqu $A21,8+32*3-96($A_flat)
355         vmovdqu $A41,8+32*4-96($A_flat)
356         vmovdqu $A11,8+32*5-96($A_flat)
357
358         vzeroupper
359
360         lea     (%r11),%rsp
361         lea     ($len,$bsz),%rax                # return value
362         ret
363 .size   SHA3_absorb,.-SHA3_absorb
364
365 .globl  SHA3_squeeze
366 .type   SHA3_squeeze,\@function
367 .align  32
368 SHA3_squeeze:
369         mov     %rsp,%r11
370
371         lea     96($A_flat),$A_flat
372         shr     \$3,$bsz
373
374         vzeroupper
375
376         vpbroadcastq    -96($A_flat),$A00
377         vpxor           @T[0],@T[0],@T[0]
378         vmovdqu         8+32*0-96($A_flat),$A01
379         vmovdqu         8+32*1-96($A_flat),$A20
380         vmovdqu         8+32*2-96($A_flat),$A31
381         vmovdqu         8+32*3-96($A_flat),$A21
382         vmovdqu         8+32*4-96($A_flat),$A41
383         vmovdqu         8+32*5-96($A_flat),$A11
384
385         mov     $bsz,%rax
386
387 .Loop_squeeze_avx2:
388         mov     @A_jagged[$i]-96($A_flat),%r8
389 ___
390 for (my $i=0; $i<25; $i++) {
391 $code.=<<___;
392         sub     \$8,$len
393         jc      .Ltail_squeeze_avx2
394         mov     %r8,($out)
395         lea     8($out),$out
396         je      .Ldone_squeeze_avx2
397         dec     %eax
398         je      .Lextend_output_avx2
399         mov     @A_jagged[$i+1]-120($A_flat),%r8
400 ___
401 }
402 $code.=<<___;
403 .Lextend_output_avx2:
404         call    __KeccakF1600
405
406         vmovq   %xmm0,-96($A_flat)
407         vmovdqu $A01,8+32*0-96($A_flat)
408         vmovdqu $A20,8+32*1-96($A_flat)
409         vmovdqu $A31,8+32*2-96($A_flat)
410         vmovdqu $A21,8+32*3-96($A_flat)
411         vmovdqu $A41,8+32*4-96($A_flat)
412         vmovdqu $A11,8+32*5-96($A_flat)
413
414         mov     $bsz,%rax
415         jmp     .Loop_squeeze_avx2
416
417
418 .Ltail_squeeze_avx2:
419         add     \$8,$len
420 .Loop_tail_avx2:
421         mov     %r8b,($out)
422         lea     1($out),$out
423         shr     \$8,%r8
424         dec     $len
425         jnz     .Loop_tail_avx2
426
427 .Ldone_squeeze_avx2:
428         vzeroupper
429
430         lea     (%r11),%rsp
431         ret
432 .size   SHA3_squeeze,.-SHA3_squeeze
433
434 .align  64
435 rhotates_left:
436         .quad   3,      18,     36,     41      # [2][0] [4][0] [1][0] [3][0]
437         .quad   1,      62,     28,     27      # [0][1] [0][2] [0][3] [0][4]
438         .quad   45,     6,      56,     39      # [3][1] [1][2] [4][3] [2][4]
439         .quad   10,     61,     55,     8       # [2][1] [4][2] [1][3] [3][4]
440         .quad   2,      15,     25,     20      # [4][1] [3][2] [2][3] [1][4]
441         .quad   44,     43,     21,     14      # [1][1] [2][2] [3][3] [4][4]
442 rhotates_right:
443         .quad   64-3,   64-18,  64-36,  64-41
444         .quad   64-1,   64-62,  64-28,  64-27
445         .quad   64-45,  64-6,   64-56,  64-39
446         .quad   64-10,  64-61,  64-55,  64-8
447         .quad   64-2,   64-15,  64-25,  64-20
448         .quad   64-44,  64-43,  64-21,  64-14
449 iotas:
450         .quad   0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
451         .quad   0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
452         .quad   0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
453         .quad   0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
454         .quad   0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
455         .quad   0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
456         .quad   0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
457         .quad   0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
458         .quad   0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
459         .quad   0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
460         .quad   0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
461         .quad   0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
462         .quad   0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
463         .quad   0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
464         .quad   0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
465         .quad   0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
466         .quad   0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
467         .quad   0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
468         .quad   0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
469         .quad   0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
470         .quad   0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
471         .quad   0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
472         .quad   0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
473         .quad   0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
474
475 .asciz  "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"
476 ___
477
478 print $code;
479 close STDOUT;