crypto/sha/asm/keccak1600-avx2.pl

   1 #!/usr/bin/env perl
   2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8 #
   9 # ====================================================================
  10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11 # project. The module is, however, dual licensed under OpenSSL and
  12 # CRYPTOGAMS licenses depending on where you obtain it. For further
  13 # details see http://www.openssl.org/~appro/cryptogams/.
  14 # ====================================================================
  15 #
  16 # Keccak-1600 for AVX2.
  17 #
  18 # July 2017.
  19 #
  20 # To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
  21 # 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
  22 # other than A[0][0] in magic order into 6 [256-bit] registers, *each
  23 # dedicated to one axis*, Pi permutation is reduced to intra-register
  24 # shuffles...
  25 #
  26 # It makes other steps more intricate, but overall, is it a win? To be
  27 # more specific index permutations organized by quadruples are:
  28 #
  29 #       [4][4] [3][3] [2][2] [1][1]<-+
  30 #       [0][4] [0][3] [0][2] [0][1]<-+
  31 #       [3][0] [1][0] [4][0] [2][0]  |
  32 #       [4][3] [3][1] [2][4] [1][2]  |
  33 #       [3][4] [1][3] [4][2] [2][1]  |
  34 #       [2][3] [4][1] [1][4] [3][2]  |
  35 #       [2][2] [4][4] [1][1] [3][3] -+
  36 #
  37 # This however is highly impractical for Theta and Chi. What would help
  38 # Theta is if x indices were aligned column-wise, or in other words:
  39 #
  40 #       [0][4] [0][3] [0][2] [0][1]
  41 #       [3][0] [1][0] [4][0] [2][0]
  42 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
  43 #       [2][4] [4][3] [1][2] [3][1]
  44 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
  45 #       [3][4] [1][3] [4][2] [2][1]
  46 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
  47 #       [1][4] [2][3] [3][2] [4][1]
  48 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
  49 #       [4][4] [3][3] [2][2] [1][1]
  50 #
  51 # So here we have it, lines not marked with vpermq() represent the magic
  52 # order in which data is to be loaded and maintained. [And lines marked
  53 # with vpermq() represent Pi circular permutation in chosen layout. Note
  54 # that first step is permutation-free.] A[0][0] is loaded to register of
  55 # its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
  56 # Digits in variables' names denote right-most coordinates:
  57
  58 my ($A00,       # [0][0] [0][0] [0][0] [0][0]           # %ymm0
  59     $A01,       # [0][4] [0][3] [0][2] [0][1]           # %ymm1
  60     $A20,       # [3][0] [1][0] [4][0] [2][0]           # %ymm2
  61     $A31,       # [2][4] [4][3] [1][2] [3][1]           # %ymm3
  62     $A21,       # [3][4] [1][3] [4][2] [2][1]           # %ymm4
  63     $A41,       # [1][4] [2][3] [3][2] [4][1]           # %ymm5
  64     $A11) =     # [4][4] [3][3] [2][2] [1][1]           # %ymm6
  65     map("%ymm$_",(0..6));
  66
  67 # We also need to map the magic order into offsets within structure:
  68
  69 my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3],      # [0][0..4]
  70                 [2,2], [6,0], [3,1], [4,2], [5,3],      # [1][0..4]
  71                 [2,0], [4,0], [6,1], [5,2], [3,3],      # [2][0..4]
  72                 [2,3], [3,0], [5,1], [6,2], [4,3],      # [3][0..4]
  73                 [2,1], [5,0], [4,1], [3,2], [6,3]);     # [4][0..4]
  74    @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged);     # ... and now linear
  75
  76 # But on the other hand Chi is much better off if y indices were aligned
  77 # column-wise, not x. For this reason we have to shuffle data prior
  78 # Chi and revert it afterwards. Prior shuffle is naturally merged with
  79 # Pi itself:
  80 #
  81 #       [0][4] [0][3] [0][2] [0][1]
  82 #       [3][0] [1][0] [4][0] [2][0]
  83 #vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
  84 #vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
  85 #       [3][1] [1][2] [4][3] [2][4]
  86 #vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
  87 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
  88 #       [3][4] [1][3] [4][2] [2][1]
  89 #vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
  90 #vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
  91 #       [3][2] [1][4] [4][1] [2][3]
  92 #vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
  93 #vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
  94 #       [3][3] [1][1] [4][4] [2][2]
  95 #
  96 # And reverse post-Chi permutation:
  97 #
  98 #       [0][4] [0][3] [0][2] [0][1]
  99 #       [3][0] [1][0] [4][0] [2][0]
 100 #vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
 101 #       [2][4] [4][3] [1][2] [3][1]
 102 #vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
 103 #       [3][4] [1][3] [4][2] [2][1]
 104 #vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
 105 #       [1][4] [2][3] [3][2] [4][1]
 106 #vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
 107 #       [4][4] [3][3] [2][2] [1][1]
 108 #
 109 ########################################################################
 110 # Numbers are cycles per processed byte out of large message.
 111 #
 112 #                       r=1088(*)
 113 #
 114 # Haswell               9.5
 115 # Skylake               8.8
 116 #
 117 # (*)   Corresponds to SHA3-256.
 118
 119 my @T = map("%ymm$_",(7..15));
 120 my ($C14,$C00,$D00,$D14) = @T[5..8];
 121
 122 $code.=<<___;
 123 .text
 124
 125 .type   __KeccakF1600,\@function
 126 .align  32
 127 __KeccakF1600:
 128         lea             rhotates_left+96(%rip),%r8
 129         lea             rhotates_right+96(%rip),%r9
 130         lea             iotas(%rip),%r10
 131         mov             \$24,%eax
 132         jmp             .Loop_avx2
 133
 134 .align  32
 135 .Loop_avx2:
 136         ######################################### Theta
 137         vpxor           $A01,$A31,$C14
 138         vpxor           $A21,$C14,$C14
 139         vpxor           $A41,$C14,$C14
 140         vpxor           $A11,$C14,$C14          # C[1..4]
 141         vpermq          \$0b10110001,$A20,$C00
 142         vpxor           $A20,$C00,$C00
 143         vpermq          \$0b01001110,$C00,@T[0]
 144         vpxor           $A00,$C00,$C00
 145         vpxor           @T[0],$C00,$C00         # C[0..0]
 146
 147         vpsrlq          \$63,$C14,@T[1]
 148         vpaddq          $C14,$C14,@T[3]
 149         vpor            @T[3],@T[1],@T[1]       # ROL64(C[1..4],1)
 150
 151         vpsrlq          \$63,$C00,@T[0]
 152         vpaddq          $C00,$C00,@T[2]
 153         vpor            @T[2],@T[0],@T[0]       # ROL64(C[0..0],1)
 154
 155         vpermq          \$0b00000000,@T[1],$D00
 156         vpermq          \$0b11111111,$C14,@T[3]
 157         vpxor           @T[3],$D00,$D00         # D[0..0] = ROL64(C[1],1) ^ C[4]
 158
 159         vpermq          \$0b00111001,@T[1],$D14
 160         vpblendd        \$0b11000000,@T[0],$D14,$D14
 161         vpermq          \$0b10010011,$C14,@T[2]
 162         vpblendd        \$0b00000011,$C00,@T[2],@T[2]
 163         vpxor           @T[2],$D14,$D14         # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
 164
 165         vpxor           $D00,$A00,$A00          # ^= D[0..0]
 166         vpxor           $D00,$A20,$A20          # ^= D[0..0]
 167         vpxor           $D14,$A01,$A01          # ^= D[1..4]
 168         vpxor           $D14,$A31,$A31          # ^= D[1..4]
 169         vpxor           $D14,$A21,$A21          # ^= D[1..4]
 170         vpxor           $D14,$A41,$A41          # ^= D[1..4]
 171         vpxor           $D14,$A11,$A11          # ^= D[1..4]
 172
 173         ######################################### Rho
 174         vpsllvq         0*32-96(%r8),$A20,@T[0]
 175         vpsrlvq         0*32-96(%r9),$A20,$A20
 176         vpor            @T[0],$A20,$A20
 177
 178         vpsllvq         1*32-96(%r8),$A01,@T[1]
 179         vpsrlvq         1*32-96(%r9),$A01,$A01
 180         vpor            @T[1],$A01,$A01
 181
 182         vpsllvq         2*32-96(%r8),$A31,@T[2]
 183         vpsrlvq         2*32-96(%r9),$A31,$A31
 184         vpor            @T[2],$A31,$A31
 185
 186         vpsllvq         3*32-96(%r8),$A21,@T[3]
 187         vpsrlvq         3*32-96(%r9),$A21,$A21
 188         vpor            @T[3],$A21,$A21
 189
 190         vpsllvq         4*32-96(%r8),$A41,@T[4]
 191         vpsrlvq         4*32-96(%r9),$A41,$A41
 192         vpor            @T[4],$A41,$A41
 193
 194         vpsllvq         5*32-96(%r8),$A11,@T[5]
 195         vpsrlvq         5*32-96(%r9),$A11,$A11
 196         vpor            @T[5],$A11,$A11
 197
 198         ######################################### Pi + pre-Chi shuffle
 199         vpermq          \$0b01110010,$A41,@T[6] # vpermq \$0b00011011,$A41,$A11
 200         vpermq          \$0b00011011,$A21,@T[5] # vpermq \$0b01110010,$A21,$A41
 201         vpermq          \$0b10001101,$A31,@T[4] # vpermq \$0b10001101,$A31,$A21
 202         vpermq          \$0b10001101,$A20,@T[3] # vpermq \$0b01110010,$A20,$A31
 203         vmovdqa         $A01,@T[2]
 204         vmovdqa         $A11,@T[1]
 205
 206         ######################################### Chi
 207         vpermq          \$0b00000000,@T[1],@T[0]        # [0][1] [0][1] [0][1] [0][1]
 208         vpermq          \$0b01010101,@T[1],@T[7]        # [0][2] [0][2] [0][2] [0][2]
 209         vpandn          @T[7],@T[0],@T[0]       # tgting  [0][0] [0][0] [0][0] [0][0]
 210
 211         vpermq          \$0b00111001,@T[1],$A01         # [0][1] [0][4] [0][3] [0][2]
 212         vpermq          \$0b00011110,@T[1],@T[8]        # [0][1] [0][2] [0][4] [0][3]
 213         vpblendd        \$0b11000000,$A00,$A01,$A01     # [0][0] [0][4] [0][3] [0][2]
 214         vpblendd        \$0b00110000,$A00,@T[8],@T[8]   # [0][1] [0][0] [0][4] [0][3]
 215         vpandn          @T[8],$A01,$A01         # tgting  [0][4] [0][3] [0][2] [0][1]
 216
 217         vpblendd        \$0b00001100,@T[5],@T[4],$A20   #               [4][1] [2][1]
 218         vpblendd        \$0b00110000,@T[6],$A20,$A20    #        [1][1] [4][1] [2][1]
 219         vpblendd        \$0b11000000,@T[3],$A20,$A20    # [3][1] [1][1] [4][1] [2][1]
 220         vpblendd        \$0b00001100,@T[4],@T[6],@T[7]  #               [4][2] [2][2]
 221         vpblendd        \$0b00110000,@T[3],@T[7],@T[7]  #        [1][2] [4][2] [2][2]
 222         vpblendd        \$0b11000000,@T[5],@T[7],@T[7]  # [3][2] [1][2] [4][2] [2][2]
 223         vpandn          @T[7],$A20,$A20         # tgting  [3][0] [1][0] [4][0] [2][0]
 224
 225         vpblendd        \$0b00001100,@T[6],@T[2],$A31   #               [4][4] [2][0]
 226         vpblendd        \$0b00110000,@T[4],$A31,$A31    #        [1][3] [4][4] [2][0]
 227         vpblendd        \$0b11000000,@T[5],$A31,$A31    # [3][2] [1][3] [4][4] [2][0]
 228         vpblendd        \$0b00001100,@T[2],@T[4],@T[8]  #               [4][0] [2][1]
 229         vpblendd        \$0b00110000,@T[5],@T[8],@T[8]  #        [1][4] [4][0] [2][1]
 230         vpblendd        \$0b11000000,@T[6],@T[8],@T[8]  # [3][3] [1][4] [4][0] [2][1]
 231         vpandn          @T[8],$A31,$A31         # tgting  [3][1] [1][2] [4][3] [2][4]
 232
 233         vpblendd        \$0b00001100,@T[3],@T[6],$A21   #               [4][3] [2][2]
 234         vpblendd        \$0b00110000,@T[5],$A21,$A21    #        [1][4] [4][3] [2][2]
 235         vpblendd        \$0b11000000,@T[2],$A21,$A21    # [3][0] [1][4] [4][3] [2][2]
 236         vpblendd        \$0b00001100,@T[6],@T[5],@T[7]  #               [4][4] [2][3]
 237         vpblendd        \$0b00110000,@T[2],@T[7],@T[7]  #        [1][0] [4][4] [2][3]
 238         vpblendd        \$0b11000000,@T[3],@T[7],@T[7]  # [3][1] [1][0] [4][4] [2][3]
 239         vpandn          @T[7],$A21,$A21         # tgting  [3][4] [1][3] [4][2] [2][1]
 240
 241         vpblendd        \$0b00001100,@T[4],@T[3],$A41   #               [4][2] [2][4]
 242         vpblendd        \$0b00110000,@T[2],$A41,$A41    #        [1][0] [4][2] [2][4]
 243         vpblendd        \$0b11000000,@T[6],$A41,$A41    # [3][3] [1][0] [4][2] [2][4]
 244         vpblendd        \$0b00001100,@T[3],@T[2],@T[8]  #               [4][3] [2][0]
 245         vpblendd        \$0b00110000,@T[6],@T[8],@T[8]  #        [1][1] [4][3] [2][0]
 246         vpblendd        \$0b11000000,@T[4],@T[8],@T[8]  # [3][4] [1][1] [4][3] [2][0]
 247         vpandn          @T[8],$A41,$A41         # tgting  [3][2] [1][4] [4][1] [2][3]
 248
 249         vpblendd        \$0b00001100,@T[2],@T[5],$A11   #               [4][0] [2][3]
 250         vpblendd        \$0b00110000,@T[3],$A11,$A11    #        [1][2] [4][0] [2][3]
 251         vpblendd        \$0b11000000,@T[4],$A11,$A11    # [3][4] [1][2] [4][0] [2][3]
 252         vpblendd        \$0b00001100,@T[5],@T[3],@T[7]  #               [4][1] [2][4]
 253         vpblendd        \$0b00110000,@T[4],@T[7],@T[7]  #        [1][3] [4][1] [2][4]
 254         vpblendd        \$0b11000000,@T[2],@T[7],@T[7]  # [3][0] [1][3] [4][1] [2][4]
 255         vpandn          @T[7],$A11,$A11         # tgting  [3][3] [1][1] [4][4] [2][2]
 256
 257         vpxor           @T[0],$A00,$A00
 258         vpxor           @T[1],$A01,$A01
 259         vpxor           @T[2],$A20,$A20
 260         vpxor           @T[3],$A31,$A31
 261         vpxor           @T[4],$A21,$A21
 262         vpxor           @T[5],$A41,$A41
 263         vpxor           @T[6],$A11,$A11
 264
 265         vpermq          \$0b00011011,$A31,$A31  # post-Chi shuffle
 266         vpermq          \$0b10001101,$A41,$A41
 267         vpermq          \$0b01110010,$A11,$A11
 268
 269         ######################################### Iota
 270         vpxor           (%r10),$A00,$A00
 271         lea             32(%r10),%r10
 272
 273         dec             %eax
 274         jnz             .Loop_avx2
 275
 276         ret
 277 .size   __KeccakF1600,.-__KeccakF1600
 278 ___
 279 my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
 280 my  $out = $inp;        # in squeeze
 281
 282 $code.=<<___;
 283 .globl  SHA3_absorb
 284 .type   SHA3_absorb,\@function
 285 .align  32
 286 SHA3_absorb:
 287         mov     %rsp,%r11
 288
 289         lea     -240(%rsp),%rsp
 290         and     \$-32,%rsp
 291
 292         lea     96($A_flat),$A_flat
 293         lea     96($inp),$inp
 294         lea     96(%rsp),%r10
 295
 296         vzeroupper
 297
 298         vpbroadcastq    -96($A_flat),$A00       # load A[5][5]
 299         vmovdqu         8+32*0-96($A_flat),$A01
 300         vmovdqu         8+32*1-96($A_flat),$A20
 301         vmovdqu         8+32*2-96($A_flat),$A31
 302         vmovdqu         8+32*3-96($A_flat),$A21
 303         vmovdqu         8+32*4-96($A_flat),$A41
 304         vmovdqu         8+32*5-96($A_flat),$A11
 305
 306         vpxor           @T[0],@T[0],@T[0]
 307         vmovdqa         @T[0],32*2-96(%r10)     # zero transfer area on stack
 308         vmovdqa         @T[0],32*3-96(%r10)
 309         vmovdqa         @T[0],32*4-96(%r10)
 310         vmovdqa         @T[0],32*5-96(%r10)
 311         vmovdqa         @T[0],32*6-96(%r10)
 312
 313 .Loop_absorb_avx2:
 314         mov             $bsz,%rax
 315         sub             $bsz,$len
 316         jc              .Ldone_absorb_avx2
 317
 318         shr             \$3,%eax
 319         vpbroadcastq    0-96($inp),@T[0]
 320         vmovdqu         8-96($inp),@T[1]
 321         sub             \$4,%eax
 322 ___
 323 for(my $i=5; $i<25; $i++) {
 324 $code.=<<___
 325         dec     %eax
 326         jz      .Labsorved_avx2
 327         mov     8*$i-96($inp),%r8
 328         mov     %r8,$A_jagged[$i]-96(%r10)
 329 ___
 330 }
 331 $code.=<<___;
 332 .Labsorved_avx2:
 333         lea     ($inp,$bsz),$inp
 334
 335         vpxor   @T[0],$A00,$A00
 336         vpxor   @T[1],$A01,$A01
 337         vpxor   32*2-96(%r10),$A20,$A20
 338         vpxor   32*3-96(%r10),$A31,$A31
 339         vpxor   32*4-96(%r10),$A21,$A21
 340         vpxor   32*5-96(%r10),$A41,$A41
 341         vpxor   32*6-96(%r10),$A11,$A11
 342
 343         call    __KeccakF1600
 344
 345         lea     96(%rsp),%r10
 346         jmp     .Loop_absorb_avx2
 347
 348 .Ldone_absorb_avx2:
 349         vmovq   %xmm0,-96($A_flat)
 350         vmovdqu $A01,8+32*0-96($A_flat)
 351         vmovdqu $A20,8+32*1-96($A_flat)
 352         vmovdqu $A31,8+32*2-96($A_flat)
 353         vmovdqu $A21,8+32*3-96($A_flat)
 354         vmovdqu $A41,8+32*4-96($A_flat)
 355         vmovdqu $A11,8+32*5-96($A_flat)
 356
 357         vzeroupper
 358
 359         lea     (%r11),%rsp
 360         lea     ($len,$bsz),%rax                # return value
 361         ret
 362 .size   SHA3_absorb,.-SHA3_absorb
 363
 364 .globl  SHA3_squeeze
 365 .type   SHA3_squeeze,\@function
 366 .align  32
 367 SHA3_squeeze:
 368         mov     %rsp,%r11
 369
 370         lea     96($A_flat),$A_flat
 371         shr     \$3,$bsz
 372
 373         vzeroupper
 374
 375         vpbroadcastq    -96($A_flat),$A00
 376         vpxor           @T[0],@T[0],@T[0]
 377         vmovdqu         8+32*0-96($A_flat),$A01
 378         vmovdqu         8+32*1-96($A_flat),$A20
 379         vmovdqu         8+32*2-96($A_flat),$A31
 380         vmovdqu         8+32*3-96($A_flat),$A21
 381         vmovdqu         8+32*4-96($A_flat),$A41
 382         vmovdqu         8+32*5-96($A_flat),$A11
 383
 384         mov     $bsz,%rax
 385
 386 .Loop_squeeze_avx2:
 387         mov     @A_jagged[$i]-96($A_flat),%r8
 388 ___
 389 for (my $i=0; $i<25; $i++) {
 390 $code.=<<___;
 391         sub     \$8,$len
 392         jc      .Ltail_squeeze_avx2
 393         mov     %r8,($out)
 394         lea     8($out),$out
 395         je      .Ldone_squeeze_avx2
 396         dec     %eax
 397         je      .Lextend_output_avx2
 398         mov     @A_jagged[$i+1]-120($A_flat),%r8
 399 ___
 400 }
 401 $code.=<<___;
 402 .Lextend_output_avx2:
 403         call    __KeccakF1600
 404
 405         vmovq   %xmm0,-96($A_flat)
 406         vmovdqu $A01,8+32*0-96($A_flat)
 407         vmovdqu $A20,8+32*1-96($A_flat)
 408         vmovdqu $A31,8+32*2-96($A_flat)
 409         vmovdqu $A21,8+32*3-96($A_flat)
 410         vmovdqu $A41,8+32*4-96($A_flat)
 411         vmovdqu $A11,8+32*5-96($A_flat)
 412
 413         mov     $bsz,%rax
 414         jmp     .Loop_squeeze_avx2
 415
 416
 417 .Ltail_squeeze_avx2:
 418         add     \$8,$len
 419 .Loop_tail_avx2:
 420         mov     %r8b,($out)
 421         lea     1($out),$out
 422         shr     \$8,%r8
 423         dec     $len
 424         jnz     .Loop_tail_avx2
 425
 426 .Ldone_squeeze_avx2:
 427         vzeroupper
 428
 429         lea     (%r11),%rsp
 430         ret
 431 .size   SHA3_squeeze,.-SHA3_squeeze
 432
 433 .align  64
 434 rhotates_left:
 435         .quad   3,      18,     36,     41      # [2][0] [4][0] [1][0] [3][0]
 436         .quad   1,      62,     28,     27      # [0][1] [0][2] [0][3] [0][4]
 437         .quad   45,     6,      56,     39      # [3][1] [1][2] [4][3] [2][4]
 438         .quad   10,     61,     55,     8       # [2][1] [4][2] [1][3] [3][4]
 439         .quad   2,      15,     25,     20      # [4][1] [3][2] [2][3] [1][4]
 440         .quad   44,     43,     21,     14      # [1][1] [2][2] [3][3] [4][4]
 441 rhotates_right:
 442         .quad   64-3,   64-18,  64-36,  64-41
 443         .quad   64-1,   64-62,  64-28,  64-27
 444         .quad   64-45,  64-6,   64-56,  64-39
 445         .quad   64-10,  64-61,  64-55,  64-8
 446         .quad   64-2,   64-15,  64-25,  64-20
 447         .quad   64-44,  64-43,  64-21,  64-14
 448 iotas:
 449         .quad   0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
 450         .quad   0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
 451         .quad   0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
 452         .quad   0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
 453         .quad   0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
 454         .quad   0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
 455         .quad   0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
 456         .quad   0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
 457         .quad   0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
 458         .quad   0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
 459         .quad   0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
 460         .quad   0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
 461         .quad   0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
 462         .quad   0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
 463         .quad   0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
 464         .quad   0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
 465         .quad   0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
 466         .quad   0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
 467         .quad   0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
 468         .quad   0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
 469         .quad   0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
 470         .quad   0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
 471         .quad   0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
 472         .quad   0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
 473
 474 .asciz  "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"
 475 ___
 476
 477 print $code;
 478 close STDOUT;