crypto/sha/asm/keccak1600-x86_64.pl

   1 #!/usr/bin/env perl
   2 # Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8 #
   9 # ====================================================================
  10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11 # project. The module is, however, dual licensed under OpenSSL and
  12 # CRYPTOGAMS licenses depending on where you obtain it. For further
  13 # details see http://www.openssl.org/~appro/cryptogams/.
  14 # ====================================================================
  15 #
  16 # Keccak-1600 for x86_64.
  17 #
  18 # June 2017.
  19 #
  20 # Below code is [lane complementing] KECCAK_2X implementation (see
  21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
  22 # instead of actually unrolling the loop pair-wise I simply flip
  23 # pointers to T[][] and A[][] at the end of round. Since number of
  24 # rounds is even, last round writes to A[][] and everything works out.
  25 # How does it compare to x86_64 assembly module in Keccak Code Package?
  26 # Depending on processor it's either as fast or faster by up to 15%...
  27 #
  28 ########################################################################
  29 # Numbers are cycles per processed byte out of large message.
  30 #
  31 #                       r=1088(*)
  32 #
  33 # P4                    25.8
  34 # Core 2                12.9
  35 # Westmere              13.7
  36 # Sandy Bridge          12.9(**)
  37 # Haswell               9.6
  38 # Skylake               9.4
  39 # Silvermont            22.8
  40 # Goldmont              15.8
  41 # VIA Nano              17.3
  42 # Sledgehammer          13.3
  43 # Bulldozer             16.5
  44 # Ryzen                 8.8
  45 #
  46 # (*)   Corresponds to SHA3-256. Improvement over compiler-generate
  47 #       varies a lot, most commont coefficient is 15% in comparison to
  48 #       gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
  49 # (**)  Sandy Bridge has broken rotate instruction. Performance can be
  50 #       improved by 14% by replacing rotates with double-precision
  51 #       shift with same register as source and destination.
  52
  53 # $output is the last argument if it looks like a file (it has an extension)
  54 # $flavour is the first argument if it doesn't look like a file
  55 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  56 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  57
  58 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  59
  60 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  61 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  62 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  63 die "can't locate x86_64-xlate.pl";
  64
  65 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  66     or die "can't call $xlate: $!";
  67 *STDOUT=*OUT;
  68
  69 my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
  70               8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
  71
  72 my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
  73 my @D = map("%r$_",(8..12));
  74 my @T = map("%r$_",(13..14));
  75 my $iotas = "%r15";
  76
  77 my @rhotates = ([  0,  1, 62, 28, 27 ],
  78                 [ 36, 44,  6, 55, 20 ],
  79                 [  3, 10, 43, 25, 39 ],
  80                 [ 41, 45, 15, 21,  8 ],
  81                 [ 18,  2, 61, 56, 14 ]);
  82
  83 $code.=<<___;
  84 .text
  85
  86 .type   __KeccakF1600,\@abi-omnipotent
  87 .align  32
  88 __KeccakF1600:
  89         mov     $A[4][0](%rdi),@C[0]
  90         mov     $A[4][1](%rdi),@C[1]
  91         mov     $A[4][2](%rdi),@C[2]
  92         mov     $A[4][3](%rdi),@C[3]
  93         mov     $A[4][4](%rdi),@C[4]
  94         jmp     .Loop
  95
  96 .align  32
  97 .Loop:
  98         mov     $A[0][0](%rdi),@D[0]
  99         mov     $A[1][1](%rdi),@D[1]
 100         mov     $A[2][2](%rdi),@D[2]
 101         mov     $A[3][3](%rdi),@D[3]
 102
 103         xor     $A[0][2](%rdi),@C[2]
 104         xor     $A[0][3](%rdi),@C[3]
 105         xor     @D[0],         @C[0]
 106         xor     $A[0][1](%rdi),@C[1]
 107          xor    $A[1][2](%rdi),@C[2]
 108          xor    $A[1][0](%rdi),@C[0]
 109         mov     @C[4],@D[4]
 110         xor     $A[0][4](%rdi),@C[4]
 111
 112         xor     @D[2],         @C[2]
 113         xor     $A[2][0](%rdi),@C[0]
 114          xor    $A[1][3](%rdi),@C[3]
 115          xor    @D[1],         @C[1]
 116          xor    $A[1][4](%rdi),@C[4]
 117
 118         xor     $A[3][2](%rdi),@C[2]
 119         xor     $A[3][0](%rdi),@C[0]
 120          xor    $A[2][3](%rdi),@C[3]
 121          xor    $A[2][1](%rdi),@C[1]
 122          xor    $A[2][4](%rdi),@C[4]
 123
 124         mov     @C[2],@T[0]
 125         rol     \$1,@C[2]
 126         xor     @C[0],@C[2]             # D[1] = ROL64(C[2], 1) ^ C[0]
 127          xor    @D[3],         @C[3]
 128
 129         rol     \$1,@C[0]
 130         xor     @C[3],@C[0]             # D[4] = ROL64(C[0], 1) ^ C[3]
 131          xor    $A[3][1](%rdi),@C[1]
 132
 133         rol     \$1,@C[3]
 134         xor     @C[1],@C[3]             # D[2] = ROL64(C[3], 1) ^ C[1]
 135          xor    $A[3][4](%rdi),@C[4]
 136
 137         rol     \$1,@C[1]
 138         xor     @C[4],@C[1]             # D[0] = ROL64(C[1], 1) ^ C[4]
 139
 140         rol     \$1,@C[4]
 141         xor     @T[0],@C[4]             # D[3] = ROL64(C[4], 1) ^ C[2]
 142 ___
 143         (@D[0..4], @C) = (@C[1..4,0], @D);
 144 $code.=<<___;
 145         xor     @D[1],@C[1]
 146         xor     @D[2],@C[2]
 147         rol     \$$rhotates[1][1],@C[1]
 148         xor     @D[3],@C[3]
 149         xor     @D[4],@C[4]
 150         rol     \$$rhotates[2][2],@C[2]
 151         xor     @D[0],@C[0]
 152          mov    @C[1],@T[0]
 153         rol     \$$rhotates[3][3],@C[3]
 154          or     @C[2],@C[1]
 155          xor    @C[0],@C[1]             #           C[0] ^ ( C[1] | C[2])
 156         rol     \$$rhotates[4][4],@C[4]
 157
 158          xor    ($iotas),@C[1]
 159          lea    8($iotas),$iotas
 160
 161         mov     @C[4],@T[1]
 162         and     @C[3],@C[4]
 163          mov    @C[1],$A[0][0](%rsi)    # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
 164         xor     @C[2],@C[4]             #           C[2] ^ ( C[4] & C[3])
 165         not     @C[2]
 166         mov     @C[4],$A[0][2](%rsi)    # R[0][2] = C[2] ^ ( C[4] & C[3])
 167
 168         or      @C[3],@C[2]
 169           mov   $A[4][2](%rdi),@C[4]
 170         xor     @T[0],@C[2]             #           C[1] ^ (~C[2] | C[3])
 171         mov     @C[2],$A[0][1](%rsi)    # R[0][1] = C[1] ^ (~C[2] | C[3])
 172
 173         and     @C[0],@T[0]
 174           mov   $A[1][4](%rdi),@C[1]
 175         xor     @T[1],@T[0]             #           C[4] ^ ( C[1] & C[0])
 176           mov   $A[2][0](%rdi),@C[2]
 177         mov     @T[0],$A[0][4](%rsi)    # R[0][4] = C[4] ^ ( C[1] & C[0])
 178
 179         or      @C[0],@T[1]
 180           mov   $A[0][3](%rdi),@C[0]
 181         xor     @C[3],@T[1]             #           C[3] ^ ( C[4] | C[0])
 182           mov   $A[3][1](%rdi),@C[3]
 183         mov     @T[1],$A[0][3](%rsi)    # R[0][3] = C[3] ^ ( C[4] | C[0])
 184
 185
 186         xor     @D[3],@C[0]
 187         xor     @D[2],@C[4]
 188         rol     \$$rhotates[0][3],@C[0]
 189         xor     @D[1],@C[3]
 190         xor     @D[4],@C[1]
 191         rol     \$$rhotates[4][2],@C[4]
 192         rol     \$$rhotates[3][1],@C[3]
 193         xor     @D[0],@C[2]
 194         rol     \$$rhotates[1][4],@C[1]
 195          mov    @C[0],@T[0]
 196          or     @C[4],@C[0]
 197         rol     \$$rhotates[2][0],@C[2]
 198
 199         xor     @C[3],@C[0]             #           C[3] ^ (C[0] |  C[4])
 200         mov     @C[0],$A[1][3](%rsi)    # R[1][3] = C[3] ^ (C[0] |  C[4])
 201
 202         mov     @C[1],@T[1]
 203         and     @T[0],@C[1]
 204           mov   $A[0][1](%rdi),@C[0]
 205         xor     @C[4],@C[1]             #           C[4] ^ (C[1] &  C[0])
 206         not     @C[4]
 207         mov     @C[1],$A[1][4](%rsi)    # R[1][4] = C[4] ^ (C[1] &  C[0])
 208
 209         or      @C[3],@C[4]
 210           mov   $A[1][2](%rdi),@C[1]
 211         xor     @C[2],@C[4]             #           C[2] ^ (~C[4] | C[3])
 212         mov     @C[4],$A[1][2](%rsi)    # R[1][2] = C[2] ^ (~C[4] | C[3])
 213
 214         and     @C[2],@C[3]
 215           mov   $A[4][0](%rdi),@C[4]
 216         xor     @T[1],@C[3]             #           C[1] ^ (C[3] &  C[2])
 217         mov     @C[3],$A[1][1](%rsi)    # R[1][1] = C[1] ^ (C[3] &  C[2])
 218
 219         or      @C[2],@T[1]
 220           mov   $A[2][3](%rdi),@C[2]
 221         xor     @T[0],@T[1]             #           C[0] ^ (C[1] |  C[2])
 222           mov   $A[3][4](%rdi),@C[3]
 223         mov     @T[1],$A[1][0](%rsi)    # R[1][0] = C[0] ^ (C[1] |  C[2])
 224
 225
 226         xor     @D[3],@C[2]
 227         xor     @D[4],@C[3]
 228         rol     \$$rhotates[2][3],@C[2]
 229         xor     @D[2],@C[1]
 230         rol     \$$rhotates[3][4],@C[3]
 231         xor     @D[0],@C[4]
 232         rol     \$$rhotates[1][2],@C[1]
 233         xor     @D[1],@C[0]
 234         rol     \$$rhotates[4][0],@C[4]
 235          mov    @C[2],@T[0]
 236          and    @C[3],@C[2]
 237         rol     \$$rhotates[0][1],@C[0]
 238
 239         not     @C[3]
 240         xor     @C[1],@C[2]             #            C[1] ^ ( C[2] & C[3])
 241         mov     @C[2],$A[2][1](%rsi)    # R[2][1] =  C[1] ^ ( C[2] & C[3])
 242
 243         mov     @C[4],@T[1]
 244         and     @C[3],@C[4]
 245           mov   $A[2][1](%rdi),@C[2]
 246         xor     @T[0],@C[4]             #            C[2] ^ ( C[4] & ~C[3])
 247         mov     @C[4],$A[2][2](%rsi)    # R[2][2] =  C[2] ^ ( C[4] & ~C[3])
 248
 249         or      @C[1],@T[0]
 250           mov   $A[4][3](%rdi),@C[4]
 251         xor     @C[0],@T[0]             #            C[0] ^ ( C[2] | C[1])
 252         mov     @T[0],$A[2][0](%rsi)    # R[2][0] =  C[0] ^ ( C[2] | C[1])
 253
 254         and     @C[0],@C[1]
 255         xor     @T[1],@C[1]             #            C[4] ^ ( C[1] & C[0])
 256         mov     @C[1],$A[2][4](%rsi)    # R[2][4] =  C[4] ^ ( C[1] & C[0])
 257
 258         or      @C[0],@T[1]
 259           mov   $A[1][0](%rdi),@C[1]
 260         xor     @C[3],@T[1]             #           ~C[3] ^ ( C[0] | C[4])
 261           mov   $A[3][2](%rdi),@C[3]
 262         mov     @T[1],$A[2][3](%rsi)    # R[2][3] = ~C[3] ^ ( C[0] | C[4])
 263
 264
 265         mov     $A[0][4](%rdi),@C[0]
 266
 267         xor     @D[1],@C[2]
 268         xor     @D[2],@C[3]
 269         rol     \$$rhotates[2][1],@C[2]
 270         xor     @D[0],@C[1]
 271         rol     \$$rhotates[3][2],@C[3]
 272         xor     @D[3],@C[4]
 273         rol     \$$rhotates[1][0],@C[1]
 274         xor     @D[4],@C[0]
 275         rol     \$$rhotates[4][3],@C[4]
 276          mov    @C[2],@T[0]
 277          or     @C[3],@C[2]
 278         rol     \$$rhotates[0][4],@C[0]
 279
 280         not     @C[3]
 281         xor     @C[1],@C[2]             #            C[1] ^ ( C[2] | C[3])
 282         mov     @C[2],$A[3][1](%rsi)    # R[3][1] =  C[1] ^ ( C[2] | C[3])
 283
 284         mov     @C[4],@T[1]
 285         or      @C[3],@C[4]
 286         xor     @T[0],@C[4]             #            C[2] ^ ( C[4] | ~C[3])
 287         mov     @C[4],$A[3][2](%rsi)    # R[3][2] =  C[2] ^ ( C[4] | ~C[3])
 288
 289         and     @C[1],@T[0]
 290         xor     @C[0],@T[0]             #            C[0] ^ ( C[2] & C[1])
 291         mov     @T[0],$A[3][0](%rsi)    # R[3][0] =  C[0] ^ ( C[2] & C[1])
 292
 293         or      @C[0],@C[1]
 294         xor     @T[1],@C[1]             #            C[4] ^ ( C[1] | C[0])
 295         mov     @C[1],$A[3][4](%rsi)    # R[3][4] =  C[4] ^ ( C[1] | C[0])
 296
 297         and     @T[1],@C[0]
 298         xor     @C[3],@C[0]             #           ~C[3] ^ ( C[0] & C[4])
 299         mov     @C[0],$A[3][3](%rsi)    # R[3][3] = ~C[3] ^ ( C[0] & C[4])
 300
 301
 302         xor     $A[0][2](%rdi),@D[2]
 303         xor     $A[1][3](%rdi),@D[3]
 304         rol     \$$rhotates[0][2],@D[2]
 305         xor     $A[4][1](%rdi),@D[1]
 306         rol     \$$rhotates[1][3],@D[3]
 307         xor     $A[2][4](%rdi),@D[4]
 308         rol     \$$rhotates[4][1],@D[1]
 309         xor     $A[3][0](%rdi),@D[0]
 310         xchg    %rsi,%rdi
 311         rol     \$$rhotates[2][4],@D[4]
 312         rol     \$$rhotates[3][0],@D[0]
 313 ___
 314         @C = @D[2..4,0,1];
 315 $code.=<<___;
 316         mov     @C[0],@T[0]
 317         and     @C[1],@C[0]
 318         not     @C[1]
 319         xor     @C[4],@C[0]             #            C[4] ^ ( C[0] & C[1])
 320         mov     @C[0],$A[4][4](%rdi)    # R[4][4] =  C[4] ^ ( C[0] & C[1])
 321
 322         mov     @C[2],@T[1]
 323         and     @C[1],@C[2]
 324         xor     @T[0],@C[2]             #            C[0] ^ ( C[2] & ~C[1])
 325         mov     @C[2],$A[4][0](%rdi)    # R[4][0] =  C[0] ^ ( C[2] & ~C[1])
 326
 327         or      @C[4],@T[0]
 328         xor     @C[3],@T[0]             #            C[3] ^ ( C[0] | C[4])
 329         mov     @T[0],$A[4][3](%rdi)    # R[4][3] =  C[3] ^ ( C[0] | C[4])
 330
 331         and     @C[3],@C[4]
 332         xor     @T[1],@C[4]             #            C[2] ^ ( C[4] & C[3])
 333         mov     @C[4],$A[4][2](%rdi)    # R[4][2] =  C[2] ^ ( C[4] & C[3])
 334
 335         or      @T[1],@C[3]
 336         xor     @C[1],@C[3]             #           ~C[1] ^ ( C[2] | C[3])
 337         mov     @C[3],$A[4][1](%rdi)    # R[4][1] = ~C[1] ^ ( C[2] | C[3])
 338
 339         mov     @C[0],@C[1]             # harmonize with the loop top
 340         mov     @T[0],@C[0]
 341
 342         test    \$255,$iotas
 343         jnz     .Loop
 344
 345         lea     -192($iotas),$iotas     # rewind iotas
 346         ret
 347 .size   __KeccakF1600,.-__KeccakF1600
 348
 349 .type   KeccakF1600,\@abi-omnipotent
 350 .align  32
 351 KeccakF1600:
 352 .cfi_startproc
 353         push    %rbx
 354 .cfi_push       %rbx
 355         push    %rbp
 356 .cfi_push       %rbp
 357         push    %r12
 358 .cfi_push       %r12
 359         push    %r13
 360 .cfi_push       %r13
 361         push    %r14
 362 .cfi_push       %r14
 363         push    %r15
 364 .cfi_push       %r15
 365
 366         lea     100(%rdi),%rdi          # size optimization
 367         sub     \$200,%rsp
 368 .cfi_adjust_cfa_offset  200
 369
 370         notq    $A[0][1](%rdi)
 371         notq    $A[0][2](%rdi)
 372         notq    $A[1][3](%rdi)
 373         notq    $A[2][2](%rdi)
 374         notq    $A[3][2](%rdi)
 375         notq    $A[4][0](%rdi)
 376
 377         lea     iotas(%rip),$iotas
 378         lea     100(%rsp),%rsi          # size optimization
 379
 380         call    __KeccakF1600
 381
 382         notq    $A[0][1](%rdi)
 383         notq    $A[0][2](%rdi)
 384         notq    $A[1][3](%rdi)
 385         notq    $A[2][2](%rdi)
 386         notq    $A[3][2](%rdi)
 387         notq    $A[4][0](%rdi)
 388         lea     -100(%rdi),%rdi         # preserve A[][]
 389
 390         add     \$200,%rsp
 391 .cfi_adjust_cfa_offset  -200
 392
 393         pop     %r15
 394 .cfi_pop        %r15
 395         pop     %r14
 396 .cfi_pop        %r14
 397         pop     %r13
 398 .cfi_pop        %r13
 399         pop     %r12
 400 .cfi_pop        %r12
 401         pop     %rbp
 402 .cfi_pop        %rbp
 403         pop     %rbx
 404 .cfi_pop        %rbx
 405         ret
 406 .cfi_endproc
 407 .size   KeccakF1600,.-KeccakF1600
 408 ___
 409
 410 { my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
 411      ($A_flat,$inp) = ("%r8","%r9");
 412 $code.=<<___;
 413 .globl  SHA3_absorb
 414 .type   SHA3_absorb,\@function,4
 415 .align  32
 416 SHA3_absorb:
 417 .cfi_startproc
 418         push    %rbx
 419 .cfi_push       %rbx
 420         push    %rbp
 421 .cfi_push       %rbp
 422         push    %r12
 423 .cfi_push       %r12
 424         push    %r13
 425 .cfi_push       %r13
 426         push    %r14
 427 .cfi_push       %r14
 428         push    %r15
 429 .cfi_push       %r15
 430
 431         lea     100(%rdi),%rdi          # size optimization
 432         sub     \$232,%rsp
 433 .cfi_adjust_cfa_offset  232
 434
 435         mov     %rsi,$inp
 436         lea     100(%rsp),%rsi          # size optimization
 437
 438         notq    $A[0][1](%rdi)
 439         notq    $A[0][2](%rdi)
 440         notq    $A[1][3](%rdi)
 441         notq    $A[2][2](%rdi)
 442         notq    $A[3][2](%rdi)
 443         notq    $A[4][0](%rdi)
 444         lea     iotas(%rip),$iotas
 445
 446         mov     $bsz,216-100(%rsi)      # save bsz
 447
 448 .Loop_absorb:
 449         cmp     $bsz,$len
 450         jc      .Ldone_absorb
 451
 452         shr     \$3,$bsz
 453         lea     -100(%rdi),$A_flat
 454
 455 .Lblock_absorb:
 456         mov     ($inp),%rax
 457         lea     8($inp),$inp
 458         xor     ($A_flat),%rax
 459         lea     8($A_flat),$A_flat
 460         sub     \$8,$len
 461         mov     %rax,-8($A_flat)
 462         sub     \$1,$bsz
 463         jnz     .Lblock_absorb
 464
 465         mov     $inp,200-100(%rsi)      # save inp
 466         mov     $len,208-100(%rsi)      # save len
 467         call    __KeccakF1600
 468         mov     200-100(%rsi),$inp      # pull inp
 469         mov     208-100(%rsi),$len      # pull len
 470         mov     216-100(%rsi),$bsz      # pull bsz
 471         jmp     .Loop_absorb
 472
 473 .align  32
 474 .Ldone_absorb:
 475         mov     $len,%rax               # return value
 476
 477         notq    $A[0][1](%rdi)
 478         notq    $A[0][2](%rdi)
 479         notq    $A[1][3](%rdi)
 480         notq    $A[2][2](%rdi)
 481         notq    $A[3][2](%rdi)
 482         notq    $A[4][0](%rdi)
 483
 484         add     \$232,%rsp
 485 .cfi_adjust_cfa_offset  -232
 486
 487         pop     %r15
 488 .cfi_pop        %r15
 489         pop     %r14
 490 .cfi_pop        %r14
 491         pop     %r13
 492 .cfi_pop        %r13
 493         pop     %r12
 494 .cfi_pop        %r12
 495         pop     %rbp
 496 .cfi_pop        %rbp
 497         pop     %rbx
 498 .cfi_pop        %rbx
 499         ret
 500 .cfi_endproc
 501 .size   SHA3_absorb,.-SHA3_absorb
 502 ___
 503 }
 504 { my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
 505      ($out,$len,$bsz) = ("%r12","%r13","%r14");
 506
 507 $code.=<<___;
 508 .globl  SHA3_squeeze
 509 .type   SHA3_squeeze,\@function,4
 510 .align  32
 511 SHA3_squeeze:
 512 .cfi_startproc
 513         push    %r12
 514 .cfi_push       %r12
 515         push    %r13
 516 .cfi_push       %r13
 517         push    %r14
 518 .cfi_push       %r14
 519
 520         shr     \$3,%rcx
 521         mov     $A_flat,%r8
 522         mov     %rsi,$out
 523         mov     %rdx,$len
 524         mov     %rcx,$bsz
 525         jmp     .Loop_squeeze
 526
 527 .align  32
 528 .Loop_squeeze:
 529         cmp     \$8,$len
 530         jb      .Ltail_squeeze
 531
 532         mov     (%r8),%rax
 533         lea     8(%r8),%r8
 534         mov     %rax,($out)
 535         lea     8($out),$out
 536         sub     \$8,$len                # len -= 8
 537         jz      .Ldone_squeeze
 538
 539         sub     \$1,%rcx                # bsz--
 540         jnz     .Loop_squeeze
 541
 542         call    KeccakF1600
 543         mov     $A_flat,%r8
 544         mov     $bsz,%rcx
 545         jmp     .Loop_squeeze
 546
 547 .Ltail_squeeze:
 548         mov     %r8, %rsi
 549         mov     $out,%rdi
 550         mov     $len,%rcx
 551         .byte   0xf3,0xa4               # rep   movsb
 552
 553 .Ldone_squeeze:
 554         pop     %r14
 555 .cfi_pop        %r14
 556         pop     %r13
 557 .cfi_pop        %r13
 558         pop     %r12
 559 .cfi_pop        %r13
 560         ret
 561 .cfi_endproc
 562 .size   SHA3_squeeze,.-SHA3_squeeze
 563 ___
 564 }
 565 $code.=<<___;
 566 .align  256
 567         .quad   0,0,0,0,0,0,0,0
 568 .type   iotas,\@object
 569 iotas:
 570         .quad   0x0000000000000001
 571         .quad   0x0000000000008082
 572         .quad   0x800000000000808a
 573         .quad   0x8000000080008000
 574         .quad   0x000000000000808b
 575         .quad   0x0000000080000001
 576         .quad   0x8000000080008081
 577         .quad   0x8000000000008009
 578         .quad   0x000000000000008a
 579         .quad   0x0000000000000088
 580         .quad   0x0000000080008009
 581         .quad   0x000000008000000a
 582         .quad   0x000000008000808b
 583         .quad   0x800000000000008b
 584         .quad   0x8000000000008089
 585         .quad   0x8000000000008003
 586         .quad   0x8000000000008002
 587         .quad   0x8000000000000080
 588         .quad   0x000000000000800a
 589         .quad   0x800000008000000a
 590         .quad   0x8000000080008081
 591         .quad   0x8000000000008080
 592         .quad   0x0000000080000001
 593         .quad   0x8000000080008008
 594 .size   iotas,.-iotas
 595 .asciz  "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 596 ___
 597
 598 foreach (split("\n",$code)) {
 599         # Below replacement results in 11.2 on Sandy Bridge, 9.4 on
 600         # Haswell, but it hurts other processors by up to 2-3-4x...
 601         #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
 602         # Below replacement results in 9.3 on Haswell [as well as
 603         # on Ryzen, i.e. it *hurts* Ryzen]...
 604         #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
 605
 606         print $_, "\n";
 607 }
 608
 609 close STDOUT;