crypto/sha/asm/keccak1600-x86_64.pl

   1 #!/usr/bin/env perl
   2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8 #
   9 # ====================================================================
  10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11 # project. The module is, however, dual licensed under OpenSSL and
  12 # CRYPTOGAMS licenses depending on where you obtain it. For further
  13 # details see http://www.openssl.org/~appro/cryptogams/.
  14 # ====================================================================
  15 #
  16 # Keccak-1600 for x86_64.
  17 #
  18 # June 2017.
  19 #
  20 # Below code is [lane complementing] KECCAK_2X implementation (see
  21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
  22 # instead of actually unrolling the loop pair-wise I simply flip
  23 # pointers to T[][] and A[][] at the end of round. Since number of
  24 # rounds is even, last round writes to A[][] and everything works out.
  25 # How does it compare to x86_64 assembly module in Keccak Code Package?
  26 # Depending on processor it's either as fast or faster by up to 15%...
  27 #
  28 ########################################################################
  29 # Numbers are cycles per processed byte out of large message.
  30 #
  31 #                       r=1088(*)
  32 #
  33 # P4                    25.8
  34 # Core 2                12.9
  35 # Westmere              13.7
  36 # Sandy Bridge          12.9(**)
  37 # Haswell               9.6
  38 # Skylake               9.4
  39 # Silvermont            22.8
  40 # Goldmont              15.8
  41 # VIA Nano              17.3
  42 # Sledgehammer          13.3
  43 # Bulldozer             16.5
  44 # Ryzen                 8.8
  45 #
  46 # (*)   Corresponds to SHA3-256. Improvement over compiler-generate
  47 #       varies a lot, most commont coefficient is 15% in comparison to
  48 #       gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
  49 # (**)  Sandy Bridge has broken rotate instruction. Performance can be
  50 #       improved by 14% by replacing rotates with double-precision
  51 #       shift with same register as source and destination.
  52
  53 $flavour = shift;
  54 $output  = shift;
  55 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  56
  57 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  58
  59 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  60 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  61 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  62 die "can't locate x86_64-xlate.pl";
  63
  64 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  65 *STDOUT=*OUT;
  66
  67 my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
  68               8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
  69
  70 my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
  71 my @D = map("%r$_",(8..12));
  72 my @T = map("%r$_",(13..14));
  73 my $iotas = "%r15";
  74
  75 my @rhotates = ([  0,  1, 62, 28, 27 ],
  76                 [ 36, 44,  6, 55, 20 ],
  77                 [  3, 10, 43, 25, 39 ],
  78                 [ 41, 45, 15, 21,  8 ],
  79                 [ 18,  2, 61, 56, 14 ]);
  80
  81 $code.=<<___;
  82 .text
  83
  84 .type   __KeccakF1600,\@function
  85 .align  32
  86 __KeccakF1600:
  87         mov     $A[4][0](%rdi),@C[0]
  88         mov     $A[4][1](%rdi),@C[1]
  89         mov     $A[4][2](%rdi),@C[2]
  90         mov     $A[4][3](%rdi),@C[3]
  91         mov     $A[4][4](%rdi),@C[4]
  92         jmp     .Loop
  93
  94 .align  32
  95 .Loop:
  96         mov     $A[0][0](%rdi),@D[0]
  97         mov     $A[1][1](%rdi),@D[1]
  98         mov     $A[2][2](%rdi),@D[2]
  99         mov     $A[3][3](%rdi),@D[3]
 100
 101         xor     $A[0][2](%rdi),@C[2]
 102         xor     $A[0][3](%rdi),@C[3]
 103         xor     @D[0],         @C[0]
 104         xor     $A[0][1](%rdi),@C[1]
 105          xor    $A[1][2](%rdi),@C[2]
 106          xor    $A[1][0](%rdi),@C[0]
 107         mov     @C[4],@D[4]
 108         xor     $A[0][4](%rdi),@C[4]
 109
 110         xor     @D[2],         @C[2]
 111         xor     $A[2][0](%rdi),@C[0]
 112          xor    $A[1][3](%rdi),@C[3]
 113          xor    @D[1],         @C[1]
 114          xor    $A[1][4](%rdi),@C[4]
 115
 116         xor     $A[3][2](%rdi),@C[2]
 117         xor     $A[3][0](%rdi),@C[0]
 118          xor    $A[2][3](%rdi),@C[3]
 119          xor    $A[2][1](%rdi),@C[1]
 120          xor    $A[2][4](%rdi),@C[4]
 121
 122         mov     @C[2],@T[0]
 123         rol     \$1,@C[2]
 124         xor     @C[0],@C[2]             # D[1] = ROL64(C[2], 1) ^ C[0]
 125          xor    @D[3],         @C[3]
 126
 127         rol     \$1,@C[0]
 128         xor     @C[3],@C[0]             # D[4] = ROL64(C[0], 1) ^ C[3]
 129          xor    $A[3][1](%rdi),@C[1]
 130
 131         rol     \$1,@C[3]
 132         xor     @C[1],@C[3]             # D[2] = ROL64(C[3], 1) ^ C[1]
 133          xor    $A[3][4](%rdi),@C[4]
 134
 135         rol     \$1,@C[1]
 136         xor     @C[4],@C[1]             # D[0] = ROL64(C[1], 1) ^ C[4]
 137
 138         rol     \$1,@C[4]
 139         xor     @T[0],@C[4]             # D[3] = ROL64(C[4], 1) ^ C[2]
 140 ___
 141         (@D[0..4], @C) = (@C[1..4,0], @D);
 142 $code.=<<___;
 143         xor     @D[1],@C[1]
 144         xor     @D[2],@C[2]
 145         rol     \$$rhotates[1][1],@C[1]
 146         xor     @D[3],@C[3]
 147         xor     @D[4],@C[4]
 148         rol     \$$rhotates[2][2],@C[2]
 149         xor     @D[0],@C[0]
 150          mov    @C[1],@T[0]
 151         rol     \$$rhotates[3][3],@C[3]
 152          or     @C[2],@C[1]
 153          xor    @C[0],@C[1]             #           C[0] ^ ( C[1] | C[2])
 154         rol     \$$rhotates[4][4],@C[4]
 155
 156          xor    ($iotas),@C[1]
 157          lea    8($iotas),$iotas
 158
 159         mov     @C[4],@T[1]
 160         and     @C[3],@C[4]
 161          mov    @C[1],$A[0][0](%rsi)    # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
 162         xor     @C[2],@C[4]             #           C[2] ^ ( C[4] & C[3])
 163         not     @C[2]
 164         mov     @C[4],$A[0][2](%rsi)    # R[0][2] = C[2] ^ ( C[4] & C[3])
 165
 166         or      @C[3],@C[2]
 167           mov   $A[4][2](%rdi),@C[4]
 168         xor     @T[0],@C[2]             #           C[1] ^ (~C[2] | C[3])
 169         mov     @C[2],$A[0][1](%rsi)    # R[0][1] = C[1] ^ (~C[2] | C[3])
 170
 171         and     @C[0],@T[0]
 172           mov   $A[1][4](%rdi),@C[1]
 173         xor     @T[1],@T[0]             #           C[4] ^ ( C[1] & C[0])
 174           mov   $A[2][0](%rdi),@C[2]
 175         mov     @T[0],$A[0][4](%rsi)    # R[0][4] = C[4] ^ ( C[1] & C[0])
 176
 177         or      @C[0],@T[1]
 178           mov   $A[0][3](%rdi),@C[0]
 179         xor     @C[3],@T[1]             #           C[3] ^ ( C[4] | C[0])
 180           mov   $A[3][1](%rdi),@C[3]
 181         mov     @T[1],$A[0][3](%rsi)    # R[0][3] = C[3] ^ ( C[4] | C[0])
 182
 183
 184         xor     @D[3],@C[0]
 185         xor     @D[2],@C[4]
 186         rol     \$$rhotates[0][3],@C[0]
 187         xor     @D[1],@C[3]
 188         xor     @D[4],@C[1]
 189         rol     \$$rhotates[4][2],@C[4]
 190         rol     \$$rhotates[3][1],@C[3]
 191         xor     @D[0],@C[2]
 192         rol     \$$rhotates[1][4],@C[1]
 193          mov    @C[0],@T[0]
 194          or     @C[4],@C[0]
 195         rol     \$$rhotates[2][0],@C[2]
 196
 197         xor     @C[3],@C[0]             #           C[3] ^ (C[0] |  C[4])
 198         mov     @C[0],$A[1][3](%rsi)    # R[1][3] = C[3] ^ (C[0] |  C[4])
 199
 200         mov     @C[1],@T[1]
 201         and     @T[0],@C[1]
 202           mov   $A[0][1](%rdi),@C[0]
 203         xor     @C[4],@C[1]             #           C[4] ^ (C[1] &  C[0])
 204         not     @C[4]
 205         mov     @C[1],$A[1][4](%rsi)    # R[1][4] = C[4] ^ (C[1] &  C[0])
 206
 207         or      @C[3],@C[4]
 208           mov   $A[1][2](%rdi),@C[1]
 209         xor     @C[2],@C[4]             #           C[2] ^ (~C[4] | C[3])
 210         mov     @C[4],$A[1][2](%rsi)    # R[1][2] = C[2] ^ (~C[4] | C[3])
 211
 212         and     @C[2],@C[3]
 213           mov   $A[4][0](%rdi),@C[4]
 214         xor     @T[1],@C[3]             #           C[1] ^ (C[3] &  C[2])
 215         mov     @C[3],$A[1][1](%rsi)    # R[1][1] = C[1] ^ (C[3] &  C[2])
 216
 217         or      @C[2],@T[1]
 218           mov   $A[2][3](%rdi),@C[2]
 219         xor     @T[0],@T[1]             #           C[0] ^ (C[1] |  C[2])
 220           mov   $A[3][4](%rdi),@C[3]
 221         mov     @T[1],$A[1][0](%rsi)    # R[1][0] = C[0] ^ (C[1] |  C[2])
 222
 223
 224         xor     @D[3],@C[2]
 225         xor     @D[4],@C[3]
 226         rol     \$$rhotates[2][3],@C[2]
 227         xor     @D[2],@C[1]
 228         rol     \$$rhotates[3][4],@C[3]
 229         xor     @D[0],@C[4]
 230         rol     \$$rhotates[1][2],@C[1]
 231         xor     @D[1],@C[0]
 232         rol     \$$rhotates[4][0],@C[4]
 233          mov    @C[2],@T[0]
 234          and    @C[3],@C[2]
 235         rol     \$$rhotates[0][1],@C[0]
 236
 237         not     @C[3]
 238         xor     @C[1],@C[2]             #            C[1] ^ ( C[2] & C[3])
 239         mov     @C[2],$A[2][1](%rsi)    # R[2][1] =  C[1] ^ ( C[2] & C[3])
 240
 241         mov     @C[4],@T[1]
 242         and     @C[3],@C[4]
 243           mov   $A[2][1](%rdi),@C[2]
 244         xor     @T[0],@C[4]             #            C[2] ^ ( C[4] & ~C[3])
 245         mov     @C[4],$A[2][2](%rsi)    # R[2][2] =  C[2] ^ ( C[4] & ~C[3])
 246
 247         or      @C[1],@T[0]
 248           mov   $A[4][3](%rdi),@C[4]
 249         xor     @C[0],@T[0]             #            C[0] ^ ( C[2] | C[1])
 250         mov     @T[0],$A[2][0](%rsi)    # R[2][0] =  C[0] ^ ( C[2] | C[1])
 251
 252         and     @C[0],@C[1]
 253         xor     @T[1],@C[1]             #            C[4] ^ ( C[1] & C[0])
 254         mov     @C[1],$A[2][4](%rsi)    # R[2][4] =  C[4] ^ ( C[1] & C[0])
 255
 256         or      @C[0],@T[1]
 257           mov   $A[1][0](%rdi),@C[1]
 258         xor     @C[3],@T[1]             #           ~C[3] ^ ( C[0] | C[4])
 259           mov   $A[3][2](%rdi),@C[3]
 260         mov     @T[1],$A[2][3](%rsi)    # R[2][3] = ~C[3] ^ ( C[0] | C[4])
 261
 262
 263         mov     $A[0][4](%rdi),@C[0]
 264
 265         xor     @D[1],@C[2]
 266         xor     @D[2],@C[3]
 267         rol     \$$rhotates[2][1],@C[2]
 268         xor     @D[0],@C[1]
 269         rol     \$$rhotates[3][2],@C[3]
 270         xor     @D[3],@C[4]
 271         rol     \$$rhotates[1][0],@C[1]
 272         xor     @D[4],@C[0]
 273         rol     \$$rhotates[4][3],@C[4]
 274          mov    @C[2],@T[0]
 275          or     @C[3],@C[2]
 276         rol     \$$rhotates[0][4],@C[0]
 277
 278         not     @C[3]
 279         xor     @C[1],@C[2]             #            C[1] ^ ( C[2] | C[3])
 280         mov     @C[2],$A[3][1](%rsi)    # R[3][1] =  C[1] ^ ( C[2] | C[3])
 281
 282         mov     @C[4],@T[1]
 283         or      @C[3],@C[4]
 284         xor     @T[0],@C[4]             #            C[2] ^ ( C[4] | ~C[3])
 285         mov     @C[4],$A[3][2](%rsi)    # R[3][2] =  C[2] ^ ( C[4] | ~C[3])
 286
 287         and     @C[1],@T[0]
 288         xor     @C[0],@T[0]             #            C[0] ^ ( C[2] & C[1])
 289         mov     @T[0],$A[3][0](%rsi)    # R[3][0] =  C[0] ^ ( C[2] & C[1])
 290
 291         or      @C[0],@C[1]
 292         xor     @T[1],@C[1]             #            C[4] ^ ( C[1] | C[0])
 293         mov     @C[1],$A[3][4](%rsi)    # R[3][4] =  C[4] ^ ( C[1] | C[0])
 294
 295         and     @T[1],@C[0]
 296         xor     @C[3],@C[0]             #           ~C[3] ^ ( C[0] & C[4])
 297         mov     @C[0],$A[3][3](%rsi)    # R[3][3] = ~C[3] ^ ( C[0] & C[4])
 298
 299
 300         xor     $A[0][2](%rdi),@D[2]
 301         xor     $A[1][3](%rdi),@D[3]
 302         rol     \$$rhotates[0][2],@D[2]
 303         xor     $A[4][1](%rdi),@D[1]
 304         rol     \$$rhotates[1][3],@D[3]
 305         xor     $A[2][4](%rdi),@D[4]
 306         rol     \$$rhotates[4][1],@D[1]
 307         xor     $A[3][0](%rdi),@D[0]
 308         xchg    %rsi,%rdi
 309         rol     \$$rhotates[2][4],@D[4]
 310         rol     \$$rhotates[3][0],@D[0]
 311 ___
 312         @C = @D[2..4,0,1];
 313 $code.=<<___;
 314         mov     @C[0],@T[0]
 315         and     @C[1],@C[0]
 316         not     @C[1]
 317         xor     @C[4],@C[0]             #            C[4] ^ ( C[0] & C[1])
 318         mov     @C[0],$A[4][4](%rdi)    # R[4][4] =  C[4] ^ ( C[0] & C[1])
 319
 320         mov     @C[2],@T[1]
 321         and     @C[1],@C[2]
 322         xor     @T[0],@C[2]             #            C[0] ^ ( C[2] & ~C[1])
 323         mov     @C[2],$A[4][0](%rdi)    # R[4][0] =  C[0] ^ ( C[2] & ~C[1])
 324
 325         or      @C[4],@T[0]
 326         xor     @C[3],@T[0]             #            C[3] ^ ( C[0] | C[4])
 327         mov     @T[0],$A[4][3](%rdi)    # R[4][3] =  C[3] ^ ( C[0] | C[4])
 328
 329         and     @C[3],@C[4]
 330         xor     @T[1],@C[4]             #            C[2] ^ ( C[4] & C[3])
 331         mov     @C[4],$A[4][2](%rdi)    # R[4][2] =  C[2] ^ ( C[4] & C[3])
 332
 333         or      @T[1],@C[3]
 334         xor     @C[1],@C[3]             #           ~C[1] ^ ( C[2] | C[3])
 335         mov     @C[3],$A[4][1](%rdi)    # R[4][1] = ~C[1] ^ ( C[2] | C[3])
 336
 337         mov     @C[0],@C[1]             # harmonize with the loop top
 338         mov     @T[0],@C[0]
 339
 340         test    \$255,$iotas
 341         jnz     .Loop
 342
 343         lea     -192($iotas),$iotas     # rewind iotas
 344         ret
 345 .size   __KeccakF1600,.-__KeccakF1600
 346
 347 .globl  KeccakF1600
 348 .type   KeccakF1600,\@function
 349 .align  32
 350 KeccakF1600:
 351 .cfi_startproc
 352         push    %rbx
 353 .cfi_push       %rbx
 354         push    %rbp
 355 .cfi_push       %rbp
 356         push    %r12
 357 .cfi_push       %r12
 358         push    %r13
 359 .cfi_push       %r13
 360         push    %r14
 361 .cfi_push       %r14
 362         push    %r15
 363 .cfi_push       %r15
 364
 365         lea     100(%rdi),%rdi          # size optimization
 366         sub     \$200,%rsp
 367 .cfi_adjust_cfa_offset  200
 368
 369         notq    $A[0][1](%rdi)
 370         notq    $A[0][2](%rdi)
 371         notq    $A[1][3](%rdi)
 372         notq    $A[2][2](%rdi)
 373         notq    $A[3][2](%rdi)
 374         notq    $A[4][0](%rdi)
 375
 376         lea     iotas(%rip),$iotas
 377         lea     100(%rsp),%rsi          # size optimization
 378
 379         call    __KeccakF1600
 380
 381         notq    $A[0][1](%rdi)
 382         notq    $A[0][2](%rdi)
 383         notq    $A[1][3](%rdi)
 384         notq    $A[2][2](%rdi)
 385         notq    $A[3][2](%rdi)
 386         notq    $A[4][0](%rdi)
 387         lea     -100(%rdi),%rdi         # preserve A[][]
 388
 389         add     \$200,%rsp
 390 .cfi_adjust_cfa_offset  -200
 391
 392         pop     %r15
 393 .cfi_pop        %r15
 394         pop     %r14
 395 .cfi_pop        %r14
 396         pop     %r13
 397 .cfi_pop        %r13
 398         pop     %r12
 399 .cfi_pop        %r12
 400         pop     %rbp
 401 .cfi_pop        %rbp
 402         pop     %rbx
 403 .cfi_pop        %rbx
 404         ret
 405 .cfi_endproc
 406 .size   KeccakF1600,.-KeccakF1600
 407 ___
 408
 409 { my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
 410      ($A_flat,$inp) = ("%r8","%r9");
 411 $code.=<<___;
 412 .globl  SHA3_absorb
 413 .type   SHA3_absorb,\@function
 414 .align  32
 415 SHA3_absorb:
 416 .cfi_startproc
 417         push    %rbx
 418 .cfi_push       %rbx
 419         push    %rbp
 420 .cfi_push       %rbp
 421         push    %r12
 422 .cfi_push       %r12
 423         push    %r13
 424 .cfi_push       %r13
 425         push    %r14
 426 .cfi_push       %r14
 427         push    %r15
 428 .cfi_push       %r15
 429
 430         lea     100(%rdi),%rdi          # size optimization
 431         sub     \$232,%rsp
 432 .cfi_adjust_cfa_offset  232
 433
 434         mov     %rsi,$inp
 435         lea     100(%rsp),%rsi          # size optimization
 436
 437         notq    $A[0][1](%rdi)
 438         notq    $A[0][2](%rdi)
 439         notq    $A[1][3](%rdi)
 440         notq    $A[2][2](%rdi)
 441         notq    $A[3][2](%rdi)
 442         notq    $A[4][0](%rdi)
 443         lea     iotas(%rip),$iotas
 444
 445         mov     $bsz,216-100(%rsi)      # save bsz
 446
 447 .Loop_absorb:
 448         cmp     $bsz,$len
 449         jc      .Ldone_absorb
 450
 451         shr     \$3,$bsz
 452         lea     -100(%rdi),$A_flat
 453
 454 .Lblock_absorb:
 455         mov     ($inp),%rax
 456         lea     8($inp),$inp
 457         xor     ($A_flat),%rax
 458         lea     8($A_flat),$A_flat
 459         sub     \$8,$len
 460         mov     %rax,-8($A_flat)
 461         sub     \$1,$bsz
 462         jnz     .Lblock_absorb
 463
 464         mov     $inp,200-100(%rsi)      # save inp
 465         mov     $len,208-100(%rsi)      # save len
 466         call    __KeccakF1600
 467         mov     200-100(%rsi),$inp      # pull inp
 468         mov     208-100(%rsi),$len      # pull len
 469         mov     216-100(%rsi),$bsz      # pull bsz
 470         jmp     .Loop_absorb
 471
 472 .align  32
 473 .Ldone_absorb:
 474         mov     $len,%rax               # return value
 475
 476         notq    $A[0][1](%rdi)
 477         notq    $A[0][2](%rdi)
 478         notq    $A[1][3](%rdi)
 479         notq    $A[2][2](%rdi)
 480         notq    $A[3][2](%rdi)
 481         notq    $A[4][0](%rdi)
 482
 483         add     \$232,%rsp
 484 .cfi_adjust_cfa_offset  -232
 485
 486         pop     %r15
 487 .cfi_pop        %r15
 488         pop     %r14
 489 .cfi_pop        %r14
 490         pop     %r13
 491 .cfi_pop        %r13
 492         pop     %r12
 493 .cfi_pop        %r12
 494         pop     %rbp
 495 .cfi_pop        %rbp
 496         pop     %rbx
 497 .cfi_pop        %rbx
 498         ret
 499 .cfi_endproc
 500 .size   SHA3_absorb,.-SHA3_absorb
 501 ___
 502 }
 503 { my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
 504      ($out,$len,$bsz) = ("%r12","%r13","%r14");
 505
 506 $code.=<<___;
 507 .globl  SHA3_squeeze
 508 .type   SHA3_squeeze,\@function
 509 .align  32
 510 SHA3_squeeze:
 511 .cfi_startproc
 512         push    %r12
 513 .cfi_push       %r12
 514         push    %r13
 515 .cfi_push       %r13
 516         push    %r14
 517 .cfi_push       %r14
 518
 519         shr     \$3,%rcx
 520         mov     $A_flat,%r8
 521         mov     %rsi,$out
 522         mov     %rdx,$len
 523         mov     %rcx,$bsz
 524         jmp     .Loop_squeeze
 525
 526 .align  32
 527 .Loop_squeeze:
 528         cmp     \$8,$len
 529         jb      .Ltail_squeeze
 530
 531         mov     (%r8),%rax
 532         lea     8(%r8),%r8
 533         mov     %rax,($out)
 534         lea     8($out),$out
 535         sub     \$8,$len                # len -= 8
 536         jz      .Ldone_squeeze
 537
 538         sub     \$1,%rcx                # bsz--
 539         jnz     .Loop_squeeze
 540
 541         call    KeccakF1600
 542         mov     $A_flat,%r8
 543         mov     $bsz,%rcx
 544         jmp     .Loop_squeeze
 545
 546 .Ltail_squeeze:
 547         mov     %r8, %rsi
 548         mov     $out,%rdi
 549         mov     $len,%rcx
 550         .byte   0xf3,0xa4               # rep   movsb
 551
 552 .Ldone_squeeze:
 553         pop     %r14
 554 .cfi_pop        %r14
 555         pop     %r13
 556 .cfi_pop        %r13
 557         pop     %r12
 558 .cfi_pop        %r13
 559         ret
 560 .cfi_endproc
 561 .size   SHA3_squeeze,.-SHA3_squeeze
 562 ___
 563 }
 564 $code.=<<___;
 565 .align  256
 566         .quad   0,0,0,0,0,0,0,0
 567 .type   iotas,\@object
 568 iotas:
 569         .quad   0x0000000000000001
 570         .quad   0x0000000000008082
 571         .quad   0x800000000000808a
 572         .quad   0x8000000080008000
 573         .quad   0x000000000000808b
 574         .quad   0x0000000080000001
 575         .quad   0x8000000080008081
 576         .quad   0x8000000000008009
 577         .quad   0x000000000000008a
 578         .quad   0x0000000000000088
 579         .quad   0x0000000080008009
 580         .quad   0x000000008000000a
 581         .quad   0x000000008000808b
 582         .quad   0x800000000000008b
 583         .quad   0x8000000000008089
 584         .quad   0x8000000000008003
 585         .quad   0x8000000000008002
 586         .quad   0x8000000000000080
 587         .quad   0x000000000000800a
 588         .quad   0x800000008000000a
 589         .quad   0x8000000080008081
 590         .quad   0x8000000000008080
 591         .quad   0x0000000080000001
 592         .quad   0x8000000080008008
 593 .size   iotas,.-iotas
 594 .asciz  "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 595 ___
 596
 597 foreach (split("\n",$code)) {
 598         # Below replacement results in 11.2 on Sandy Bridge, 9.4 on
 599         # Haswell, but it hurts other processors by up to 2-3-4x...
 600         #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
 601         # Below replacement results in 9.3 on Haswell [as well as
 602         # on Ryzen, i.e. it *hurts* Ryzen]...
 603         #s/rol\s+\$([0-9]+),(%[a-z][a-z0-9]+)/rorx\t\$64-$1,$2,$2/;
 604
 605         print $_, "\n";
 606 }
 607
 608 close STDOUT;