crypto/sha/asm/keccak1600-x86_64.pl

   1 #!/usr/bin/env perl
   2 # Copyright 2017 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8 #
   9 # ====================================================================
  10 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  11 # project. The module is, however, dual licensed under OpenSSL and
  12 # CRYPTOGAMS licenses depending on where you obtain it. For further
  13 # details see http://www.openssl.org/~appro/cryptogams/.
  14 # ====================================================================
  15 #
  16 # Keccak-1600 for x86_86.
  17 #
  18 # June 2017.
  19 #
  20 # Below code is [lane complementing] KECCAK_2X implementation (see
  21 # sha/keccak1600.c) with C[5] and D[5] held in register bank. Though
  22 # instead of actually unrolling the loop pair-wise I simply flip
  23 # pointers to T[][] and A[][] at the end of round. Since number of
  24 # rounds is even, last round writes to A[][] and everything works out.
  25 # How does it compare to assembly module in Keccak Code Package? KCP
  26 # is faster on couple of processors, VIA Nano and Goldmont by 4-6%,
  27 # otherwise this module is either as fast or faster by up to 15%...
  28 #
  29 ########################################################################
  30 # Numbers are cycles per processed byte out of large message.
  31 #
  32 #                       r=1088(*)
  33 #
  34 # P4                    25.8
  35 # Core 2                13.0
  36 # Westmere              13.7
  37 # Sandy Bridge          12.9(**)
  38 # Haswell               9.7
  39 # Skylake               9.4
  40 # Silvermont            22.8
  41 # Goldmont              16.4
  42 # VIA Nano              18.0
  43 # Sledgehammer          13.3
  44 # Bulldozer             16.5
  45 #
  46 # (*)   Corresponds to SHA3-256. Improvement over compiler-generate
  47 #       varies a lot, most commont coefficient is 15% in comparison to
  48 #       gcc-5.x, 50% for gcc-4.x, 90% for gcc-3.x.
  49 # (**)  Sandy Bridge has broken rotate instruction. Performance can be
  50 #       improved by 14% by replacing rotates with double-precision
  51 #       shift with same register as source and destination.
  52
  53 $flavour = shift;
  54 $output  = shift;
  55 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  56
  57 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  58
  59 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  60 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  61 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  62 die "can't locate x86_64-xlate.pl";
  63
  64 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  65 *STDOUT=*OUT;
  66
  67 my @A = map([ 8*$_-100, 8*($_+1)-100, 8*($_+2)-100,
  68               8*($_+3)-100, 8*($_+4)-100 ], (0,5,10,15,20));
  69
  70 my @C = ("%rax","%rbx","%rcx","%rdx","%rbp");
  71 my @D = map("%r$_",(8..12));
  72 my @T = map("%r$_",(13..14));
  73 my $iotas = "%r15";
  74
  75 my @rhotates = ([  0,  1, 62, 28, 27 ],
  76                 [ 36, 44,  6, 55, 20 ],
  77                 [  3, 10, 43, 25, 39 ],
  78                 [ 41, 45, 15, 21,  8 ],
  79                 [ 18,  2, 61, 56, 14 ]);
  80
  81 $code.=<<___;
  82 .text
  83
  84 .type   __KeccakF1600,\@function
  85 .align  32
  86 __KeccakF1600:
  87         mov     $A[4][0](%rdi),@C[0]
  88         mov     $A[4][1](%rdi),@C[1]
  89         mov     $A[4][2](%rdi),@C[2]
  90         mov     $A[4][3](%rdi),@C[3]
  91         mov     $A[4][4](%rdi),@C[4]
  92         jmp     .Loop
  93
  94 .align  32
  95 .Loop:
  96         mov     $A[0][0](%rdi),@D[0]
  97         mov     $A[1][1](%rdi),@D[1]
  98         mov     $A[2][2](%rdi),@D[2]
  99         mov     $A[3][3](%rdi),@D[3]
 100
 101         xor     $A[0][2](%rdi),@C[2]
 102         xor     $A[0][3](%rdi),@C[3]
 103         xor     @D[0],         @C[0]
 104         xor     $A[0][1](%rdi),@C[1]
 105          xor    $A[1][2](%rdi),@C[2]
 106          xor    $A[1][0](%rdi),@C[0]
 107         mov     @C[4],@D[4]
 108         xor     $A[0][4](%rdi),@C[4]
 109
 110         xor     @D[2],         @C[2]
 111         xor     $A[2][0](%rdi),@C[0]
 112          xor    $A[1][3](%rdi),@C[3]
 113          xor    @D[1],         @C[1]
 114          xor    $A[1][4](%rdi),@C[4]
 115
 116         xor     $A[3][2](%rdi),@C[2]
 117         xor     $A[3][0](%rdi),@C[0]
 118          xor    $A[2][3](%rdi),@C[3]
 119          xor    $A[2][1](%rdi),@C[1]
 120          xor    $A[2][4](%rdi),@C[4]
 121
 122         mov     @C[2],@T[0]
 123         rol     \$1,@C[2]
 124         xor     @C[0],@C[2]             # D[1] = ROL64(C[2], 1) ^ C[0]
 125          xor    @D[3],         @C[3]
 126
 127         rol     \$1,@C[0]
 128         xor     @C[3],@C[0]             # D[4] = ROL64(C[0], 1) ^ C[3]
 129          xor    $A[3][1](%rdi),@C[1]
 130
 131         rol     \$1,@C[3]
 132         xor     @C[1],@C[3]             # D[2] = ROL64(C[3], 1) ^ C[1]
 133          xor    $A[3][4](%rdi),@C[4]
 134
 135         rol     \$1,@C[1]
 136         xor     @C[4],@C[1]             # D[0] = ROL64(C[1], 1) ^ C[4]
 137
 138         rol     \$1,@C[4]
 139         xor     @T[0],@C[4]             # D[3] = ROL64(C[4], 1) ^ C[2]
 140 ___
 141         my @E = @D;
 142         @D = (@C[1],@C[2],@C[3],@C[4],@C[0]);
 143         @C = @E;
 144 $code.=<<___;
 145         xor     @D[1],@C[1]
 146         xor     @D[2],@C[2]
 147         rol     \$$rhotates[1][1],@C[1]
 148         xor     @D[3],@C[3]
 149         xor     @D[4],@C[4]
 150         rol     \$$rhotates[2][2],@C[2]
 151         xor     @D[0],@C[0]
 152          mov    @C[1],@T[0]
 153         rol     \$$rhotates[3][3],@C[3]
 154          or     @C[2],@C[1]
 155          xor    @C[0],@C[1]             #           C[0] ^ ( C[1] | C[2])
 156         rol     \$$rhotates[4][4],@C[4]
 157
 158          xor    ($iotas),@C[1]
 159          lea    8($iotas),$iotas
 160
 161         mov     @C[4],@T[1]
 162         and     @C[3],@C[4]
 163          mov    @C[1],$A[0][0](%rsi)    # R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i]
 164         xor     @C[2],@C[4]             #           C[2] ^ ( C[4] & C[3])
 165         not     @C[2]
 166         mov     @C[4],$A[0][2](%rsi)    # R[0][2] = C[2] ^ ( C[4] & C[3])
 167
 168         or      @C[3],@C[2]
 169         xor     @T[0],@C[2]             #           C[1] ^ (~C[2] | C[3])
 170         mov     @C[2],$A[0][1](%rsi)    # R[0][1] = C[1] ^ (~C[2] | C[3])
 171
 172         and     @C[0],@T[0]
 173         xor     @T[1],@T[0]             #           C[4] ^ ( C[1] & C[0])
 174         mov     @T[0],$A[0][4](%rsi)    # R[0][4] = C[4] ^ ( C[1] & C[0])
 175
 176         or      @C[0],@T[1]
 177         xor     @C[3],@T[1]             #           C[3] ^ ( C[4] | C[0])
 178         mov     @T[1],$A[0][3](%rsi)    # R[0][3] = C[3] ^ ( C[4] | C[0])
 179
 180
 181         mov     $A[0][3](%rdi),@C[0]
 182         mov     $A[4][2](%rdi),@C[4]
 183         mov     $A[3][1](%rdi),@C[3]
 184         mov     $A[1][4](%rdi),@C[1]
 185         mov     $A[2][0](%rdi),@C[2]
 186
 187         xor     @D[3],@C[0]
 188         xor     @D[2],@C[4]
 189         rol     \$$rhotates[0][3],@C[0]
 190         xor     @D[1],@C[3]
 191         xor     @D[4],@C[1]
 192         rol     \$$rhotates[4][2],@C[4]
 193         rol     \$$rhotates[3][1],@C[3]
 194         xor     @D[0],@C[2]
 195         rol     \$$rhotates[1][4],@C[1]
 196          mov    @C[0],@T[0]
 197          or     @C[4],@C[0]
 198         rol     \$$rhotates[2][0],@C[2]
 199
 200         xor     @C[3],@C[0]             #           C[3] ^ (C[0] |  C[4])
 201         mov     @C[0],$A[1][3](%rsi)    # R[1][3] = C[3] ^ (C[0] |  C[4])
 202
 203         mov     @C[1],@T[1]
 204         and     @T[0],@C[1]
 205         xor     @C[4],@C[1]             #           C[4] ^ (C[1] &  C[0])
 206         not     @C[4]
 207         mov     @C[1],$A[1][4](%rsi)    # R[1][4] = C[4] ^ (C[1] &  C[0])
 208
 209         or      @C[3],@C[4]
 210         xor     @C[2],@C[4]             #           C[2] ^ (~C[4] | C[3])
 211         mov     @C[4],$A[1][2](%rsi)    # R[1][2] = C[2] ^ (~C[4] | C[3])
 212
 213         and     @C[2],@C[3]
 214         xor     @T[1],@C[3]             #           C[1] ^ (C[3] &  C[2])
 215         mov     @C[3],$A[1][1](%rsi)    # R[1][1] = C[1] ^ (C[3] &  C[2])
 216
 217         or      @C[2],@T[1]
 218         xor     @T[0],@T[1]             #           C[0] ^ (C[1] |  C[2])
 219         mov     @T[1],$A[1][0](%rsi)    # R[1][0] = C[0] ^ (C[1] |  C[2])
 220
 221
 222         mov     $A[2][3](%rdi),@C[2]
 223         mov     $A[3][4](%rdi),@C[3]
 224         mov     $A[1][2](%rdi),@C[1]
 225         mov     $A[4][0](%rdi),@C[4]
 226         mov     $A[0][1](%rdi),@C[0]
 227
 228         xor     @D[3],@C[2]
 229         xor     @D[4],@C[3]
 230         rol     \$$rhotates[2][3],@C[2]
 231         xor     @D[2],@C[1]
 232         rol     \$$rhotates[3][4],@C[3]
 233         xor     @D[0],@C[4]
 234         rol     \$$rhotates[1][2],@C[1]
 235         xor     @D[1],@C[0]
 236         rol     \$$rhotates[4][0],@C[4]
 237          mov    @C[2],@T[0]
 238          and    @C[3],@C[2]
 239         rol     \$$rhotates[0][1],@C[0]
 240
 241         not     @C[3]
 242         xor     @C[1],@C[2]             #            C[1] ^ ( C[2] & C[3])
 243         mov     @C[2],$A[2][1](%rsi)    # R[2][1] =  C[1] ^ ( C[2] & C[3])
 244
 245         mov     @C[4],@T[1]
 246         and     @C[3],@C[4]
 247         xor     @T[0],@C[4]             #            C[2] ^ ( C[4] & ~C[3])
 248         mov     @C[4],$A[2][2](%rsi)    # R[2][2] =  C[2] ^ ( C[4] & ~C[3])
 249
 250         or      @C[1],@T[0]
 251         xor     @C[0],@T[0]             #            C[0] ^ ( C[2] | C[1])
 252         mov     @T[0],$A[2][0](%rsi)    # R[2][0] =  C[0] ^ ( C[2] | C[1])
 253
 254         and     @C[0],@C[1]
 255         xor     @T[1],@C[1]             #            C[4] ^ ( C[1] & C[0])
 256         mov     @C[1],$A[2][4](%rsi)    # R[2][4] =  C[4] ^ ( C[1] & C[0])
 257
 258         or      @T[1],@C[0]
 259         xor     @C[3],@C[0]             #           ~C[3] ^ ( C[0] | C[4])
 260         mov     @C[0],$A[2][3](%rsi)    # R[2][3] = ~C[3] ^ ( C[0] | C[4])
 261
 262
 263         mov     $A[2][1](%rdi),@C[2]
 264         mov     $A[3][2](%rdi),@C[3]
 265         mov     $A[1][0](%rdi),@C[1]
 266         mov     $A[4][3](%rdi),@C[4]
 267         mov     $A[0][4](%rdi),@C[0]
 268
 269         xor     @D[1],@C[2]
 270         xor     @D[2],@C[3]
 271         rol     \$$rhotates[2][1],@C[2]
 272         xor     @D[0],@C[1]
 273         rol     \$$rhotates[3][2],@C[3]
 274         xor     @D[3],@C[4]
 275         rol     \$$rhotates[1][0],@C[1]
 276         xor     @D[4],@C[0]
 277         rol     \$$rhotates[4][3],@C[4]
 278          mov    @C[2],@T[0]
 279          or     @C[3],@C[2]
 280         rol     \$$rhotates[0][4],@C[0]
 281
 282         not     @C[3]
 283         xor     @C[1],@C[2]             #            C[1] ^ ( C[2] | C[3])
 284         mov     @C[2],$A[3][1](%rsi)    # R[3][1] =  C[1] ^ ( C[2] | C[3])
 285
 286         mov     @C[4],@T[1]
 287         or      @C[3],@C[4]
 288         xor     @T[0],@C[4]             #            C[2] ^ ( C[4] | ~C[3])
 289         mov     @C[4],$A[3][2](%rsi)    # R[3][2] =  C[2] ^ ( C[4] | ~C[3])
 290
 291         and     @C[1],@T[0]
 292         xor     @C[0],@T[0]             #            C[0] ^ ( C[2] & C[1])
 293         mov     @T[0],$A[3][0](%rsi)    # R[3][0] =  C[0] ^ ( C[2] & C[1])
 294
 295         or      @C[0],@C[1]
 296         xor     @T[1],@C[1]             #            C[4] ^ ( C[1] | C[0])
 297         mov     @C[1],$A[3][4](%rsi)    # R[3][4] =  C[4] ^ ( C[1] | C[0])
 298
 299         and     @T[1],@C[0]
 300         xor     @C[3],@C[0]             #           ~C[3] ^ ( C[0] & C[4])
 301         mov     @C[0],$A[3][3](%rsi)    # R[3][3] = ~C[3] ^ ( C[0] & C[4])
 302
 303
 304         xor     $A[0][2](%rdi),@D[2]
 305         xor     $A[1][3](%rdi),@D[3]
 306         rol     \$$rhotates[0][2],@D[2]
 307         xor     $A[4][1](%rdi),@D[1]
 308         rol     \$$rhotates[1][3],@D[3]
 309         xor     $A[2][4](%rdi),@D[4]
 310         rol     \$$rhotates[4][1],@D[1]
 311         xor     $A[3][0](%rdi),@D[0]
 312         xchg    %rsi,%rdi
 313         rol     \$$rhotates[2][4],@D[4]
 314         rol     \$$rhotates[3][0],@D[0]
 315 ___
 316         @C = (@D[2],@D[3],@D[4],@D[0],@D[1]);
 317 $code.=<<___;
 318         mov     @C[0],@T[0]
 319         and     @C[1],@C[0]
 320         not     @C[1]
 321         xor     @C[4],@C[0]             #            C[4] ^ ( C[0] & C[1])
 322         mov     @C[0],$A[4][4](%rdi)    # R[4][4] =  C[4] ^ ( C[0] & C[1])
 323
 324         mov     @C[2],@T[1]
 325         and     @C[1],@C[2]
 326         xor     @T[0],@C[2]             #            C[0] ^ ( C[2] & ~C[1])
 327         mov     @C[2],$A[4][0](%rdi)    # R[4][0] =  C[0] ^ ( C[2] & ~C[1])
 328
 329         or      @C[4],@T[0]
 330         xor     @C[3],@T[0]             #            C[3] ^ ( C[0] | C[4])
 331         mov     @T[0],$A[4][3](%rdi)    # R[4][3] =  C[3] ^ ( C[0] | C[4])
 332
 333         and     @C[3],@C[4]
 334         xor     @T[1],@C[4]             #            C[2] ^ ( C[4] & C[3])
 335         mov     @C[4],$A[4][2](%rdi)    # R[4][2] =  C[2] ^ ( C[4] & C[3])
 336
 337         or      @T[1],@C[3]
 338         xor     @C[1],@C[3]             #           ~C[1] ^ ( C[2] | C[3])
 339         mov     @C[3],$A[4][1](%rdi)    # R[4][1] = ~C[1] ^ ( C[2] | C[3])
 340
 341         mov     @C[0],@C[1]             # harmonize with the loop top
 342         mov     @T[0],@C[0]
 343
 344         test    \$255,$iotas
 345         jnz     .Loop
 346
 347         lea     -192($iotas),$iotas     # rewind iotas
 348         ret
 349 .size   __KeccakF1600,.-__KeccakF1600
 350
 351 .globl  KeccakF1600
 352 .type   KeccakF1600,\@function
 353 .align  32
 354 KeccakF1600:
 355 .cfi_startproc
 356         push    %rbx
 357 .cfi_push       %rbx
 358         push    %rbp
 359 .cfi_push       %rbp
 360         push    %r12
 361 .cfi_push       %r12
 362         push    %r13
 363 .cfi_push       %r13
 364         push    %r14
 365 .cfi_push       %r14
 366         push    %r15
 367 .cfi_push       %r15
 368
 369         lea     100(%rdi),%rdi          # size optimization
 370         sub     \$200,%rsp
 371 .cfi_adjust_cfa_offset  200
 372
 373         notq    $A[0][1](%rdi)
 374         notq    $A[0][2](%rdi)
 375         notq    $A[1][3](%rdi)
 376         notq    $A[2][2](%rdi)
 377         notq    $A[3][2](%rdi)
 378         notq    $A[4][0](%rdi)
 379
 380         lea     iotas(%rip),$iotas
 381         lea     100(%rsp),%rsi          # size optimization
 382
 383         call    __KeccakF1600
 384
 385         notq    $A[0][1](%rdi)
 386         notq    $A[0][2](%rdi)
 387         notq    $A[1][3](%rdi)
 388         notq    $A[2][2](%rdi)
 389         notq    $A[3][2](%rdi)
 390         notq    $A[4][0](%rdi)
 391         lea     -100(%rdi),%rdi         # preserve A[][]
 392
 393         add     \$200,%rsp
 394 .cfi_adjust_cfa_offset  -200
 395
 396         pop     %r15
 397 .cfi_pop        %r15
 398         pop     %r14
 399 .cfi_pop        %r14
 400         pop     %r13
 401 .cfi_pop        %r13
 402         pop     %r12
 403 .cfi_pop        %r12
 404         pop     %rbp
 405 .cfi_pop        %rbp
 406         pop     %rbx
 407 .cfi_pop        %rbx
 408         ret
 409 .cfi_endproc
 410 .size   KeccakF1600,.-KeccakF1600
 411 ___
 412
 413 { my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
 414      ($A_flat,$inp) = ("%r8","%r9");
 415 $code.=<<___;
 416 .globl  SHA3_absorb
 417 .type   SHA3_absorb,\@function
 418 .align  32
 419 SHA3_absorb:
 420 .cfi_startproc
 421         push    %rbx
 422 .cfi_push       %rbx
 423         push    %rbp
 424 .cfi_push       %rbp
 425         push    %r12
 426 .cfi_push       %r12
 427         push    %r13
 428 .cfi_push       %r13
 429         push    %r14
 430 .cfi_push       %r14
 431         push    %r15
 432 .cfi_push       %r15
 433
 434         lea     100(%rdi),%rdi          # size optimization
 435         sub     \$232,%rsp
 436 .cfi_adjust_cfa_offset  232
 437
 438         mov     %rsi,$inp
 439         lea     100(%rsp),%rsi          # size optimization
 440
 441         notq    $A[0][1](%rdi)
 442         notq    $A[0][2](%rdi)
 443         notq    $A[1][3](%rdi)
 444         notq    $A[2][2](%rdi)
 445         notq    $A[3][2](%rdi)
 446         notq    $A[4][0](%rdi)
 447         lea     iotas(%rip),$iotas
 448
 449         mov     $bsz,216-100(%rsi)      # save bsz
 450
 451 .Loop_absorb:
 452         cmp     $bsz,$len
 453         jc      .Ldone_absorb
 454
 455         shr     \$3,$bsz
 456         lea     -100(%rdi),$A_flat
 457
 458 .Lblock_absorb:
 459         mov     ($inp),%rax
 460         lea     8($inp),$inp
 461         xor     ($A_flat),%rax
 462         lea     8($A_flat),$A_flat
 463         sub     \$8,$len
 464         mov     %rax,-8($A_flat)
 465         sub     \$1,$bsz
 466         jnz     .Lblock_absorb
 467
 468         mov     $inp,200-100(%rsi)      # save inp
 469         mov     $len,208-100(%rsi)      # save len
 470         call    __KeccakF1600
 471         mov     200-100(%rsi),$inp      # pull inp
 472         mov     208-100(%rsi),$len      # pull len
 473         mov     216-100(%rsi),$bsz      # pull bsz
 474         jmp     .Loop_absorb
 475
 476 .align  32
 477 .Ldone_absorb:
 478         mov     $len,%rax               # return value
 479
 480         notq    $A[0][1](%rdi)
 481         notq    $A[0][2](%rdi)
 482         notq    $A[1][3](%rdi)
 483         notq    $A[2][2](%rdi)
 484         notq    $A[3][2](%rdi)
 485         notq    $A[4][0](%rdi)
 486
 487         add     \$232,%rsp
 488 .cfi_adjust_cfa_offset  -232
 489
 490         pop     %r15
 491 .cfi_pop        %r15
 492         pop     %r14
 493 .cfi_pop        %r14
 494         pop     %r13
 495 .cfi_pop        %r13
 496         pop     %r12
 497 .cfi_pop        %r12
 498         pop     %rbp
 499 .cfi_pop        %rbp
 500         pop     %rbx
 501 .cfi_pop        %rbx
 502         ret
 503 .cfi_endproc
 504 .size   SHA3_absorb,.-SHA3_absorb
 505 ___
 506 }
 507 { my ($A_flat,$out,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
 508      ($out,$len,$bsz) = ("%r12","%r13","%r14");
 509
 510 $code.=<<___;
 511 .globl  SHA3_squeeze
 512 .type   SHA3_squeeze,\@function
 513 .align  32
 514 SHA3_squeeze:
 515 .cfi_startproc
 516         push    %r12
 517 .cfi_push       %r12
 518         push    %r13
 519 .cfi_push       %r13
 520         push    %r14
 521 .cfi_push       %r14
 522
 523         shr     \$3,%rcx
 524         mov     $A_flat,%r8
 525         mov     %rsi,$out
 526         mov     %rdx,$len
 527         mov     %rcx,$bsz
 528         jmp     .Loop_squeeze
 529
 530 .align  32
 531 .Loop_squeeze:
 532         cmp     \$8,$len
 533         jb      .Ltail_squeeze
 534
 535         mov     (%r8),%rax
 536         lea     8(%r8),%r8
 537         mov     %rax,($out)
 538         lea     8($out),$out
 539         sub     \$8,$len                # len -= 8
 540         jz      .Ldone_squeeze
 541
 542         sub     \$1,%rcx                # bsz--
 543         jnz     .Loop_squeeze
 544
 545         call    KeccakF1600
 546         mov     $A_flat,%r8
 547         mov     $bsz,%rcx
 548         jmp     .Loop_squeeze
 549
 550 .Ltail_squeeze:
 551         mov     %r8, %rsi
 552         mov     $out,%rdi
 553         mov     $len,%rcx
 554         .byte   0xf3,0xa4               # rep   movsb
 555
 556 .Ldone_squeeze:
 557         pop     %r14
 558 .cfi_pop        %r14
 559         pop     %r13
 560 .cfi_pop        %r13
 561         pop     %r12
 562 .cfi_pop        %r13
 563         ret
 564 .cfi_endproc
 565 .size   SHA3_squeeze,.-SHA3_squeeze
 566 ___
 567 }
 568 $code.=<<___;
 569 .align  256
 570         .quad   0,0,0,0,0,0,0,0
 571 .type   iotas,\@object
 572 iotas:
 573         .quad   0x0000000000000001
 574         .quad   0x0000000000008082
 575         .quad   0x800000000000808a
 576         .quad   0x8000000080008000
 577         .quad   0x000000000000808b
 578         .quad   0x0000000080000001
 579         .quad   0x8000000080008081
 580         .quad   0x8000000000008009
 581         .quad   0x000000000000008a
 582         .quad   0x0000000000000088
 583         .quad   0x0000000080008009
 584         .quad   0x000000008000000a
 585         .quad   0x000000008000808b
 586         .quad   0x800000000000008b
 587         .quad   0x8000000000008089
 588         .quad   0x8000000000008003
 589         .quad   0x8000000000008002
 590         .quad   0x8000000000000080
 591         .quad   0x000000000000800a
 592         .quad   0x800000008000000a
 593         .quad   0x8000000080008081
 594         .quad   0x8000000000008080
 595         .quad   0x0000000080000001
 596         .quad   0x8000000080008008
 597 .size   iotas,.-iotas
 598 .asciz  "Keccak-1600 absorb and squeeze for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 599 ___
 600
 601 foreach (split("\n",$code)) {
 602         # Below replacement results in 11.3 on Sandy Bridge, 9.4 on
 603         # Haswell, but it hurts other processors by up to 2-3-4x...
 604         #s/rol\s+(\$[0-9]+),(%[a-z][a-z0-9]+)/shld\t$1,$2,$2/;
 605
 606         print $_, "\n";
 607 }
 608
 609 close STDOUT;