crypto/sha/asm/sha1-x86_64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # sha1_block procedure for x86_64.
  11 #
  12 # It was brought to my attention that on EM64T compiler-generated code
  13 # was far behind 32-bit assembler implementation. This is unlike on
  14 # Opteron where compiler-generated code was only 15% behind 32-bit
  15 # assembler, which originally made it hard to motivate the effort.
  16 # There was suggestion to mechanically translate 32-bit code, but I
  17 # dismissed it, reasoning that x86_64 offers enough register bank
  18 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
  19 # implementation:-) However! While 64-bit code does perform better
  20 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
  21 # x86_64 does offer larger *addressable* bank, but out-of-order core
  22 # reaches for even more registers through dynamic aliasing, and EM64T
  23 # core must have managed to run-time optimize even 32-bit code just as
  24 # good as 64-bit one. Performance improvement is summarized in the
  25 # following table:
  26 #
  27 #               gcc 3.4         32-bit asm      cycles/byte
  28 # Opteron       +45%            +20%            6.8
  29 # Xeon P4       +65%            +0%             9.9
  30 # Core2         +60%            +10%            7.0
  31
  32 # August 2009.
  33 #
  34 # The code was revised to minimize code size and to maximize
  35 # "distance" between instructions producing input to 'lea'
  36 # instruction and the 'lea' instruction itself, which is essential
  37 # for Intel Atom core.
  38
  39 # October 2010.
  40 #
  41 # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
  42 # is to offload message schedule denoted by Wt in NIST specification,
  43 # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
  44 # for background and implementation details. The only difference from
  45 # 32-bit code is that 64-bit code doesn't have to spill @X[] elements
  46 # to free temporary registers.
  47
  48 # April 2011.
  49 #
  50 # Add AVX code path. See sha1-586.pl for further information.
  51
  52 ######################################################################
  53 # Current performance is summarized in following table. Numbers are
  54 # CPU clock cycles spent to process single byte (less is better).
  55 #
  56 #               x86_64          SSSE3           AVX
  57 # P4            9.8             -
  58 # Opteron       6.65            -
  59 # Core2         6.70            6.05/+11%       -
  60 # Westmere      7.08            5.49/+29%       -
  61 # Sandy Bridge  7.93            6.16/+28%       4.99/+59%
  62 # Ivy Bridge    6.30            4.63/+36%       4.60/+37%
  63 # Bulldozer     10.9            5.95/+82%
  64 # VIA Nano      10.2            7.46/+37%
  65 # Atom          11.0            9.61/+14%
  66
  67 $flavour = shift;
  68 $output  = shift;
  69 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  70
  71 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  72
  73 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  74 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  75 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  76 die "can't locate x86_64-xlate.pl";
  77
  78 $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  79                 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
  80            $1>=2.19);
  81 $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  82            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
  83            $1>=2.09);
  84 $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  85            `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
  86            $1>=10);
  87
  88 open OUT,"| \"$^X\" $xlate $flavour $output";
  89 *STDOUT=*OUT;
  90
  91 $ctx="%rdi";    # 1st arg
  92 $inp="%rsi";    # 2nd arg
  93 $num="%rdx";    # 3rd arg
  94
  95 # reassign arguments in order to produce more compact code
  96 $ctx="%r8";
  97 $inp="%r9";
  98 $num="%r10";
  99
 100 $t0="%eax";
 101 $t1="%ebx";
 102 $t2="%ecx";
 103 @xi=("%edx","%ebp");
 104 $A="%esi";
 105 $B="%edi";
 106 $C="%r11d";
 107 $D="%r12d";
 108 $E="%r13d";
 109
 110 @V=($A,$B,$C,$D,$E);
 111
 112 sub BODY_00_19 {
 113 my ($i,$a,$b,$c,$d,$e)=@_;
 114 my $j=$i+1;
 115 $code.=<<___ if ($i==0);
 116         mov     `4*$i`($inp),$xi[0]
 117         bswap   $xi[0]
 118         mov     $xi[0],`4*$i`(%rsp)
 119 ___
 120 $code.=<<___ if ($i<15);
 121         mov     $c,$t0
 122         mov     `4*$j`($inp),$xi[1]
 123         mov     $a,$t2
 124         xor     $d,$t0
 125         bswap   $xi[1]
 126         rol     \$5,$t2
 127         lea     0x5a827999($xi[0],$e),$e
 128         and     $b,$t0
 129         mov     $xi[1],`4*$j`(%rsp)
 130         add     $t2,$e
 131         xor     $d,$t0
 132         rol     \$30,$b
 133         add     $t0,$e
 134 ___
 135 $code.=<<___ if ($i>=15);
 136         mov     `4*($j%16)`(%rsp),$xi[1]
 137         mov     $c,$t0
 138         mov     $a,$t2
 139         xor     `4*(($j+2)%16)`(%rsp),$xi[1]
 140         xor     $d,$t0
 141         rol     \$5,$t2
 142         xor     `4*(($j+8)%16)`(%rsp),$xi[1]
 143         and     $b,$t0
 144         lea     0x5a827999($xi[0],$e),$e
 145         xor     `4*(($j+13)%16)`(%rsp),$xi[1]
 146         xor     $d,$t0
 147         rol     \$1,$xi[1]
 148         add     $t2,$e
 149         rol     \$30,$b
 150         mov     $xi[1],`4*($j%16)`(%rsp)
 151         add     $t0,$e
 152 ___
 153 unshift(@xi,pop(@xi));
 154 }
 155
 156 sub BODY_20_39 {
 157 my ($i,$a,$b,$c,$d,$e)=@_;
 158 my $j=$i+1;
 159 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
 160 $code.=<<___ if ($i<79);
 161         mov     `4*($j%16)`(%rsp),$xi[1]
 162         mov     $c,$t0
 163         mov     $a,$t2
 164         xor     `4*(($j+2)%16)`(%rsp),$xi[1]
 165         xor     $b,$t0
 166         rol     \$5,$t2
 167         lea     $K($xi[0],$e),$e
 168         xor     `4*(($j+8)%16)`(%rsp),$xi[1]
 169         xor     $d,$t0
 170         add     $t2,$e
 171         xor     `4*(($j+13)%16)`(%rsp),$xi[1]
 172         rol     \$30,$b
 173         add     $t0,$e
 174         rol     \$1,$xi[1]
 175 ___
 176 $code.=<<___ if ($i<76);
 177         mov     $xi[1],`4*($j%16)`(%rsp)
 178 ___
 179 $code.=<<___ if ($i==79);
 180         mov     $c,$t0
 181         mov     $a,$t2
 182         xor     $b,$t0
 183         lea     $K($xi[0],$e),$e
 184         rol     \$5,$t2
 185         xor     $d,$t0
 186         add     $t2,$e
 187         rol     \$30,$b
 188         add     $t0,$e
 189 ___
 190 unshift(@xi,pop(@xi));
 191 }
 192
 193 sub BODY_40_59 {
 194 my ($i,$a,$b,$c,$d,$e)=@_;
 195 my $j=$i+1;
 196 $code.=<<___;
 197         mov     `4*($j%16)`(%rsp),$xi[1]
 198         mov     $c,$t0
 199         mov     $c,$t1
 200         xor     `4*(($j+2)%16)`(%rsp),$xi[1]
 201         and     $d,$t0
 202         mov     $a,$t2
 203         xor     `4*(($j+8)%16)`(%rsp),$xi[1]
 204         xor     $d,$t1
 205         lea     0x8f1bbcdc($xi[0],$e),$e
 206         rol     \$5,$t2
 207         xor     `4*(($j+13)%16)`(%rsp),$xi[1]
 208         add     $t0,$e
 209         and     $b,$t1
 210         rol     \$1,$xi[1]
 211         add     $t1,$e
 212         rol     \$30,$b
 213         mov     $xi[1],`4*($j%16)`(%rsp)
 214         add     $t2,$e
 215 ___
 216 unshift(@xi,pop(@xi));
 217 }
 218
 219 $code.=<<___;
 220 .text
 221 .extern OPENSSL_ia32cap_P
 222
 223 .globl  sha1_block_data_order
 224 .type   sha1_block_data_order,\@function,3
 225 .align  16
 226 sha1_block_data_order:
 227         mov     OPENSSL_ia32cap_P+0(%rip),%r9d
 228         mov     OPENSSL_ia32cap_P+4(%rip),%r8d
 229         test    \$`1<<9`,%r8d           # check SSSE3 bit
 230         jz      .Lialu
 231 ___
 232 $code.=<<___ if ($avx);
 233         and     \$`1<<28`,%r8d          # mask AVX bit
 234         and     \$`1<<30`,%r9d          # mask "Intel CPU" bit
 235         or      %r9d,%r8d
 236         cmp     \$`1<<28|1<<30`,%r8d
 237         je      _avx_shortcut
 238 ___
 239 $code.=<<___;
 240         jmp     _ssse3_shortcut
 241
 242 .align  16
 243 .Lialu:
 244         push    %rbx
 245         push    %rbp
 246         push    %r12
 247         push    %r13
 248         mov     %rsp,%r11
 249         mov     %rdi,$ctx       # reassigned argument
 250         sub     \$`8+16*4`,%rsp
 251         mov     %rsi,$inp       # reassigned argument
 252         and     \$-64,%rsp
 253         mov     %rdx,$num       # reassigned argument
 254         mov     %r11,`16*4`(%rsp)
 255 .Lprologue:
 256
 257         mov     0($ctx),$A
 258         mov     4($ctx),$B
 259         mov     8($ctx),$C
 260         mov     12($ctx),$D
 261         mov     16($ctx),$E
 262         jmp     .Lloop
 263
 264 .align  16
 265 .Lloop:
 266 ___
 267 for($i=0;$i<20;$i++)    { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
 268 for(;$i<40;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 269 for(;$i<60;$i++)        { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
 270 for(;$i<80;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 271 $code.=<<___;
 272         add     0($ctx),$A
 273         add     4($ctx),$B
 274         add     8($ctx),$C
 275         add     12($ctx),$D
 276         add     16($ctx),$E
 277         mov     $A,0($ctx)
 278         mov     $B,4($ctx)
 279         mov     $C,8($ctx)
 280         mov     $D,12($ctx)
 281         mov     $E,16($ctx)
 282
 283         sub     \$1,$num
 284         lea     `16*4`($inp),$inp
 285         jnz     .Lloop
 286
 287         mov     `16*4`(%rsp),%rsi
 288         mov     (%rsi),%r13
 289         mov     8(%rsi),%r12
 290         mov     16(%rsi),%rbp
 291         mov     24(%rsi),%rbx
 292         lea     32(%rsi),%rsp
 293 .Lepilogue:
 294         ret
 295 .size   sha1_block_data_order,.-sha1_block_data_order
 296 ___
 297 {{{
 298 my $Xi=4;
 299 my @X=map("%xmm$_",(4..7,0..3));
 300 my @Tx=map("%xmm$_",(8..10));
 301 my $Kx="%xmm11";
 302 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");    # size optimization
 303 my @T=("%esi","%edi");
 304 my $j=0;
 305 my $rx=0;
 306 my $K_XX_XX="%r11";
 307
 308 my $_rol=sub { &rol(@_) };
 309 my $_ror=sub { &ror(@_) };
 310
 311 $code.=<<___;
 312 .type   sha1_block_data_order_ssse3,\@function,3
 313 .align  16
 314 sha1_block_data_order_ssse3:
 315 _ssse3_shortcut:
 316         push    %rbx
 317         push    %rbp
 318         push    %r12
 319         lea     `-64-($win64?6*16:0)`(%rsp),%rsp
 320 ___
 321 $code.=<<___ if ($win64);
 322         movaps  %xmm6,64+0(%rsp)
 323         movaps  %xmm7,64+16(%rsp)
 324         movaps  %xmm8,64+32(%rsp)
 325         movaps  %xmm9,64+48(%rsp)
 326         movaps  %xmm10,64+64(%rsp)
 327         movaps  %xmm11,64+80(%rsp)
 328 .Lprologue_ssse3:
 329 ___
 330 $code.=<<___;
 331         mov     %rdi,$ctx       # reassigned argument
 332         mov     %rsi,$inp       # reassigned argument
 333         mov     %rdx,$num       # reassigned argument
 334
 335         shl     \$6,$num
 336         add     $inp,$num
 337         lea     K_XX_XX(%rip),$K_XX_XX
 338
 339         mov     0($ctx),$A              # load context
 340         mov     4($ctx),$B
 341         mov     8($ctx),$C
 342         mov     12($ctx),$D
 343         mov     $B,@T[0]                # magic seed
 344         mov     16($ctx),$E
 345         mov     $C,@T[1]
 346         xor     $D,@T[1]
 347         and     @T[1],@T[0]
 348
 349         movdqa  64($K_XX_XX),@X[2]      # pbswap mask
 350         movdqa  0($K_XX_XX),@Tx[1]      # K_00_19
 351         movdqu  0($inp),@X[-4&7]        # load input to %xmm[0-3]
 352         movdqu  16($inp),@X[-3&7]
 353         movdqu  32($inp),@X[-2&7]
 354         movdqu  48($inp),@X[-1&7]
 355         pshufb  @X[2],@X[-4&7]          # byte swap
 356         add     \$64,$inp
 357         pshufb  @X[2],@X[-3&7]
 358         pshufb  @X[2],@X[-2&7]
 359         pshufb  @X[2],@X[-1&7]
 360         paddd   @Tx[1],@X[-4&7]         # add K_00_19
 361         paddd   @Tx[1],@X[-3&7]
 362         paddd   @Tx[1],@X[-2&7]
 363         movdqa  @X[-4&7],0(%rsp)        # X[]+K xfer to IALU
 364         psubd   @Tx[1],@X[-4&7]         # restore X[]
 365         movdqa  @X[-3&7],16(%rsp)
 366         psubd   @Tx[1],@X[-3&7]
 367         movdqa  @X[-2&7],32(%rsp)
 368         psubd   @Tx[1],@X[-2&7]
 369         jmp     .Loop_ssse3
 370 ___
 371
 372 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 373 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 374   my $arg = pop;
 375     $arg = "\$$arg" if ($arg*1 eq $arg);
 376     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 377 }
 378
 379 sub Xupdate_ssse3_16_31()               # recall that $Xi starts wtih 4
 380 { use integer;
 381   my $body = shift;
 382   my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
 383   my ($a,$b,$c,$d,$e);
 384
 385         &movdqa (@X[0],@X[-3&7]);
 386          eval(shift(@insns));
 387          eval(shift(@insns));
 388         &movdqa (@Tx[0],@X[-1&7]);
 389         &palignr(@X[0],@X[-4&7],8);     # compose "X[-14]" in "X[0]"
 390          eval(shift(@insns));
 391          eval(shift(@insns));
 392
 393           &paddd        (@Tx[1],@X[-1&7]);
 394          eval(shift(@insns));
 395          eval(shift(@insns));
 396         &psrldq (@Tx[0],4);             # "X[-3]", 3 dwords
 397          eval(shift(@insns));
 398          eval(shift(@insns));
 399         &pxor   (@X[0],@X[-4&7]);       # "X[0]"^="X[-16]"
 400          eval(shift(@insns));
 401          eval(shift(@insns));
 402
 403         &pxor   (@Tx[0],@X[-2&7]);      # "X[-3]"^"X[-8]"
 404          eval(shift(@insns));
 405          eval(shift(@insns));
 406          eval(shift(@insns));
 407          eval(shift(@insns));
 408
 409         &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-3]"^"X[-8]"
 410          eval(shift(@insns));
 411          eval(shift(@insns));
 412           &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 413          eval(shift(@insns));
 414          eval(shift(@insns));
 415
 416         &movdqa (@Tx[2],@X[0]);
 417         &movdqa (@Tx[0],@X[0]);
 418          eval(shift(@insns));
 419          eval(shift(@insns));
 420          eval(shift(@insns));
 421          eval(shift(@insns));
 422
 423         &pslldq (@Tx[2],12);            # "X[0]"<<96, extract one dword
 424         &paddd  (@X[0],@X[0]);
 425          eval(shift(@insns));
 426          eval(shift(@insns));
 427          eval(shift(@insns));
 428          eval(shift(@insns));
 429
 430         &psrld  (@Tx[0],31);
 431          eval(shift(@insns));
 432          eval(shift(@insns));
 433         &movdqa (@Tx[1],@Tx[2]);
 434          eval(shift(@insns));
 435          eval(shift(@insns));
 436
 437         &psrld  (@Tx[2],30);
 438         &por    (@X[0],@Tx[0]);         # "X[0]"<<<=1
 439          eval(shift(@insns));
 440          eval(shift(@insns));
 441          eval(shift(@insns));
 442          eval(shift(@insns));
 443
 444         &pslld  (@Tx[1],2);
 445         &pxor   (@X[0],@Tx[2]);
 446          eval(shift(@insns));
 447          eval(shift(@insns));
 448           &movdqa       (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");       # K_XX_XX
 449          eval(shift(@insns));
 450          eval(shift(@insns));
 451
 452         &pxor   (@X[0],@Tx[1]);         # "X[0]"^=("X[0]">>96)<<<2
 453
 454          foreach (@insns) { eval; }     # remaining instructions [if any]
 455
 456   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 457                 push(@Tx,shift(@Tx));
 458 }
 459
 460 sub Xupdate_ssse3_32_79()
 461 { use integer;
 462   my $body = shift;
 463   my @insns = (&$body,&$body,&$body,&$body);    # 32 to 44 instructions
 464   my ($a,$b,$c,$d,$e);
 465
 466         &movdqa (@Tx[0],@X[-1&7])       if ($Xi==8);
 467          eval(shift(@insns));           # body_20_39
 468         &pxor   (@X[0],@X[-4&7]);       # "X[0]"="X[-32]"^"X[-16]"
 469         &palignr(@Tx[0],@X[-2&7],8);    # compose "X[-6]"
 470          eval(shift(@insns));
 471          eval(shift(@insns));
 472          eval(shift(@insns));           # rol
 473
 474         &pxor   (@X[0],@X[-7&7]);       # "X[0]"^="X[-28]"
 475          eval(shift(@insns));
 476          eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
 477         if ($Xi%5) {
 478           &movdqa       (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
 479         } else {                        # ... or load next one
 480           &movdqa       (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
 481         }
 482           &paddd        (@Tx[1],@X[-1&7]);
 483          eval(shift(@insns));           # ror
 484          eval(shift(@insns));
 485
 486         &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-6]"
 487          eval(shift(@insns));           # body_20_39
 488          eval(shift(@insns));
 489          eval(shift(@insns));
 490          eval(shift(@insns));           # rol
 491
 492         &movdqa (@Tx[0],@X[0]);
 493           &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 494          eval(shift(@insns));
 495          eval(shift(@insns));
 496          eval(shift(@insns));           # ror
 497          eval(shift(@insns));
 498
 499         &pslld  (@X[0],2);
 500          eval(shift(@insns));           # body_20_39
 501          eval(shift(@insns));
 502         &psrld  (@Tx[0],30);
 503          eval(shift(@insns));
 504          eval(shift(@insns));           # rol
 505          eval(shift(@insns));
 506          eval(shift(@insns));
 507          eval(shift(@insns));           # ror
 508          eval(shift(@insns));
 509
 510         &por    (@X[0],@Tx[0]);         # "X[0]"<<<=2
 511          eval(shift(@insns));           # body_20_39
 512          eval(shift(@insns));
 513           &movdqa       (@Tx[1],@X[0])  if ($Xi<19);
 514          eval(shift(@insns));
 515          eval(shift(@insns));           # rol
 516          eval(shift(@insns));
 517          eval(shift(@insns));
 518          eval(shift(@insns));           # rol
 519          eval(shift(@insns));
 520
 521          foreach (@insns) { eval; }     # remaining instructions
 522
 523   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 524                 push(@Tx,shift(@Tx));
 525 }
 526
 527 sub Xuplast_ssse3_80()
 528 { use integer;
 529   my $body = shift;
 530   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 531   my ($a,$b,$c,$d,$e);
 532
 533          eval(shift(@insns));
 534           &paddd        (@Tx[1],@X[-1&7]);
 535          eval(shift(@insns));
 536          eval(shift(@insns));
 537          eval(shift(@insns));
 538          eval(shift(@insns));
 539
 540           &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
 541
 542          foreach (@insns) { eval; }             # remaining instructions
 543
 544         &cmp    ($inp,$num);
 545         &je     (".Ldone_ssse3");
 546
 547         unshift(@Tx,pop(@Tx));
 548
 549         &movdqa (@X[2],"64($K_XX_XX)");         # pbswap mask
 550         &movdqa (@Tx[1],"0($K_XX_XX)");         # K_00_19
 551         &movdqu (@X[-4&7],"0($inp)");           # load input
 552         &movdqu (@X[-3&7],"16($inp)");
 553         &movdqu (@X[-2&7],"32($inp)");
 554         &movdqu (@X[-1&7],"48($inp)");
 555         &pshufb (@X[-4&7],@X[2]);               # byte swap
 556         &add    ($inp,64);
 557
 558   $Xi=0;
 559 }
 560
 561 sub Xloop_ssse3()
 562 { use integer;
 563   my $body = shift;
 564   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 565   my ($a,$b,$c,$d,$e);
 566
 567          eval(shift(@insns));
 568          eval(shift(@insns));
 569         &pshufb (@X[($Xi-3)&7],@X[2]);
 570          eval(shift(@insns));
 571          eval(shift(@insns));
 572         &paddd  (@X[($Xi-4)&7],@Tx[1]);
 573          eval(shift(@insns));
 574          eval(shift(@insns));
 575          eval(shift(@insns));
 576          eval(shift(@insns));
 577         &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);  # X[]+K xfer to IALU
 578          eval(shift(@insns));
 579          eval(shift(@insns));
 580         &psubd  (@X[($Xi-4)&7],@Tx[1]);
 581
 582         foreach (@insns) { eval; }
 583   $Xi++;
 584 }
 585
 586 sub Xtail_ssse3()
 587 { use integer;
 588   my $body = shift;
 589   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 590   my ($a,$b,$c,$d,$e);
 591
 592         foreach (@insns) { eval; }
 593 }
 594
 595 sub body_00_19 () {     # ((c^d)&b)^d
 596         # on start @T[0]=(c^d)&b
 597         return &body_20_39() if ($rx==19); $rx++;
 598         (
 599         '($a,$b,$c,$d,$e)=@V;'.
 600         '&$_ror ($b,$j?7:2)',   # $b>>>2
 601         '&xor   (@T[0],$d)',
 602         '&mov   (@T[1],$a)',    # $b for next round
 603
 604         '&add   ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
 605         '&xor   ($b,$c)',       # $c^$d for next round
 606
 607         '&$_rol ($a,5)',
 608         '&add   ($e,@T[0])',
 609         '&and   (@T[1],$b)',    # ($b&($c^$d)) for next round
 610
 611         '&xor   ($b,$c)',       # restore $b
 612         '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 613         );
 614 }
 615
 616 sub body_20_39 () {     # b^d^c
 617         # on entry @T[0]=b^d
 618         return &body_40_59() if ($rx==39); $rx++;
 619         (
 620         '($a,$b,$c,$d,$e)=@V;'.
 621         '&add   ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
 622         '&xor   (@T[0],$d)      if($j==19);'.
 623         '&xor   (@T[0],$c)      if($j> 19)',    # ($b^$d^$c)
 624         '&mov   (@T[1],$a)',    # $b for next round
 625
 626         '&$_rol ($a,5)',
 627         '&add   ($e,@T[0])',
 628         '&xor   (@T[1],$c)      if ($j< 79)',   # $b^$d for next round
 629
 630         '&$_ror ($b,7)',        # $b>>>2
 631         '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 632         );
 633 }
 634
 635 sub body_40_59 () {     # ((b^c)&(c^d))^c
 636         # on entry @T[0]=(b^c), (c^=d)
 637         $rx++;
 638         (
 639         '($a,$b,$c,$d,$e)=@V;'.
 640         '&add   ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
 641         '&and   (@T[0],$c)      if ($j>=40)',   # (b^c)&(c^d)
 642         '&xor   ($c,$d)         if ($j>=40)',   # restore $c
 643
 644         '&$_ror ($b,7)',        # $b>>>2
 645         '&mov   (@T[1],$a)',    # $b for next round
 646         '&xor   (@T[0],$c)',
 647
 648         '&$_rol ($a,5)',
 649         '&add   ($e,@T[0])',
 650         '&xor   (@T[1],$c)      if ($j==59);'.
 651         '&xor   (@T[1],$b)      if ($j< 59)',   # b^c for next round
 652
 653         '&xor   ($b,$c)         if ($j< 59)',   # c^d for next round
 654         '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 655         );
 656 }
 657 $code.=<<___;
 658 .align  16
 659 .Loop_ssse3:
 660 ___
 661         &Xupdate_ssse3_16_31(\&body_00_19);
 662         &Xupdate_ssse3_16_31(\&body_00_19);
 663         &Xupdate_ssse3_16_31(\&body_00_19);
 664         &Xupdate_ssse3_16_31(\&body_00_19);
 665         &Xupdate_ssse3_32_79(\&body_00_19);
 666         &Xupdate_ssse3_32_79(\&body_20_39);
 667         &Xupdate_ssse3_32_79(\&body_20_39);
 668         &Xupdate_ssse3_32_79(\&body_20_39);
 669         &Xupdate_ssse3_32_79(\&body_20_39);
 670         &Xupdate_ssse3_32_79(\&body_20_39);
 671         &Xupdate_ssse3_32_79(\&body_40_59);
 672         &Xupdate_ssse3_32_79(\&body_40_59);
 673         &Xupdate_ssse3_32_79(\&body_40_59);
 674         &Xupdate_ssse3_32_79(\&body_40_59);
 675         &Xupdate_ssse3_32_79(\&body_40_59);
 676         &Xupdate_ssse3_32_79(\&body_20_39);
 677         &Xuplast_ssse3_80(\&body_20_39);        # can jump to "done"
 678
 679                                 $saved_j=$j; @saved_V=@V;
 680
 681         &Xloop_ssse3(\&body_20_39);
 682         &Xloop_ssse3(\&body_20_39);
 683         &Xloop_ssse3(\&body_20_39);
 684
 685 $code.=<<___;
 686         add     0($ctx),$A                      # update context
 687         add     4($ctx),@T[0]
 688         add     8($ctx),$C
 689         add     12($ctx),$D
 690         mov     $A,0($ctx)
 691         add     16($ctx),$E
 692         mov     @T[0],4($ctx)
 693         mov     @T[0],$B                        # magic seed
 694         mov     $C,8($ctx)
 695         mov     $C,@T[1]
 696         mov     $D,12($ctx)
 697         xor     $D,@T[1]
 698         mov     $E,16($ctx)
 699         and     @T[1],@T[0]
 700         jmp     .Loop_ssse3
 701
 702 .align  16
 703 .Ldone_ssse3:
 704 ___
 705                                 $j=$saved_j; @V=@saved_V;
 706
 707         &Xtail_ssse3(\&body_20_39);
 708         &Xtail_ssse3(\&body_20_39);
 709         &Xtail_ssse3(\&body_20_39);
 710
 711 $code.=<<___;
 712         add     0($ctx),$A                      # update context
 713         add     4($ctx),@T[0]
 714         add     8($ctx),$C
 715         mov     $A,0($ctx)
 716         add     12($ctx),$D
 717         mov     @T[0],4($ctx)
 718         add     16($ctx),$E
 719         mov     $C,8($ctx)
 720         mov     $D,12($ctx)
 721         mov     $E,16($ctx)
 722 ___
 723 $code.=<<___ if ($win64);
 724         movaps  64+0(%rsp),%xmm6
 725         movaps  64+16(%rsp),%xmm7
 726         movaps  64+32(%rsp),%xmm8
 727         movaps  64+48(%rsp),%xmm9
 728         movaps  64+64(%rsp),%xmm10
 729         movaps  64+80(%rsp),%xmm11
 730 ___
 731 $code.=<<___;
 732         lea     `64+($win64?6*16:0)`(%rsp),%rsi
 733         mov     0(%rsi),%r12
 734         mov     8(%rsi),%rbp
 735         mov     16(%rsi),%rbx
 736         lea     24(%rsi),%rsp
 737 .Lepilogue_ssse3:
 738         ret
 739 .size   sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
 740 ___
 741
 742 if ($avx) {
 743 $Xi=4;                          # reset variables
 744 @X=map("%xmm$_",(4..7,0..3));
 745 @Tx=map("%xmm$_",(8..10));
 746 $j=0;
 747 $rx=0;
 748
 749 my $done_avx_label=".Ldone_avx";
 750
 751 my $_rol=sub { &shld(@_[0],@_) };
 752 my $_ror=sub { &shrd(@_[0],@_) };
 753
 754 $code.=<<___;
 755 .type   sha1_block_data_order_avx,\@function,3
 756 .align  16
 757 sha1_block_data_order_avx:
 758 _avx_shortcut:
 759         push    %rbx
 760         push    %rbp
 761         push    %r12
 762         lea     `-64-($win64?6*16:0)`(%rsp),%rsp
 763 ___
 764 $code.=<<___ if ($win64);
 765         movaps  %xmm6,64+0(%rsp)
 766         movaps  %xmm7,64+16(%rsp)
 767         movaps  %xmm8,64+32(%rsp)
 768         movaps  %xmm9,64+48(%rsp)
 769         movaps  %xmm10,64+64(%rsp)
 770         movaps  %xmm11,64+80(%rsp)
 771 .Lprologue_avx:
 772 ___
 773 $code.=<<___;
 774         mov     %rdi,$ctx       # reassigned argument
 775         mov     %rsi,$inp       # reassigned argument
 776         mov     %rdx,$num       # reassigned argument
 777         vzeroall
 778
 779         shl     \$6,$num
 780         add     $inp,$num
 781         lea     K_XX_XX(%rip),$K_XX_XX
 782
 783         mov     0($ctx),$A              # load context
 784         mov     4($ctx),$B
 785         mov     8($ctx),$C
 786         mov     12($ctx),$D
 787         mov     $B,@T[0]                # magic seed
 788         mov     16($ctx),$E
 789         mov     $C,@T[1]
 790         xor     $D,@T[1]
 791         and     @T[1],@T[0]
 792
 793         vmovdqa 64($K_XX_XX),@X[2]      # pbswap mask
 794         vmovdqa 0($K_XX_XX),$Kx         # K_00_19
 795         vmovdqu 0($inp),@X[-4&7]        # load input to %xmm[0-3]
 796         vmovdqu 16($inp),@X[-3&7]
 797         vmovdqu 32($inp),@X[-2&7]
 798         vmovdqu 48($inp),@X[-1&7]
 799         vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
 800         add     \$64,$inp
 801         vpshufb @X[2],@X[-3&7],@X[-3&7]
 802         vpshufb @X[2],@X[-2&7],@X[-2&7]
 803         vpshufb @X[2],@X[-1&7],@X[-1&7]
 804         vpaddd  $Kx,@X[-4&7],@X[0]      # add K_00_19
 805         vpaddd  $Kx,@X[-3&7],@X[1]
 806         vpaddd  $Kx,@X[-2&7],@X[2]
 807         vmovdqa @X[0],0(%rsp)           # X[]+K xfer to IALU
 808         vmovdqa @X[1],16(%rsp)
 809         vmovdqa @X[2],32(%rsp)
 810         jmp     .Loop_avx
 811 ___
 812
 813 sub Xupdate_avx_16_31()         # recall that $Xi starts wtih 4
 814 { use integer;
 815   my $body = shift;
 816   my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
 817   my ($a,$b,$c,$d,$e);
 818
 819          eval(shift(@insns));
 820          eval(shift(@insns));
 821         &vpalignr(@X[0],@X[-3&7],@X[-4&7],8);   # compose "X[-14]" in "X[0]"
 822          eval(shift(@insns));
 823          eval(shift(@insns));
 824
 825           &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
 826          eval(shift(@insns));
 827          eval(shift(@insns));
 828         &vpsrldq(@Tx[0],@X[-1&7],4);            # "X[-3]", 3 dwords
 829          eval(shift(@insns));
 830          eval(shift(@insns));
 831         &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"^="X[-16]"
 832          eval(shift(@insns));
 833          eval(shift(@insns));
 834
 835         &vpxor  (@Tx[0],@Tx[0],@X[-2&7]);       # "X[-3]"^"X[-8]"
 836          eval(shift(@insns));
 837          eval(shift(@insns));
 838          eval(shift(@insns));
 839          eval(shift(@insns));
 840
 841         &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-3]"^"X[-8]"
 842          eval(shift(@insns));
 843          eval(shift(@insns));
 844           &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 845          eval(shift(@insns));
 846          eval(shift(@insns));
 847
 848         &vpsrld (@Tx[0],@X[0],31);
 849          eval(shift(@insns));
 850          eval(shift(@insns));
 851          eval(shift(@insns));
 852          eval(shift(@insns));
 853
 854         &vpslldq(@Tx[2],@X[0],12);              # "X[0]"<<96, extract one dword
 855         &vpaddd (@X[0],@X[0],@X[0]);
 856          eval(shift(@insns));
 857          eval(shift(@insns));
 858          eval(shift(@insns));
 859          eval(shift(@insns));
 860
 861         &vpsrld (@Tx[1],@Tx[2],30);
 862         &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=1
 863          eval(shift(@insns));
 864          eval(shift(@insns));
 865          eval(shift(@insns));
 866          eval(shift(@insns));
 867
 868         &vpslld (@Tx[2],@Tx[2],2);
 869         &vpxor  (@X[0],@X[0],@Tx[1]);
 870          eval(shift(@insns));
 871          eval(shift(@insns));
 872          eval(shift(@insns));
 873          eval(shift(@insns));
 874
 875         &vpxor  (@X[0],@X[0],@Tx[2]);           # "X[0]"^=("X[0]">>96)<<<2
 876          eval(shift(@insns));
 877          eval(shift(@insns));
 878           &vmovdqa      ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)")   if ($Xi%5==0);  # K_XX_XX
 879          eval(shift(@insns));
 880          eval(shift(@insns));
 881
 882
 883          foreach (@insns) { eval; }     # remaining instructions [if any]
 884
 885   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 886 }
 887
 888 sub Xupdate_avx_32_79()
 889 { use integer;
 890   my $body = shift;
 891   my @insns = (&$body,&$body,&$body,&$body);    # 32 to 44 instructions
 892   my ($a,$b,$c,$d,$e);
 893
 894         &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);  # compose "X[-6]"
 895         &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"="X[-32]"^"X[-16]"
 896          eval(shift(@insns));           # body_20_39
 897          eval(shift(@insns));
 898          eval(shift(@insns));
 899          eval(shift(@insns));           # rol
 900
 901         &vpxor  (@X[0],@X[0],@X[-7&7]);         # "X[0]"^="X[-28]"
 902          eval(shift(@insns));
 903          eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
 904           &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
 905           &vmovdqa      ($Kx,eval(16*($Xi/5))."($K_XX_XX)")     if ($Xi%5==0);
 906          eval(shift(@insns));           # ror
 907          eval(shift(@insns));
 908
 909         &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-6]"
 910          eval(shift(@insns));           # body_20_39
 911          eval(shift(@insns));
 912          eval(shift(@insns));
 913          eval(shift(@insns));           # rol
 914
 915         &vpsrld (@Tx[0],@X[0],30);
 916           &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 917          eval(shift(@insns));
 918          eval(shift(@insns));
 919          eval(shift(@insns));           # ror
 920          eval(shift(@insns));
 921
 922         &vpslld (@X[0],@X[0],2);
 923          eval(shift(@insns));           # body_20_39
 924          eval(shift(@insns));
 925          eval(shift(@insns));
 926          eval(shift(@insns));           # rol
 927          eval(shift(@insns));
 928          eval(shift(@insns));
 929          eval(shift(@insns));           # ror
 930          eval(shift(@insns));
 931
 932         &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=2
 933          eval(shift(@insns));           # body_20_39
 934          eval(shift(@insns));
 935          eval(shift(@insns));
 936          eval(shift(@insns));           # rol
 937          eval(shift(@insns));
 938          eval(shift(@insns));
 939          eval(shift(@insns));           # rol
 940          eval(shift(@insns));
 941
 942          foreach (@insns) { eval; }     # remaining instructions
 943
 944   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 945 }
 946
 947 sub Xuplast_avx_80()
 948 { use integer;
 949   my $body = shift;
 950   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 951   my ($a,$b,$c,$d,$e);
 952
 953          eval(shift(@insns));
 954           &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
 955          eval(shift(@insns));
 956          eval(shift(@insns));
 957          eval(shift(@insns));
 958          eval(shift(@insns));
 959
 960           &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
 961
 962          foreach (@insns) { eval; }             # remaining instructions
 963
 964         &cmp    ($inp,$num);
 965         &je     ($done_avx_label);
 966
 967         &vmovdqa(@X[2],"64($K_XX_XX)");         # pbswap mask
 968         &vmovdqa($Kx,"0($K_XX_XX)");            # K_00_19
 969         &vmovdqu(@X[-4&7],"0($inp)");           # load input
 970         &vmovdqu(@X[-3&7],"16($inp)");
 971         &vmovdqu(@X[-2&7],"32($inp)");
 972         &vmovdqu(@X[-1&7],"48($inp)");
 973         &vpshufb(@X[-4&7],@X[-4&7],@X[2]);      # byte swap
 974         &add    ($inp,64);
 975
 976   $Xi=0;
 977 }
 978
 979 sub Xloop_avx()
 980 { use integer;
 981   my $body = shift;
 982   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 983   my ($a,$b,$c,$d,$e);
 984
 985          eval(shift(@insns));
 986          eval(shift(@insns));
 987         &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
 988          eval(shift(@insns));
 989          eval(shift(@insns));
 990         &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx);
 991          eval(shift(@insns));
 992          eval(shift(@insns));
 993          eval(shift(@insns));
 994          eval(shift(@insns));
 995         &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);      # X[]+K xfer to IALU
 996          eval(shift(@insns));
 997          eval(shift(@insns));
 998
 999         foreach (@insns) { eval; }
1000   $Xi++;
1001 }
1002
1003 sub Xtail_avx()
1004 { use integer;
1005   my $body = shift;
1006   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
1007   my ($a,$b,$c,$d,$e);
1008
1009         foreach (@insns) { eval; }
1010 }
1011
1012 $code.=<<___;
1013 .align  16
1014 .Loop_avx:
1015 ___
1016         &Xupdate_avx_16_31(\&body_00_19);
1017         &Xupdate_avx_16_31(\&body_00_19);
1018         &Xupdate_avx_16_31(\&body_00_19);
1019         &Xupdate_avx_16_31(\&body_00_19);
1020         &Xupdate_avx_32_79(\&body_00_19);
1021         &Xupdate_avx_32_79(\&body_20_39);
1022         &Xupdate_avx_32_79(\&body_20_39);
1023         &Xupdate_avx_32_79(\&body_20_39);
1024         &Xupdate_avx_32_79(\&body_20_39);
1025         &Xupdate_avx_32_79(\&body_20_39);
1026         &Xupdate_avx_32_79(\&body_40_59);
1027         &Xupdate_avx_32_79(\&body_40_59);
1028         &Xupdate_avx_32_79(\&body_40_59);
1029         &Xupdate_avx_32_79(\&body_40_59);
1030         &Xupdate_avx_32_79(\&body_40_59);
1031         &Xupdate_avx_32_79(\&body_20_39);
1032         &Xuplast_avx_80(\&body_20_39);  # can jump to "done"
1033
1034                                 $saved_j=$j; @saved_V=@V;
1035
1036         &Xloop_avx(\&body_20_39);
1037         &Xloop_avx(\&body_20_39);
1038         &Xloop_avx(\&body_20_39);
1039
1040 $code.=<<___;
1041         add     0($ctx),$A                      # update context
1042         add     4($ctx),@T[0]
1043         add     8($ctx),$C
1044         add     12($ctx),$D
1045         mov     $A,0($ctx)
1046         add     16($ctx),$E
1047         mov     @T[0],4($ctx)
1048         mov     @T[0],$B                        # magic seed
1049         mov     $C,8($ctx)
1050         mov     $C,@T[1]
1051         mov     $D,12($ctx)
1052         xor     $D,@T[1]
1053         mov     $E,16($ctx)
1054         and     @T[1],@T[0]
1055         jmp     .Loop_avx
1056
1057 .align  16
1058 $done_avx_label:
1059 ___
1060                                 $j=$saved_j; @V=@saved_V;
1061
1062         &Xtail_avx(\&body_20_39);
1063         &Xtail_avx(\&body_20_39);
1064         &Xtail_avx(\&body_20_39);
1065
1066 $code.=<<___;
1067         vzeroall
1068
1069         add     0($ctx),$A                      # update context
1070         add     4($ctx),@T[0]
1071         add     8($ctx),$C
1072         mov     $A,0($ctx)
1073         add     12($ctx),$D
1074         mov     @T[0],4($ctx)
1075         add     16($ctx),$E
1076         mov     $C,8($ctx)
1077         mov     $D,12($ctx)
1078         mov     $E,16($ctx)
1079 ___
1080 $code.=<<___ if ($win64);
1081         movaps  64+0(%rsp),%xmm6
1082         movaps  64+16(%rsp),%xmm7
1083         movaps  64+32(%rsp),%xmm8
1084         movaps  64+48(%rsp),%xmm9
1085         movaps  64+64(%rsp),%xmm10
1086         movaps  64+80(%rsp),%xmm11
1087 ___
1088 $code.=<<___;
1089         lea     `64+($win64?6*16:0)`(%rsp),%rsi
1090         mov     0(%rsi),%r12
1091         mov     8(%rsi),%rbp
1092         mov     16(%rsi),%rbx
1093         lea     24(%rsi),%rsp
1094 .Lepilogue_avx:
1095         ret
1096 .size   sha1_block_data_order_avx,.-sha1_block_data_order_avx
1097 ___
1098 }
1099 $code.=<<___;
1100 .align  64
1101 K_XX_XX:
1102 .long   0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
1103 .long   0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
1104 .long   0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
1105 .long   0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
1106 .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap mask
1107 ___
1108 }}}
1109 $code.=<<___;
1110 .asciz  "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1111 .align  64
1112 ___
1113
1114 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1115 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1116 if ($win64) {
1117 $rec="%rcx";
1118 $frame="%rdx";
1119 $context="%r8";
1120 $disp="%r9";
1121
1122 $code.=<<___;
1123 .extern __imp_RtlVirtualUnwind
1124 .type   se_handler,\@abi-omnipotent
1125 .align  16
1126 se_handler:
1127         push    %rsi
1128         push    %rdi
1129         push    %rbx
1130         push    %rbp
1131         push    %r12
1132         push    %r13
1133         push    %r14
1134         push    %r15
1135         pushfq
1136         sub     \$64,%rsp
1137
1138         mov     120($context),%rax      # pull context->Rax
1139         mov     248($context),%rbx      # pull context->Rip
1140
1141         lea     .Lprologue(%rip),%r10
1142         cmp     %r10,%rbx               # context->Rip<.Lprologue
1143         jb      .Lcommon_seh_tail
1144
1145         mov     152($context),%rax      # pull context->Rsp
1146
1147         lea     .Lepilogue(%rip),%r10
1148         cmp     %r10,%rbx               # context->Rip>=.Lepilogue
1149         jae     .Lcommon_seh_tail
1150
1151         mov     `16*4`(%rax),%rax       # pull saved stack pointer
1152         lea     32(%rax),%rax
1153
1154         mov     -8(%rax),%rbx
1155         mov     -16(%rax),%rbp
1156         mov     -24(%rax),%r12
1157         mov     -32(%rax),%r13
1158         mov     %rbx,144($context)      # restore context->Rbx
1159         mov     %rbp,160($context)      # restore context->Rbp
1160         mov     %r12,216($context)      # restore context->R12
1161         mov     %r13,224($context)      # restore context->R13
1162
1163         jmp     .Lcommon_seh_tail
1164 .size   se_handler,.-se_handler
1165
1166 .type   ssse3_handler,\@abi-omnipotent
1167 .align  16
1168 ssse3_handler:
1169         push    %rsi
1170         push    %rdi
1171         push    %rbx
1172         push    %rbp
1173         push    %r12
1174         push    %r13
1175         push    %r14
1176         push    %r15
1177         pushfq
1178         sub     \$64,%rsp
1179
1180         mov     120($context),%rax      # pull context->Rax
1181         mov     248($context),%rbx      # pull context->Rip
1182
1183         mov     8($disp),%rsi           # disp->ImageBase
1184         mov     56($disp),%r11          # disp->HandlerData
1185
1186         mov     0(%r11),%r10d           # HandlerData[0]
1187         lea     (%rsi,%r10),%r10        # prologue label
1188         cmp     %r10,%rbx               # context->Rip<prologue label
1189         jb      .Lcommon_seh_tail
1190
1191         mov     152($context),%rax      # pull context->Rsp
1192
1193         mov     4(%r11),%r10d           # HandlerData[1]
1194         lea     (%rsi,%r10),%r10        # epilogue label
1195         cmp     %r10,%rbx               # context->Rip>=epilogue label
1196         jae     .Lcommon_seh_tail
1197
1198         lea     64(%rax),%rsi
1199         lea     512($context),%rdi      # &context.Xmm6
1200         mov     \$12,%ecx
1201         .long   0xa548f3fc              # cld; rep movsq
1202         lea     `24+64+6*16`(%rax),%rax # adjust stack pointer
1203
1204         mov     -8(%rax),%rbx
1205         mov     -16(%rax),%rbp
1206         mov     -24(%rax),%r12
1207         mov     %rbx,144($context)      # restore context->Rbx
1208         mov     %rbp,160($context)      # restore context->Rbp
1209         mov     %r12,216($context)      # restore cotnext->R12
1210
1211 .Lcommon_seh_tail:
1212         mov     8(%rax),%rdi
1213         mov     16(%rax),%rsi
1214         mov     %rax,152($context)      # restore context->Rsp
1215         mov     %rsi,168($context)      # restore context->Rsi
1216         mov     %rdi,176($context)      # restore context->Rdi
1217
1218         mov     40($disp),%rdi          # disp->ContextRecord
1219         mov     $context,%rsi           # context
1220         mov     \$154,%ecx              # sizeof(CONTEXT)
1221         .long   0xa548f3fc              # cld; rep movsq
1222
1223         mov     $disp,%rsi
1224         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1225         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1226         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1227         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1228         mov     40(%rsi),%r10           # disp->ContextRecord
1229         lea     56(%rsi),%r11           # &disp->HandlerData
1230         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1231         mov     %r10,32(%rsp)           # arg5
1232         mov     %r11,40(%rsp)           # arg6
1233         mov     %r12,48(%rsp)           # arg7
1234         mov     %rcx,56(%rsp)           # arg8, (NULL)
1235         call    *__imp_RtlVirtualUnwind(%rip)
1236
1237         mov     \$1,%eax                # ExceptionContinueSearch
1238         add     \$64,%rsp
1239         popfq
1240         pop     %r15
1241         pop     %r14
1242         pop     %r13
1243         pop     %r12
1244         pop     %rbp
1245         pop     %rbx
1246         pop     %rdi
1247         pop     %rsi
1248         ret
1249 .size   ssse3_handler,.-ssse3_handler
1250
1251 .section        .pdata
1252 .align  4
1253         .rva    .LSEH_begin_sha1_block_data_order
1254         .rva    .LSEH_end_sha1_block_data_order
1255         .rva    .LSEH_info_sha1_block_data_order
1256         .rva    .LSEH_begin_sha1_block_data_order_ssse3
1257         .rva    .LSEH_end_sha1_block_data_order_ssse3
1258         .rva    .LSEH_info_sha1_block_data_order_ssse3
1259 ___
1260 $code.=<<___ if ($avx);
1261         .rva    .LSEH_begin_sha1_block_data_order_avx
1262         .rva    .LSEH_end_sha1_block_data_order_avx
1263         .rva    .LSEH_info_sha1_block_data_order_avx
1264 ___
1265 $code.=<<___;
1266 .section        .xdata
1267 .align  8
1268 .LSEH_info_sha1_block_data_order:
1269         .byte   9,0,0,0
1270         .rva    se_handler
1271 .LSEH_info_sha1_block_data_order_ssse3:
1272         .byte   9,0,0,0
1273         .rva    ssse3_handler
1274         .rva    .Lprologue_ssse3,.Lepilogue_ssse3       # HandlerData[]
1275 ___
1276 $code.=<<___ if ($avx);
1277 .LSEH_info_sha1_block_data_order_avx:
1278         .byte   9,0,0,0
1279         .rva    ssse3_handler
1280         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1281 ___
1282 }
1283
1284 ####################################################################
1285
1286 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1287 print $code;
1288 close STDOUT;