crypto/aes/asm/aesni-sha256-x86_64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # January 2013
  11 #
  12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  15 # parallelism, interleaving it with another algorithm would allow to
  16 # utilize processor resources better and achieve better performance.
  17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  18 # AESNI code is weaved into it. As SHA256 dominates execution time,
  19 # stitch performance does not depend on AES key length. Below are
  20 # performance numbers in cycles per processed byte, less is better,
  21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  22 # subroutine:
  23 #
  24 #                AES-128/-192/-256+SHA256       this(**)gain
  25 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
  26 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
  27 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
  28 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
  29 #
  30 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
  31 #       Westmere is omitted from loop, this is because gain was not
  32 #       estimated high enough to justify the effort;
  33 # (**)  these are EVP-free results, results obtained with 'speed
  34 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  35
  36 $flavour = shift;
  37 $output  = shift;
  38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  39
  40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  41
  42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  45 die "can't locate x86_64-xlate.pl";
  46
  47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  48                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  49         $avx = ($1>=2.19) + ($1>=2.22);
  50 }
  51
  52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  53            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  54         $avx = ($1>=2.09) + ($1>=2.10);
  55 }
  56
  57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  58            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  59         $avx = ($1>=10) + ($1>=11);
  60 }
  61
  62 $shaext=1;      ### set to zero if compiling for 1.0.1
  63 $avx=1          if (!$shaext && $avx);
  64
  65 open OUT,"| \"$^X\" $xlate $flavour $output";
  66 *STDOUT=*OUT;
  67
  68 $func="aesni_cbc_sha256_enc";
  69 $TABLE="K256";
  70 $SZ=4;
  71 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  72                                 "%r8d","%r9d","%r10d","%r11d");
  73 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  74 @Sigma0=( 2,13,22);
  75 @Sigma1=( 6,11,25);
  76 @sigma0=( 7,18, 3);
  77 @sigma1=(17,19,10);
  78 $rounds=64;
  79
  80 ########################################################################
  81 # void aesni_cbc_sha256_enc(const void *inp,
  82 #                       void *out,
  83 #                       size_t length,
  84 #                       const AES_KEY *key,
  85 #                       unsigned char *iv,
  86 #                       SHA256_CTX *ctx,
  87 #                       const void *in0);
  88 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
  89 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  90
  91 $Tbl="%rbp";
  92
  93 $_inp="16*$SZ+0*8(%rsp)";
  94 $_out="16*$SZ+1*8(%rsp)";
  95 $_end="16*$SZ+2*8(%rsp)";
  96 $_key="16*$SZ+3*8(%rsp)";
  97 $_ivp="16*$SZ+4*8(%rsp)";
  98 $_ctx="16*$SZ+5*8(%rsp)";
  99 $_in0="16*$SZ+6*8(%rsp)";
 100 $_rsp="16*$SZ+7*8(%rsp)";
 101 $framesz=16*$SZ+8*8;
 102
 103 $code=<<___;
 104 .text
 105
 106 .extern OPENSSL_ia32cap_P
 107 .globl  $func
 108 .type   $func,\@abi-omnipotent
 109 .align  16
 110 $func:
 111 ___
 112 $code.=<<___ if ($avx);
 113         lea     OPENSSL_ia32cap_P(%rip),%r11
 114         mov     \$1,%eax
 115         cmp     \$0,`$win64?"%rcx":"%rdi"`
 116         je      .Lprobe
 117         mov     0(%r11),%eax
 118         mov     4(%r11),%r10
 119 ___
 120 $code.=<<___ if ($shaext);
 121         bt      \$61,%r10                       # check for SHA
 122         jc      ${func}_shaext
 123 ___
 124 $code.=<<___;
 125         mov     %r10,%r11
 126         shr     \$32,%r11
 127
 128         test    \$`1<<11`,%r10d                 # check for XOP
 129         jnz     ${func}_xop
 130 ___
 131 $code.=<<___ if ($avx>1);
 132         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
 133         cmp     \$`1<<8|1<<5|1<<3`,%r11d
 134         je      ${func}_avx2
 135 ___
 136 $code.=<<___ if ($avx);
 137         and     \$`1<<30`,%eax                  # mask "Intel CPU" bit
 138         and     \$`1<<28|1<<9`,%r10d            # mask AVX+SSSE3 bits
 139         or      %eax,%r10d
 140         cmp     \$`1<<28|1<<9|1<<30`,%r10d
 141         je      ${func}_avx
 142         ud2
 143 ___
 144 $code.=<<___;
 145         xor     %eax,%eax
 146         cmp     \$0,`$win64?"%rcx":"%rdi"`
 147         je      .Lprobe
 148         ud2
 149 .Lprobe:
 150         ret
 151 .size   $func,.-$func
 152
 153 .align  64
 154 .type   $TABLE,\@object
 155 $TABLE:
 156         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 157         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 158         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 159         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 160         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 161         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 162         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 163         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 164         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 165         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 166         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 167         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 168         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 169         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 170         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 171         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 172         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 173         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 174         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 175         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 176         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 177         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 178         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 179         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 180         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 181         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 182         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 183         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 184         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 185         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 186         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 187         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 188
 189         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 190         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 191         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
 192         .long   0,0,0,0,   0,0,0,0
 193         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 194 .align  64
 195 ___
 196
 197 ######################################################################
 198 # SIMD code paths
 199 #
 200 {{{
 201 ($iv,$inout,$roundkey,$temp,
 202  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
 203
 204 $aesni_cbc_idx=0;
 205 @aesni_cbc_block = (
 206 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
 207 ##      &vmovdqu        ($inout,($inp));
 208 ##      &mov            ($_inp,$inp);
 209
 210         '&vpxor         ($inout,$inout,$roundkey);'.
 211         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
 212
 213         '&vpxor         ($inout,$inout,$iv);',
 214
 215         '&vaesenc       ($inout,$inout,$roundkey);'.
 216         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
 217
 218         '&vaesenc       ($inout,$inout,$roundkey);'.
 219         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
 220
 221         '&vaesenc       ($inout,$inout,$roundkey);'.
 222         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
 223
 224         '&vaesenc       ($inout,$inout,$roundkey);'.
 225         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
 226
 227         '&vaesenc       ($inout,$inout,$roundkey);'.
 228         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
 229
 230         '&vaesenc       ($inout,$inout,$roundkey);'.
 231         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
 232
 233         '&vaesenc       ($inout,$inout,$roundkey);'.
 234         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
 235
 236         '&vaesenc       ($inout,$inout,$roundkey);'.
 237         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
 238
 239         '&vaesenc       ($inout,$inout,$roundkey);'.
 240         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
 241
 242         '&vaesenclast   ($temp,$inout,$roundkey);'.
 243         ' &vaesenc      ($inout,$inout,$roundkey);'.
 244         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
 245
 246         '&vpand         ($iv,$temp,$mask10);'.
 247         ' &vaesenc      ($inout,$inout,$roundkey);'.
 248         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
 249
 250         '&vaesenclast   ($temp,$inout,$roundkey);'.
 251         ' &vaesenc      ($inout,$inout,$roundkey);'.
 252         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
 253
 254         '&vpand         ($temp,$temp,$mask12);'.
 255         ' &vaesenc      ($inout,$inout,$roundkey);'.
 256          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
 257
 258         '&vpor          ($iv,$iv,$temp);'.
 259         ' &vaesenclast  ($temp,$inout,$roundkey);'.
 260         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
 261
 262 ##      &mov            ($inp,$_inp);
 263 ##      &mov            ($out,$_out);
 264 ##      &vpand          ($temp,$temp,$mask14);
 265 ##      &vpor           ($iv,$iv,$temp);
 266 ##      &vmovdqu        ($iv,($out,$inp);
 267 ##      &lea            (inp,16($inp));
 268 );
 269
 270 my $a4=$T1;
 271 my ($a,$b,$c,$d,$e,$f,$g,$h);
 272
 273 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 274 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 275   my $arg = pop;
 276     $arg = "\$$arg" if ($arg*1 eq $arg);
 277     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 278 }
 279
 280 sub body_00_15 () {
 281         (
 282         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 283
 284         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
 285         '&mov   ($a,$a1)',
 286         '&mov   ($a4,$f)',
 287
 288         '&xor   ($a0,$e)',
 289         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
 290         '&xor   ($a4,$g)',                      # f^g
 291
 292         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
 293         '&xor   ($a1,$a)',
 294         '&and   ($a4,$e)',                      # (f^g)&e
 295
 296         @aesni_cbc_block[$aesni_cbc_idx++].
 297         '&xor   ($a0,$e)',
 298         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
 299         '&mov   ($a2,$a)',
 300
 301         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
 302         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
 303         '&xor   ($a2,$b)',                      # a^b, b^c in next round
 304
 305         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
 306         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
 307         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
 308
 309         '&xor   ($a1,$a)',
 310         '&add   ($h,$a0)',                      # h+=Sigma1(e)
 311         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
 312
 313         '&add   ($d,$h)',                       # d+=h
 314         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
 315         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
 316
 317         '&mov   ($a0,$d)',
 318         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
 319         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 320         );
 321 }
 322
 323 if ($avx) {{
 324 ######################################################################
 325 # XOP code path
 326 #
 327 $code.=<<___;
 328 .type   ${func}_xop,\@function,6
 329 .align  64
 330 ${func}_xop:
 331 .Lxop_shortcut:
 332         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 333         push    %rbx
 334         push    %rbp
 335         push    %r12
 336         push    %r13
 337         push    %r14
 338         push    %r15
 339         mov     %rsp,%r11               # copy %rsp
 340         sub     \$`$framesz+$win64*16*10`,%rsp
 341         and     \$-64,%rsp              # align stack frame
 342
 343         shl     \$6,$len
 344         sub     $inp,$out               # re-bias
 345         sub     $inp,$in0
 346         add     $inp,$len               # end of input
 347
 348         #mov    $inp,$_inp              # saved later
 349         mov     $out,$_out
 350         mov     $len,$_end
 351         #mov    $key,$_key              # remains resident in $inp register
 352         mov     $ivp,$_ivp
 353         mov     $ctx,$_ctx
 354         mov     $in0,$_in0
 355         mov     %r11,$_rsp
 356 ___
 357 $code.=<<___ if ($win64);
 358         movaps  %xmm6,`$framesz+16*0`(%rsp)
 359         movaps  %xmm7,`$framesz+16*1`(%rsp)
 360         movaps  %xmm8,`$framesz+16*2`(%rsp)
 361         movaps  %xmm9,`$framesz+16*3`(%rsp)
 362         movaps  %xmm10,`$framesz+16*4`(%rsp)
 363         movaps  %xmm11,`$framesz+16*5`(%rsp)
 364         movaps  %xmm12,`$framesz+16*6`(%rsp)
 365         movaps  %xmm13,`$framesz+16*7`(%rsp)
 366         movaps  %xmm14,`$framesz+16*8`(%rsp)
 367         movaps  %xmm15,`$framesz+16*9`(%rsp)
 368 ___
 369 $code.=<<___;
 370 .Lprologue_xop:
 371         vzeroall
 372
 373         mov     $inp,%r12               # borrow $a4
 374         lea     0x80($key),$inp         # size optimization, reassign
 375         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 376         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 377         mov     $ctx,%r15               # borrow $a2
 378         mov     $in0,%rsi               # borrow $a3
 379         vmovdqu ($ivp),$iv              # load IV
 380         sub     \$9,%r14
 381
 382         mov     $SZ*0(%r15),$A
 383         mov     $SZ*1(%r15),$B
 384         mov     $SZ*2(%r15),$C
 385         mov     $SZ*3(%r15),$D
 386         mov     $SZ*4(%r15),$E
 387         mov     $SZ*5(%r15),$F
 388         mov     $SZ*6(%r15),$G
 389         mov     $SZ*7(%r15),$H
 390
 391         vmovdqa 0x00(%r13,%r14,8),$mask14
 392         vmovdqa 0x10(%r13,%r14,8),$mask12
 393         vmovdqa 0x20(%r13,%r14,8),$mask10
 394         vmovdqu 0x00-0x80($inp),$roundkey
 395         jmp     .Lloop_xop
 396 ___
 397                                         if ($SZ==4) {   # SHA256
 398     my @X = map("%xmm$_",(0..3));
 399     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 400
 401 $code.=<<___;
 402 .align  16
 403 .Lloop_xop:
 404         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 405         vmovdqu 0x00(%rsi,%r12),@X[0]
 406         vmovdqu 0x10(%rsi,%r12),@X[1]
 407         vmovdqu 0x20(%rsi,%r12),@X[2]
 408         vmovdqu 0x30(%rsi,%r12),@X[3]
 409         vpshufb $t3,@X[0],@X[0]
 410         lea     $TABLE(%rip),$Tbl
 411         vpshufb $t3,@X[1],@X[1]
 412         vpshufb $t3,@X[2],@X[2]
 413         vpaddd  0x00($Tbl),@X[0],$t0
 414         vpshufb $t3,@X[3],@X[3]
 415         vpaddd  0x20($Tbl),@X[1],$t1
 416         vpaddd  0x40($Tbl),@X[2],$t2
 417         vpaddd  0x60($Tbl),@X[3],$t3
 418         vmovdqa $t0,0x00(%rsp)
 419         mov     $A,$a1
 420         vmovdqa $t1,0x10(%rsp)
 421         mov     $B,$a3
 422         vmovdqa $t2,0x20(%rsp)
 423         xor     $C,$a3                  # magic
 424         vmovdqa $t3,0x30(%rsp)
 425         mov     $E,$a0
 426         jmp     .Lxop_00_47
 427
 428 .align  16
 429 .Lxop_00_47:
 430         sub     \$-16*2*$SZ,$Tbl        # size optimization
 431         vmovdqu (%r12),$inout           # $a4
 432         mov     %r12,$_inp              # $a4
 433 ___
 434 sub XOP_256_00_47 () {
 435 my $j = shift;
 436 my $body = shift;
 437 my @X = @_;
 438 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 439
 440         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
 441           eval(shift(@insns));
 442           eval(shift(@insns));
 443          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
 444           eval(shift(@insns));
 445           eval(shift(@insns));
 446         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
 447           eval(shift(@insns));
 448           eval(shift(@insns));
 449         &vpsrld         ($t0,$t0,$sigma0[2]);
 450           eval(shift(@insns));
 451           eval(shift(@insns));
 452          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
 453           eval(shift(@insns));
 454           eval(shift(@insns));
 455           eval(shift(@insns));
 456           eval(shift(@insns));
 457         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
 458           eval(shift(@insns));
 459           eval(shift(@insns));
 460         &vpxor          ($t0,$t0,$t1);
 461           eval(shift(@insns));
 462           eval(shift(@insns));
 463           eval(shift(@insns));
 464           eval(shift(@insns));
 465          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
 466           eval(shift(@insns));
 467           eval(shift(@insns));
 468         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
 469           eval(shift(@insns));
 470           eval(shift(@insns));
 471          &vpsrld        ($t2,@X[3],$sigma1[2]);
 472           eval(shift(@insns));
 473           eval(shift(@insns));
 474         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
 475           eval(shift(@insns));
 476           eval(shift(@insns));
 477          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 478           eval(shift(@insns));
 479           eval(shift(@insns));
 480          &vpxor         ($t3,$t3,$t2);
 481           eval(shift(@insns));
 482           eval(shift(@insns));
 483           eval(shift(@insns));
 484           eval(shift(@insns));
 485          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
 486           eval(shift(@insns));
 487           eval(shift(@insns));
 488           eval(shift(@insns));
 489           eval(shift(@insns));
 490         &vpsrldq        ($t3,$t3,8);
 491           eval(shift(@insns));
 492           eval(shift(@insns));
 493           eval(shift(@insns));
 494           eval(shift(@insns));
 495         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
 496           eval(shift(@insns));
 497           eval(shift(@insns));
 498           eval(shift(@insns));
 499           eval(shift(@insns));
 500          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
 501           eval(shift(@insns));
 502           eval(shift(@insns));
 503          &vpsrld        ($t2,@X[0],$sigma1[2]);
 504           eval(shift(@insns));
 505           eval(shift(@insns));
 506          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 507           eval(shift(@insns));
 508           eval(shift(@insns));
 509          &vpxor         ($t3,$t3,$t2);
 510           eval(shift(@insns));
 511           eval(shift(@insns));
 512           eval(shift(@insns));
 513           eval(shift(@insns));
 514          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
 515           eval(shift(@insns));
 516           eval(shift(@insns));
 517           eval(shift(@insns));
 518           eval(shift(@insns));
 519         &vpslldq        ($t3,$t3,8);            # 22 instructions
 520           eval(shift(@insns));
 521           eval(shift(@insns));
 522           eval(shift(@insns));
 523           eval(shift(@insns));
 524         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
 525           eval(shift(@insns));
 526           eval(shift(@insns));
 527           eval(shift(@insns));
 528           eval(shift(@insns));
 529         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 530           foreach (@insns) { eval; }            # remaining instructions
 531         &vmovdqa        (16*$j."(%rsp)",$t2);
 532 }
 533
 534     $aesni_cbc_idx=0;
 535     for ($i=0,$j=0; $j<4; $j++) {
 536         &XOP_256_00_47($j,\&body_00_15,@X);
 537         push(@X,shift(@X));                     # rotate(@X)
 538     }
 539         &mov            ("%r12",$_inp);         # borrow $a4
 540         &vpand          ($temp,$temp,$mask14);
 541         &mov            ("%r15",$_out);         # borrow $a2
 542         &vpor           ($iv,$iv,$temp);
 543         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 544         &lea            ("%r12","16(%r12)");    # inp++
 545
 546         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 547         &jne    (".Lxop_00_47");
 548
 549         &vmovdqu        ($inout,"(%r12)");
 550         &mov            ($_inp,"%r12");
 551
 552     $aesni_cbc_idx=0;
 553     for ($i=0; $i<16; ) {
 554         foreach(body_00_15()) { eval; }
 555     }
 556                                         }
 557 $code.=<<___;
 558         mov     $_inp,%r12              # borrow $a4
 559         mov     $_out,%r13              # borrow $a0
 560         mov     $_ctx,%r15              # borrow $a2
 561         mov     $_in0,%rsi              # borrow $a3
 562
 563         vpand   $mask14,$temp,$temp
 564         mov     $a1,$A
 565         vpor    $temp,$iv,$iv
 566         vmovdqu $iv,(%r13,%r12)         # write output
 567         lea     16(%r12),%r12           # inp++
 568
 569         add     $SZ*0(%r15),$A
 570         add     $SZ*1(%r15),$B
 571         add     $SZ*2(%r15),$C
 572         add     $SZ*3(%r15),$D
 573         add     $SZ*4(%r15),$E
 574         add     $SZ*5(%r15),$F
 575         add     $SZ*6(%r15),$G
 576         add     $SZ*7(%r15),$H
 577
 578         cmp     $_end,%r12
 579
 580         mov     $A,$SZ*0(%r15)
 581         mov     $B,$SZ*1(%r15)
 582         mov     $C,$SZ*2(%r15)
 583         mov     $D,$SZ*3(%r15)
 584         mov     $E,$SZ*4(%r15)
 585         mov     $F,$SZ*5(%r15)
 586         mov     $G,$SZ*6(%r15)
 587         mov     $H,$SZ*7(%r15)
 588
 589         jb      .Lloop_xop
 590
 591         mov     $_ivp,$ivp
 592         mov     $_rsp,%rsi
 593         vmovdqu $iv,($ivp)              # output IV
 594         vzeroall
 595 ___
 596 $code.=<<___ if ($win64);
 597         movaps  `$framesz+16*0`(%rsp),%xmm6
 598         movaps  `$framesz+16*1`(%rsp),%xmm7
 599         movaps  `$framesz+16*2`(%rsp),%xmm8
 600         movaps  `$framesz+16*3`(%rsp),%xmm9
 601         movaps  `$framesz+16*4`(%rsp),%xmm10
 602         movaps  `$framesz+16*5`(%rsp),%xmm11
 603         movaps  `$framesz+16*6`(%rsp),%xmm12
 604         movaps  `$framesz+16*7`(%rsp),%xmm13
 605         movaps  `$framesz+16*8`(%rsp),%xmm14
 606         movaps  `$framesz+16*9`(%rsp),%xmm15
 607 ___
 608 $code.=<<___;
 609         mov     (%rsi),%r15
 610         mov     8(%rsi),%r14
 611         mov     16(%rsi),%r13
 612         mov     24(%rsi),%r12
 613         mov     32(%rsi),%rbp
 614         mov     40(%rsi),%rbx
 615         lea     48(%rsi),%rsp
 616 .Lepilogue_xop:
 617         ret
 618 .size   ${func}_xop,.-${func}_xop
 619 ___
 620 ######################################################################
 621 # AVX+shrd code path
 622 #
 623 local *ror = sub { &shrd(@_[0],@_) };
 624
 625 $code.=<<___;
 626 .type   ${func}_avx,\@function,6
 627 .align  64
 628 ${func}_avx:
 629 .Lavx_shortcut:
 630         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 631         push    %rbx
 632         push    %rbp
 633         push    %r12
 634         push    %r13
 635         push    %r14
 636         push    %r15
 637         mov     %rsp,%r11               # copy %rsp
 638         sub     \$`$framesz+$win64*16*10`,%rsp
 639         and     \$-64,%rsp              # align stack frame
 640
 641         shl     \$6,$len
 642         sub     $inp,$out               # re-bias
 643         sub     $inp,$in0
 644         add     $inp,$len               # end of input
 645
 646         #mov    $inp,$_inp              # saved later
 647         mov     $out,$_out
 648         mov     $len,$_end
 649         #mov    $key,$_key              # remains resident in $inp register
 650         mov     $ivp,$_ivp
 651         mov     $ctx,$_ctx
 652         mov     $in0,$_in0
 653         mov     %r11,$_rsp
 654 ___
 655 $code.=<<___ if ($win64);
 656         movaps  %xmm6,`$framesz+16*0`(%rsp)
 657         movaps  %xmm7,`$framesz+16*1`(%rsp)
 658         movaps  %xmm8,`$framesz+16*2`(%rsp)
 659         movaps  %xmm9,`$framesz+16*3`(%rsp)
 660         movaps  %xmm10,`$framesz+16*4`(%rsp)
 661         movaps  %xmm11,`$framesz+16*5`(%rsp)
 662         movaps  %xmm12,`$framesz+16*6`(%rsp)
 663         movaps  %xmm13,`$framesz+16*7`(%rsp)
 664         movaps  %xmm14,`$framesz+16*8`(%rsp)
 665         movaps  %xmm15,`$framesz+16*9`(%rsp)
 666 ___
 667 $code.=<<___;
 668 .Lprologue_avx:
 669         vzeroall
 670
 671         mov     $inp,%r12               # borrow $a4
 672         lea     0x80($key),$inp         # size optimization, reassign
 673         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 674         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 675         mov     $ctx,%r15               # borrow $a2
 676         mov     $in0,%rsi               # borrow $a3
 677         vmovdqu ($ivp),$iv              # load IV
 678         sub     \$9,%r14
 679
 680         mov     $SZ*0(%r15),$A
 681         mov     $SZ*1(%r15),$B
 682         mov     $SZ*2(%r15),$C
 683         mov     $SZ*3(%r15),$D
 684         mov     $SZ*4(%r15),$E
 685         mov     $SZ*5(%r15),$F
 686         mov     $SZ*6(%r15),$G
 687         mov     $SZ*7(%r15),$H
 688
 689         vmovdqa 0x00(%r13,%r14,8),$mask14
 690         vmovdqa 0x10(%r13,%r14,8),$mask12
 691         vmovdqa 0x20(%r13,%r14,8),$mask10
 692         vmovdqu 0x00-0x80($inp),$roundkey
 693 ___
 694                                         if ($SZ==4) {   # SHA256
 695     my @X = map("%xmm$_",(0..3));
 696     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 697
 698 $code.=<<___;
 699         jmp     .Lloop_avx
 700 .align  16
 701 .Lloop_avx:
 702         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 703         vmovdqu 0x00(%rsi,%r12),@X[0]
 704         vmovdqu 0x10(%rsi,%r12),@X[1]
 705         vmovdqu 0x20(%rsi,%r12),@X[2]
 706         vmovdqu 0x30(%rsi,%r12),@X[3]
 707         vpshufb $t3,@X[0],@X[0]
 708         lea     $TABLE(%rip),$Tbl
 709         vpshufb $t3,@X[1],@X[1]
 710         vpshufb $t3,@X[2],@X[2]
 711         vpaddd  0x00($Tbl),@X[0],$t0
 712         vpshufb $t3,@X[3],@X[3]
 713         vpaddd  0x20($Tbl),@X[1],$t1
 714         vpaddd  0x40($Tbl),@X[2],$t2
 715         vpaddd  0x60($Tbl),@X[3],$t3
 716         vmovdqa $t0,0x00(%rsp)
 717         mov     $A,$a1
 718         vmovdqa $t1,0x10(%rsp)
 719         mov     $B,$a3
 720         vmovdqa $t2,0x20(%rsp)
 721         xor     $C,$a3                  # magic
 722         vmovdqa $t3,0x30(%rsp)
 723         mov     $E,$a0
 724         jmp     .Lavx_00_47
 725
 726 .align  16
 727 .Lavx_00_47:
 728         sub     \$-16*2*$SZ,$Tbl        # size optimization
 729         vmovdqu (%r12),$inout           # $a4
 730         mov     %r12,$_inp              # $a4
 731 ___
 732 sub Xupdate_256_AVX () {
 733         (
 734         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
 735          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
 736         '&vpsrld        ($t2,$t0,$sigma0[0]);',
 737          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
 738         '&vpsrld        ($t3,$t0,$sigma0[2])',
 739         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
 740         '&vpxor         ($t0,$t3,$t2)',
 741          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
 742         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
 743         '&vpxor         ($t0,$t0,$t1)',
 744         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
 745         '&vpxor         ($t0,$t0,$t2)',
 746          '&vpsrld       ($t2,$t3,$sigma1[2]);',
 747         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
 748          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
 749         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
 750          '&vpxor        ($t2,$t2,$t3);',
 751          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 752          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
 753          '&vpshufd      ($t2,$t2,0b10000100)',
 754          '&vpsrldq      ($t2,$t2,8)',
 755         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
 756          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
 757          '&vpsrld       ($t2,$t3,$sigma1[2])',
 758          '&vpsrlq       ($t3,$t3,$sigma1[0])',
 759          '&vpxor        ($t2,$t2,$t3);',
 760          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 761          '&vpxor        ($t2,$t2,$t3)',
 762          '&vpshufd      ($t2,$t2,0b11101000)',
 763          '&vpslldq      ($t2,$t2,8)',
 764         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
 765         );
 766 }
 767
 768 sub AVX_256_00_47 () {
 769 my $j = shift;
 770 my $body = shift;
 771 my @X = @_;
 772 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 773
 774         foreach (Xupdate_256_AVX()) {           # 29 instructions
 775             eval;
 776             eval(shift(@insns));
 777             eval(shift(@insns));
 778             eval(shift(@insns));
 779         }
 780         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 781           foreach (@insns) { eval; }            # remaining instructions
 782         &vmovdqa        (16*$j."(%rsp)",$t2);
 783 }
 784
 785     $aesni_cbc_idx=0;
 786     for ($i=0,$j=0; $j<4; $j++) {
 787         &AVX_256_00_47($j,\&body_00_15,@X);
 788         push(@X,shift(@X));                     # rotate(@X)
 789     }
 790         &mov            ("%r12",$_inp);         # borrow $a4
 791         &vpand          ($temp,$temp,$mask14);
 792         &mov            ("%r15",$_out);         # borrow $a2
 793         &vpor           ($iv,$iv,$temp);
 794         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 795         &lea            ("%r12","16(%r12)");    # inp++
 796
 797         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 798         &jne    (".Lavx_00_47");
 799
 800         &vmovdqu        ($inout,"(%r12)");
 801         &mov            ($_inp,"%r12");
 802
 803     $aesni_cbc_idx=0;
 804     for ($i=0; $i<16; ) {
 805         foreach(body_00_15()) { eval; }
 806     }
 807
 808                                         }
 809 $code.=<<___;
 810         mov     $_inp,%r12              # borrow $a4
 811         mov     $_out,%r13              # borrow $a0
 812         mov     $_ctx,%r15              # borrow $a2
 813         mov     $_in0,%rsi              # borrow $a3
 814
 815         vpand   $mask14,$temp,$temp
 816         mov     $a1,$A
 817         vpor    $temp,$iv,$iv
 818         vmovdqu $iv,(%r13,%r12)         # write output
 819         lea     16(%r12),%r12           # inp++
 820
 821         add     $SZ*0(%r15),$A
 822         add     $SZ*1(%r15),$B
 823         add     $SZ*2(%r15),$C
 824         add     $SZ*3(%r15),$D
 825         add     $SZ*4(%r15),$E
 826         add     $SZ*5(%r15),$F
 827         add     $SZ*6(%r15),$G
 828         add     $SZ*7(%r15),$H
 829
 830         cmp     $_end,%r12
 831
 832         mov     $A,$SZ*0(%r15)
 833         mov     $B,$SZ*1(%r15)
 834         mov     $C,$SZ*2(%r15)
 835         mov     $D,$SZ*3(%r15)
 836         mov     $E,$SZ*4(%r15)
 837         mov     $F,$SZ*5(%r15)
 838         mov     $G,$SZ*6(%r15)
 839         mov     $H,$SZ*7(%r15)
 840         jb      .Lloop_avx
 841
 842         mov     $_ivp,$ivp
 843         mov     $_rsp,%rsi
 844         vmovdqu $iv,($ivp)              # output IV
 845         vzeroall
 846 ___
 847 $code.=<<___ if ($win64);
 848         movaps  `$framesz+16*0`(%rsp),%xmm6
 849         movaps  `$framesz+16*1`(%rsp),%xmm7
 850         movaps  `$framesz+16*2`(%rsp),%xmm8
 851         movaps  `$framesz+16*3`(%rsp),%xmm9
 852         movaps  `$framesz+16*4`(%rsp),%xmm10
 853         movaps  `$framesz+16*5`(%rsp),%xmm11
 854         movaps  `$framesz+16*6`(%rsp),%xmm12
 855         movaps  `$framesz+16*7`(%rsp),%xmm13
 856         movaps  `$framesz+16*8`(%rsp),%xmm14
 857         movaps  `$framesz+16*9`(%rsp),%xmm15
 858 ___
 859 $code.=<<___;
 860         mov     (%rsi),%r15
 861         mov     8(%rsi),%r14
 862         mov     16(%rsi),%r13
 863         mov     24(%rsi),%r12
 864         mov     32(%rsi),%rbp
 865         mov     40(%rsi),%rbx
 866         lea     48(%rsi),%rsp
 867 .Lepilogue_avx:
 868         ret
 869 .size   ${func}_avx,.-${func}_avx
 870 ___
 871
 872 if ($avx>1) {{
 873 ######################################################################
 874 # AVX2+BMI code path
 875 #
 876 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
 877 my $PUSH8=8*2*$SZ;
 878 use integer;
 879
 880 sub bodyx_00_15 () {
 881         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
 882         (
 883         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 884
 885         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
 886         '&and   ($a4,$e)',              # f&e
 887         '&rorx  ($a0,$e,$Sigma1[2])',
 888         '&rorx  ($a2,$e,$Sigma1[1])',
 889
 890         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
 891         '&lea   ($h,"($h,$a4)")',
 892         '&andn  ($a4,$e,$g)',           # ~e&g
 893         '&xor   ($a0,$a2)',
 894
 895         '&rorx  ($a1,$e,$Sigma1[0])',
 896         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
 897         '&xor   ($a0,$a1)',             # Sigma1(e)
 898         '&mov   ($a2,$a)',
 899
 900         '&rorx  ($a4,$a,$Sigma0[2])',
 901         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
 902         '&xor   ($a2,$b)',              # a^b, b^c in next round
 903         '&rorx  ($a1,$a,$Sigma0[1])',
 904
 905         '&rorx  ($a0,$a,$Sigma0[0])',
 906         '&lea   ($d,"($d,$h)")',        # d+=h
 907         '&and   ($a3,$a2)',             # (b^c)&(a^b)
 908         @aesni_cbc_block[$aesni_cbc_idx++].
 909         '&xor   ($a1,$a4)',
 910
 911         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
 912         '&xor   ($a1,$a0)',             # Sigma0(a)
 913         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
 914         '&mov   ($a4,$e)',              # copy of f in future
 915
 916         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 917         );
 918         # and at the finish one has to $a+=$a1
 919 }
 920
 921 $code.=<<___;
 922 .type   ${func}_avx2,\@function,6
 923 .align  64
 924 ${func}_avx2:
 925 .Lavx2_shortcut:
 926         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 927         push    %rbx
 928         push    %rbp
 929         push    %r12
 930         push    %r13
 931         push    %r14
 932         push    %r15
 933         mov     %rsp,%r11               # copy %rsp
 934         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
 935         and     \$-256*$SZ,%rsp         # align stack frame
 936         add     \$`2*$SZ*($rounds-8)`,%rsp
 937
 938         shl     \$6,$len
 939         sub     $inp,$out               # re-bias
 940         sub     $inp,$in0
 941         add     $inp,$len               # end of input
 942
 943         #mov    $inp,$_inp              # saved later
 944         #mov    $out,$_out              # kept in $offload
 945         mov     $len,$_end
 946         #mov    $key,$_key              # remains resident in $inp register
 947         mov     $ivp,$_ivp
 948         mov     $ctx,$_ctx
 949         mov     $in0,$_in0
 950         mov     %r11,$_rsp
 951 ___
 952 $code.=<<___ if ($win64);
 953         movaps  %xmm6,`$framesz+16*0`(%rsp)
 954         movaps  %xmm7,`$framesz+16*1`(%rsp)
 955         movaps  %xmm8,`$framesz+16*2`(%rsp)
 956         movaps  %xmm9,`$framesz+16*3`(%rsp)
 957         movaps  %xmm10,`$framesz+16*4`(%rsp)
 958         movaps  %xmm11,`$framesz+16*5`(%rsp)
 959         movaps  %xmm12,`$framesz+16*6`(%rsp)
 960         movaps  %xmm13,`$framesz+16*7`(%rsp)
 961         movaps  %xmm14,`$framesz+16*8`(%rsp)
 962         movaps  %xmm15,`$framesz+16*9`(%rsp)
 963 ___
 964 $code.=<<___;
 965 .Lprologue_avx2:
 966         vzeroall
 967
 968         mov     $inp,%r13               # borrow $a0
 969         vpinsrq \$1,$out,$offload,$offload
 970         lea     0x80($key),$inp         # size optimization, reassign
 971         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
 972         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 973         mov     $ctx,%r15               # borrow $a2
 974         mov     $in0,%rsi               # borrow $a3
 975         vmovdqu ($ivp),$iv              # load IV
 976         lea     -9(%r14),%r14
 977
 978         vmovdqa 0x00(%r12,%r14,8),$mask14
 979         vmovdqa 0x10(%r12,%r14,8),$mask12
 980         vmovdqa 0x20(%r12,%r14,8),$mask10
 981
 982         sub     \$-16*$SZ,%r13          # inp++, size optimization
 983         mov     $SZ*0(%r15),$A
 984         lea     (%rsi,%r13),%r12        # borrow $a0
 985         mov     $SZ*1(%r15),$B
 986         cmp     $len,%r13               # $_end
 987         mov     $SZ*2(%r15),$C
 988         cmove   %rsp,%r12               # next block or random data
 989         mov     $SZ*3(%r15),$D
 990         mov     $SZ*4(%r15),$E
 991         mov     $SZ*5(%r15),$F
 992         mov     $SZ*6(%r15),$G
 993         mov     $SZ*7(%r15),$H
 994         vmovdqu 0x00-0x80($inp),$roundkey
 995 ___
 996                                         if ($SZ==4) {   # SHA256
 997     my @X = map("%ymm$_",(0..3));
 998     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
 999
1000 $code.=<<___;
1001         jmp     .Loop_avx2
1002 .align  16
1003 .Loop_avx2:
1004         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1005         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1006         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1007         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1008         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1009
1010         vinserti128     \$1,(%r12),@X[0],@X[0]
1011         vinserti128     \$1,16(%r12),@X[1],@X[1]
1012          vpshufb        $t3,@X[0],@X[0]
1013         vinserti128     \$1,32(%r12),@X[2],@X[2]
1014          vpshufb        $t3,@X[1],@X[1]
1015         vinserti128     \$1,48(%r12),@X[3],@X[3]
1016
1017         lea     $TABLE(%rip),$Tbl
1018         vpshufb $t3,@X[2],@X[2]
1019         lea     -16*$SZ(%r13),%r13
1020         vpaddd  0x00($Tbl),@X[0],$t0
1021         vpshufb $t3,@X[3],@X[3]
1022         vpaddd  0x20($Tbl),@X[1],$t1
1023         vpaddd  0x40($Tbl),@X[2],$t2
1024         vpaddd  0x60($Tbl),@X[3],$t3
1025         vmovdqa $t0,0x00(%rsp)
1026         xor     $a1,$a1
1027         vmovdqa $t1,0x20(%rsp)
1028         lea     -$PUSH8(%rsp),%rsp
1029         mov     $B,$a3
1030         vmovdqa $t2,0x00(%rsp)
1031         xor     $C,$a3                  # magic
1032         vmovdqa $t3,0x20(%rsp)
1033         mov     $F,$a4
1034         sub     \$-16*2*$SZ,$Tbl        # size optimization
1035         jmp     .Lavx2_00_47
1036
1037 .align  16
1038 .Lavx2_00_47:
1039         vmovdqu (%r13),$inout
1040         vpinsrq \$0,%r13,$offload,$offload
1041 ___
1042
1043 sub AVX2_256_00_47 () {
1044 my $j = shift;
1045 my $body = shift;
1046 my @X = @_;
1047 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1048 my $base = "+2*$PUSH8(%rsp)";
1049
1050         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1051         foreach (Xupdate_256_AVX()) {           # 29 instructions
1052             eval;
1053             eval(shift(@insns));
1054             eval(shift(@insns));
1055             eval(shift(@insns));
1056         }
1057         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1058           foreach (@insns) { eval; }            # remaining instructions
1059         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1060 }
1061     $aesni_cbc_idx=0;
1062     for ($i=0,$j=0; $j<4; $j++) {
1063         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1064         push(@X,shift(@X));                     # rotate(@X)
1065     }
1066         &vmovq          ("%r13",$offload);      # borrow $a0
1067         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1068         &vpand          ($temp,$temp,$mask14);
1069         &vpor           ($iv,$iv,$temp);
1070         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1071         &lea            ("%r13","16(%r13)");    # inp++
1072
1073         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1074         &cmpb   (($SZ-1)."($Tbl)",0);
1075         &jne    (".Lavx2_00_47");
1076
1077         &vmovdqu        ($inout,"(%r13)");
1078         &vpinsrq        ($offload,$offload,"%r13",0);
1079
1080     $aesni_cbc_idx=0;
1081     for ($i=0; $i<16; ) {
1082         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1083         foreach(bodyx_00_15()) { eval; }
1084     }
1085                                         }
1086 $code.=<<___;
1087         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1088         vmovq   $offload,%r13                   # $_inp, borrow $a0
1089         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1090         add     $a1,$A
1091         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1092
1093         vpand   $mask14,$temp,$temp
1094         vpor    $temp,$iv,$iv
1095         vmovdqu $iv,(%r12,%r13)                 # write output
1096         lea     16(%r13),%r13
1097
1098         add     $SZ*0(%r15),$A
1099         add     $SZ*1(%r15),$B
1100         add     $SZ*2(%r15),$C
1101         add     $SZ*3(%r15),$D
1102         add     $SZ*4(%r15),$E
1103         add     $SZ*5(%r15),$F
1104         add     $SZ*6(%r15),$G
1105         add     $SZ*7(%r15),$H
1106
1107         mov     $A,$SZ*0(%r15)
1108         mov     $B,$SZ*1(%r15)
1109         mov     $C,$SZ*2(%r15)
1110         mov     $D,$SZ*3(%r15)
1111         mov     $E,$SZ*4(%r15)
1112         mov     $F,$SZ*5(%r15)
1113         mov     $G,$SZ*6(%r15)
1114         mov     $H,$SZ*7(%r15)
1115
1116         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1117         je      .Ldone_avx2
1118
1119         xor     $a1,$a1
1120         mov     $B,$a3
1121         mov     $F,$a4
1122         xor     $C,$a3                  # magic
1123         jmp     .Lower_avx2
1124 .align  16
1125 .Lower_avx2:
1126         vmovdqu (%r13),$inout
1127         vpinsrq \$0,%r13,$offload,$offload
1128 ___
1129     $aesni_cbc_idx=0;
1130     for ($i=0; $i<16; ) {
1131         my $base="+16($Tbl)";
1132         foreach(bodyx_00_15()) { eval; }
1133         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1134     }
1135 $code.=<<___;
1136         vmovq   $offload,%r13                   # borrow $a0
1137         vpextrq \$1,$offload,%r15               # borrow $a2
1138         vpand   $mask14,$temp,$temp
1139         vpor    $temp,$iv,$iv
1140         lea     -$PUSH8($Tbl),$Tbl
1141         vmovdqu $iv,(%r15,%r13)                 # write output
1142         lea     16(%r13),%r13                   # inp++
1143         cmp     %rsp,$Tbl
1144         jae     .Lower_avx2
1145
1146         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1147         lea     16*$SZ(%r13),%r13
1148         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1149         add     $a1,$A
1150         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1151
1152         add     $SZ*0(%r15),$A
1153         add     $SZ*1(%r15),$B
1154         add     $SZ*2(%r15),$C
1155         add     $SZ*3(%r15),$D
1156         add     $SZ*4(%r15),$E
1157         add     $SZ*5(%r15),$F
1158         add     $SZ*6(%r15),$G
1159         lea     (%rsi,%r13),%r12
1160         add     $SZ*7(%r15),$H
1161
1162         cmp     $_end,%r13
1163
1164         mov     $A,$SZ*0(%r15)
1165         cmove   %rsp,%r12               # next block or stale data
1166         mov     $B,$SZ*1(%r15)
1167         mov     $C,$SZ*2(%r15)
1168         mov     $D,$SZ*3(%r15)
1169         mov     $E,$SZ*4(%r15)
1170         mov     $F,$SZ*5(%r15)
1171         mov     $G,$SZ*6(%r15)
1172         mov     $H,$SZ*7(%r15)
1173
1174         jbe     .Loop_avx2
1175         lea     (%rsp),$Tbl
1176
1177 .Ldone_avx2:
1178         lea     ($Tbl),%rsp
1179         mov     $_ivp,$ivp
1180         mov     $_rsp,%rsi
1181         vmovdqu $iv,($ivp)              # output IV
1182         vzeroall
1183 ___
1184 $code.=<<___ if ($win64);
1185         movaps  `$framesz+16*0`(%rsp),%xmm6
1186         movaps  `$framesz+16*1`(%rsp),%xmm7
1187         movaps  `$framesz+16*2`(%rsp),%xmm8
1188         movaps  `$framesz+16*3`(%rsp),%xmm9
1189         movaps  `$framesz+16*4`(%rsp),%xmm10
1190         movaps  `$framesz+16*5`(%rsp),%xmm11
1191         movaps  `$framesz+16*6`(%rsp),%xmm12
1192         movaps  `$framesz+16*7`(%rsp),%xmm13
1193         movaps  `$framesz+16*8`(%rsp),%xmm14
1194         movaps  `$framesz+16*9`(%rsp),%xmm15
1195 ___
1196 $code.=<<___;
1197         mov     (%rsi),%r15
1198         mov     8(%rsi),%r14
1199         mov     16(%rsi),%r13
1200         mov     24(%rsi),%r12
1201         mov     32(%rsi),%rbp
1202         mov     40(%rsi),%rbx
1203         lea     48(%rsi),%rsp
1204 .Lepilogue_avx2:
1205         ret
1206 .size   ${func}_avx2,.-${func}_avx2
1207 ___
1208 }}
1209 }}
1210 {{
1211 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1212
1213 my ($rounds,$Tbl)=("%r11d","%rbx");
1214
1215 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1216 my @rndkey=("%xmm4","%xmm5");
1217 my $r=0;
1218 my $sn=0;
1219
1220 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1221 my @MSG=map("%xmm$_",(10..13));
1222
1223 my $aesenc=sub {
1224   use integer;
1225   my ($n,$k)=($r/10,$r%10);
1226     if ($k==0) {
1227       $code.=<<___;
1228         movups          `16*$n`($in0),$in               # load input
1229         xorps           $rndkey0,$in
1230 ___
1231       $code.=<<___ if ($n);
1232         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1233 ___
1234       $code.=<<___;
1235         xorps           $in,$iv
1236         movups          `32+16*$k-112`($key),$rndkey[1]
1237         aesenc          $rndkey[0],$iv
1238 ___
1239     } elsif ($k==9) {
1240       $sn++;
1241       $code.=<<___;
1242         cmp             \$11,$rounds
1243         jb              .Laesenclast$sn
1244         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1245         aesenc          $rndkey[0],$iv
1246         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1247         aesenc          $rndkey[1],$iv
1248         je              .Laesenclast$sn
1249         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1250         aesenc          $rndkey[0],$iv
1251         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1252         aesenc          $rndkey[1],$iv
1253 .Laesenclast$sn:
1254         aesenclast      $rndkey[0],$iv
1255         movups          16-112($key),$rndkey[1]         # forward reference
1256         nop
1257 ___
1258     } else {
1259       $code.=<<___;
1260         movups          `32+16*$k-112`($key),$rndkey[1]
1261         aesenc          $rndkey[0],$iv
1262 ___
1263     }
1264     $r++;       unshift(@rndkey,pop(@rndkey));
1265 };
1266
1267 if ($shaext) {
1268 my $Tbl="%rax";
1269
1270 $code.=<<___;
1271 .type   ${func}_shaext,\@function,6
1272 .align  32
1273 ${func}_shaext:
1274         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1275 ___
1276 $code.=<<___ if ($win64);
1277         lea     `-8-10*16`(%rsp),%rsp
1278         movaps  %xmm6,-8-10*16(%rax)
1279         movaps  %xmm7,-8-9*16(%rax)
1280         movaps  %xmm8,-8-8*16(%rax)
1281         movaps  %xmm9,-8-7*16(%rax)
1282         movaps  %xmm10,-8-6*16(%rax)
1283         movaps  %xmm11,-8-5*16(%rax)
1284         movaps  %xmm12,-8-4*16(%rax)
1285         movaps  %xmm13,-8-3*16(%rax)
1286         movaps  %xmm14,-8-2*16(%rax)
1287         movaps  %xmm15,-8-1*16(%rax)
1288 .Lprologue_shaext:
1289 ___
1290 $code.=<<___;
1291         lea             K256+0x80(%rip),$Tbl
1292         movdqu          ($ctx),$ABEF            # DCBA
1293         movdqu          16($ctx),$CDGH          # HGFE
1294         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1295
1296         mov             240($key),$rounds
1297         sub             $in0,$out
1298         movups          ($key),$rndkey0         # $key[0]
1299         movups          16($key),$rndkey[0]     # forward reference
1300         lea             112($key),$key          # size optimization
1301
1302         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1303         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1304         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1305         movdqa          $TMP,$BSWAP             # offload
1306         palignr         \$8,$CDGH,$ABEF         # ABEF
1307         punpcklqdq      $Wi,$CDGH               # CDGH
1308
1309         jmp     .Loop_shaext
1310
1311 .align  16
1312 .Loop_shaext:
1313         movdqu          ($inp),@MSG[0]
1314         movdqu          0x10($inp),@MSG[1]
1315         movdqu          0x20($inp),@MSG[2]
1316         pshufb          $TMP,@MSG[0]
1317         movdqu          0x30($inp),@MSG[3]
1318
1319         movdqa          0*32-0x80($Tbl),$Wi
1320         paddd           @MSG[0],$Wi
1321         pshufb          $TMP,@MSG[1]
1322         movdqa          $CDGH,$CDGH_SAVE        # offload
1323         movdqa          $ABEF,$ABEF_SAVE        # offload
1324 ___
1325         &$aesenc();
1326 $code.=<<___;
1327         sha256rnds2     $ABEF,$CDGH             # 0-3
1328         pshufd          \$0x0e,$Wi,$Wi
1329 ___
1330         &$aesenc();
1331 $code.=<<___;
1332         sha256rnds2     $CDGH,$ABEF
1333
1334         movdqa          1*32-0x80($Tbl),$Wi
1335         paddd           @MSG[1],$Wi
1336         pshufb          $TMP,@MSG[2]
1337         lea             0x40($inp),$inp
1338 ___
1339         &$aesenc();
1340 $code.=<<___;
1341         sha256rnds2     $ABEF,$CDGH             # 4-7
1342         pshufd          \$0x0e,$Wi,$Wi
1343 ___
1344         &$aesenc();
1345 $code.=<<___;
1346         sha256rnds2     $CDGH,$ABEF
1347
1348         movdqa          2*32-0x80($Tbl),$Wi
1349         paddd           @MSG[2],$Wi
1350         pshufb          $TMP,@MSG[3]
1351         sha256msg1      @MSG[1],@MSG[0]
1352 ___
1353         &$aesenc();
1354 $code.=<<___;
1355         sha256rnds2     $ABEF,$CDGH             # 8-11
1356         pshufd          \$0x0e,$Wi,$Wi
1357         movdqa          @MSG[3],$TMP
1358         palignr         \$4,@MSG[2],$TMP
1359         paddd           $TMP,@MSG[0]
1360 ___
1361         &$aesenc();
1362 $code.=<<___;
1363         sha256rnds2     $CDGH,$ABEF
1364
1365         movdqa          3*32-0x80($Tbl),$Wi
1366         paddd           @MSG[3],$Wi
1367         sha256msg2      @MSG[3],@MSG[0]
1368         sha256msg1      @MSG[2],@MSG[1]
1369 ___
1370         &$aesenc();
1371 $code.=<<___;
1372         sha256rnds2     $ABEF,$CDGH             # 12-15
1373         pshufd          \$0x0e,$Wi,$Wi
1374 ___
1375         &$aesenc();
1376 $code.=<<___;
1377         movdqa          @MSG[0],$TMP
1378         palignr         \$4,@MSG[3],$TMP
1379         paddd           $TMP,@MSG[1]
1380         sha256rnds2     $CDGH,$ABEF
1381 ___
1382 for($i=4;$i<16-3;$i++) {
1383         &$aesenc()      if (($r%10)==0);
1384 $code.=<<___;
1385         movdqa          $i*32-0x80($Tbl),$Wi
1386         paddd           @MSG[0],$Wi
1387         sha256msg2      @MSG[0],@MSG[1]
1388         sha256msg1      @MSG[3],@MSG[2]
1389 ___
1390         &$aesenc();
1391 $code.=<<___;
1392         sha256rnds2     $ABEF,$CDGH             # 16-19...
1393         pshufd          \$0x0e,$Wi,$Wi
1394         movdqa          @MSG[1],$TMP
1395         palignr         \$4,@MSG[0],$TMP
1396         paddd           $TMP,@MSG[2]
1397 ___
1398         &$aesenc();
1399         &$aesenc()      if ($r==19);
1400 $code.=<<___;
1401         sha256rnds2     $CDGH,$ABEF
1402 ___
1403         push(@MSG,shift(@MSG));
1404 }
1405 $code.=<<___;
1406         movdqa          13*32-0x80($Tbl),$Wi
1407         paddd           @MSG[0],$Wi
1408         sha256msg2      @MSG[0],@MSG[1]
1409         sha256msg1      @MSG[3],@MSG[2]
1410 ___
1411         &$aesenc();
1412 $code.=<<___;
1413         sha256rnds2     $ABEF,$CDGH             # 52-55
1414         pshufd          \$0x0e,$Wi,$Wi
1415         movdqa          @MSG[1],$TMP
1416         palignr         \$4,@MSG[0],$TMP
1417         paddd           $TMP,@MSG[2]
1418 ___
1419         &$aesenc();
1420         &$aesenc();
1421 $code.=<<___;
1422         sha256rnds2     $CDGH,$ABEF
1423
1424         movdqa          14*32-0x80($Tbl),$Wi
1425         paddd           @MSG[1],$Wi
1426         sha256msg2      @MSG[1],@MSG[2]
1427         movdqa          $BSWAP,$TMP
1428 ___
1429         &$aesenc();
1430 $code.=<<___;
1431         sha256rnds2     $ABEF,$CDGH             # 56-59
1432         pshufd          \$0x0e,$Wi,$Wi
1433 ___
1434         &$aesenc();
1435 $code.=<<___;
1436         sha256rnds2     $CDGH,$ABEF
1437
1438         movdqa          15*32-0x80($Tbl),$Wi
1439         paddd           @MSG[2],$Wi
1440 ___
1441         &$aesenc();
1442         &$aesenc();
1443 $code.=<<___;
1444         sha256rnds2     $ABEF,$CDGH             # 60-63
1445         pshufd          \$0x0e,$Wi,$Wi
1446 ___
1447         &$aesenc();
1448 $code.=<<___;
1449         sha256rnds2     $CDGH,$ABEF
1450         #pxor           $CDGH,$rndkey0          # black magic
1451 ___
1452         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1453 $code.=<<___;
1454         #xorps          $CDGH,$rndkey0          # black magic
1455         paddd           $CDGH_SAVE,$CDGH
1456         paddd           $ABEF_SAVE,$ABEF
1457
1458         dec             $len
1459         movups          $iv,48($out,$in0)       # write output
1460         lea             64($in0),$in0
1461         jnz             .Loop_shaext
1462
1463         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1464         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1465         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1466         punpckhqdq      $CDGH,$ABEF             # DCBA
1467         palignr         \$8,$TMP,$CDGH          # HGFE
1468
1469         movups          $iv,($ivp)              # write IV
1470         movdqu          $ABEF,($ctx)
1471         movdqu          $CDGH,16($ctx)
1472 ___
1473 $code.=<<___ if ($win64);
1474         movaps  0*16(%rsp),%xmm6
1475         movaps  1*16(%rsp),%xmm7
1476         movaps  2*16(%rsp),%xmm8
1477         movaps  3*16(%rsp),%xmm9
1478         movaps  4*16(%rsp),%xmm10
1479         movaps  5*16(%rsp),%xmm11
1480         movaps  6*16(%rsp),%xmm12
1481         movaps  7*16(%rsp),%xmm13
1482         movaps  8*16(%rsp),%xmm14
1483         movaps  9*16(%rsp),%xmm15
1484         lea     8+10*16(%rsp),%rsp
1485 .Lepilogue_shaext:
1486 ___
1487 $code.=<<___;
1488         ret
1489 .size   ${func}_shaext,.-${func}_shaext
1490 ___
1491 }
1492 }}}}}
1493
1494 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1495 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1496 if ($win64) {
1497 $rec="%rcx";
1498 $frame="%rdx";
1499 $context="%r8";
1500 $disp="%r9";
1501
1502 $code.=<<___ if ($avx);
1503 .extern __imp_RtlVirtualUnwind
1504 .type   se_handler,\@abi-omnipotent
1505 .align  16
1506 se_handler:
1507         push    %rsi
1508         push    %rdi
1509         push    %rbx
1510         push    %rbp
1511         push    %r12
1512         push    %r13
1513         push    %r14
1514         push    %r15
1515         pushfq
1516         sub     \$64,%rsp
1517
1518         mov     120($context),%rax      # pull context->Rax
1519         mov     248($context),%rbx      # pull context->Rip
1520
1521         mov     8($disp),%rsi           # disp->ImageBase
1522         mov     56($disp),%r11          # disp->HanderlData
1523
1524         mov     0(%r11),%r10d           # HandlerData[0]
1525         lea     (%rsi,%r10),%r10        # prologue label
1526         cmp     %r10,%rbx               # context->Rip<prologue label
1527         jb      .Lin_prologue
1528
1529         mov     152($context),%rax      # pull context->Rsp
1530
1531         mov     4(%r11),%r10d           # HandlerData[1]
1532         lea     (%rsi,%r10),%r10        # epilogue label
1533         cmp     %r10,%rbx               # context->Rip>=epilogue label
1534         jae     .Lin_prologue
1535 ___
1536 $code.=<<___ if ($shaext);
1537         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1538         cmp     %r10,%rbx
1539         jb      .Lnot_in_shaext
1540
1541         lea     (%rax),%rsi
1542         lea     512($context),%rdi      # &context.Xmm6
1543         mov     \$20,%ecx
1544         .long   0xa548f3fc              # cld; rep movsq
1545         lea     168(%rax),%rax          # adjust stack pointer
1546         jmp     .Lin_prologue
1547 .Lnot_in_shaext:
1548 ___
1549 $code.=<<___ if ($avx>1);
1550         lea     .Lavx2_shortcut(%rip),%r10
1551         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1552         jb      .Lnot_in_avx2
1553
1554         and     \$-256*$SZ,%rax
1555         add     \$`2*$SZ*($rounds-8)`,%rax
1556 .Lnot_in_avx2:
1557 ___
1558 $code.=<<___;
1559         mov     %rax,%rsi               # put aside Rsp
1560         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1561         lea     48(%rax),%rax
1562
1563         mov     -8(%rax),%rbx
1564         mov     -16(%rax),%rbp
1565         mov     -24(%rax),%r12
1566         mov     -32(%rax),%r13
1567         mov     -40(%rax),%r14
1568         mov     -48(%rax),%r15
1569         mov     %rbx,144($context)      # restore context->Rbx
1570         mov     %rbp,160($context)      # restore context->Rbp
1571         mov     %r12,216($context)      # restore context->R12
1572         mov     %r13,224($context)      # restore context->R13
1573         mov     %r14,232($context)      # restore context->R14
1574         mov     %r15,240($context)      # restore context->R15
1575
1576         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1577         lea     512($context),%rdi      # &context.Xmm6
1578         mov     \$20,%ecx
1579         .long   0xa548f3fc              # cld; rep movsq
1580
1581 .Lin_prologue:
1582         mov     8(%rax),%rdi
1583         mov     16(%rax),%rsi
1584         mov     %rax,152($context)      # restore context->Rsp
1585         mov     %rsi,168($context)      # restore context->Rsi
1586         mov     %rdi,176($context)      # restore context->Rdi
1587
1588         mov     40($disp),%rdi          # disp->ContextRecord
1589         mov     $context,%rsi           # context
1590         mov     \$154,%ecx              # sizeof(CONTEXT)
1591         .long   0xa548f3fc              # cld; rep movsq
1592
1593         mov     $disp,%rsi
1594         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1595         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1596         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1597         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1598         mov     40(%rsi),%r10           # disp->ContextRecord
1599         lea     56(%rsi),%r11           # &disp->HandlerData
1600         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1601         mov     %r10,32(%rsp)           # arg5
1602         mov     %r11,40(%rsp)           # arg6
1603         mov     %r12,48(%rsp)           # arg7
1604         mov     %rcx,56(%rsp)           # arg8, (NULL)
1605         call    *__imp_RtlVirtualUnwind(%rip)
1606
1607         mov     \$1,%eax                # ExceptionContinueSearch
1608         add     \$64,%rsp
1609         popfq
1610         pop     %r15
1611         pop     %r14
1612         pop     %r13
1613         pop     %r12
1614         pop     %rbp
1615         pop     %rbx
1616         pop     %rdi
1617         pop     %rsi
1618         ret
1619 .size   se_handler,.-se_handler
1620
1621 .section        .pdata
1622         .rva    .LSEH_begin_${func}_xop
1623         .rva    .LSEH_end_${func}_xop
1624         .rva    .LSEH_info_${func}_xop
1625
1626         .rva    .LSEH_begin_${func}_avx
1627         .rva    .LSEH_end_${func}_avx
1628         .rva    .LSEH_info_${func}_avx
1629 ___
1630 $code.=<<___ if ($avx>1);
1631         .rva    .LSEH_begin_${func}_avx2
1632         .rva    .LSEH_end_${func}_avx2
1633         .rva    .LSEH_info_${func}_avx2
1634 ___
1635 $code.=<<___ if ($shaext);
1636         .rva    .LSEH_begin_${func}_shaext
1637         .rva    .LSEH_end_${func}_shaext
1638         .rva    .LSEH_info_${func}_shaext
1639 ___
1640 $code.=<<___ if ($avx);
1641 .section        .xdata
1642 .align  8
1643 .LSEH_info_${func}_xop:
1644         .byte   9,0,0,0
1645         .rva    se_handler
1646         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1647
1648 .LSEH_info_${func}_avx:
1649         .byte   9,0,0,0
1650         .rva    se_handler
1651         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1652 ___
1653 $code.=<<___ if ($avx>1);
1654 .LSEH_info_${func}_avx2:
1655         .byte   9,0,0,0
1656         .rva    se_handler
1657         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1658 ___
1659 $code.=<<___ if ($shaext);
1660 .LSEH_info_${func}_shaext:
1661         .byte   9,0,0,0
1662         .rva    se_handler
1663         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1664 ___
1665 }
1666
1667 ####################################################################
1668 sub rex {
1669   local *opcode=shift;
1670   my ($dst,$src)=@_;
1671   my $rex=0;
1672
1673     $rex|=0x04                  if($dst>=8);
1674     $rex|=0x01                  if($src>=8);
1675     unshift @opcode,$rex|0x40   if($rex);
1676 }
1677
1678 {
1679   my %opcodelet = (
1680                 "sha256rnds2" => 0xcb,
1681                 "sha256msg1"  => 0xcc,
1682                 "sha256msg2"  => 0xcd   );
1683
1684   sub sha256op38 {
1685     my $instr = shift;
1686
1687     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
1688       my @opcode=(0x0f,0x38);
1689         rex(\@opcode,$2,$1);
1690         push @opcode,$opcodelet{$instr};
1691         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1692         return ".byte\t".join(',',@opcode);
1693     } else {
1694         return $instr."\t".@_[0];
1695     }
1696   }
1697 }
1698
1699 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1700 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1701 print $code;
1702 close STDOUT;