crypto/aes/asm/aesni-sha256-x86_64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # January 2013
  11 #
  12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  15 # parallelism, interleaving it with another algorithm would allow to
  16 # utilize processor resources better and achieve better performance.
  17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  18 # AESNI code is weaved into it. As SHA256 dominates execution time,
  19 # stitch performance does not depend on AES key length. Below are
  20 # performance numbers in cycles per processed byte, less is better,
  21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  22 # subroutine:
  23 #
  24 #                AES-128/-192/-256+SHA256       this(**)gain
  25 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
  26 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
  27 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
  28 # Skylake           2.62/3.14/3.62+7.70         8.10    +27%/34%/40%
  29 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
  30 #
  31 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
  32 #       Westmere is omitted from loop, this is because gain was not
  33 #       estimated high enough to justify the effort;
  34 # (**)  these are EVP-free results, results obtained with 'speed
  35 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  36
  37 $flavour = shift;
  38 $output  = shift;
  39 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  40
  41 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  42
  43 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  44 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  45 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  46 die "can't locate x86_64-xlate.pl";
  47
  48 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  49                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  50         $avx = ($1>=2.19) + ($1>=2.22);
  51 }
  52
  53 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  54            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  55         $avx = ($1>=2.09) + ($1>=2.10);
  56 }
  57
  58 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  59            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  60         $avx = ($1>=10) + ($1>=12);
  61 }
  62
  63 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/) {
  64         $avx = ($2>=3.0) + ($2>3.0);
  65 }
  66
  67 $shaext=$avx;   ### set to zero if compiling for 1.0.1
  68 $avx=1          if (!$shaext && $avx);
  69
  70 open OUT,"| \"$^X\" $xlate $flavour $output";
  71 *STDOUT=*OUT;
  72
  73 $func="aesni_cbc_sha256_enc";
  74 $TABLE="K256";
  75 $SZ=4;
  76 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  77                                 "%r8d","%r9d","%r10d","%r11d");
  78 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  79 @Sigma0=( 2,13,22);
  80 @Sigma1=( 6,11,25);
  81 @sigma0=( 7,18, 3);
  82 @sigma1=(17,19,10);
  83 $rounds=64;
  84
  85 ########################################################################
  86 # void aesni_cbc_sha256_enc(const void *inp,
  87 #                       void *out,
  88 #                       size_t length,
  89 #                       const AES_KEY *key,
  90 #                       unsigned char *iv,
  91 #                       SHA256_CTX *ctx,
  92 #                       const void *in0);
  93 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
  94 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  95
  96 $Tbl="%rbp";
  97
  98 $_inp="16*$SZ+0*8(%rsp)";
  99 $_out="16*$SZ+1*8(%rsp)";
 100 $_end="16*$SZ+2*8(%rsp)";
 101 $_key="16*$SZ+3*8(%rsp)";
 102 $_ivp="16*$SZ+4*8(%rsp)";
 103 $_ctx="16*$SZ+5*8(%rsp)";
 104 $_in0="16*$SZ+6*8(%rsp)";
 105 $_rsp="16*$SZ+7*8(%rsp)";
 106 $framesz=16*$SZ+8*8;
 107
 108 $code=<<___;
 109 .text
 110
 111 .extern OPENSSL_ia32cap_P
 112 .globl  $func
 113 .type   $func,\@abi-omnipotent
 114 .align  16
 115 $func:
 116 ___
 117                                                 if ($avx) {
 118 $code.=<<___;
 119         lea     OPENSSL_ia32cap_P(%rip),%r11
 120         mov     \$1,%eax
 121         cmp     \$0,`$win64?"%rcx":"%rdi"`
 122         je      .Lprobe
 123         mov     0(%r11),%eax
 124         mov     4(%r11),%r10
 125 ___
 126 $code.=<<___ if ($shaext);
 127         bt      \$61,%r10                       # check for SHA
 128         jc      ${func}_shaext
 129 ___
 130 $code.=<<___;
 131         mov     %r10,%r11
 132         shr     \$32,%r11
 133
 134         test    \$`1<<11`,%r10d                 # check for XOP
 135         jnz     ${func}_xop
 136 ___
 137 $code.=<<___ if ($avx>1);
 138         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
 139         cmp     \$`1<<8|1<<5|1<<3`,%r11d
 140         je      ${func}_avx2
 141 ___
 142 $code.=<<___;
 143         and     \$`1<<28`,%r10d                 # check for AVX
 144         jnz     ${func}_avx
 145         ud2
 146 ___
 147                                                 }
 148 $code.=<<___;
 149         xor     %eax,%eax
 150         cmp     \$0,`$win64?"%rcx":"%rdi"`
 151         je      .Lprobe
 152         ud2
 153 .Lprobe:
 154         ret
 155 .size   $func,.-$func
 156
 157 .align  64
 158 .type   $TABLE,\@object
 159 $TABLE:
 160         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 161         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 162         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 163         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 164         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 165         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 166         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 167         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 168         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 169         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 170         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 171         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 172         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 173         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 174         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 175         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 176         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 177         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 178         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 179         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 180         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 181         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 182         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 183         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 184         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 185         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 186         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 187         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 188         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 189         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 190         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 191         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 192
 193         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 194         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 195         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
 196         .long   0,0,0,0,   0,0,0,0
 197         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 198 .align  64
 199 ___
 200
 201 ######################################################################
 202 # SIMD code paths
 203 #
 204 {{{
 205 ($iv,$inout,$roundkey,$temp,
 206  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
 207
 208 $aesni_cbc_idx=0;
 209 @aesni_cbc_block = (
 210 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
 211 ##      &vmovdqu        ($inout,($inp));
 212 ##      &mov            ($_inp,$inp);
 213
 214         '&vpxor         ($inout,$inout,$roundkey);'.
 215         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
 216
 217         '&vpxor         ($inout,$inout,$iv);',
 218
 219         '&vaesenc       ($inout,$inout,$roundkey);'.
 220         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
 221
 222         '&vaesenc       ($inout,$inout,$roundkey);'.
 223         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
 224
 225         '&vaesenc       ($inout,$inout,$roundkey);'.
 226         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
 227
 228         '&vaesenc       ($inout,$inout,$roundkey);'.
 229         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
 230
 231         '&vaesenc       ($inout,$inout,$roundkey);'.
 232         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
 233
 234         '&vaesenc       ($inout,$inout,$roundkey);'.
 235         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
 236
 237         '&vaesenc       ($inout,$inout,$roundkey);'.
 238         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
 239
 240         '&vaesenc       ($inout,$inout,$roundkey);'.
 241         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
 242
 243         '&vaesenc       ($inout,$inout,$roundkey);'.
 244         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
 245
 246         '&vaesenclast   ($temp,$inout,$roundkey);'.
 247         ' &vaesenc      ($inout,$inout,$roundkey);'.
 248         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
 249
 250         '&vpand         ($iv,$temp,$mask10);'.
 251         ' &vaesenc      ($inout,$inout,$roundkey);'.
 252         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
 253
 254         '&vaesenclast   ($temp,$inout,$roundkey);'.
 255         ' &vaesenc      ($inout,$inout,$roundkey);'.
 256         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
 257
 258         '&vpand         ($temp,$temp,$mask12);'.
 259         ' &vaesenc      ($inout,$inout,$roundkey);'.
 260          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
 261
 262         '&vpor          ($iv,$iv,$temp);'.
 263         ' &vaesenclast  ($temp,$inout,$roundkey);'.
 264         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
 265
 266 ##      &mov            ($inp,$_inp);
 267 ##      &mov            ($out,$_out);
 268 ##      &vpand          ($temp,$temp,$mask14);
 269 ##      &vpor           ($iv,$iv,$temp);
 270 ##      &vmovdqu        ($iv,($out,$inp);
 271 ##      &lea            (inp,16($inp));
 272 );
 273
 274 my $a4=$T1;
 275 my ($a,$b,$c,$d,$e,$f,$g,$h);
 276
 277 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 278 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 279   my $arg = pop;
 280     $arg = "\$$arg" if ($arg*1 eq $arg);
 281     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 282 }
 283
 284 sub body_00_15 () {
 285         (
 286         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 287
 288         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
 289         '&mov   ($a,$a1)',
 290         '&mov   ($a4,$f)',
 291
 292         '&xor   ($a0,$e)',
 293         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
 294         '&xor   ($a4,$g)',                      # f^g
 295
 296         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
 297         '&xor   ($a1,$a)',
 298         '&and   ($a4,$e)',                      # (f^g)&e
 299
 300         @aesni_cbc_block[$aesni_cbc_idx++].
 301         '&xor   ($a0,$e)',
 302         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
 303         '&mov   ($a2,$a)',
 304
 305         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
 306         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
 307         '&xor   ($a2,$b)',                      # a^b, b^c in next round
 308
 309         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
 310         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
 311         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
 312
 313         '&xor   ($a1,$a)',
 314         '&add   ($h,$a0)',                      # h+=Sigma1(e)
 315         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
 316
 317         '&add   ($d,$h)',                       # d+=h
 318         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
 319         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
 320
 321         '&mov   ($a0,$d)',
 322         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
 323         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 324         );
 325 }
 326
 327 if ($avx) {{
 328 ######################################################################
 329 # XOP code path
 330 #
 331 $code.=<<___;
 332 .type   ${func}_xop,\@function,6
 333 .align  64
 334 ${func}_xop:
 335 .Lxop_shortcut:
 336         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 337         push    %rbx
 338         push    %rbp
 339         push    %r12
 340         push    %r13
 341         push    %r14
 342         push    %r15
 343         mov     %rsp,%r11               # copy %rsp
 344         sub     \$`$framesz+$win64*16*10`,%rsp
 345         and     \$-64,%rsp              # align stack frame
 346
 347         shl     \$6,$len
 348         sub     $inp,$out               # re-bias
 349         sub     $inp,$in0
 350         add     $inp,$len               # end of input
 351
 352         #mov    $inp,$_inp              # saved later
 353         mov     $out,$_out
 354         mov     $len,$_end
 355         #mov    $key,$_key              # remains resident in $inp register
 356         mov     $ivp,$_ivp
 357         mov     $ctx,$_ctx
 358         mov     $in0,$_in0
 359         mov     %r11,$_rsp
 360 ___
 361 $code.=<<___ if ($win64);
 362         movaps  %xmm6,`$framesz+16*0`(%rsp)
 363         movaps  %xmm7,`$framesz+16*1`(%rsp)
 364         movaps  %xmm8,`$framesz+16*2`(%rsp)
 365         movaps  %xmm9,`$framesz+16*3`(%rsp)
 366         movaps  %xmm10,`$framesz+16*4`(%rsp)
 367         movaps  %xmm11,`$framesz+16*5`(%rsp)
 368         movaps  %xmm12,`$framesz+16*6`(%rsp)
 369         movaps  %xmm13,`$framesz+16*7`(%rsp)
 370         movaps  %xmm14,`$framesz+16*8`(%rsp)
 371         movaps  %xmm15,`$framesz+16*9`(%rsp)
 372 ___
 373 $code.=<<___;
 374 .Lprologue_xop:
 375         vzeroall
 376
 377         mov     $inp,%r12               # borrow $a4
 378         lea     0x80($key),$inp         # size optimization, reassign
 379         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 380         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 381         mov     $ctx,%r15               # borrow $a2
 382         mov     $in0,%rsi               # borrow $a3
 383         vmovdqu ($ivp),$iv              # load IV
 384         sub     \$9,%r14
 385
 386         mov     $SZ*0(%r15),$A
 387         mov     $SZ*1(%r15),$B
 388         mov     $SZ*2(%r15),$C
 389         mov     $SZ*3(%r15),$D
 390         mov     $SZ*4(%r15),$E
 391         mov     $SZ*5(%r15),$F
 392         mov     $SZ*6(%r15),$G
 393         mov     $SZ*7(%r15),$H
 394
 395         vmovdqa 0x00(%r13,%r14,8),$mask14
 396         vmovdqa 0x10(%r13,%r14,8),$mask12
 397         vmovdqa 0x20(%r13,%r14,8),$mask10
 398         vmovdqu 0x00-0x80($inp),$roundkey
 399         jmp     .Lloop_xop
 400 ___
 401                                         if ($SZ==4) {   # SHA256
 402     my @X = map("%xmm$_",(0..3));
 403     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 404
 405 $code.=<<___;
 406 .align  16
 407 .Lloop_xop:
 408         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 409         vmovdqu 0x00(%rsi,%r12),@X[0]
 410         vmovdqu 0x10(%rsi,%r12),@X[1]
 411         vmovdqu 0x20(%rsi,%r12),@X[2]
 412         vmovdqu 0x30(%rsi,%r12),@X[3]
 413         vpshufb $t3,@X[0],@X[0]
 414         lea     $TABLE(%rip),$Tbl
 415         vpshufb $t3,@X[1],@X[1]
 416         vpshufb $t3,@X[2],@X[2]
 417         vpaddd  0x00($Tbl),@X[0],$t0
 418         vpshufb $t3,@X[3],@X[3]
 419         vpaddd  0x20($Tbl),@X[1],$t1
 420         vpaddd  0x40($Tbl),@X[2],$t2
 421         vpaddd  0x60($Tbl),@X[3],$t3
 422         vmovdqa $t0,0x00(%rsp)
 423         mov     $A,$a1
 424         vmovdqa $t1,0x10(%rsp)
 425         mov     $B,$a3
 426         vmovdqa $t2,0x20(%rsp)
 427         xor     $C,$a3                  # magic
 428         vmovdqa $t3,0x30(%rsp)
 429         mov     $E,$a0
 430         jmp     .Lxop_00_47
 431
 432 .align  16
 433 .Lxop_00_47:
 434         sub     \$-16*2*$SZ,$Tbl        # size optimization
 435         vmovdqu (%r12),$inout           # $a4
 436         mov     %r12,$_inp              # $a4
 437 ___
 438 sub XOP_256_00_47 () {
 439 my $j = shift;
 440 my $body = shift;
 441 my @X = @_;
 442 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 443
 444         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
 445           eval(shift(@insns));
 446           eval(shift(@insns));
 447          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
 448           eval(shift(@insns));
 449           eval(shift(@insns));
 450         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
 451           eval(shift(@insns));
 452           eval(shift(@insns));
 453         &vpsrld         ($t0,$t0,$sigma0[2]);
 454           eval(shift(@insns));
 455           eval(shift(@insns));
 456          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
 457           eval(shift(@insns));
 458           eval(shift(@insns));
 459           eval(shift(@insns));
 460           eval(shift(@insns));
 461         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
 462           eval(shift(@insns));
 463           eval(shift(@insns));
 464         &vpxor          ($t0,$t0,$t1);
 465           eval(shift(@insns));
 466           eval(shift(@insns));
 467           eval(shift(@insns));
 468           eval(shift(@insns));
 469          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
 470           eval(shift(@insns));
 471           eval(shift(@insns));
 472         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
 473           eval(shift(@insns));
 474           eval(shift(@insns));
 475          &vpsrld        ($t2,@X[3],$sigma1[2]);
 476           eval(shift(@insns));
 477           eval(shift(@insns));
 478         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
 479           eval(shift(@insns));
 480           eval(shift(@insns));
 481          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 482           eval(shift(@insns));
 483           eval(shift(@insns));
 484          &vpxor         ($t3,$t3,$t2);
 485           eval(shift(@insns));
 486           eval(shift(@insns));
 487           eval(shift(@insns));
 488           eval(shift(@insns));
 489          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
 490           eval(shift(@insns));
 491           eval(shift(@insns));
 492           eval(shift(@insns));
 493           eval(shift(@insns));
 494         &vpsrldq        ($t3,$t3,8);
 495           eval(shift(@insns));
 496           eval(shift(@insns));
 497           eval(shift(@insns));
 498           eval(shift(@insns));
 499         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
 500           eval(shift(@insns));
 501           eval(shift(@insns));
 502           eval(shift(@insns));
 503           eval(shift(@insns));
 504          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
 505           eval(shift(@insns));
 506           eval(shift(@insns));
 507          &vpsrld        ($t2,@X[0],$sigma1[2]);
 508           eval(shift(@insns));
 509           eval(shift(@insns));
 510          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 511           eval(shift(@insns));
 512           eval(shift(@insns));
 513          &vpxor         ($t3,$t3,$t2);
 514           eval(shift(@insns));
 515           eval(shift(@insns));
 516           eval(shift(@insns));
 517           eval(shift(@insns));
 518          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
 519           eval(shift(@insns));
 520           eval(shift(@insns));
 521           eval(shift(@insns));
 522           eval(shift(@insns));
 523         &vpslldq        ($t3,$t3,8);            # 22 instructions
 524           eval(shift(@insns));
 525           eval(shift(@insns));
 526           eval(shift(@insns));
 527           eval(shift(@insns));
 528         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
 529           eval(shift(@insns));
 530           eval(shift(@insns));
 531           eval(shift(@insns));
 532           eval(shift(@insns));
 533         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 534           foreach (@insns) { eval; }            # remaining instructions
 535         &vmovdqa        (16*$j."(%rsp)",$t2);
 536 }
 537
 538     $aesni_cbc_idx=0;
 539     for ($i=0,$j=0; $j<4; $j++) {
 540         &XOP_256_00_47($j,\&body_00_15,@X);
 541         push(@X,shift(@X));                     # rotate(@X)
 542     }
 543         &mov            ("%r12",$_inp);         # borrow $a4
 544         &vpand          ($temp,$temp,$mask14);
 545         &mov            ("%r15",$_out);         # borrow $a2
 546         &vpor           ($iv,$iv,$temp);
 547         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 548         &lea            ("%r12","16(%r12)");    # inp++
 549
 550         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 551         &jne    (".Lxop_00_47");
 552
 553         &vmovdqu        ($inout,"(%r12)");
 554         &mov            ($_inp,"%r12");
 555
 556     $aesni_cbc_idx=0;
 557     for ($i=0; $i<16; ) {
 558         foreach(body_00_15()) { eval; }
 559     }
 560                                         }
 561 $code.=<<___;
 562         mov     $_inp,%r12              # borrow $a4
 563         mov     $_out,%r13              # borrow $a0
 564         mov     $_ctx,%r15              # borrow $a2
 565         mov     $_in0,%rsi              # borrow $a3
 566
 567         vpand   $mask14,$temp,$temp
 568         mov     $a1,$A
 569         vpor    $temp,$iv,$iv
 570         vmovdqu $iv,(%r13,%r12)         # write output
 571         lea     16(%r12),%r12           # inp++
 572
 573         add     $SZ*0(%r15),$A
 574         add     $SZ*1(%r15),$B
 575         add     $SZ*2(%r15),$C
 576         add     $SZ*3(%r15),$D
 577         add     $SZ*4(%r15),$E
 578         add     $SZ*5(%r15),$F
 579         add     $SZ*6(%r15),$G
 580         add     $SZ*7(%r15),$H
 581
 582         cmp     $_end,%r12
 583
 584         mov     $A,$SZ*0(%r15)
 585         mov     $B,$SZ*1(%r15)
 586         mov     $C,$SZ*2(%r15)
 587         mov     $D,$SZ*3(%r15)
 588         mov     $E,$SZ*4(%r15)
 589         mov     $F,$SZ*5(%r15)
 590         mov     $G,$SZ*6(%r15)
 591         mov     $H,$SZ*7(%r15)
 592
 593         jb      .Lloop_xop
 594
 595         mov     $_ivp,$ivp
 596         mov     $_rsp,%rsi
 597         vmovdqu $iv,($ivp)              # output IV
 598         vzeroall
 599 ___
 600 $code.=<<___ if ($win64);
 601         movaps  `$framesz+16*0`(%rsp),%xmm6
 602         movaps  `$framesz+16*1`(%rsp),%xmm7
 603         movaps  `$framesz+16*2`(%rsp),%xmm8
 604         movaps  `$framesz+16*3`(%rsp),%xmm9
 605         movaps  `$framesz+16*4`(%rsp),%xmm10
 606         movaps  `$framesz+16*5`(%rsp),%xmm11
 607         movaps  `$framesz+16*6`(%rsp),%xmm12
 608         movaps  `$framesz+16*7`(%rsp),%xmm13
 609         movaps  `$framesz+16*8`(%rsp),%xmm14
 610         movaps  `$framesz+16*9`(%rsp),%xmm15
 611 ___
 612 $code.=<<___;
 613         mov     (%rsi),%r15
 614         mov     8(%rsi),%r14
 615         mov     16(%rsi),%r13
 616         mov     24(%rsi),%r12
 617         mov     32(%rsi),%rbp
 618         mov     40(%rsi),%rbx
 619         lea     48(%rsi),%rsp
 620 .Lepilogue_xop:
 621         ret
 622 .size   ${func}_xop,.-${func}_xop
 623 ___
 624 ######################################################################
 625 # AVX+shrd code path
 626 #
 627 local *ror = sub { &shrd(@_[0],@_) };
 628
 629 $code.=<<___;
 630 .type   ${func}_avx,\@function,6
 631 .align  64
 632 ${func}_avx:
 633 .Lavx_shortcut:
 634         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 635         push    %rbx
 636         push    %rbp
 637         push    %r12
 638         push    %r13
 639         push    %r14
 640         push    %r15
 641         mov     %rsp,%r11               # copy %rsp
 642         sub     \$`$framesz+$win64*16*10`,%rsp
 643         and     \$-64,%rsp              # align stack frame
 644
 645         shl     \$6,$len
 646         sub     $inp,$out               # re-bias
 647         sub     $inp,$in0
 648         add     $inp,$len               # end of input
 649
 650         #mov    $inp,$_inp              # saved later
 651         mov     $out,$_out
 652         mov     $len,$_end
 653         #mov    $key,$_key              # remains resident in $inp register
 654         mov     $ivp,$_ivp
 655         mov     $ctx,$_ctx
 656         mov     $in0,$_in0
 657         mov     %r11,$_rsp
 658 ___
 659 $code.=<<___ if ($win64);
 660         movaps  %xmm6,`$framesz+16*0`(%rsp)
 661         movaps  %xmm7,`$framesz+16*1`(%rsp)
 662         movaps  %xmm8,`$framesz+16*2`(%rsp)
 663         movaps  %xmm9,`$framesz+16*3`(%rsp)
 664         movaps  %xmm10,`$framesz+16*4`(%rsp)
 665         movaps  %xmm11,`$framesz+16*5`(%rsp)
 666         movaps  %xmm12,`$framesz+16*6`(%rsp)
 667         movaps  %xmm13,`$framesz+16*7`(%rsp)
 668         movaps  %xmm14,`$framesz+16*8`(%rsp)
 669         movaps  %xmm15,`$framesz+16*9`(%rsp)
 670 ___
 671 $code.=<<___;
 672 .Lprologue_avx:
 673         vzeroall
 674
 675         mov     $inp,%r12               # borrow $a4
 676         lea     0x80($key),$inp         # size optimization, reassign
 677         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 678         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 679         mov     $ctx,%r15               # borrow $a2
 680         mov     $in0,%rsi               # borrow $a3
 681         vmovdqu ($ivp),$iv              # load IV
 682         sub     \$9,%r14
 683
 684         mov     $SZ*0(%r15),$A
 685         mov     $SZ*1(%r15),$B
 686         mov     $SZ*2(%r15),$C
 687         mov     $SZ*3(%r15),$D
 688         mov     $SZ*4(%r15),$E
 689         mov     $SZ*5(%r15),$F
 690         mov     $SZ*6(%r15),$G
 691         mov     $SZ*7(%r15),$H
 692
 693         vmovdqa 0x00(%r13,%r14,8),$mask14
 694         vmovdqa 0x10(%r13,%r14,8),$mask12
 695         vmovdqa 0x20(%r13,%r14,8),$mask10
 696         vmovdqu 0x00-0x80($inp),$roundkey
 697 ___
 698                                         if ($SZ==4) {   # SHA256
 699     my @X = map("%xmm$_",(0..3));
 700     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 701
 702 $code.=<<___;
 703         jmp     .Lloop_avx
 704 .align  16
 705 .Lloop_avx:
 706         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 707         vmovdqu 0x00(%rsi,%r12),@X[0]
 708         vmovdqu 0x10(%rsi,%r12),@X[1]
 709         vmovdqu 0x20(%rsi,%r12),@X[2]
 710         vmovdqu 0x30(%rsi,%r12),@X[3]
 711         vpshufb $t3,@X[0],@X[0]
 712         lea     $TABLE(%rip),$Tbl
 713         vpshufb $t3,@X[1],@X[1]
 714         vpshufb $t3,@X[2],@X[2]
 715         vpaddd  0x00($Tbl),@X[0],$t0
 716         vpshufb $t3,@X[3],@X[3]
 717         vpaddd  0x20($Tbl),@X[1],$t1
 718         vpaddd  0x40($Tbl),@X[2],$t2
 719         vpaddd  0x60($Tbl),@X[3],$t3
 720         vmovdqa $t0,0x00(%rsp)
 721         mov     $A,$a1
 722         vmovdqa $t1,0x10(%rsp)
 723         mov     $B,$a3
 724         vmovdqa $t2,0x20(%rsp)
 725         xor     $C,$a3                  # magic
 726         vmovdqa $t3,0x30(%rsp)
 727         mov     $E,$a0
 728         jmp     .Lavx_00_47
 729
 730 .align  16
 731 .Lavx_00_47:
 732         sub     \$-16*2*$SZ,$Tbl        # size optimization
 733         vmovdqu (%r12),$inout           # $a4
 734         mov     %r12,$_inp              # $a4
 735 ___
 736 sub Xupdate_256_AVX () {
 737         (
 738         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
 739          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
 740         '&vpsrld        ($t2,$t0,$sigma0[0]);',
 741          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
 742         '&vpsrld        ($t3,$t0,$sigma0[2])',
 743         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
 744         '&vpxor         ($t0,$t3,$t2)',
 745          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
 746         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
 747         '&vpxor         ($t0,$t0,$t1)',
 748         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
 749         '&vpxor         ($t0,$t0,$t2)',
 750          '&vpsrld       ($t2,$t3,$sigma1[2]);',
 751         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
 752          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
 753         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
 754          '&vpxor        ($t2,$t2,$t3);',
 755          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 756          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
 757          '&vpshufd      ($t2,$t2,0b10000100)',
 758          '&vpsrldq      ($t2,$t2,8)',
 759         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
 760          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
 761          '&vpsrld       ($t2,$t3,$sigma1[2])',
 762          '&vpsrlq       ($t3,$t3,$sigma1[0])',
 763          '&vpxor        ($t2,$t2,$t3);',
 764          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 765          '&vpxor        ($t2,$t2,$t3)',
 766          '&vpshufd      ($t2,$t2,0b11101000)',
 767          '&vpslldq      ($t2,$t2,8)',
 768         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
 769         );
 770 }
 771
 772 sub AVX_256_00_47 () {
 773 my $j = shift;
 774 my $body = shift;
 775 my @X = @_;
 776 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 777
 778         foreach (Xupdate_256_AVX()) {           # 29 instructions
 779             eval;
 780             eval(shift(@insns));
 781             eval(shift(@insns));
 782             eval(shift(@insns));
 783         }
 784         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 785           foreach (@insns) { eval; }            # remaining instructions
 786         &vmovdqa        (16*$j."(%rsp)",$t2);
 787 }
 788
 789     $aesni_cbc_idx=0;
 790     for ($i=0,$j=0; $j<4; $j++) {
 791         &AVX_256_00_47($j,\&body_00_15,@X);
 792         push(@X,shift(@X));                     # rotate(@X)
 793     }
 794         &mov            ("%r12",$_inp);         # borrow $a4
 795         &vpand          ($temp,$temp,$mask14);
 796         &mov            ("%r15",$_out);         # borrow $a2
 797         &vpor           ($iv,$iv,$temp);
 798         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 799         &lea            ("%r12","16(%r12)");    # inp++
 800
 801         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 802         &jne    (".Lavx_00_47");
 803
 804         &vmovdqu        ($inout,"(%r12)");
 805         &mov            ($_inp,"%r12");
 806
 807     $aesni_cbc_idx=0;
 808     for ($i=0; $i<16; ) {
 809         foreach(body_00_15()) { eval; }
 810     }
 811
 812                                         }
 813 $code.=<<___;
 814         mov     $_inp,%r12              # borrow $a4
 815         mov     $_out,%r13              # borrow $a0
 816         mov     $_ctx,%r15              # borrow $a2
 817         mov     $_in0,%rsi              # borrow $a3
 818
 819         vpand   $mask14,$temp,$temp
 820         mov     $a1,$A
 821         vpor    $temp,$iv,$iv
 822         vmovdqu $iv,(%r13,%r12)         # write output
 823         lea     16(%r12),%r12           # inp++
 824
 825         add     $SZ*0(%r15),$A
 826         add     $SZ*1(%r15),$B
 827         add     $SZ*2(%r15),$C
 828         add     $SZ*3(%r15),$D
 829         add     $SZ*4(%r15),$E
 830         add     $SZ*5(%r15),$F
 831         add     $SZ*6(%r15),$G
 832         add     $SZ*7(%r15),$H
 833
 834         cmp     $_end,%r12
 835
 836         mov     $A,$SZ*0(%r15)
 837         mov     $B,$SZ*1(%r15)
 838         mov     $C,$SZ*2(%r15)
 839         mov     $D,$SZ*3(%r15)
 840         mov     $E,$SZ*4(%r15)
 841         mov     $F,$SZ*5(%r15)
 842         mov     $G,$SZ*6(%r15)
 843         mov     $H,$SZ*7(%r15)
 844         jb      .Lloop_avx
 845
 846         mov     $_ivp,$ivp
 847         mov     $_rsp,%rsi
 848         vmovdqu $iv,($ivp)              # output IV
 849         vzeroall
 850 ___
 851 $code.=<<___ if ($win64);
 852         movaps  `$framesz+16*0`(%rsp),%xmm6
 853         movaps  `$framesz+16*1`(%rsp),%xmm7
 854         movaps  `$framesz+16*2`(%rsp),%xmm8
 855         movaps  `$framesz+16*3`(%rsp),%xmm9
 856         movaps  `$framesz+16*4`(%rsp),%xmm10
 857         movaps  `$framesz+16*5`(%rsp),%xmm11
 858         movaps  `$framesz+16*6`(%rsp),%xmm12
 859         movaps  `$framesz+16*7`(%rsp),%xmm13
 860         movaps  `$framesz+16*8`(%rsp),%xmm14
 861         movaps  `$framesz+16*9`(%rsp),%xmm15
 862 ___
 863 $code.=<<___;
 864         mov     (%rsi),%r15
 865         mov     8(%rsi),%r14
 866         mov     16(%rsi),%r13
 867         mov     24(%rsi),%r12
 868         mov     32(%rsi),%rbp
 869         mov     40(%rsi),%rbx
 870         lea     48(%rsi),%rsp
 871 .Lepilogue_avx:
 872         ret
 873 .size   ${func}_avx,.-${func}_avx
 874 ___
 875
 876 if ($avx>1) {{
 877 ######################################################################
 878 # AVX2+BMI code path
 879 #
 880 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
 881 my $PUSH8=8*2*$SZ;
 882 use integer;
 883
 884 sub bodyx_00_15 () {
 885         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
 886         (
 887         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 888
 889         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
 890         '&and   ($a4,$e)',              # f&e
 891         '&rorx  ($a0,$e,$Sigma1[2])',
 892         '&rorx  ($a2,$e,$Sigma1[1])',
 893
 894         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
 895         '&lea   ($h,"($h,$a4)")',
 896         '&andn  ($a4,$e,$g)',           # ~e&g
 897         '&xor   ($a0,$a2)',
 898
 899         '&rorx  ($a1,$e,$Sigma1[0])',
 900         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
 901         '&xor   ($a0,$a1)',             # Sigma1(e)
 902         '&mov   ($a2,$a)',
 903
 904         '&rorx  ($a4,$a,$Sigma0[2])',
 905         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
 906         '&xor   ($a2,$b)',              # a^b, b^c in next round
 907         '&rorx  ($a1,$a,$Sigma0[1])',
 908
 909         '&rorx  ($a0,$a,$Sigma0[0])',
 910         '&lea   ($d,"($d,$h)")',        # d+=h
 911         '&and   ($a3,$a2)',             # (b^c)&(a^b)
 912         @aesni_cbc_block[$aesni_cbc_idx++].
 913         '&xor   ($a1,$a4)',
 914
 915         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
 916         '&xor   ($a1,$a0)',             # Sigma0(a)
 917         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
 918         '&mov   ($a4,$e)',              # copy of f in future
 919
 920         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 921         );
 922         # and at the finish one has to $a+=$a1
 923 }
 924
 925 $code.=<<___;
 926 .type   ${func}_avx2,\@function,6
 927 .align  64
 928 ${func}_avx2:
 929 .Lavx2_shortcut:
 930         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 931         push    %rbx
 932         push    %rbp
 933         push    %r12
 934         push    %r13
 935         push    %r14
 936         push    %r15
 937         mov     %rsp,%r11               # copy %rsp
 938         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
 939         and     \$-256*$SZ,%rsp         # align stack frame
 940         add     \$`2*$SZ*($rounds-8)`,%rsp
 941
 942         shl     \$6,$len
 943         sub     $inp,$out               # re-bias
 944         sub     $inp,$in0
 945         add     $inp,$len               # end of input
 946
 947         #mov    $inp,$_inp              # saved later
 948         #mov    $out,$_out              # kept in $offload
 949         mov     $len,$_end
 950         #mov    $key,$_key              # remains resident in $inp register
 951         mov     $ivp,$_ivp
 952         mov     $ctx,$_ctx
 953         mov     $in0,$_in0
 954         mov     %r11,$_rsp
 955 ___
 956 $code.=<<___ if ($win64);
 957         movaps  %xmm6,`$framesz+16*0`(%rsp)
 958         movaps  %xmm7,`$framesz+16*1`(%rsp)
 959         movaps  %xmm8,`$framesz+16*2`(%rsp)
 960         movaps  %xmm9,`$framesz+16*3`(%rsp)
 961         movaps  %xmm10,`$framesz+16*4`(%rsp)
 962         movaps  %xmm11,`$framesz+16*5`(%rsp)
 963         movaps  %xmm12,`$framesz+16*6`(%rsp)
 964         movaps  %xmm13,`$framesz+16*7`(%rsp)
 965         movaps  %xmm14,`$framesz+16*8`(%rsp)
 966         movaps  %xmm15,`$framesz+16*9`(%rsp)
 967 ___
 968 $code.=<<___;
 969 .Lprologue_avx2:
 970         vzeroall
 971
 972         mov     $inp,%r13               # borrow $a0
 973         vpinsrq \$1,$out,$offload,$offload
 974         lea     0x80($key),$inp         # size optimization, reassign
 975         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
 976         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 977         mov     $ctx,%r15               # borrow $a2
 978         mov     $in0,%rsi               # borrow $a3
 979         vmovdqu ($ivp),$iv              # load IV
 980         lea     -9(%r14),%r14
 981
 982         vmovdqa 0x00(%r12,%r14,8),$mask14
 983         vmovdqa 0x10(%r12,%r14,8),$mask12
 984         vmovdqa 0x20(%r12,%r14,8),$mask10
 985
 986         sub     \$-16*$SZ,%r13          # inp++, size optimization
 987         mov     $SZ*0(%r15),$A
 988         lea     (%rsi,%r13),%r12        # borrow $a0
 989         mov     $SZ*1(%r15),$B
 990         cmp     $len,%r13               # $_end
 991         mov     $SZ*2(%r15),$C
 992         cmove   %rsp,%r12               # next block or random data
 993         mov     $SZ*3(%r15),$D
 994         mov     $SZ*4(%r15),$E
 995         mov     $SZ*5(%r15),$F
 996         mov     $SZ*6(%r15),$G
 997         mov     $SZ*7(%r15),$H
 998         vmovdqu 0x00-0x80($inp),$roundkey
 999 ___
1000                                         if ($SZ==4) {   # SHA256
1001     my @X = map("%ymm$_",(0..3));
1002     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1003
1004 $code.=<<___;
1005         jmp     .Loop_avx2
1006 .align  16
1007 .Loop_avx2:
1008         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1009         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1010         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1011         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1012         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1013
1014         vinserti128     \$1,(%r12),@X[0],@X[0]
1015         vinserti128     \$1,16(%r12),@X[1],@X[1]
1016          vpshufb        $t3,@X[0],@X[0]
1017         vinserti128     \$1,32(%r12),@X[2],@X[2]
1018          vpshufb        $t3,@X[1],@X[1]
1019         vinserti128     \$1,48(%r12),@X[3],@X[3]
1020
1021         lea     $TABLE(%rip),$Tbl
1022         vpshufb $t3,@X[2],@X[2]
1023         lea     -16*$SZ(%r13),%r13
1024         vpaddd  0x00($Tbl),@X[0],$t0
1025         vpshufb $t3,@X[3],@X[3]
1026         vpaddd  0x20($Tbl),@X[1],$t1
1027         vpaddd  0x40($Tbl),@X[2],$t2
1028         vpaddd  0x60($Tbl),@X[3],$t3
1029         vmovdqa $t0,0x00(%rsp)
1030         xor     $a1,$a1
1031         vmovdqa $t1,0x20(%rsp)
1032         lea     -$PUSH8(%rsp),%rsp
1033         mov     $B,$a3
1034         vmovdqa $t2,0x00(%rsp)
1035         xor     $C,$a3                  # magic
1036         vmovdqa $t3,0x20(%rsp)
1037         mov     $F,$a4
1038         sub     \$-16*2*$SZ,$Tbl        # size optimization
1039         jmp     .Lavx2_00_47
1040
1041 .align  16
1042 .Lavx2_00_47:
1043         vmovdqu (%r13),$inout
1044         vpinsrq \$0,%r13,$offload,$offload
1045 ___
1046
1047 sub AVX2_256_00_47 () {
1048 my $j = shift;
1049 my $body = shift;
1050 my @X = @_;
1051 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1052 my $base = "+2*$PUSH8(%rsp)";
1053
1054         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1055         foreach (Xupdate_256_AVX()) {           # 29 instructions
1056             eval;
1057             eval(shift(@insns));
1058             eval(shift(@insns));
1059             eval(shift(@insns));
1060         }
1061         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1062           foreach (@insns) { eval; }            # remaining instructions
1063         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1064 }
1065     $aesni_cbc_idx=0;
1066     for ($i=0,$j=0; $j<4; $j++) {
1067         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1068         push(@X,shift(@X));                     # rotate(@X)
1069     }
1070         &vmovq          ("%r13",$offload);      # borrow $a0
1071         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1072         &vpand          ($temp,$temp,$mask14);
1073         &vpor           ($iv,$iv,$temp);
1074         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1075         &lea            ("%r13","16(%r13)");    # inp++
1076
1077         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1078         &cmpb   (($SZ-1)."($Tbl)",0);
1079         &jne    (".Lavx2_00_47");
1080
1081         &vmovdqu        ($inout,"(%r13)");
1082         &vpinsrq        ($offload,$offload,"%r13",0);
1083
1084     $aesni_cbc_idx=0;
1085     for ($i=0; $i<16; ) {
1086         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1087         foreach(bodyx_00_15()) { eval; }
1088     }
1089                                         }
1090 $code.=<<___;
1091         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1092         vmovq   $offload,%r13                   # $_inp, borrow $a0
1093         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1094         add     $a1,$A
1095         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1096
1097         vpand   $mask14,$temp,$temp
1098         vpor    $temp,$iv,$iv
1099         vmovdqu $iv,(%r12,%r13)                 # write output
1100         lea     16(%r13),%r13
1101
1102         add     $SZ*0(%r15),$A
1103         add     $SZ*1(%r15),$B
1104         add     $SZ*2(%r15),$C
1105         add     $SZ*3(%r15),$D
1106         add     $SZ*4(%r15),$E
1107         add     $SZ*5(%r15),$F
1108         add     $SZ*6(%r15),$G
1109         add     $SZ*7(%r15),$H
1110
1111         mov     $A,$SZ*0(%r15)
1112         mov     $B,$SZ*1(%r15)
1113         mov     $C,$SZ*2(%r15)
1114         mov     $D,$SZ*3(%r15)
1115         mov     $E,$SZ*4(%r15)
1116         mov     $F,$SZ*5(%r15)
1117         mov     $G,$SZ*6(%r15)
1118         mov     $H,$SZ*7(%r15)
1119
1120         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1121         je      .Ldone_avx2
1122
1123         xor     $a1,$a1
1124         mov     $B,$a3
1125         mov     $F,$a4
1126         xor     $C,$a3                  # magic
1127         jmp     .Lower_avx2
1128 .align  16
1129 .Lower_avx2:
1130         vmovdqu (%r13),$inout
1131         vpinsrq \$0,%r13,$offload,$offload
1132 ___
1133     $aesni_cbc_idx=0;
1134     for ($i=0; $i<16; ) {
1135         my $base="+16($Tbl)";
1136         foreach(bodyx_00_15()) { eval; }
1137         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1138     }
1139 $code.=<<___;
1140         vmovq   $offload,%r13                   # borrow $a0
1141         vpextrq \$1,$offload,%r15               # borrow $a2
1142         vpand   $mask14,$temp,$temp
1143         vpor    $temp,$iv,$iv
1144         lea     -$PUSH8($Tbl),$Tbl
1145         vmovdqu $iv,(%r15,%r13)                 # write output
1146         lea     16(%r13),%r13                   # inp++
1147         cmp     %rsp,$Tbl
1148         jae     .Lower_avx2
1149
1150         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1151         lea     16*$SZ(%r13),%r13
1152         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1153         add     $a1,$A
1154         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1155
1156         add     $SZ*0(%r15),$A
1157         add     $SZ*1(%r15),$B
1158         add     $SZ*2(%r15),$C
1159         add     $SZ*3(%r15),$D
1160         add     $SZ*4(%r15),$E
1161         add     $SZ*5(%r15),$F
1162         add     $SZ*6(%r15),$G
1163         lea     (%rsi,%r13),%r12
1164         add     $SZ*7(%r15),$H
1165
1166         cmp     $_end,%r13
1167
1168         mov     $A,$SZ*0(%r15)
1169         cmove   %rsp,%r12               # next block or stale data
1170         mov     $B,$SZ*1(%r15)
1171         mov     $C,$SZ*2(%r15)
1172         mov     $D,$SZ*3(%r15)
1173         mov     $E,$SZ*4(%r15)
1174         mov     $F,$SZ*5(%r15)
1175         mov     $G,$SZ*6(%r15)
1176         mov     $H,$SZ*7(%r15)
1177
1178         jbe     .Loop_avx2
1179         lea     (%rsp),$Tbl
1180
1181 .Ldone_avx2:
1182         lea     ($Tbl),%rsp
1183         mov     $_ivp,$ivp
1184         mov     $_rsp,%rsi
1185         vmovdqu $iv,($ivp)              # output IV
1186         vzeroall
1187 ___
1188 $code.=<<___ if ($win64);
1189         movaps  `$framesz+16*0`(%rsp),%xmm6
1190         movaps  `$framesz+16*1`(%rsp),%xmm7
1191         movaps  `$framesz+16*2`(%rsp),%xmm8
1192         movaps  `$framesz+16*3`(%rsp),%xmm9
1193         movaps  `$framesz+16*4`(%rsp),%xmm10
1194         movaps  `$framesz+16*5`(%rsp),%xmm11
1195         movaps  `$framesz+16*6`(%rsp),%xmm12
1196         movaps  `$framesz+16*7`(%rsp),%xmm13
1197         movaps  `$framesz+16*8`(%rsp),%xmm14
1198         movaps  `$framesz+16*9`(%rsp),%xmm15
1199 ___
1200 $code.=<<___;
1201         mov     (%rsi),%r15
1202         mov     8(%rsi),%r14
1203         mov     16(%rsi),%r13
1204         mov     24(%rsi),%r12
1205         mov     32(%rsi),%rbp
1206         mov     40(%rsi),%rbx
1207         lea     48(%rsi),%rsp
1208 .Lepilogue_avx2:
1209         ret
1210 .size   ${func}_avx2,.-${func}_avx2
1211 ___
1212 }}
1213 }}
1214 {{
1215 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1216
1217 my ($rounds,$Tbl)=("%r11d","%rbx");
1218
1219 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1220 my @rndkey=("%xmm4","%xmm5");
1221 my $r=0;
1222 my $sn=0;
1223
1224 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1225 my @MSG=map("%xmm$_",(10..13));
1226
1227 my $aesenc=sub {
1228   use integer;
1229   my ($n,$k)=($r/10,$r%10);
1230     if ($k==0) {
1231       $code.=<<___;
1232         movups          `16*$n`($in0),$in               # load input
1233         xorps           $rndkey0,$in
1234 ___
1235       $code.=<<___ if ($n);
1236         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1237 ___
1238       $code.=<<___;
1239         xorps           $in,$iv
1240         movups          `32+16*$k-112`($key),$rndkey[1]
1241         aesenc          $rndkey[0],$iv
1242 ___
1243     } elsif ($k==9) {
1244       $sn++;
1245       $code.=<<___;
1246         cmp             \$11,$rounds
1247         jb              .Laesenclast$sn
1248         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1249         aesenc          $rndkey[0],$iv
1250         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1251         aesenc          $rndkey[1],$iv
1252         je              .Laesenclast$sn
1253         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1254         aesenc          $rndkey[0],$iv
1255         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1256         aesenc          $rndkey[1],$iv
1257 .Laesenclast$sn:
1258         aesenclast      $rndkey[0],$iv
1259         movups          16-112($key),$rndkey[1]         # forward reference
1260         nop
1261 ___
1262     } else {
1263       $code.=<<___;
1264         movups          `32+16*$k-112`($key),$rndkey[1]
1265         aesenc          $rndkey[0],$iv
1266 ___
1267     }
1268     $r++;       unshift(@rndkey,pop(@rndkey));
1269 };
1270
1271 if ($shaext) {
1272 my $Tbl="%rax";
1273
1274 $code.=<<___;
1275 .type   ${func}_shaext,\@function,6
1276 .align  32
1277 ${func}_shaext:
1278         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1279 ___
1280 $code.=<<___ if ($win64);
1281         lea     `-8-10*16`(%rsp),%rsp
1282         movaps  %xmm6,-8-10*16(%rax)
1283         movaps  %xmm7,-8-9*16(%rax)
1284         movaps  %xmm8,-8-8*16(%rax)
1285         movaps  %xmm9,-8-7*16(%rax)
1286         movaps  %xmm10,-8-6*16(%rax)
1287         movaps  %xmm11,-8-5*16(%rax)
1288         movaps  %xmm12,-8-4*16(%rax)
1289         movaps  %xmm13,-8-3*16(%rax)
1290         movaps  %xmm14,-8-2*16(%rax)
1291         movaps  %xmm15,-8-1*16(%rax)
1292 .Lprologue_shaext:
1293 ___
1294 $code.=<<___;
1295         lea             K256+0x80(%rip),$Tbl
1296         movdqu          ($ctx),$ABEF            # DCBA
1297         movdqu          16($ctx),$CDGH          # HGFE
1298         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1299
1300         mov             240($key),$rounds
1301         sub             $in0,$out
1302         movups          ($key),$rndkey0         # $key[0]
1303         movups          16($key),$rndkey[0]     # forward reference
1304         lea             112($key),$key          # size optimization
1305
1306         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1307         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1308         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1309         movdqa          $TMP,$BSWAP             # offload
1310         palignr         \$8,$CDGH,$ABEF         # ABEF
1311         punpcklqdq      $Wi,$CDGH               # CDGH
1312
1313         jmp     .Loop_shaext
1314
1315 .align  16
1316 .Loop_shaext:
1317         movdqu          ($inp),@MSG[0]
1318         movdqu          0x10($inp),@MSG[1]
1319         movdqu          0x20($inp),@MSG[2]
1320         pshufb          $TMP,@MSG[0]
1321         movdqu          0x30($inp),@MSG[3]
1322
1323         movdqa          0*32-0x80($Tbl),$Wi
1324         paddd           @MSG[0],$Wi
1325         pshufb          $TMP,@MSG[1]
1326         movdqa          $CDGH,$CDGH_SAVE        # offload
1327         movdqa          $ABEF,$ABEF_SAVE        # offload
1328 ___
1329         &$aesenc();
1330 $code.=<<___;
1331         sha256rnds2     $ABEF,$CDGH             # 0-3
1332         pshufd          \$0x0e,$Wi,$Wi
1333 ___
1334         &$aesenc();
1335 $code.=<<___;
1336         sha256rnds2     $CDGH,$ABEF
1337
1338         movdqa          1*32-0x80($Tbl),$Wi
1339         paddd           @MSG[1],$Wi
1340         pshufb          $TMP,@MSG[2]
1341         lea             0x40($inp),$inp
1342 ___
1343         &$aesenc();
1344 $code.=<<___;
1345         sha256rnds2     $ABEF,$CDGH             # 4-7
1346         pshufd          \$0x0e,$Wi,$Wi
1347 ___
1348         &$aesenc();
1349 $code.=<<___;
1350         sha256rnds2     $CDGH,$ABEF
1351
1352         movdqa          2*32-0x80($Tbl),$Wi
1353         paddd           @MSG[2],$Wi
1354         pshufb          $TMP,@MSG[3]
1355         sha256msg1      @MSG[1],@MSG[0]
1356 ___
1357         &$aesenc();
1358 $code.=<<___;
1359         sha256rnds2     $ABEF,$CDGH             # 8-11
1360         pshufd          \$0x0e,$Wi,$Wi
1361         movdqa          @MSG[3],$TMP
1362         palignr         \$4,@MSG[2],$TMP
1363         paddd           $TMP,@MSG[0]
1364 ___
1365         &$aesenc();
1366 $code.=<<___;
1367         sha256rnds2     $CDGH,$ABEF
1368
1369         movdqa          3*32-0x80($Tbl),$Wi
1370         paddd           @MSG[3],$Wi
1371         sha256msg2      @MSG[3],@MSG[0]
1372         sha256msg1      @MSG[2],@MSG[1]
1373 ___
1374         &$aesenc();
1375 $code.=<<___;
1376         sha256rnds2     $ABEF,$CDGH             # 12-15
1377         pshufd          \$0x0e,$Wi,$Wi
1378 ___
1379         &$aesenc();
1380 $code.=<<___;
1381         movdqa          @MSG[0],$TMP
1382         palignr         \$4,@MSG[3],$TMP
1383         paddd           $TMP,@MSG[1]
1384         sha256rnds2     $CDGH,$ABEF
1385 ___
1386 for($i=4;$i<16-3;$i++) {
1387         &$aesenc()      if (($r%10)==0);
1388 $code.=<<___;
1389         movdqa          $i*32-0x80($Tbl),$Wi
1390         paddd           @MSG[0],$Wi
1391         sha256msg2      @MSG[0],@MSG[1]
1392         sha256msg1      @MSG[3],@MSG[2]
1393 ___
1394         &$aesenc();
1395 $code.=<<___;
1396         sha256rnds2     $ABEF,$CDGH             # 16-19...
1397         pshufd          \$0x0e,$Wi,$Wi
1398         movdqa          @MSG[1],$TMP
1399         palignr         \$4,@MSG[0],$TMP
1400         paddd           $TMP,@MSG[2]
1401 ___
1402         &$aesenc();
1403         &$aesenc()      if ($r==19);
1404 $code.=<<___;
1405         sha256rnds2     $CDGH,$ABEF
1406 ___
1407         push(@MSG,shift(@MSG));
1408 }
1409 $code.=<<___;
1410         movdqa          13*32-0x80($Tbl),$Wi
1411         paddd           @MSG[0],$Wi
1412         sha256msg2      @MSG[0],@MSG[1]
1413         sha256msg1      @MSG[3],@MSG[2]
1414 ___
1415         &$aesenc();
1416 $code.=<<___;
1417         sha256rnds2     $ABEF,$CDGH             # 52-55
1418         pshufd          \$0x0e,$Wi,$Wi
1419         movdqa          @MSG[1],$TMP
1420         palignr         \$4,@MSG[0],$TMP
1421         paddd           $TMP,@MSG[2]
1422 ___
1423         &$aesenc();
1424         &$aesenc();
1425 $code.=<<___;
1426         sha256rnds2     $CDGH,$ABEF
1427
1428         movdqa          14*32-0x80($Tbl),$Wi
1429         paddd           @MSG[1],$Wi
1430         sha256msg2      @MSG[1],@MSG[2]
1431         movdqa          $BSWAP,$TMP
1432 ___
1433         &$aesenc();
1434 $code.=<<___;
1435         sha256rnds2     $ABEF,$CDGH             # 56-59
1436         pshufd          \$0x0e,$Wi,$Wi
1437 ___
1438         &$aesenc();
1439 $code.=<<___;
1440         sha256rnds2     $CDGH,$ABEF
1441
1442         movdqa          15*32-0x80($Tbl),$Wi
1443         paddd           @MSG[2],$Wi
1444 ___
1445         &$aesenc();
1446         &$aesenc();
1447 $code.=<<___;
1448         sha256rnds2     $ABEF,$CDGH             # 60-63
1449         pshufd          \$0x0e,$Wi,$Wi
1450 ___
1451         &$aesenc();
1452 $code.=<<___;
1453         sha256rnds2     $CDGH,$ABEF
1454         #pxor           $CDGH,$rndkey0          # black magic
1455 ___
1456         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1457 $code.=<<___;
1458         #xorps          $CDGH,$rndkey0          # black magic
1459         paddd           $CDGH_SAVE,$CDGH
1460         paddd           $ABEF_SAVE,$ABEF
1461
1462         dec             $len
1463         movups          $iv,48($out,$in0)       # write output
1464         lea             64($in0),$in0
1465         jnz             .Loop_shaext
1466
1467         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1468         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1469         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1470         punpckhqdq      $CDGH,$ABEF             # DCBA
1471         palignr         \$8,$TMP,$CDGH          # HGFE
1472
1473         movups          $iv,($ivp)              # write IV
1474         movdqu          $ABEF,($ctx)
1475         movdqu          $CDGH,16($ctx)
1476 ___
1477 $code.=<<___ if ($win64);
1478         movaps  0*16(%rsp),%xmm6
1479         movaps  1*16(%rsp),%xmm7
1480         movaps  2*16(%rsp),%xmm8
1481         movaps  3*16(%rsp),%xmm9
1482         movaps  4*16(%rsp),%xmm10
1483         movaps  5*16(%rsp),%xmm11
1484         movaps  6*16(%rsp),%xmm12
1485         movaps  7*16(%rsp),%xmm13
1486         movaps  8*16(%rsp),%xmm14
1487         movaps  9*16(%rsp),%xmm15
1488         lea     8+10*16(%rsp),%rsp
1489 .Lepilogue_shaext:
1490 ___
1491 $code.=<<___;
1492         ret
1493 .size   ${func}_shaext,.-${func}_shaext
1494 ___
1495 }
1496 }}}}}
1497
1498 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1499 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1500 if ($win64 && $avx) {
1501 $rec="%rcx";
1502 $frame="%rdx";
1503 $context="%r8";
1504 $disp="%r9";
1505
1506 $code.=<<___;
1507 .extern __imp_RtlVirtualUnwind
1508 .type   se_handler,\@abi-omnipotent
1509 .align  16
1510 se_handler:
1511         push    %rsi
1512         push    %rdi
1513         push    %rbx
1514         push    %rbp
1515         push    %r12
1516         push    %r13
1517         push    %r14
1518         push    %r15
1519         pushfq
1520         sub     \$64,%rsp
1521
1522         mov     120($context),%rax      # pull context->Rax
1523         mov     248($context),%rbx      # pull context->Rip
1524
1525         mov     8($disp),%rsi           # disp->ImageBase
1526         mov     56($disp),%r11          # disp->HanderlData
1527
1528         mov     0(%r11),%r10d           # HandlerData[0]
1529         lea     (%rsi,%r10),%r10        # prologue label
1530         cmp     %r10,%rbx               # context->Rip<prologue label
1531         jb      .Lin_prologue
1532
1533         mov     152($context),%rax      # pull context->Rsp
1534
1535         mov     4(%r11),%r10d           # HandlerData[1]
1536         lea     (%rsi,%r10),%r10        # epilogue label
1537         cmp     %r10,%rbx               # context->Rip>=epilogue label
1538         jae     .Lin_prologue
1539 ___
1540 $code.=<<___ if ($shaext);
1541         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1542         cmp     %r10,%rbx
1543         jb      .Lnot_in_shaext
1544
1545         lea     (%rax),%rsi
1546         lea     512($context),%rdi      # &context.Xmm6
1547         mov     \$20,%ecx
1548         .long   0xa548f3fc              # cld; rep movsq
1549         lea     168(%rax),%rax          # adjust stack pointer
1550         jmp     .Lin_prologue
1551 .Lnot_in_shaext:
1552 ___
1553 $code.=<<___ if ($avx>1);
1554         lea     .Lavx2_shortcut(%rip),%r10
1555         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1556         jb      .Lnot_in_avx2
1557
1558         and     \$-256*$SZ,%rax
1559         add     \$`2*$SZ*($rounds-8)`,%rax
1560 .Lnot_in_avx2:
1561 ___
1562 $code.=<<___;
1563         mov     %rax,%rsi               # put aside Rsp
1564         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1565         lea     48(%rax),%rax
1566
1567         mov     -8(%rax),%rbx
1568         mov     -16(%rax),%rbp
1569         mov     -24(%rax),%r12
1570         mov     -32(%rax),%r13
1571         mov     -40(%rax),%r14
1572         mov     -48(%rax),%r15
1573         mov     %rbx,144($context)      # restore context->Rbx
1574         mov     %rbp,160($context)      # restore context->Rbp
1575         mov     %r12,216($context)      # restore context->R12
1576         mov     %r13,224($context)      # restore context->R13
1577         mov     %r14,232($context)      # restore context->R14
1578         mov     %r15,240($context)      # restore context->R15
1579
1580         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1581         lea     512($context),%rdi      # &context.Xmm6
1582         mov     \$20,%ecx
1583         .long   0xa548f3fc              # cld; rep movsq
1584
1585 .Lin_prologue:
1586         mov     8(%rax),%rdi
1587         mov     16(%rax),%rsi
1588         mov     %rax,152($context)      # restore context->Rsp
1589         mov     %rsi,168($context)      # restore context->Rsi
1590         mov     %rdi,176($context)      # restore context->Rdi
1591
1592         mov     40($disp),%rdi          # disp->ContextRecord
1593         mov     $context,%rsi           # context
1594         mov     \$154,%ecx              # sizeof(CONTEXT)
1595         .long   0xa548f3fc              # cld; rep movsq
1596
1597         mov     $disp,%rsi
1598         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1599         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1600         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1601         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1602         mov     40(%rsi),%r10           # disp->ContextRecord
1603         lea     56(%rsi),%r11           # &disp->HandlerData
1604         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1605         mov     %r10,32(%rsp)           # arg5
1606         mov     %r11,40(%rsp)           # arg6
1607         mov     %r12,48(%rsp)           # arg7
1608         mov     %rcx,56(%rsp)           # arg8, (NULL)
1609         call    *__imp_RtlVirtualUnwind(%rip)
1610
1611         mov     \$1,%eax                # ExceptionContinueSearch
1612         add     \$64,%rsp
1613         popfq
1614         pop     %r15
1615         pop     %r14
1616         pop     %r13
1617         pop     %r12
1618         pop     %rbp
1619         pop     %rbx
1620         pop     %rdi
1621         pop     %rsi
1622         ret
1623 .size   se_handler,.-se_handler
1624
1625 .section        .pdata
1626         .rva    .LSEH_begin_${func}_xop
1627         .rva    .LSEH_end_${func}_xop
1628         .rva    .LSEH_info_${func}_xop
1629
1630         .rva    .LSEH_begin_${func}_avx
1631         .rva    .LSEH_end_${func}_avx
1632         .rva    .LSEH_info_${func}_avx
1633 ___
1634 $code.=<<___ if ($avx>1);
1635         .rva    .LSEH_begin_${func}_avx2
1636         .rva    .LSEH_end_${func}_avx2
1637         .rva    .LSEH_info_${func}_avx2
1638 ___
1639 $code.=<<___ if ($shaext);
1640         .rva    .LSEH_begin_${func}_shaext
1641         .rva    .LSEH_end_${func}_shaext
1642         .rva    .LSEH_info_${func}_shaext
1643 ___
1644 $code.=<<___;
1645 .section        .xdata
1646 .align  8
1647 .LSEH_info_${func}_xop:
1648         .byte   9,0,0,0
1649         .rva    se_handler
1650         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1651
1652 .LSEH_info_${func}_avx:
1653         .byte   9,0,0,0
1654         .rva    se_handler
1655         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1656 ___
1657 $code.=<<___ if ($avx>1);
1658 .LSEH_info_${func}_avx2:
1659         .byte   9,0,0,0
1660         .rva    se_handler
1661         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1662 ___
1663 $code.=<<___ if ($shaext);
1664 .LSEH_info_${func}_shaext:
1665         .byte   9,0,0,0
1666         .rva    se_handler
1667         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1668 ___
1669 }
1670
1671 ####################################################################
1672 sub rex {
1673   local *opcode=shift;
1674   my ($dst,$src)=@_;
1675   my $rex=0;
1676
1677     $rex|=0x04                  if($dst>=8);
1678     $rex|=0x01                  if($src>=8);
1679     unshift @opcode,$rex|0x40   if($rex);
1680 }
1681
1682 {
1683   my %opcodelet = (
1684                 "sha256rnds2" => 0xcb,
1685                 "sha256msg1"  => 0xcc,
1686                 "sha256msg2"  => 0xcd   );
1687
1688   sub sha256op38 {
1689     my $instr = shift;
1690
1691     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1692       my @opcode=(0x0f,0x38);
1693         rex(\@opcode,$2,$1);
1694         push @opcode,$opcodelet{$instr};
1695         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1696         return ".byte\t".join(',',@opcode);
1697     } else {
1698         return $instr."\t".@_[0];
1699     }
1700   }
1701 }
1702
1703 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1704 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1705 print $code;
1706 close STDOUT;