crypto/aes/asm/aesni-sha256-x86_64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # January 2013
  11 #
  12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  15 # parallelism, interleaving it with another algorithm would allow to
  16 # utilize processor resources better and achieve better performance.
  17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  18 # AESNI code is weaved into it. As SHA256 dominates execution time,
  19 # stitch performance does not depend on AES key length. Below are
  20 # performance numbers in cycles per processed byte, less is better,
  21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  22 # subroutine:
  23 #
  24 #                AES-128/-192/-256+SHA256       this(**)gain
  25 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
  26 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
  27 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
  28 # Skylake           2.62/3.14/3.62+7.70         8.10    +27%/34%/40%
  29 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
  30 #
  31 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
  32 #       Westmere is omitted from loop, this is because gain was not
  33 #       estimated high enough to justify the effort;
  34 # (**)  these are EVP-free results, results obtained with 'speed
  35 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  36
  37 $flavour = shift;
  38 $output  = shift;
  39 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  40
  41 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  42
  43 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  44 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  45 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  46 die "can't locate x86_64-xlate.pl";
  47
  48 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  49                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  50         $avx = ($1>=2.19) + ($1>=2.22);
  51 }
  52
  53 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  54            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  55         $avx = ($1>=2.09) + ($1>=2.10);
  56 }
  57
  58 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  59            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  60         $avx = ($1>=10) + ($1>=12);
  61 }
  62
  63 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
  64         $avx = ($2>=3.0) + ($2>3.0);
  65 }
  66
  67 $shaext=$avx;   ### set to zero if compiling for 1.0.1
  68 $avx=1          if (!$shaext && $avx);
  69
  70 open OUT,"| \"$^X\" $xlate $flavour $output";
  71 *STDOUT=*OUT;
  72
  73 $func="aesni_cbc_sha256_enc";
  74 $TABLE="K256";
  75 $SZ=4;
  76 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  77                                 "%r8d","%r9d","%r10d","%r11d");
  78 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  79 @Sigma0=( 2,13,22);
  80 @Sigma1=( 6,11,25);
  81 @sigma0=( 7,18, 3);
  82 @sigma1=(17,19,10);
  83 $rounds=64;
  84
  85 ########################################################################
  86 # void aesni_cbc_sha256_enc(const void *inp,
  87 #                       void *out,
  88 #                       size_t length,
  89 #                       const AES_KEY *key,
  90 #                       unsigned char *iv,
  91 #                       SHA256_CTX *ctx,
  92 #                       const void *in0);
  93 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
  94 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  95
  96 $Tbl="%rbp";
  97
  98 $_inp="16*$SZ+0*8(%rsp)";
  99 $_out="16*$SZ+1*8(%rsp)";
 100 $_end="16*$SZ+2*8(%rsp)";
 101 $_key="16*$SZ+3*8(%rsp)";
 102 $_ivp="16*$SZ+4*8(%rsp)";
 103 $_ctx="16*$SZ+5*8(%rsp)";
 104 $_in0="16*$SZ+6*8(%rsp)";
 105 $_rsp="16*$SZ+7*8(%rsp)";
 106 $framesz=16*$SZ+8*8;
 107
 108 $code=<<___;
 109 .text
 110
 111 .extern OPENSSL_ia32cap_P
 112 .globl  $func
 113 .type   $func,\@abi-omnipotent
 114 .align  16
 115 $func:
 116 ___
 117                                                 if ($avx) {
 118 $code.=<<___;
 119         lea     OPENSSL_ia32cap_P(%rip),%r11
 120         mov     \$1,%eax
 121         cmp     \$0,`$win64?"%rcx":"%rdi"`
 122         je      .Lprobe
 123         mov     0(%r11),%eax
 124         mov     4(%r11),%r10
 125 ___
 126 $code.=<<___ if ($shaext);
 127         bt      \$61,%r10                       # check for SHA
 128         jc      ${func}_shaext
 129 ___
 130 $code.=<<___;
 131         mov     %r10,%r11
 132         shr     \$32,%r11
 133
 134         test    \$`1<<11`,%r10d                 # check for XOP
 135         jnz     ${func}_xop
 136 ___
 137 $code.=<<___ if ($avx>1);
 138         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
 139         cmp     \$`1<<8|1<<5|1<<3`,%r11d
 140         je      ${func}_avx2
 141 ___
 142 $code.=<<___;
 143         and     \$`1<<30`,%eax                  # mask "Intel CPU" bit
 144         and     \$`1<<28|1<<9`,%r10d            # mask AVX+SSSE3 bits
 145         or      %eax,%r10d
 146         cmp     \$`1<<28|1<<9|1<<30`,%r10d
 147         je      ${func}_avx
 148         ud2
 149 ___
 150                                                 }
 151 $code.=<<___;
 152         xor     %eax,%eax
 153         cmp     \$0,`$win64?"%rcx":"%rdi"`
 154         je      .Lprobe
 155         ud2
 156 .Lprobe:
 157         ret
 158 .size   $func,.-$func
 159
 160 .align  64
 161 .type   $TABLE,\@object
 162 $TABLE:
 163         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 164         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 165         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 166         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 167         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 168         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 169         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 170         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 171         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 172         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 173         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 174         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 175         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 176         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 177         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 178         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 179         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 180         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 181         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 182         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 183         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 184         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 185         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 186         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 187         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 188         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 189         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 190         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 191         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 192         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 193         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 194         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 195
 196         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 197         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 198         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
 199         .long   0,0,0,0,   0,0,0,0
 200         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 201 .align  64
 202 ___
 203
 204 ######################################################################
 205 # SIMD code paths
 206 #
 207 {{{
 208 ($iv,$inout,$roundkey,$temp,
 209  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
 210
 211 $aesni_cbc_idx=0;
 212 @aesni_cbc_block = (
 213 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
 214 ##      &vmovdqu        ($inout,($inp));
 215 ##      &mov            ($_inp,$inp);
 216
 217         '&vpxor         ($inout,$inout,$roundkey);'.
 218         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
 219
 220         '&vpxor         ($inout,$inout,$iv);',
 221
 222         '&vaesenc       ($inout,$inout,$roundkey);'.
 223         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
 224
 225         '&vaesenc       ($inout,$inout,$roundkey);'.
 226         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
 227
 228         '&vaesenc       ($inout,$inout,$roundkey);'.
 229         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
 230
 231         '&vaesenc       ($inout,$inout,$roundkey);'.
 232         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
 233
 234         '&vaesenc       ($inout,$inout,$roundkey);'.
 235         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
 236
 237         '&vaesenc       ($inout,$inout,$roundkey);'.
 238         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
 239
 240         '&vaesenc       ($inout,$inout,$roundkey);'.
 241         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
 242
 243         '&vaesenc       ($inout,$inout,$roundkey);'.
 244         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
 245
 246         '&vaesenc       ($inout,$inout,$roundkey);'.
 247         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
 248
 249         '&vaesenclast   ($temp,$inout,$roundkey);'.
 250         ' &vaesenc      ($inout,$inout,$roundkey);'.
 251         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
 252
 253         '&vpand         ($iv,$temp,$mask10);'.
 254         ' &vaesenc      ($inout,$inout,$roundkey);'.
 255         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
 256
 257         '&vaesenclast   ($temp,$inout,$roundkey);'.
 258         ' &vaesenc      ($inout,$inout,$roundkey);'.
 259         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
 260
 261         '&vpand         ($temp,$temp,$mask12);'.
 262         ' &vaesenc      ($inout,$inout,$roundkey);'.
 263          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
 264
 265         '&vpor          ($iv,$iv,$temp);'.
 266         ' &vaesenclast  ($temp,$inout,$roundkey);'.
 267         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
 268
 269 ##      &mov            ($inp,$_inp);
 270 ##      &mov            ($out,$_out);
 271 ##      &vpand          ($temp,$temp,$mask14);
 272 ##      &vpor           ($iv,$iv,$temp);
 273 ##      &vmovdqu        ($iv,($out,$inp);
 274 ##      &lea            (inp,16($inp));
 275 );
 276
 277 my $a4=$T1;
 278 my ($a,$b,$c,$d,$e,$f,$g,$h);
 279
 280 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 281 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 282   my $arg = pop;
 283     $arg = "\$$arg" if ($arg*1 eq $arg);
 284     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 285 }
 286
 287 sub body_00_15 () {
 288         (
 289         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 290
 291         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
 292         '&mov   ($a,$a1)',
 293         '&mov   ($a4,$f)',
 294
 295         '&xor   ($a0,$e)',
 296         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
 297         '&xor   ($a4,$g)',                      # f^g
 298
 299         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
 300         '&xor   ($a1,$a)',
 301         '&and   ($a4,$e)',                      # (f^g)&e
 302
 303         @aesni_cbc_block[$aesni_cbc_idx++].
 304         '&xor   ($a0,$e)',
 305         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
 306         '&mov   ($a2,$a)',
 307
 308         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
 309         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
 310         '&xor   ($a2,$b)',                      # a^b, b^c in next round
 311
 312         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
 313         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
 314         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
 315
 316         '&xor   ($a1,$a)',
 317         '&add   ($h,$a0)',                      # h+=Sigma1(e)
 318         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
 319
 320         '&add   ($d,$h)',                       # d+=h
 321         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
 322         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
 323
 324         '&mov   ($a0,$d)',
 325         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
 326         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 327         );
 328 }
 329
 330 if ($avx) {{
 331 ######################################################################
 332 # XOP code path
 333 #
 334 $code.=<<___;
 335 .type   ${func}_xop,\@function,6
 336 .align  64
 337 ${func}_xop:
 338 .Lxop_shortcut:
 339         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 340         push    %rbx
 341         push    %rbp
 342         push    %r12
 343         push    %r13
 344         push    %r14
 345         push    %r15
 346         mov     %rsp,%r11               # copy %rsp
 347         sub     \$`$framesz+$win64*16*10`,%rsp
 348         and     \$-64,%rsp              # align stack frame
 349
 350         shl     \$6,$len
 351         sub     $inp,$out               # re-bias
 352         sub     $inp,$in0
 353         add     $inp,$len               # end of input
 354
 355         #mov    $inp,$_inp              # saved later
 356         mov     $out,$_out
 357         mov     $len,$_end
 358         #mov    $key,$_key              # remains resident in $inp register
 359         mov     $ivp,$_ivp
 360         mov     $ctx,$_ctx
 361         mov     $in0,$_in0
 362         mov     %r11,$_rsp
 363 ___
 364 $code.=<<___ if ($win64);
 365         movaps  %xmm6,`$framesz+16*0`(%rsp)
 366         movaps  %xmm7,`$framesz+16*1`(%rsp)
 367         movaps  %xmm8,`$framesz+16*2`(%rsp)
 368         movaps  %xmm9,`$framesz+16*3`(%rsp)
 369         movaps  %xmm10,`$framesz+16*4`(%rsp)
 370         movaps  %xmm11,`$framesz+16*5`(%rsp)
 371         movaps  %xmm12,`$framesz+16*6`(%rsp)
 372         movaps  %xmm13,`$framesz+16*7`(%rsp)
 373         movaps  %xmm14,`$framesz+16*8`(%rsp)
 374         movaps  %xmm15,`$framesz+16*9`(%rsp)
 375 ___
 376 $code.=<<___;
 377 .Lprologue_xop:
 378         vzeroall
 379
 380         mov     $inp,%r12               # borrow $a4
 381         lea     0x80($key),$inp         # size optimization, reassign
 382         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 383         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 384         mov     $ctx,%r15               # borrow $a2
 385         mov     $in0,%rsi               # borrow $a3
 386         vmovdqu ($ivp),$iv              # load IV
 387         sub     \$9,%r14
 388
 389         mov     $SZ*0(%r15),$A
 390         mov     $SZ*1(%r15),$B
 391         mov     $SZ*2(%r15),$C
 392         mov     $SZ*3(%r15),$D
 393         mov     $SZ*4(%r15),$E
 394         mov     $SZ*5(%r15),$F
 395         mov     $SZ*6(%r15),$G
 396         mov     $SZ*7(%r15),$H
 397
 398         vmovdqa 0x00(%r13,%r14,8),$mask14
 399         vmovdqa 0x10(%r13,%r14,8),$mask12
 400         vmovdqa 0x20(%r13,%r14,8),$mask10
 401         vmovdqu 0x00-0x80($inp),$roundkey
 402         jmp     .Lloop_xop
 403 ___
 404                                         if ($SZ==4) {   # SHA256
 405     my @X = map("%xmm$_",(0..3));
 406     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 407
 408 $code.=<<___;
 409 .align  16
 410 .Lloop_xop:
 411         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 412         vmovdqu 0x00(%rsi,%r12),@X[0]
 413         vmovdqu 0x10(%rsi,%r12),@X[1]
 414         vmovdqu 0x20(%rsi,%r12),@X[2]
 415         vmovdqu 0x30(%rsi,%r12),@X[3]
 416         vpshufb $t3,@X[0],@X[0]
 417         lea     $TABLE(%rip),$Tbl
 418         vpshufb $t3,@X[1],@X[1]
 419         vpshufb $t3,@X[2],@X[2]
 420         vpaddd  0x00($Tbl),@X[0],$t0
 421         vpshufb $t3,@X[3],@X[3]
 422         vpaddd  0x20($Tbl),@X[1],$t1
 423         vpaddd  0x40($Tbl),@X[2],$t2
 424         vpaddd  0x60($Tbl),@X[3],$t3
 425         vmovdqa $t0,0x00(%rsp)
 426         mov     $A,$a1
 427         vmovdqa $t1,0x10(%rsp)
 428         mov     $B,$a3
 429         vmovdqa $t2,0x20(%rsp)
 430         xor     $C,$a3                  # magic
 431         vmovdqa $t3,0x30(%rsp)
 432         mov     $E,$a0
 433         jmp     .Lxop_00_47
 434
 435 .align  16
 436 .Lxop_00_47:
 437         sub     \$-16*2*$SZ,$Tbl        # size optimization
 438         vmovdqu (%r12),$inout           # $a4
 439         mov     %r12,$_inp              # $a4
 440 ___
 441 sub XOP_256_00_47 () {
 442 my $j = shift;
 443 my $body = shift;
 444 my @X = @_;
 445 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 446
 447         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
 448           eval(shift(@insns));
 449           eval(shift(@insns));
 450          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
 451           eval(shift(@insns));
 452           eval(shift(@insns));
 453         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
 454           eval(shift(@insns));
 455           eval(shift(@insns));
 456         &vpsrld         ($t0,$t0,$sigma0[2]);
 457           eval(shift(@insns));
 458           eval(shift(@insns));
 459          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
 460           eval(shift(@insns));
 461           eval(shift(@insns));
 462           eval(shift(@insns));
 463           eval(shift(@insns));
 464         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
 465           eval(shift(@insns));
 466           eval(shift(@insns));
 467         &vpxor          ($t0,$t0,$t1);
 468           eval(shift(@insns));
 469           eval(shift(@insns));
 470           eval(shift(@insns));
 471           eval(shift(@insns));
 472          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
 473           eval(shift(@insns));
 474           eval(shift(@insns));
 475         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
 476           eval(shift(@insns));
 477           eval(shift(@insns));
 478          &vpsrld        ($t2,@X[3],$sigma1[2]);
 479           eval(shift(@insns));
 480           eval(shift(@insns));
 481         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
 482           eval(shift(@insns));
 483           eval(shift(@insns));
 484          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 485           eval(shift(@insns));
 486           eval(shift(@insns));
 487          &vpxor         ($t3,$t3,$t2);
 488           eval(shift(@insns));
 489           eval(shift(@insns));
 490           eval(shift(@insns));
 491           eval(shift(@insns));
 492          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
 493           eval(shift(@insns));
 494           eval(shift(@insns));
 495           eval(shift(@insns));
 496           eval(shift(@insns));
 497         &vpsrldq        ($t3,$t3,8);
 498           eval(shift(@insns));
 499           eval(shift(@insns));
 500           eval(shift(@insns));
 501           eval(shift(@insns));
 502         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
 503           eval(shift(@insns));
 504           eval(shift(@insns));
 505           eval(shift(@insns));
 506           eval(shift(@insns));
 507          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
 508           eval(shift(@insns));
 509           eval(shift(@insns));
 510          &vpsrld        ($t2,@X[0],$sigma1[2]);
 511           eval(shift(@insns));
 512           eval(shift(@insns));
 513          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 514           eval(shift(@insns));
 515           eval(shift(@insns));
 516          &vpxor         ($t3,$t3,$t2);
 517           eval(shift(@insns));
 518           eval(shift(@insns));
 519           eval(shift(@insns));
 520           eval(shift(@insns));
 521          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
 522           eval(shift(@insns));
 523           eval(shift(@insns));
 524           eval(shift(@insns));
 525           eval(shift(@insns));
 526         &vpslldq        ($t3,$t3,8);            # 22 instructions
 527           eval(shift(@insns));
 528           eval(shift(@insns));
 529           eval(shift(@insns));
 530           eval(shift(@insns));
 531         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
 532           eval(shift(@insns));
 533           eval(shift(@insns));
 534           eval(shift(@insns));
 535           eval(shift(@insns));
 536         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 537           foreach (@insns) { eval; }            # remaining instructions
 538         &vmovdqa        (16*$j."(%rsp)",$t2);
 539 }
 540
 541     $aesni_cbc_idx=0;
 542     for ($i=0,$j=0; $j<4; $j++) {
 543         &XOP_256_00_47($j,\&body_00_15,@X);
 544         push(@X,shift(@X));                     # rotate(@X)
 545     }
 546         &mov            ("%r12",$_inp);         # borrow $a4
 547         &vpand          ($temp,$temp,$mask14);
 548         &mov            ("%r15",$_out);         # borrow $a2
 549         &vpor           ($iv,$iv,$temp);
 550         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 551         &lea            ("%r12","16(%r12)");    # inp++
 552
 553         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 554         &jne    (".Lxop_00_47");
 555
 556         &vmovdqu        ($inout,"(%r12)");
 557         &mov            ($_inp,"%r12");
 558
 559     $aesni_cbc_idx=0;
 560     for ($i=0; $i<16; ) {
 561         foreach(body_00_15()) { eval; }
 562     }
 563                                         }
 564 $code.=<<___;
 565         mov     $_inp,%r12              # borrow $a4
 566         mov     $_out,%r13              # borrow $a0
 567         mov     $_ctx,%r15              # borrow $a2
 568         mov     $_in0,%rsi              # borrow $a3
 569
 570         vpand   $mask14,$temp,$temp
 571         mov     $a1,$A
 572         vpor    $temp,$iv,$iv
 573         vmovdqu $iv,(%r13,%r12)         # write output
 574         lea     16(%r12),%r12           # inp++
 575
 576         add     $SZ*0(%r15),$A
 577         add     $SZ*1(%r15),$B
 578         add     $SZ*2(%r15),$C
 579         add     $SZ*3(%r15),$D
 580         add     $SZ*4(%r15),$E
 581         add     $SZ*5(%r15),$F
 582         add     $SZ*6(%r15),$G
 583         add     $SZ*7(%r15),$H
 584
 585         cmp     $_end,%r12
 586
 587         mov     $A,$SZ*0(%r15)
 588         mov     $B,$SZ*1(%r15)
 589         mov     $C,$SZ*2(%r15)
 590         mov     $D,$SZ*3(%r15)
 591         mov     $E,$SZ*4(%r15)
 592         mov     $F,$SZ*5(%r15)
 593         mov     $G,$SZ*6(%r15)
 594         mov     $H,$SZ*7(%r15)
 595
 596         jb      .Lloop_xop
 597
 598         mov     $_ivp,$ivp
 599         mov     $_rsp,%rsi
 600         vmovdqu $iv,($ivp)              # output IV
 601         vzeroall
 602 ___
 603 $code.=<<___ if ($win64);
 604         movaps  `$framesz+16*0`(%rsp),%xmm6
 605         movaps  `$framesz+16*1`(%rsp),%xmm7
 606         movaps  `$framesz+16*2`(%rsp),%xmm8
 607         movaps  `$framesz+16*3`(%rsp),%xmm9
 608         movaps  `$framesz+16*4`(%rsp),%xmm10
 609         movaps  `$framesz+16*5`(%rsp),%xmm11
 610         movaps  `$framesz+16*6`(%rsp),%xmm12
 611         movaps  `$framesz+16*7`(%rsp),%xmm13
 612         movaps  `$framesz+16*8`(%rsp),%xmm14
 613         movaps  `$framesz+16*9`(%rsp),%xmm15
 614 ___
 615 $code.=<<___;
 616         mov     (%rsi),%r15
 617         mov     8(%rsi),%r14
 618         mov     16(%rsi),%r13
 619         mov     24(%rsi),%r12
 620         mov     32(%rsi),%rbp
 621         mov     40(%rsi),%rbx
 622         lea     48(%rsi),%rsp
 623 .Lepilogue_xop:
 624         ret
 625 .size   ${func}_xop,.-${func}_xop
 626 ___
 627 ######################################################################
 628 # AVX+shrd code path
 629 #
 630 local *ror = sub { &shrd(@_[0],@_) };
 631
 632 $code.=<<___;
 633 .type   ${func}_avx,\@function,6
 634 .align  64
 635 ${func}_avx:
 636 .Lavx_shortcut:
 637         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 638         push    %rbx
 639         push    %rbp
 640         push    %r12
 641         push    %r13
 642         push    %r14
 643         push    %r15
 644         mov     %rsp,%r11               # copy %rsp
 645         sub     \$`$framesz+$win64*16*10`,%rsp
 646         and     \$-64,%rsp              # align stack frame
 647
 648         shl     \$6,$len
 649         sub     $inp,$out               # re-bias
 650         sub     $inp,$in0
 651         add     $inp,$len               # end of input
 652
 653         #mov    $inp,$_inp              # saved later
 654         mov     $out,$_out
 655         mov     $len,$_end
 656         #mov    $key,$_key              # remains resident in $inp register
 657         mov     $ivp,$_ivp
 658         mov     $ctx,$_ctx
 659         mov     $in0,$_in0
 660         mov     %r11,$_rsp
 661 ___
 662 $code.=<<___ if ($win64);
 663         movaps  %xmm6,`$framesz+16*0`(%rsp)
 664         movaps  %xmm7,`$framesz+16*1`(%rsp)
 665         movaps  %xmm8,`$framesz+16*2`(%rsp)
 666         movaps  %xmm9,`$framesz+16*3`(%rsp)
 667         movaps  %xmm10,`$framesz+16*4`(%rsp)
 668         movaps  %xmm11,`$framesz+16*5`(%rsp)
 669         movaps  %xmm12,`$framesz+16*6`(%rsp)
 670         movaps  %xmm13,`$framesz+16*7`(%rsp)
 671         movaps  %xmm14,`$framesz+16*8`(%rsp)
 672         movaps  %xmm15,`$framesz+16*9`(%rsp)
 673 ___
 674 $code.=<<___;
 675 .Lprologue_avx:
 676         vzeroall
 677
 678         mov     $inp,%r12               # borrow $a4
 679         lea     0x80($key),$inp         # size optimization, reassign
 680         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 681         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 682         mov     $ctx,%r15               # borrow $a2
 683         mov     $in0,%rsi               # borrow $a3
 684         vmovdqu ($ivp),$iv              # load IV
 685         sub     \$9,%r14
 686
 687         mov     $SZ*0(%r15),$A
 688         mov     $SZ*1(%r15),$B
 689         mov     $SZ*2(%r15),$C
 690         mov     $SZ*3(%r15),$D
 691         mov     $SZ*4(%r15),$E
 692         mov     $SZ*5(%r15),$F
 693         mov     $SZ*6(%r15),$G
 694         mov     $SZ*7(%r15),$H
 695
 696         vmovdqa 0x00(%r13,%r14,8),$mask14
 697         vmovdqa 0x10(%r13,%r14,8),$mask12
 698         vmovdqa 0x20(%r13,%r14,8),$mask10
 699         vmovdqu 0x00-0x80($inp),$roundkey
 700 ___
 701                                         if ($SZ==4) {   # SHA256
 702     my @X = map("%xmm$_",(0..3));
 703     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 704
 705 $code.=<<___;
 706         jmp     .Lloop_avx
 707 .align  16
 708 .Lloop_avx:
 709         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 710         vmovdqu 0x00(%rsi,%r12),@X[0]
 711         vmovdqu 0x10(%rsi,%r12),@X[1]
 712         vmovdqu 0x20(%rsi,%r12),@X[2]
 713         vmovdqu 0x30(%rsi,%r12),@X[3]
 714         vpshufb $t3,@X[0],@X[0]
 715         lea     $TABLE(%rip),$Tbl
 716         vpshufb $t3,@X[1],@X[1]
 717         vpshufb $t3,@X[2],@X[2]
 718         vpaddd  0x00($Tbl),@X[0],$t0
 719         vpshufb $t3,@X[3],@X[3]
 720         vpaddd  0x20($Tbl),@X[1],$t1
 721         vpaddd  0x40($Tbl),@X[2],$t2
 722         vpaddd  0x60($Tbl),@X[3],$t3
 723         vmovdqa $t0,0x00(%rsp)
 724         mov     $A,$a1
 725         vmovdqa $t1,0x10(%rsp)
 726         mov     $B,$a3
 727         vmovdqa $t2,0x20(%rsp)
 728         xor     $C,$a3                  # magic
 729         vmovdqa $t3,0x30(%rsp)
 730         mov     $E,$a0
 731         jmp     .Lavx_00_47
 732
 733 .align  16
 734 .Lavx_00_47:
 735         sub     \$-16*2*$SZ,$Tbl        # size optimization
 736         vmovdqu (%r12),$inout           # $a4
 737         mov     %r12,$_inp              # $a4
 738 ___
 739 sub Xupdate_256_AVX () {
 740         (
 741         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
 742          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
 743         '&vpsrld        ($t2,$t0,$sigma0[0]);',
 744          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
 745         '&vpsrld        ($t3,$t0,$sigma0[2])',
 746         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
 747         '&vpxor         ($t0,$t3,$t2)',
 748          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
 749         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
 750         '&vpxor         ($t0,$t0,$t1)',
 751         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
 752         '&vpxor         ($t0,$t0,$t2)',
 753          '&vpsrld       ($t2,$t3,$sigma1[2]);',
 754         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
 755          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
 756         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
 757          '&vpxor        ($t2,$t2,$t3);',
 758          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 759          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
 760          '&vpshufd      ($t2,$t2,0b10000100)',
 761          '&vpsrldq      ($t2,$t2,8)',
 762         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
 763          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
 764          '&vpsrld       ($t2,$t3,$sigma1[2])',
 765          '&vpsrlq       ($t3,$t3,$sigma1[0])',
 766          '&vpxor        ($t2,$t2,$t3);',
 767          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 768          '&vpxor        ($t2,$t2,$t3)',
 769          '&vpshufd      ($t2,$t2,0b11101000)',
 770          '&vpslldq      ($t2,$t2,8)',
 771         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
 772         );
 773 }
 774
 775 sub AVX_256_00_47 () {
 776 my $j = shift;
 777 my $body = shift;
 778 my @X = @_;
 779 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 780
 781         foreach (Xupdate_256_AVX()) {           # 29 instructions
 782             eval;
 783             eval(shift(@insns));
 784             eval(shift(@insns));
 785             eval(shift(@insns));
 786         }
 787         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 788           foreach (@insns) { eval; }            # remaining instructions
 789         &vmovdqa        (16*$j."(%rsp)",$t2);
 790 }
 791
 792     $aesni_cbc_idx=0;
 793     for ($i=0,$j=0; $j<4; $j++) {
 794         &AVX_256_00_47($j,\&body_00_15,@X);
 795         push(@X,shift(@X));                     # rotate(@X)
 796     }
 797         &mov            ("%r12",$_inp);         # borrow $a4
 798         &vpand          ($temp,$temp,$mask14);
 799         &mov            ("%r15",$_out);         # borrow $a2
 800         &vpor           ($iv,$iv,$temp);
 801         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 802         &lea            ("%r12","16(%r12)");    # inp++
 803
 804         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 805         &jne    (".Lavx_00_47");
 806
 807         &vmovdqu        ($inout,"(%r12)");
 808         &mov            ($_inp,"%r12");
 809
 810     $aesni_cbc_idx=0;
 811     for ($i=0; $i<16; ) {
 812         foreach(body_00_15()) { eval; }
 813     }
 814
 815                                         }
 816 $code.=<<___;
 817         mov     $_inp,%r12              # borrow $a4
 818         mov     $_out,%r13              # borrow $a0
 819         mov     $_ctx,%r15              # borrow $a2
 820         mov     $_in0,%rsi              # borrow $a3
 821
 822         vpand   $mask14,$temp,$temp
 823         mov     $a1,$A
 824         vpor    $temp,$iv,$iv
 825         vmovdqu $iv,(%r13,%r12)         # write output
 826         lea     16(%r12),%r12           # inp++
 827
 828         add     $SZ*0(%r15),$A
 829         add     $SZ*1(%r15),$B
 830         add     $SZ*2(%r15),$C
 831         add     $SZ*3(%r15),$D
 832         add     $SZ*4(%r15),$E
 833         add     $SZ*5(%r15),$F
 834         add     $SZ*6(%r15),$G
 835         add     $SZ*7(%r15),$H
 836
 837         cmp     $_end,%r12
 838
 839         mov     $A,$SZ*0(%r15)
 840         mov     $B,$SZ*1(%r15)
 841         mov     $C,$SZ*2(%r15)
 842         mov     $D,$SZ*3(%r15)
 843         mov     $E,$SZ*4(%r15)
 844         mov     $F,$SZ*5(%r15)
 845         mov     $G,$SZ*6(%r15)
 846         mov     $H,$SZ*7(%r15)
 847         jb      .Lloop_avx
 848
 849         mov     $_ivp,$ivp
 850         mov     $_rsp,%rsi
 851         vmovdqu $iv,($ivp)              # output IV
 852         vzeroall
 853 ___
 854 $code.=<<___ if ($win64);
 855         movaps  `$framesz+16*0`(%rsp),%xmm6
 856         movaps  `$framesz+16*1`(%rsp),%xmm7
 857         movaps  `$framesz+16*2`(%rsp),%xmm8
 858         movaps  `$framesz+16*3`(%rsp),%xmm9
 859         movaps  `$framesz+16*4`(%rsp),%xmm10
 860         movaps  `$framesz+16*5`(%rsp),%xmm11
 861         movaps  `$framesz+16*6`(%rsp),%xmm12
 862         movaps  `$framesz+16*7`(%rsp),%xmm13
 863         movaps  `$framesz+16*8`(%rsp),%xmm14
 864         movaps  `$framesz+16*9`(%rsp),%xmm15
 865 ___
 866 $code.=<<___;
 867         mov     (%rsi),%r15
 868         mov     8(%rsi),%r14
 869         mov     16(%rsi),%r13
 870         mov     24(%rsi),%r12
 871         mov     32(%rsi),%rbp
 872         mov     40(%rsi),%rbx
 873         lea     48(%rsi),%rsp
 874 .Lepilogue_avx:
 875         ret
 876 .size   ${func}_avx,.-${func}_avx
 877 ___
 878
 879 if ($avx>1) {{
 880 ######################################################################
 881 # AVX2+BMI code path
 882 #
 883 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
 884 my $PUSH8=8*2*$SZ;
 885 use integer;
 886
 887 sub bodyx_00_15 () {
 888         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
 889         (
 890         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 891
 892         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
 893         '&and   ($a4,$e)',              # f&e
 894         '&rorx  ($a0,$e,$Sigma1[2])',
 895         '&rorx  ($a2,$e,$Sigma1[1])',
 896
 897         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
 898         '&lea   ($h,"($h,$a4)")',
 899         '&andn  ($a4,$e,$g)',           # ~e&g
 900         '&xor   ($a0,$a2)',
 901
 902         '&rorx  ($a1,$e,$Sigma1[0])',
 903         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
 904         '&xor   ($a0,$a1)',             # Sigma1(e)
 905         '&mov   ($a2,$a)',
 906
 907         '&rorx  ($a4,$a,$Sigma0[2])',
 908         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
 909         '&xor   ($a2,$b)',              # a^b, b^c in next round
 910         '&rorx  ($a1,$a,$Sigma0[1])',
 911
 912         '&rorx  ($a0,$a,$Sigma0[0])',
 913         '&lea   ($d,"($d,$h)")',        # d+=h
 914         '&and   ($a3,$a2)',             # (b^c)&(a^b)
 915         @aesni_cbc_block[$aesni_cbc_idx++].
 916         '&xor   ($a1,$a4)',
 917
 918         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
 919         '&xor   ($a1,$a0)',             # Sigma0(a)
 920         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
 921         '&mov   ($a4,$e)',              # copy of f in future
 922
 923         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 924         );
 925         # and at the finish one has to $a+=$a1
 926 }
 927
 928 $code.=<<___;
 929 .type   ${func}_avx2,\@function,6
 930 .align  64
 931 ${func}_avx2:
 932 .Lavx2_shortcut:
 933         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 934         push    %rbx
 935         push    %rbp
 936         push    %r12
 937         push    %r13
 938         push    %r14
 939         push    %r15
 940         mov     %rsp,%r11               # copy %rsp
 941         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
 942         and     \$-256*$SZ,%rsp         # align stack frame
 943         add     \$`2*$SZ*($rounds-8)`,%rsp
 944
 945         shl     \$6,$len
 946         sub     $inp,$out               # re-bias
 947         sub     $inp,$in0
 948         add     $inp,$len               # end of input
 949
 950         #mov    $inp,$_inp              # saved later
 951         #mov    $out,$_out              # kept in $offload
 952         mov     $len,$_end
 953         #mov    $key,$_key              # remains resident in $inp register
 954         mov     $ivp,$_ivp
 955         mov     $ctx,$_ctx
 956         mov     $in0,$_in0
 957         mov     %r11,$_rsp
 958 ___
 959 $code.=<<___ if ($win64);
 960         movaps  %xmm6,`$framesz+16*0`(%rsp)
 961         movaps  %xmm7,`$framesz+16*1`(%rsp)
 962         movaps  %xmm8,`$framesz+16*2`(%rsp)
 963         movaps  %xmm9,`$framesz+16*3`(%rsp)
 964         movaps  %xmm10,`$framesz+16*4`(%rsp)
 965         movaps  %xmm11,`$framesz+16*5`(%rsp)
 966         movaps  %xmm12,`$framesz+16*6`(%rsp)
 967         movaps  %xmm13,`$framesz+16*7`(%rsp)
 968         movaps  %xmm14,`$framesz+16*8`(%rsp)
 969         movaps  %xmm15,`$framesz+16*9`(%rsp)
 970 ___
 971 $code.=<<___;
 972 .Lprologue_avx2:
 973         vzeroall
 974
 975         mov     $inp,%r13               # borrow $a0
 976         vpinsrq \$1,$out,$offload,$offload
 977         lea     0x80($key),$inp         # size optimization, reassign
 978         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
 979         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 980         mov     $ctx,%r15               # borrow $a2
 981         mov     $in0,%rsi               # borrow $a3
 982         vmovdqu ($ivp),$iv              # load IV
 983         lea     -9(%r14),%r14
 984
 985         vmovdqa 0x00(%r12,%r14,8),$mask14
 986         vmovdqa 0x10(%r12,%r14,8),$mask12
 987         vmovdqa 0x20(%r12,%r14,8),$mask10
 988
 989         sub     \$-16*$SZ,%r13          # inp++, size optimization
 990         mov     $SZ*0(%r15),$A
 991         lea     (%rsi,%r13),%r12        # borrow $a0
 992         mov     $SZ*1(%r15),$B
 993         cmp     $len,%r13               # $_end
 994         mov     $SZ*2(%r15),$C
 995         cmove   %rsp,%r12               # next block or random data
 996         mov     $SZ*3(%r15),$D
 997         mov     $SZ*4(%r15),$E
 998         mov     $SZ*5(%r15),$F
 999         mov     $SZ*6(%r15),$G
1000         mov     $SZ*7(%r15),$H
1001         vmovdqu 0x00-0x80($inp),$roundkey
1002 ___
1003                                         if ($SZ==4) {   # SHA256
1004     my @X = map("%ymm$_",(0..3));
1005     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1006
1007 $code.=<<___;
1008         jmp     .Loop_avx2
1009 .align  16
1010 .Loop_avx2:
1011         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1012         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1013         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1014         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1015         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1016
1017         vinserti128     \$1,(%r12),@X[0],@X[0]
1018         vinserti128     \$1,16(%r12),@X[1],@X[1]
1019          vpshufb        $t3,@X[0],@X[0]
1020         vinserti128     \$1,32(%r12),@X[2],@X[2]
1021          vpshufb        $t3,@X[1],@X[1]
1022         vinserti128     \$1,48(%r12),@X[3],@X[3]
1023
1024         lea     $TABLE(%rip),$Tbl
1025         vpshufb $t3,@X[2],@X[2]
1026         lea     -16*$SZ(%r13),%r13
1027         vpaddd  0x00($Tbl),@X[0],$t0
1028         vpshufb $t3,@X[3],@X[3]
1029         vpaddd  0x20($Tbl),@X[1],$t1
1030         vpaddd  0x40($Tbl),@X[2],$t2
1031         vpaddd  0x60($Tbl),@X[3],$t3
1032         vmovdqa $t0,0x00(%rsp)
1033         xor     $a1,$a1
1034         vmovdqa $t1,0x20(%rsp)
1035         lea     -$PUSH8(%rsp),%rsp
1036         mov     $B,$a3
1037         vmovdqa $t2,0x00(%rsp)
1038         xor     $C,$a3                  # magic
1039         vmovdqa $t3,0x20(%rsp)
1040         mov     $F,$a4
1041         sub     \$-16*2*$SZ,$Tbl        # size optimization
1042         jmp     .Lavx2_00_47
1043
1044 .align  16
1045 .Lavx2_00_47:
1046         vmovdqu (%r13),$inout
1047         vpinsrq \$0,%r13,$offload,$offload
1048 ___
1049
1050 sub AVX2_256_00_47 () {
1051 my $j = shift;
1052 my $body = shift;
1053 my @X = @_;
1054 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1055 my $base = "+2*$PUSH8(%rsp)";
1056
1057         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1058         foreach (Xupdate_256_AVX()) {           # 29 instructions
1059             eval;
1060             eval(shift(@insns));
1061             eval(shift(@insns));
1062             eval(shift(@insns));
1063         }
1064         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1065           foreach (@insns) { eval; }            # remaining instructions
1066         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1067 }
1068     $aesni_cbc_idx=0;
1069     for ($i=0,$j=0; $j<4; $j++) {
1070         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1071         push(@X,shift(@X));                     # rotate(@X)
1072     }
1073         &vmovq          ("%r13",$offload);      # borrow $a0
1074         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1075         &vpand          ($temp,$temp,$mask14);
1076         &vpor           ($iv,$iv,$temp);
1077         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1078         &lea            ("%r13","16(%r13)");    # inp++
1079
1080         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1081         &cmpb   (($SZ-1)."($Tbl)",0);
1082         &jne    (".Lavx2_00_47");
1083
1084         &vmovdqu        ($inout,"(%r13)");
1085         &vpinsrq        ($offload,$offload,"%r13",0);
1086
1087     $aesni_cbc_idx=0;
1088     for ($i=0; $i<16; ) {
1089         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1090         foreach(bodyx_00_15()) { eval; }
1091     }
1092                                         }
1093 $code.=<<___;
1094         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1095         vmovq   $offload,%r13                   # $_inp, borrow $a0
1096         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1097         add     $a1,$A
1098         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1099
1100         vpand   $mask14,$temp,$temp
1101         vpor    $temp,$iv,$iv
1102         vmovdqu $iv,(%r12,%r13)                 # write output
1103         lea     16(%r13),%r13
1104
1105         add     $SZ*0(%r15),$A
1106         add     $SZ*1(%r15),$B
1107         add     $SZ*2(%r15),$C
1108         add     $SZ*3(%r15),$D
1109         add     $SZ*4(%r15),$E
1110         add     $SZ*5(%r15),$F
1111         add     $SZ*6(%r15),$G
1112         add     $SZ*7(%r15),$H
1113
1114         mov     $A,$SZ*0(%r15)
1115         mov     $B,$SZ*1(%r15)
1116         mov     $C,$SZ*2(%r15)
1117         mov     $D,$SZ*3(%r15)
1118         mov     $E,$SZ*4(%r15)
1119         mov     $F,$SZ*5(%r15)
1120         mov     $G,$SZ*6(%r15)
1121         mov     $H,$SZ*7(%r15)
1122
1123         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1124         je      .Ldone_avx2
1125
1126         xor     $a1,$a1
1127         mov     $B,$a3
1128         mov     $F,$a4
1129         xor     $C,$a3                  # magic
1130         jmp     .Lower_avx2
1131 .align  16
1132 .Lower_avx2:
1133         vmovdqu (%r13),$inout
1134         vpinsrq \$0,%r13,$offload,$offload
1135 ___
1136     $aesni_cbc_idx=0;
1137     for ($i=0; $i<16; ) {
1138         my $base="+16($Tbl)";
1139         foreach(bodyx_00_15()) { eval; }
1140         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1141     }
1142 $code.=<<___;
1143         vmovq   $offload,%r13                   # borrow $a0
1144         vpextrq \$1,$offload,%r15               # borrow $a2
1145         vpand   $mask14,$temp,$temp
1146         vpor    $temp,$iv,$iv
1147         lea     -$PUSH8($Tbl),$Tbl
1148         vmovdqu $iv,(%r15,%r13)                 # write output
1149         lea     16(%r13),%r13                   # inp++
1150         cmp     %rsp,$Tbl
1151         jae     .Lower_avx2
1152
1153         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1154         lea     16*$SZ(%r13),%r13
1155         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1156         add     $a1,$A
1157         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1158
1159         add     $SZ*0(%r15),$A
1160         add     $SZ*1(%r15),$B
1161         add     $SZ*2(%r15),$C
1162         add     $SZ*3(%r15),$D
1163         add     $SZ*4(%r15),$E
1164         add     $SZ*5(%r15),$F
1165         add     $SZ*6(%r15),$G
1166         lea     (%rsi,%r13),%r12
1167         add     $SZ*7(%r15),$H
1168
1169         cmp     $_end,%r13
1170
1171         mov     $A,$SZ*0(%r15)
1172         cmove   %rsp,%r12               # next block or stale data
1173         mov     $B,$SZ*1(%r15)
1174         mov     $C,$SZ*2(%r15)
1175         mov     $D,$SZ*3(%r15)
1176         mov     $E,$SZ*4(%r15)
1177         mov     $F,$SZ*5(%r15)
1178         mov     $G,$SZ*6(%r15)
1179         mov     $H,$SZ*7(%r15)
1180
1181         jbe     .Loop_avx2
1182         lea     (%rsp),$Tbl
1183
1184 .Ldone_avx2:
1185         lea     ($Tbl),%rsp
1186         mov     $_ivp,$ivp
1187         mov     $_rsp,%rsi
1188         vmovdqu $iv,($ivp)              # output IV
1189         vzeroall
1190 ___
1191 $code.=<<___ if ($win64);
1192         movaps  `$framesz+16*0`(%rsp),%xmm6
1193         movaps  `$framesz+16*1`(%rsp),%xmm7
1194         movaps  `$framesz+16*2`(%rsp),%xmm8
1195         movaps  `$framesz+16*3`(%rsp),%xmm9
1196         movaps  `$framesz+16*4`(%rsp),%xmm10
1197         movaps  `$framesz+16*5`(%rsp),%xmm11
1198         movaps  `$framesz+16*6`(%rsp),%xmm12
1199         movaps  `$framesz+16*7`(%rsp),%xmm13
1200         movaps  `$framesz+16*8`(%rsp),%xmm14
1201         movaps  `$framesz+16*9`(%rsp),%xmm15
1202 ___
1203 $code.=<<___;
1204         mov     (%rsi),%r15
1205         mov     8(%rsi),%r14
1206         mov     16(%rsi),%r13
1207         mov     24(%rsi),%r12
1208         mov     32(%rsi),%rbp
1209         mov     40(%rsi),%rbx
1210         lea     48(%rsi),%rsp
1211 .Lepilogue_avx2:
1212         ret
1213 .size   ${func}_avx2,.-${func}_avx2
1214 ___
1215 }}
1216 }}
1217 {{
1218 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1219
1220 my ($rounds,$Tbl)=("%r11d","%rbx");
1221
1222 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1223 my @rndkey=("%xmm4","%xmm5");
1224 my $r=0;
1225 my $sn=0;
1226
1227 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1228 my @MSG=map("%xmm$_",(10..13));
1229
1230 my $aesenc=sub {
1231   use integer;
1232   my ($n,$k)=($r/10,$r%10);
1233     if ($k==0) {
1234       $code.=<<___;
1235         movups          `16*$n`($in0),$in               # load input
1236         xorps           $rndkey0,$in
1237 ___
1238       $code.=<<___ if ($n);
1239         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1240 ___
1241       $code.=<<___;
1242         xorps           $in,$iv
1243         movups          `32+16*$k-112`($key),$rndkey[1]
1244         aesenc          $rndkey[0],$iv
1245 ___
1246     } elsif ($k==9) {
1247       $sn++;
1248       $code.=<<___;
1249         cmp             \$11,$rounds
1250         jb              .Laesenclast$sn
1251         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1252         aesenc          $rndkey[0],$iv
1253         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1254         aesenc          $rndkey[1],$iv
1255         je              .Laesenclast$sn
1256         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1257         aesenc          $rndkey[0],$iv
1258         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1259         aesenc          $rndkey[1],$iv
1260 .Laesenclast$sn:
1261         aesenclast      $rndkey[0],$iv
1262         movups          16-112($key),$rndkey[1]         # forward reference
1263         nop
1264 ___
1265     } else {
1266       $code.=<<___;
1267         movups          `32+16*$k-112`($key),$rndkey[1]
1268         aesenc          $rndkey[0],$iv
1269 ___
1270     }
1271     $r++;       unshift(@rndkey,pop(@rndkey));
1272 };
1273
1274 if ($shaext) {
1275 my $Tbl="%rax";
1276
1277 $code.=<<___;
1278 .type   ${func}_shaext,\@function,6
1279 .align  32
1280 ${func}_shaext:
1281         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1282 ___
1283 $code.=<<___ if ($win64);
1284         lea     `-8-10*16`(%rsp),%rsp
1285         movaps  %xmm6,-8-10*16(%rax)
1286         movaps  %xmm7,-8-9*16(%rax)
1287         movaps  %xmm8,-8-8*16(%rax)
1288         movaps  %xmm9,-8-7*16(%rax)
1289         movaps  %xmm10,-8-6*16(%rax)
1290         movaps  %xmm11,-8-5*16(%rax)
1291         movaps  %xmm12,-8-4*16(%rax)
1292         movaps  %xmm13,-8-3*16(%rax)
1293         movaps  %xmm14,-8-2*16(%rax)
1294         movaps  %xmm15,-8-1*16(%rax)
1295 .Lprologue_shaext:
1296 ___
1297 $code.=<<___;
1298         lea             K256+0x80(%rip),$Tbl
1299         movdqu          ($ctx),$ABEF            # DCBA
1300         movdqu          16($ctx),$CDGH          # HGFE
1301         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1302
1303         mov             240($key),$rounds
1304         sub             $in0,$out
1305         movups          ($key),$rndkey0         # $key[0]
1306         movups          16($key),$rndkey[0]     # forward reference
1307         lea             112($key),$key          # size optimization
1308
1309         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1310         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1311         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1312         movdqa          $TMP,$BSWAP             # offload
1313         palignr         \$8,$CDGH,$ABEF         # ABEF
1314         punpcklqdq      $Wi,$CDGH               # CDGH
1315
1316         jmp     .Loop_shaext
1317
1318 .align  16
1319 .Loop_shaext:
1320         movdqu          ($inp),@MSG[0]
1321         movdqu          0x10($inp),@MSG[1]
1322         movdqu          0x20($inp),@MSG[2]
1323         pshufb          $TMP,@MSG[0]
1324         movdqu          0x30($inp),@MSG[3]
1325
1326         movdqa          0*32-0x80($Tbl),$Wi
1327         paddd           @MSG[0],$Wi
1328         pshufb          $TMP,@MSG[1]
1329         movdqa          $CDGH,$CDGH_SAVE        # offload
1330         movdqa          $ABEF,$ABEF_SAVE        # offload
1331 ___
1332         &$aesenc();
1333 $code.=<<___;
1334         sha256rnds2     $ABEF,$CDGH             # 0-3
1335         pshufd          \$0x0e,$Wi,$Wi
1336 ___
1337         &$aesenc();
1338 $code.=<<___;
1339         sha256rnds2     $CDGH,$ABEF
1340
1341         movdqa          1*32-0x80($Tbl),$Wi
1342         paddd           @MSG[1],$Wi
1343         pshufb          $TMP,@MSG[2]
1344         lea             0x40($inp),$inp
1345 ___
1346         &$aesenc();
1347 $code.=<<___;
1348         sha256rnds2     $ABEF,$CDGH             # 4-7
1349         pshufd          \$0x0e,$Wi,$Wi
1350 ___
1351         &$aesenc();
1352 $code.=<<___;
1353         sha256rnds2     $CDGH,$ABEF
1354
1355         movdqa          2*32-0x80($Tbl),$Wi
1356         paddd           @MSG[2],$Wi
1357         pshufb          $TMP,@MSG[3]
1358         sha256msg1      @MSG[1],@MSG[0]
1359 ___
1360         &$aesenc();
1361 $code.=<<___;
1362         sha256rnds2     $ABEF,$CDGH             # 8-11
1363         pshufd          \$0x0e,$Wi,$Wi
1364         movdqa          @MSG[3],$TMP
1365         palignr         \$4,@MSG[2],$TMP
1366         paddd           $TMP,@MSG[0]
1367 ___
1368         &$aesenc();
1369 $code.=<<___;
1370         sha256rnds2     $CDGH,$ABEF
1371
1372         movdqa          3*32-0x80($Tbl),$Wi
1373         paddd           @MSG[3],$Wi
1374         sha256msg2      @MSG[3],@MSG[0]
1375         sha256msg1      @MSG[2],@MSG[1]
1376 ___
1377         &$aesenc();
1378 $code.=<<___;
1379         sha256rnds2     $ABEF,$CDGH             # 12-15
1380         pshufd          \$0x0e,$Wi,$Wi
1381 ___
1382         &$aesenc();
1383 $code.=<<___;
1384         movdqa          @MSG[0],$TMP
1385         palignr         \$4,@MSG[3],$TMP
1386         paddd           $TMP,@MSG[1]
1387         sha256rnds2     $CDGH,$ABEF
1388 ___
1389 for($i=4;$i<16-3;$i++) {
1390         &$aesenc()      if (($r%10)==0);
1391 $code.=<<___;
1392         movdqa          $i*32-0x80($Tbl),$Wi
1393         paddd           @MSG[0],$Wi
1394         sha256msg2      @MSG[0],@MSG[1]
1395         sha256msg1      @MSG[3],@MSG[2]
1396 ___
1397         &$aesenc();
1398 $code.=<<___;
1399         sha256rnds2     $ABEF,$CDGH             # 16-19...
1400         pshufd          \$0x0e,$Wi,$Wi
1401         movdqa          @MSG[1],$TMP
1402         palignr         \$4,@MSG[0],$TMP
1403         paddd           $TMP,@MSG[2]
1404 ___
1405         &$aesenc();
1406         &$aesenc()      if ($r==19);
1407 $code.=<<___;
1408         sha256rnds2     $CDGH,$ABEF
1409 ___
1410         push(@MSG,shift(@MSG));
1411 }
1412 $code.=<<___;
1413         movdqa          13*32-0x80($Tbl),$Wi
1414         paddd           @MSG[0],$Wi
1415         sha256msg2      @MSG[0],@MSG[1]
1416         sha256msg1      @MSG[3],@MSG[2]
1417 ___
1418         &$aesenc();
1419 $code.=<<___;
1420         sha256rnds2     $ABEF,$CDGH             # 52-55
1421         pshufd          \$0x0e,$Wi,$Wi
1422         movdqa          @MSG[1],$TMP
1423         palignr         \$4,@MSG[0],$TMP
1424         paddd           $TMP,@MSG[2]
1425 ___
1426         &$aesenc();
1427         &$aesenc();
1428 $code.=<<___;
1429         sha256rnds2     $CDGH,$ABEF
1430
1431         movdqa          14*32-0x80($Tbl),$Wi
1432         paddd           @MSG[1],$Wi
1433         sha256msg2      @MSG[1],@MSG[2]
1434         movdqa          $BSWAP,$TMP
1435 ___
1436         &$aesenc();
1437 $code.=<<___;
1438         sha256rnds2     $ABEF,$CDGH             # 56-59
1439         pshufd          \$0x0e,$Wi,$Wi
1440 ___
1441         &$aesenc();
1442 $code.=<<___;
1443         sha256rnds2     $CDGH,$ABEF
1444
1445         movdqa          15*32-0x80($Tbl),$Wi
1446         paddd           @MSG[2],$Wi
1447 ___
1448         &$aesenc();
1449         &$aesenc();
1450 $code.=<<___;
1451         sha256rnds2     $ABEF,$CDGH             # 60-63
1452         pshufd          \$0x0e,$Wi,$Wi
1453 ___
1454         &$aesenc();
1455 $code.=<<___;
1456         sha256rnds2     $CDGH,$ABEF
1457         #pxor           $CDGH,$rndkey0          # black magic
1458 ___
1459         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1460 $code.=<<___;
1461         #xorps          $CDGH,$rndkey0          # black magic
1462         paddd           $CDGH_SAVE,$CDGH
1463         paddd           $ABEF_SAVE,$ABEF
1464
1465         dec             $len
1466         movups          $iv,48($out,$in0)       # write output
1467         lea             64($in0),$in0
1468         jnz             .Loop_shaext
1469
1470         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1471         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1472         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1473         punpckhqdq      $CDGH,$ABEF             # DCBA
1474         palignr         \$8,$TMP,$CDGH          # HGFE
1475
1476         movups          $iv,($ivp)              # write IV
1477         movdqu          $ABEF,($ctx)
1478         movdqu          $CDGH,16($ctx)
1479 ___
1480 $code.=<<___ if ($win64);
1481         movaps  0*16(%rsp),%xmm6
1482         movaps  1*16(%rsp),%xmm7
1483         movaps  2*16(%rsp),%xmm8
1484         movaps  3*16(%rsp),%xmm9
1485         movaps  4*16(%rsp),%xmm10
1486         movaps  5*16(%rsp),%xmm11
1487         movaps  6*16(%rsp),%xmm12
1488         movaps  7*16(%rsp),%xmm13
1489         movaps  8*16(%rsp),%xmm14
1490         movaps  9*16(%rsp),%xmm15
1491         lea     8+10*16(%rsp),%rsp
1492 .Lepilogue_shaext:
1493 ___
1494 $code.=<<___;
1495         ret
1496 .size   ${func}_shaext,.-${func}_shaext
1497 ___
1498 }
1499 }}}}}
1500
1501 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1502 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1503 if ($win64 && $avx) {
1504 $rec="%rcx";
1505 $frame="%rdx";
1506 $context="%r8";
1507 $disp="%r9";
1508
1509 $code.=<<___;
1510 .extern __imp_RtlVirtualUnwind
1511 .type   se_handler,\@abi-omnipotent
1512 .align  16
1513 se_handler:
1514         push    %rsi
1515         push    %rdi
1516         push    %rbx
1517         push    %rbp
1518         push    %r12
1519         push    %r13
1520         push    %r14
1521         push    %r15
1522         pushfq
1523         sub     \$64,%rsp
1524
1525         mov     120($context),%rax      # pull context->Rax
1526         mov     248($context),%rbx      # pull context->Rip
1527
1528         mov     8($disp),%rsi           # disp->ImageBase
1529         mov     56($disp),%r11          # disp->HanderlData
1530
1531         mov     0(%r11),%r10d           # HandlerData[0]
1532         lea     (%rsi,%r10),%r10        # prologue label
1533         cmp     %r10,%rbx               # context->Rip<prologue label
1534         jb      .Lin_prologue
1535
1536         mov     152($context),%rax      # pull context->Rsp
1537
1538         mov     4(%r11),%r10d           # HandlerData[1]
1539         lea     (%rsi,%r10),%r10        # epilogue label
1540         cmp     %r10,%rbx               # context->Rip>=epilogue label
1541         jae     .Lin_prologue
1542 ___
1543 $code.=<<___ if ($shaext);
1544         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1545         cmp     %r10,%rbx
1546         jb      .Lnot_in_shaext
1547
1548         lea     (%rax),%rsi
1549         lea     512($context),%rdi      # &context.Xmm6
1550         mov     \$20,%ecx
1551         .long   0xa548f3fc              # cld; rep movsq
1552         lea     168(%rax),%rax          # adjust stack pointer
1553         jmp     .Lin_prologue
1554 .Lnot_in_shaext:
1555 ___
1556 $code.=<<___ if ($avx>1);
1557         lea     .Lavx2_shortcut(%rip),%r10
1558         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1559         jb      .Lnot_in_avx2
1560
1561         and     \$-256*$SZ,%rax
1562         add     \$`2*$SZ*($rounds-8)`,%rax
1563 .Lnot_in_avx2:
1564 ___
1565 $code.=<<___;
1566         mov     %rax,%rsi               # put aside Rsp
1567         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1568         lea     48(%rax),%rax
1569
1570         mov     -8(%rax),%rbx
1571         mov     -16(%rax),%rbp
1572         mov     -24(%rax),%r12
1573         mov     -32(%rax),%r13
1574         mov     -40(%rax),%r14
1575         mov     -48(%rax),%r15
1576         mov     %rbx,144($context)      # restore context->Rbx
1577         mov     %rbp,160($context)      # restore context->Rbp
1578         mov     %r12,216($context)      # restore context->R12
1579         mov     %r13,224($context)      # restore context->R13
1580         mov     %r14,232($context)      # restore context->R14
1581         mov     %r15,240($context)      # restore context->R15
1582
1583         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1584         lea     512($context),%rdi      # &context.Xmm6
1585         mov     \$20,%ecx
1586         .long   0xa548f3fc              # cld; rep movsq
1587
1588 .Lin_prologue:
1589         mov     8(%rax),%rdi
1590         mov     16(%rax),%rsi
1591         mov     %rax,152($context)      # restore context->Rsp
1592         mov     %rsi,168($context)      # restore context->Rsi
1593         mov     %rdi,176($context)      # restore context->Rdi
1594
1595         mov     40($disp),%rdi          # disp->ContextRecord
1596         mov     $context,%rsi           # context
1597         mov     \$154,%ecx              # sizeof(CONTEXT)
1598         .long   0xa548f3fc              # cld; rep movsq
1599
1600         mov     $disp,%rsi
1601         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1602         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1603         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1604         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1605         mov     40(%rsi),%r10           # disp->ContextRecord
1606         lea     56(%rsi),%r11           # &disp->HandlerData
1607         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1608         mov     %r10,32(%rsp)           # arg5
1609         mov     %r11,40(%rsp)           # arg6
1610         mov     %r12,48(%rsp)           # arg7
1611         mov     %rcx,56(%rsp)           # arg8, (NULL)
1612         call    *__imp_RtlVirtualUnwind(%rip)
1613
1614         mov     \$1,%eax                # ExceptionContinueSearch
1615         add     \$64,%rsp
1616         popfq
1617         pop     %r15
1618         pop     %r14
1619         pop     %r13
1620         pop     %r12
1621         pop     %rbp
1622         pop     %rbx
1623         pop     %rdi
1624         pop     %rsi
1625         ret
1626 .size   se_handler,.-se_handler
1627
1628 .section        .pdata
1629         .rva    .LSEH_begin_${func}_xop
1630         .rva    .LSEH_end_${func}_xop
1631         .rva    .LSEH_info_${func}_xop
1632
1633         .rva    .LSEH_begin_${func}_avx
1634         .rva    .LSEH_end_${func}_avx
1635         .rva    .LSEH_info_${func}_avx
1636 ___
1637 $code.=<<___ if ($avx>1);
1638         .rva    .LSEH_begin_${func}_avx2
1639         .rva    .LSEH_end_${func}_avx2
1640         .rva    .LSEH_info_${func}_avx2
1641 ___
1642 $code.=<<___ if ($shaext);
1643         .rva    .LSEH_begin_${func}_shaext
1644         .rva    .LSEH_end_${func}_shaext
1645         .rva    .LSEH_info_${func}_shaext
1646 ___
1647 $code.=<<___;
1648 .section        .xdata
1649 .align  8
1650 .LSEH_info_${func}_xop:
1651         .byte   9,0,0,0
1652         .rva    se_handler
1653         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1654
1655 .LSEH_info_${func}_avx:
1656         .byte   9,0,0,0
1657         .rva    se_handler
1658         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1659 ___
1660 $code.=<<___ if ($avx>1);
1661 .LSEH_info_${func}_avx2:
1662         .byte   9,0,0,0
1663         .rva    se_handler
1664         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1665 ___
1666 $code.=<<___ if ($shaext);
1667 .LSEH_info_${func}_shaext:
1668         .byte   9,0,0,0
1669         .rva    se_handler
1670         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1671 ___
1672 }
1673
1674 ####################################################################
1675 sub rex {
1676   local *opcode=shift;
1677   my ($dst,$src)=@_;
1678   my $rex=0;
1679
1680     $rex|=0x04                  if($dst>=8);
1681     $rex|=0x01                  if($src>=8);
1682     unshift @opcode,$rex|0x40   if($rex);
1683 }
1684
1685 {
1686   my %opcodelet = (
1687                 "sha256rnds2" => 0xcb,
1688                 "sha256msg1"  => 0xcc,
1689                 "sha256msg2"  => 0xcd   );
1690
1691   sub sha256op38 {
1692     my $instr = shift;
1693
1694     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1695       my @opcode=(0x0f,0x38);
1696         rex(\@opcode,$2,$1);
1697         push @opcode,$opcodelet{$instr};
1698         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1699         return ".byte\t".join(',',@opcode);
1700     } else {
1701         return $instr."\t".@_[0];
1702     }
1703   }
1704 }
1705
1706 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1707 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1708 print $code;
1709 close STDOUT;