crypto/aes/asm/aesni-sha256-x86_64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # January 2013
  11 #
  12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  15 # parallelism, interleaving it with another algorithm would allow to
  16 # utilize processor resources better and achieve better performance.
  17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  18 # AESNI code is weaved into it. As SHA256 dominates execution time,
  19 # stitch performance does not depend on AES key length. Below are
  20 # performance numbers in cycles per processed byte, less is better,
  21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  22 # subroutine:
  23 #
  24 #                AES-128/-192/-256+SHA256       this(**)gain
  25 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
  26 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
  27 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
  28 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
  29 #
  30 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
  31 #       Westmere is omitted from loop, this is because gain was not
  32 #       estimated high enough to justify the effort;
  33 # (**)  these are EVP-free results, results obtained with 'speed
  34 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  35
  36 $flavour = shift;
  37 $output  = shift;
  38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  39
  40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  41
  42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  45 die "can't locate x86_64-xlate.pl";
  46
  47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  48                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  49         $avx = ($1>=2.19) + ($1>=2.22);
  50 }
  51
  52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  53            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  54         $avx = ($1>=2.09) + ($1>=2.10);
  55 }
  56
  57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  58            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  59         $avx = ($1>=10) + ($1>=11);
  60 }
  61
  62 $shaext=$avx;   ### set to zero if compiling for 1.0.1
  63 $avx=1          if (!$shaext && $avx);
  64
  65 open OUT,"| \"$^X\" $xlate $flavour $output";
  66 *STDOUT=*OUT;
  67
  68 $func="aesni_cbc_sha256_enc";
  69 $TABLE="K256";
  70 $SZ=4;
  71 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  72                                 "%r8d","%r9d","%r10d","%r11d");
  73 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  74 @Sigma0=( 2,13,22);
  75 @Sigma1=( 6,11,25);
  76 @sigma0=( 7,18, 3);
  77 @sigma1=(17,19,10);
  78 $rounds=64;
  79
  80 ########################################################################
  81 # void aesni_cbc_sha256_enc(const void *inp,
  82 #                       void *out,
  83 #                       size_t length,
  84 #                       const AES_KEY *key,
  85 #                       unsigned char *iv,
  86 #                       SHA256_CTX *ctx,
  87 #                       const void *in0);
  88 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
  89 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  90
  91 $Tbl="%rbp";
  92
  93 $_inp="16*$SZ+0*8(%rsp)";
  94 $_out="16*$SZ+1*8(%rsp)";
  95 $_end="16*$SZ+2*8(%rsp)";
  96 $_key="16*$SZ+3*8(%rsp)";
  97 $_ivp="16*$SZ+4*8(%rsp)";
  98 $_ctx="16*$SZ+5*8(%rsp)";
  99 $_in0="16*$SZ+6*8(%rsp)";
 100 $_rsp="16*$SZ+7*8(%rsp)";
 101 $framesz=16*$SZ+8*8;
 102
 103 $code=<<___;
 104 .text
 105
 106 .extern OPENSSL_ia32cap_P
 107 .globl  $func
 108 .type   $func,\@abi-omnipotent
 109 .align  16
 110 $func:
 111 ___
 112                                                 if ($avx) {
 113 $code.=<<___;
 114         lea     OPENSSL_ia32cap_P(%rip),%r11
 115         mov     \$1,%eax
 116         cmp     \$0,`$win64?"%rcx":"%rdi"`
 117         je      .Lprobe
 118         mov     0(%r11),%eax
 119         mov     4(%r11),%r10
 120 ___
 121 $code.=<<___ if ($shaext);
 122         bt      \$61,%r10                       # check for SHA
 123         jc      ${func}_shaext
 124 ___
 125 $code.=<<___;
 126         mov     %r10,%r11
 127         shr     \$32,%r11
 128
 129         test    \$`1<<11`,%r10d                 # check for XOP
 130         jnz     ${func}_xop
 131 ___
 132 $code.=<<___ if ($avx>1);
 133         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
 134         cmp     \$`1<<8|1<<5|1<<3`,%r11d
 135         je      ${func}_avx2
 136 ___
 137 $code.=<<___;
 138         and     \$`1<<30`,%eax                  # mask "Intel CPU" bit
 139         and     \$`1<<28|1<<9`,%r10d            # mask AVX+SSSE3 bits
 140         or      %eax,%r10d
 141         cmp     \$`1<<28|1<<9|1<<30`,%r10d
 142         je      ${func}_avx
 143         ud2
 144 ___
 145                                                 }
 146 $code.=<<___;
 147         xor     %eax,%eax
 148         cmp     \$0,`$win64?"%rcx":"%rdi"`
 149         je      .Lprobe
 150         ud2
 151 .Lprobe:
 152         ret
 153 .size   $func,.-$func
 154
 155 .align  64
 156 .type   $TABLE,\@object
 157 $TABLE:
 158         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 159         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 160         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 161         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 162         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 163         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 164         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 165         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 166         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 167         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 168         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 169         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 170         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 171         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 172         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 173         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 174         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 175         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 176         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 177         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 178         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 179         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 180         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 181         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 182         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 183         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 184         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 185         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 186         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 187         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 188         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 189         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 190
 191         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 192         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 193         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
 194         .long   0,0,0,0,   0,0,0,0
 195         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 196 .align  64
 197 ___
 198
 199 ######################################################################
 200 # SIMD code paths
 201 #
 202 {{{
 203 ($iv,$inout,$roundkey,$temp,
 204  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
 205
 206 $aesni_cbc_idx=0;
 207 @aesni_cbc_block = (
 208 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
 209 ##      &vmovdqu        ($inout,($inp));
 210 ##      &mov            ($_inp,$inp);
 211
 212         '&vpxor         ($inout,$inout,$roundkey);'.
 213         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
 214
 215         '&vpxor         ($inout,$inout,$iv);',
 216
 217         '&vaesenc       ($inout,$inout,$roundkey);'.
 218         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
 219
 220         '&vaesenc       ($inout,$inout,$roundkey);'.
 221         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
 222
 223         '&vaesenc       ($inout,$inout,$roundkey);'.
 224         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
 225
 226         '&vaesenc       ($inout,$inout,$roundkey);'.
 227         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
 228
 229         '&vaesenc       ($inout,$inout,$roundkey);'.
 230         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
 231
 232         '&vaesenc       ($inout,$inout,$roundkey);'.
 233         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
 234
 235         '&vaesenc       ($inout,$inout,$roundkey);'.
 236         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
 237
 238         '&vaesenc       ($inout,$inout,$roundkey);'.
 239         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
 240
 241         '&vaesenc       ($inout,$inout,$roundkey);'.
 242         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
 243
 244         '&vaesenclast   ($temp,$inout,$roundkey);'.
 245         ' &vaesenc      ($inout,$inout,$roundkey);'.
 246         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
 247
 248         '&vpand         ($iv,$temp,$mask10);'.
 249         ' &vaesenc      ($inout,$inout,$roundkey);'.
 250         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
 251
 252         '&vaesenclast   ($temp,$inout,$roundkey);'.
 253         ' &vaesenc      ($inout,$inout,$roundkey);'.
 254         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
 255
 256         '&vpand         ($temp,$temp,$mask12);'.
 257         ' &vaesenc      ($inout,$inout,$roundkey);'.
 258          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
 259
 260         '&vpor          ($iv,$iv,$temp);'.
 261         ' &vaesenclast  ($temp,$inout,$roundkey);'.
 262         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
 263
 264 ##      &mov            ($inp,$_inp);
 265 ##      &mov            ($out,$_out);
 266 ##      &vpand          ($temp,$temp,$mask14);
 267 ##      &vpor           ($iv,$iv,$temp);
 268 ##      &vmovdqu        ($iv,($out,$inp);
 269 ##      &lea            (inp,16($inp));
 270 );
 271
 272 my $a4=$T1;
 273 my ($a,$b,$c,$d,$e,$f,$g,$h);
 274
 275 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 276 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 277   my $arg = pop;
 278     $arg = "\$$arg" if ($arg*1 eq $arg);
 279     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 280 }
 281
 282 sub body_00_15 () {
 283         (
 284         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 285
 286         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
 287         '&mov   ($a,$a1)',
 288         '&mov   ($a4,$f)',
 289
 290         '&xor   ($a0,$e)',
 291         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
 292         '&xor   ($a4,$g)',                      # f^g
 293
 294         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
 295         '&xor   ($a1,$a)',
 296         '&and   ($a4,$e)',                      # (f^g)&e
 297
 298         @aesni_cbc_block[$aesni_cbc_idx++].
 299         '&xor   ($a0,$e)',
 300         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
 301         '&mov   ($a2,$a)',
 302
 303         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
 304         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
 305         '&xor   ($a2,$b)',                      # a^b, b^c in next round
 306
 307         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
 308         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
 309         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
 310
 311         '&xor   ($a1,$a)',
 312         '&add   ($h,$a0)',                      # h+=Sigma1(e)
 313         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
 314
 315         '&add   ($d,$h)',                       # d+=h
 316         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
 317         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
 318
 319         '&mov   ($a0,$d)',
 320         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
 321         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 322         );
 323 }
 324
 325 if ($avx) {{
 326 ######################################################################
 327 # XOP code path
 328 #
 329 $code.=<<___;
 330 .type   ${func}_xop,\@function,6
 331 .align  64
 332 ${func}_xop:
 333 .Lxop_shortcut:
 334         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 335         push    %rbx
 336         push    %rbp
 337         push    %r12
 338         push    %r13
 339         push    %r14
 340         push    %r15
 341         mov     %rsp,%r11               # copy %rsp
 342         sub     \$`$framesz+$win64*16*10`,%rsp
 343         and     \$-64,%rsp              # align stack frame
 344
 345         shl     \$6,$len
 346         sub     $inp,$out               # re-bias
 347         sub     $inp,$in0
 348         add     $inp,$len               # end of input
 349
 350         #mov    $inp,$_inp              # saved later
 351         mov     $out,$_out
 352         mov     $len,$_end
 353         #mov    $key,$_key              # remains resident in $inp register
 354         mov     $ivp,$_ivp
 355         mov     $ctx,$_ctx
 356         mov     $in0,$_in0
 357         mov     %r11,$_rsp
 358 ___
 359 $code.=<<___ if ($win64);
 360         movaps  %xmm6,`$framesz+16*0`(%rsp)
 361         movaps  %xmm7,`$framesz+16*1`(%rsp)
 362         movaps  %xmm8,`$framesz+16*2`(%rsp)
 363         movaps  %xmm9,`$framesz+16*3`(%rsp)
 364         movaps  %xmm10,`$framesz+16*4`(%rsp)
 365         movaps  %xmm11,`$framesz+16*5`(%rsp)
 366         movaps  %xmm12,`$framesz+16*6`(%rsp)
 367         movaps  %xmm13,`$framesz+16*7`(%rsp)
 368         movaps  %xmm14,`$framesz+16*8`(%rsp)
 369         movaps  %xmm15,`$framesz+16*9`(%rsp)
 370 ___
 371 $code.=<<___;
 372 .Lprologue_xop:
 373         vzeroall
 374
 375         mov     $inp,%r12               # borrow $a4
 376         lea     0x80($key),$inp         # size optimization, reassign
 377         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 378         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 379         mov     $ctx,%r15               # borrow $a2
 380         mov     $in0,%rsi               # borrow $a3
 381         vmovdqu ($ivp),$iv              # load IV
 382         sub     \$9,%r14
 383
 384         mov     $SZ*0(%r15),$A
 385         mov     $SZ*1(%r15),$B
 386         mov     $SZ*2(%r15),$C
 387         mov     $SZ*3(%r15),$D
 388         mov     $SZ*4(%r15),$E
 389         mov     $SZ*5(%r15),$F
 390         mov     $SZ*6(%r15),$G
 391         mov     $SZ*7(%r15),$H
 392
 393         vmovdqa 0x00(%r13,%r14,8),$mask14
 394         vmovdqa 0x10(%r13,%r14,8),$mask12
 395         vmovdqa 0x20(%r13,%r14,8),$mask10
 396         vmovdqu 0x00-0x80($inp),$roundkey
 397         jmp     .Lloop_xop
 398 ___
 399                                         if ($SZ==4) {   # SHA256
 400     my @X = map("%xmm$_",(0..3));
 401     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 402
 403 $code.=<<___;
 404 .align  16
 405 .Lloop_xop:
 406         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 407         vmovdqu 0x00(%rsi,%r12),@X[0]
 408         vmovdqu 0x10(%rsi,%r12),@X[1]
 409         vmovdqu 0x20(%rsi,%r12),@X[2]
 410         vmovdqu 0x30(%rsi,%r12),@X[3]
 411         vpshufb $t3,@X[0],@X[0]
 412         lea     $TABLE(%rip),$Tbl
 413         vpshufb $t3,@X[1],@X[1]
 414         vpshufb $t3,@X[2],@X[2]
 415         vpaddd  0x00($Tbl),@X[0],$t0
 416         vpshufb $t3,@X[3],@X[3]
 417         vpaddd  0x20($Tbl),@X[1],$t1
 418         vpaddd  0x40($Tbl),@X[2],$t2
 419         vpaddd  0x60($Tbl),@X[3],$t3
 420         vmovdqa $t0,0x00(%rsp)
 421         mov     $A,$a1
 422         vmovdqa $t1,0x10(%rsp)
 423         mov     $B,$a3
 424         vmovdqa $t2,0x20(%rsp)
 425         xor     $C,$a3                  # magic
 426         vmovdqa $t3,0x30(%rsp)
 427         mov     $E,$a0
 428         jmp     .Lxop_00_47
 429
 430 .align  16
 431 .Lxop_00_47:
 432         sub     \$-16*2*$SZ,$Tbl        # size optimization
 433         vmovdqu (%r12),$inout           # $a4
 434         mov     %r12,$_inp              # $a4
 435 ___
 436 sub XOP_256_00_47 () {
 437 my $j = shift;
 438 my $body = shift;
 439 my @X = @_;
 440 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 441
 442         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
 443           eval(shift(@insns));
 444           eval(shift(@insns));
 445          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
 446           eval(shift(@insns));
 447           eval(shift(@insns));
 448         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
 449           eval(shift(@insns));
 450           eval(shift(@insns));
 451         &vpsrld         ($t0,$t0,$sigma0[2]);
 452           eval(shift(@insns));
 453           eval(shift(@insns));
 454          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
 455           eval(shift(@insns));
 456           eval(shift(@insns));
 457           eval(shift(@insns));
 458           eval(shift(@insns));
 459         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
 460           eval(shift(@insns));
 461           eval(shift(@insns));
 462         &vpxor          ($t0,$t0,$t1);
 463           eval(shift(@insns));
 464           eval(shift(@insns));
 465           eval(shift(@insns));
 466           eval(shift(@insns));
 467          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
 468           eval(shift(@insns));
 469           eval(shift(@insns));
 470         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
 471           eval(shift(@insns));
 472           eval(shift(@insns));
 473          &vpsrld        ($t2,@X[3],$sigma1[2]);
 474           eval(shift(@insns));
 475           eval(shift(@insns));
 476         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
 477           eval(shift(@insns));
 478           eval(shift(@insns));
 479          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 480           eval(shift(@insns));
 481           eval(shift(@insns));
 482          &vpxor         ($t3,$t3,$t2);
 483           eval(shift(@insns));
 484           eval(shift(@insns));
 485           eval(shift(@insns));
 486           eval(shift(@insns));
 487          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
 488           eval(shift(@insns));
 489           eval(shift(@insns));
 490           eval(shift(@insns));
 491           eval(shift(@insns));
 492         &vpsrldq        ($t3,$t3,8);
 493           eval(shift(@insns));
 494           eval(shift(@insns));
 495           eval(shift(@insns));
 496           eval(shift(@insns));
 497         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
 498           eval(shift(@insns));
 499           eval(shift(@insns));
 500           eval(shift(@insns));
 501           eval(shift(@insns));
 502          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
 503           eval(shift(@insns));
 504           eval(shift(@insns));
 505          &vpsrld        ($t2,@X[0],$sigma1[2]);
 506           eval(shift(@insns));
 507           eval(shift(@insns));
 508          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 509           eval(shift(@insns));
 510           eval(shift(@insns));
 511          &vpxor         ($t3,$t3,$t2);
 512           eval(shift(@insns));
 513           eval(shift(@insns));
 514           eval(shift(@insns));
 515           eval(shift(@insns));
 516          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
 517           eval(shift(@insns));
 518           eval(shift(@insns));
 519           eval(shift(@insns));
 520           eval(shift(@insns));
 521         &vpslldq        ($t3,$t3,8);            # 22 instructions
 522           eval(shift(@insns));
 523           eval(shift(@insns));
 524           eval(shift(@insns));
 525           eval(shift(@insns));
 526         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
 527           eval(shift(@insns));
 528           eval(shift(@insns));
 529           eval(shift(@insns));
 530           eval(shift(@insns));
 531         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 532           foreach (@insns) { eval; }            # remaining instructions
 533         &vmovdqa        (16*$j."(%rsp)",$t2);
 534 }
 535
 536     $aesni_cbc_idx=0;
 537     for ($i=0,$j=0; $j<4; $j++) {
 538         &XOP_256_00_47($j,\&body_00_15,@X);
 539         push(@X,shift(@X));                     # rotate(@X)
 540     }
 541         &mov            ("%r12",$_inp);         # borrow $a4
 542         &vpand          ($temp,$temp,$mask14);
 543         &mov            ("%r15",$_out);         # borrow $a2
 544         &vpor           ($iv,$iv,$temp);
 545         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 546         &lea            ("%r12","16(%r12)");    # inp++
 547
 548         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 549         &jne    (".Lxop_00_47");
 550
 551         &vmovdqu        ($inout,"(%r12)");
 552         &mov            ($_inp,"%r12");
 553
 554     $aesni_cbc_idx=0;
 555     for ($i=0; $i<16; ) {
 556         foreach(body_00_15()) { eval; }
 557     }
 558                                         }
 559 $code.=<<___;
 560         mov     $_inp,%r12              # borrow $a4
 561         mov     $_out,%r13              # borrow $a0
 562         mov     $_ctx,%r15              # borrow $a2
 563         mov     $_in0,%rsi              # borrow $a3
 564
 565         vpand   $mask14,$temp,$temp
 566         mov     $a1,$A
 567         vpor    $temp,$iv,$iv
 568         vmovdqu $iv,(%r13,%r12)         # write output
 569         lea     16(%r12),%r12           # inp++
 570
 571         add     $SZ*0(%r15),$A
 572         add     $SZ*1(%r15),$B
 573         add     $SZ*2(%r15),$C
 574         add     $SZ*3(%r15),$D
 575         add     $SZ*4(%r15),$E
 576         add     $SZ*5(%r15),$F
 577         add     $SZ*6(%r15),$G
 578         add     $SZ*7(%r15),$H
 579
 580         cmp     $_end,%r12
 581
 582         mov     $A,$SZ*0(%r15)
 583         mov     $B,$SZ*1(%r15)
 584         mov     $C,$SZ*2(%r15)
 585         mov     $D,$SZ*3(%r15)
 586         mov     $E,$SZ*4(%r15)
 587         mov     $F,$SZ*5(%r15)
 588         mov     $G,$SZ*6(%r15)
 589         mov     $H,$SZ*7(%r15)
 590
 591         jb      .Lloop_xop
 592
 593         mov     $_ivp,$ivp
 594         mov     $_rsp,%rsi
 595         vmovdqu $iv,($ivp)              # output IV
 596         vzeroall
 597 ___
 598 $code.=<<___ if ($win64);
 599         movaps  `$framesz+16*0`(%rsp),%xmm6
 600         movaps  `$framesz+16*1`(%rsp),%xmm7
 601         movaps  `$framesz+16*2`(%rsp),%xmm8
 602         movaps  `$framesz+16*3`(%rsp),%xmm9
 603         movaps  `$framesz+16*4`(%rsp),%xmm10
 604         movaps  `$framesz+16*5`(%rsp),%xmm11
 605         movaps  `$framesz+16*6`(%rsp),%xmm12
 606         movaps  `$framesz+16*7`(%rsp),%xmm13
 607         movaps  `$framesz+16*8`(%rsp),%xmm14
 608         movaps  `$framesz+16*9`(%rsp),%xmm15
 609 ___
 610 $code.=<<___;
 611         mov     (%rsi),%r15
 612         mov     8(%rsi),%r14
 613         mov     16(%rsi),%r13
 614         mov     24(%rsi),%r12
 615         mov     32(%rsi),%rbp
 616         mov     40(%rsi),%rbx
 617         lea     48(%rsi),%rsp
 618 .Lepilogue_xop:
 619         ret
 620 .size   ${func}_xop,.-${func}_xop
 621 ___
 622 ######################################################################
 623 # AVX+shrd code path
 624 #
 625 local *ror = sub { &shrd(@_[0],@_) };
 626
 627 $code.=<<___;
 628 .type   ${func}_avx,\@function,6
 629 .align  64
 630 ${func}_avx:
 631 .Lavx_shortcut:
 632         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 633         push    %rbx
 634         push    %rbp
 635         push    %r12
 636         push    %r13
 637         push    %r14
 638         push    %r15
 639         mov     %rsp,%r11               # copy %rsp
 640         sub     \$`$framesz+$win64*16*10`,%rsp
 641         and     \$-64,%rsp              # align stack frame
 642
 643         shl     \$6,$len
 644         sub     $inp,$out               # re-bias
 645         sub     $inp,$in0
 646         add     $inp,$len               # end of input
 647
 648         #mov    $inp,$_inp              # saved later
 649         mov     $out,$_out
 650         mov     $len,$_end
 651         #mov    $key,$_key              # remains resident in $inp register
 652         mov     $ivp,$_ivp
 653         mov     $ctx,$_ctx
 654         mov     $in0,$_in0
 655         mov     %r11,$_rsp
 656 ___
 657 $code.=<<___ if ($win64);
 658         movaps  %xmm6,`$framesz+16*0`(%rsp)
 659         movaps  %xmm7,`$framesz+16*1`(%rsp)
 660         movaps  %xmm8,`$framesz+16*2`(%rsp)
 661         movaps  %xmm9,`$framesz+16*3`(%rsp)
 662         movaps  %xmm10,`$framesz+16*4`(%rsp)
 663         movaps  %xmm11,`$framesz+16*5`(%rsp)
 664         movaps  %xmm12,`$framesz+16*6`(%rsp)
 665         movaps  %xmm13,`$framesz+16*7`(%rsp)
 666         movaps  %xmm14,`$framesz+16*8`(%rsp)
 667         movaps  %xmm15,`$framesz+16*9`(%rsp)
 668 ___
 669 $code.=<<___;
 670 .Lprologue_avx:
 671         vzeroall
 672
 673         mov     $inp,%r12               # borrow $a4
 674         lea     0x80($key),$inp         # size optimization, reassign
 675         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 676         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 677         mov     $ctx,%r15               # borrow $a2
 678         mov     $in0,%rsi               # borrow $a3
 679         vmovdqu ($ivp),$iv              # load IV
 680         sub     \$9,%r14
 681
 682         mov     $SZ*0(%r15),$A
 683         mov     $SZ*1(%r15),$B
 684         mov     $SZ*2(%r15),$C
 685         mov     $SZ*3(%r15),$D
 686         mov     $SZ*4(%r15),$E
 687         mov     $SZ*5(%r15),$F
 688         mov     $SZ*6(%r15),$G
 689         mov     $SZ*7(%r15),$H
 690
 691         vmovdqa 0x00(%r13,%r14,8),$mask14
 692         vmovdqa 0x10(%r13,%r14,8),$mask12
 693         vmovdqa 0x20(%r13,%r14,8),$mask10
 694         vmovdqu 0x00-0x80($inp),$roundkey
 695 ___
 696                                         if ($SZ==4) {   # SHA256
 697     my @X = map("%xmm$_",(0..3));
 698     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 699
 700 $code.=<<___;
 701         jmp     .Lloop_avx
 702 .align  16
 703 .Lloop_avx:
 704         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 705         vmovdqu 0x00(%rsi,%r12),@X[0]
 706         vmovdqu 0x10(%rsi,%r12),@X[1]
 707         vmovdqu 0x20(%rsi,%r12),@X[2]
 708         vmovdqu 0x30(%rsi,%r12),@X[3]
 709         vpshufb $t3,@X[0],@X[0]
 710         lea     $TABLE(%rip),$Tbl
 711         vpshufb $t3,@X[1],@X[1]
 712         vpshufb $t3,@X[2],@X[2]
 713         vpaddd  0x00($Tbl),@X[0],$t0
 714         vpshufb $t3,@X[3],@X[3]
 715         vpaddd  0x20($Tbl),@X[1],$t1
 716         vpaddd  0x40($Tbl),@X[2],$t2
 717         vpaddd  0x60($Tbl),@X[3],$t3
 718         vmovdqa $t0,0x00(%rsp)
 719         mov     $A,$a1
 720         vmovdqa $t1,0x10(%rsp)
 721         mov     $B,$a3
 722         vmovdqa $t2,0x20(%rsp)
 723         xor     $C,$a3                  # magic
 724         vmovdqa $t3,0x30(%rsp)
 725         mov     $E,$a0
 726         jmp     .Lavx_00_47
 727
 728 .align  16
 729 .Lavx_00_47:
 730         sub     \$-16*2*$SZ,$Tbl        # size optimization
 731         vmovdqu (%r12),$inout           # $a4
 732         mov     %r12,$_inp              # $a4
 733 ___
 734 sub Xupdate_256_AVX () {
 735         (
 736         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
 737          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
 738         '&vpsrld        ($t2,$t0,$sigma0[0]);',
 739          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
 740         '&vpsrld        ($t3,$t0,$sigma0[2])',
 741         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
 742         '&vpxor         ($t0,$t3,$t2)',
 743          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
 744         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
 745         '&vpxor         ($t0,$t0,$t1)',
 746         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
 747         '&vpxor         ($t0,$t0,$t2)',
 748          '&vpsrld       ($t2,$t3,$sigma1[2]);',
 749         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
 750          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
 751         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
 752          '&vpxor        ($t2,$t2,$t3);',
 753          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 754          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
 755          '&vpshufd      ($t2,$t2,0b10000100)',
 756          '&vpsrldq      ($t2,$t2,8)',
 757         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
 758          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
 759          '&vpsrld       ($t2,$t3,$sigma1[2])',
 760          '&vpsrlq       ($t3,$t3,$sigma1[0])',
 761          '&vpxor        ($t2,$t2,$t3);',
 762          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 763          '&vpxor        ($t2,$t2,$t3)',
 764          '&vpshufd      ($t2,$t2,0b11101000)',
 765          '&vpslldq      ($t2,$t2,8)',
 766         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
 767         );
 768 }
 769
 770 sub AVX_256_00_47 () {
 771 my $j = shift;
 772 my $body = shift;
 773 my @X = @_;
 774 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 775
 776         foreach (Xupdate_256_AVX()) {           # 29 instructions
 777             eval;
 778             eval(shift(@insns));
 779             eval(shift(@insns));
 780             eval(shift(@insns));
 781         }
 782         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 783           foreach (@insns) { eval; }            # remaining instructions
 784         &vmovdqa        (16*$j."(%rsp)",$t2);
 785 }
 786
 787     $aesni_cbc_idx=0;
 788     for ($i=0,$j=0; $j<4; $j++) {
 789         &AVX_256_00_47($j,\&body_00_15,@X);
 790         push(@X,shift(@X));                     # rotate(@X)
 791     }
 792         &mov            ("%r12",$_inp);         # borrow $a4
 793         &vpand          ($temp,$temp,$mask14);
 794         &mov            ("%r15",$_out);         # borrow $a2
 795         &vpor           ($iv,$iv,$temp);
 796         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 797         &lea            ("%r12","16(%r12)");    # inp++
 798
 799         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 800         &jne    (".Lavx_00_47");
 801
 802         &vmovdqu        ($inout,"(%r12)");
 803         &mov            ($_inp,"%r12");
 804
 805     $aesni_cbc_idx=0;
 806     for ($i=0; $i<16; ) {
 807         foreach(body_00_15()) { eval; }
 808     }
 809
 810                                         }
 811 $code.=<<___;
 812         mov     $_inp,%r12              # borrow $a4
 813         mov     $_out,%r13              # borrow $a0
 814         mov     $_ctx,%r15              # borrow $a2
 815         mov     $_in0,%rsi              # borrow $a3
 816
 817         vpand   $mask14,$temp,$temp
 818         mov     $a1,$A
 819         vpor    $temp,$iv,$iv
 820         vmovdqu $iv,(%r13,%r12)         # write output
 821         lea     16(%r12),%r12           # inp++
 822
 823         add     $SZ*0(%r15),$A
 824         add     $SZ*1(%r15),$B
 825         add     $SZ*2(%r15),$C
 826         add     $SZ*3(%r15),$D
 827         add     $SZ*4(%r15),$E
 828         add     $SZ*5(%r15),$F
 829         add     $SZ*6(%r15),$G
 830         add     $SZ*7(%r15),$H
 831
 832         cmp     $_end,%r12
 833
 834         mov     $A,$SZ*0(%r15)
 835         mov     $B,$SZ*1(%r15)
 836         mov     $C,$SZ*2(%r15)
 837         mov     $D,$SZ*3(%r15)
 838         mov     $E,$SZ*4(%r15)
 839         mov     $F,$SZ*5(%r15)
 840         mov     $G,$SZ*6(%r15)
 841         mov     $H,$SZ*7(%r15)
 842         jb      .Lloop_avx
 843
 844         mov     $_ivp,$ivp
 845         mov     $_rsp,%rsi
 846         vmovdqu $iv,($ivp)              # output IV
 847         vzeroall
 848 ___
 849 $code.=<<___ if ($win64);
 850         movaps  `$framesz+16*0`(%rsp),%xmm6
 851         movaps  `$framesz+16*1`(%rsp),%xmm7
 852         movaps  `$framesz+16*2`(%rsp),%xmm8
 853         movaps  `$framesz+16*3`(%rsp),%xmm9
 854         movaps  `$framesz+16*4`(%rsp),%xmm10
 855         movaps  `$framesz+16*5`(%rsp),%xmm11
 856         movaps  `$framesz+16*6`(%rsp),%xmm12
 857         movaps  `$framesz+16*7`(%rsp),%xmm13
 858         movaps  `$framesz+16*8`(%rsp),%xmm14
 859         movaps  `$framesz+16*9`(%rsp),%xmm15
 860 ___
 861 $code.=<<___;
 862         mov     (%rsi),%r15
 863         mov     8(%rsi),%r14
 864         mov     16(%rsi),%r13
 865         mov     24(%rsi),%r12
 866         mov     32(%rsi),%rbp
 867         mov     40(%rsi),%rbx
 868         lea     48(%rsi),%rsp
 869 .Lepilogue_avx:
 870         ret
 871 .size   ${func}_avx,.-${func}_avx
 872 ___
 873
 874 if ($avx>1) {{
 875 ######################################################################
 876 # AVX2+BMI code path
 877 #
 878 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
 879 my $PUSH8=8*2*$SZ;
 880 use integer;
 881
 882 sub bodyx_00_15 () {
 883         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
 884         (
 885         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 886
 887         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
 888         '&and   ($a4,$e)',              # f&e
 889         '&rorx  ($a0,$e,$Sigma1[2])',
 890         '&rorx  ($a2,$e,$Sigma1[1])',
 891
 892         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
 893         '&lea   ($h,"($h,$a4)")',
 894         '&andn  ($a4,$e,$g)',           # ~e&g
 895         '&xor   ($a0,$a2)',
 896
 897         '&rorx  ($a1,$e,$Sigma1[0])',
 898         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
 899         '&xor   ($a0,$a1)',             # Sigma1(e)
 900         '&mov   ($a2,$a)',
 901
 902         '&rorx  ($a4,$a,$Sigma0[2])',
 903         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
 904         '&xor   ($a2,$b)',              # a^b, b^c in next round
 905         '&rorx  ($a1,$a,$Sigma0[1])',
 906
 907         '&rorx  ($a0,$a,$Sigma0[0])',
 908         '&lea   ($d,"($d,$h)")',        # d+=h
 909         '&and   ($a3,$a2)',             # (b^c)&(a^b)
 910         @aesni_cbc_block[$aesni_cbc_idx++].
 911         '&xor   ($a1,$a4)',
 912
 913         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
 914         '&xor   ($a1,$a0)',             # Sigma0(a)
 915         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
 916         '&mov   ($a4,$e)',              # copy of f in future
 917
 918         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 919         );
 920         # and at the finish one has to $a+=$a1
 921 }
 922
 923 $code.=<<___;
 924 .type   ${func}_avx2,\@function,6
 925 .align  64
 926 ${func}_avx2:
 927 .Lavx2_shortcut:
 928         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 929         push    %rbx
 930         push    %rbp
 931         push    %r12
 932         push    %r13
 933         push    %r14
 934         push    %r15
 935         mov     %rsp,%r11               # copy %rsp
 936         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
 937         and     \$-256*$SZ,%rsp         # align stack frame
 938         add     \$`2*$SZ*($rounds-8)`,%rsp
 939
 940         shl     \$6,$len
 941         sub     $inp,$out               # re-bias
 942         sub     $inp,$in0
 943         add     $inp,$len               # end of input
 944
 945         #mov    $inp,$_inp              # saved later
 946         #mov    $out,$_out              # kept in $offload
 947         mov     $len,$_end
 948         #mov    $key,$_key              # remains resident in $inp register
 949         mov     $ivp,$_ivp
 950         mov     $ctx,$_ctx
 951         mov     $in0,$_in0
 952         mov     %r11,$_rsp
 953 ___
 954 $code.=<<___ if ($win64);
 955         movaps  %xmm6,`$framesz+16*0`(%rsp)
 956         movaps  %xmm7,`$framesz+16*1`(%rsp)
 957         movaps  %xmm8,`$framesz+16*2`(%rsp)
 958         movaps  %xmm9,`$framesz+16*3`(%rsp)
 959         movaps  %xmm10,`$framesz+16*4`(%rsp)
 960         movaps  %xmm11,`$framesz+16*5`(%rsp)
 961         movaps  %xmm12,`$framesz+16*6`(%rsp)
 962         movaps  %xmm13,`$framesz+16*7`(%rsp)
 963         movaps  %xmm14,`$framesz+16*8`(%rsp)
 964         movaps  %xmm15,`$framesz+16*9`(%rsp)
 965 ___
 966 $code.=<<___;
 967 .Lprologue_avx2:
 968         vzeroall
 969
 970         mov     $inp,%r13               # borrow $a0
 971         vpinsrq \$1,$out,$offload,$offload
 972         lea     0x80($key),$inp         # size optimization, reassign
 973         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
 974         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 975         mov     $ctx,%r15               # borrow $a2
 976         mov     $in0,%rsi               # borrow $a3
 977         vmovdqu ($ivp),$iv              # load IV
 978         lea     -9(%r14),%r14
 979
 980         vmovdqa 0x00(%r12,%r14,8),$mask14
 981         vmovdqa 0x10(%r12,%r14,8),$mask12
 982         vmovdqa 0x20(%r12,%r14,8),$mask10
 983
 984         sub     \$-16*$SZ,%r13          # inp++, size optimization
 985         mov     $SZ*0(%r15),$A
 986         lea     (%rsi,%r13),%r12        # borrow $a0
 987         mov     $SZ*1(%r15),$B
 988         cmp     $len,%r13               # $_end
 989         mov     $SZ*2(%r15),$C
 990         cmove   %rsp,%r12               # next block or random data
 991         mov     $SZ*3(%r15),$D
 992         mov     $SZ*4(%r15),$E
 993         mov     $SZ*5(%r15),$F
 994         mov     $SZ*6(%r15),$G
 995         mov     $SZ*7(%r15),$H
 996         vmovdqu 0x00-0x80($inp),$roundkey
 997 ___
 998                                         if ($SZ==4) {   # SHA256
 999     my @X = map("%ymm$_",(0..3));
1000     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1001
1002 $code.=<<___;
1003         jmp     .Loop_avx2
1004 .align  16
1005 .Loop_avx2:
1006         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1007         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1008         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1009         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1010         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1011
1012         vinserti128     \$1,(%r12),@X[0],@X[0]
1013         vinserti128     \$1,16(%r12),@X[1],@X[1]
1014          vpshufb        $t3,@X[0],@X[0]
1015         vinserti128     \$1,32(%r12),@X[2],@X[2]
1016          vpshufb        $t3,@X[1],@X[1]
1017         vinserti128     \$1,48(%r12),@X[3],@X[3]
1018
1019         lea     $TABLE(%rip),$Tbl
1020         vpshufb $t3,@X[2],@X[2]
1021         lea     -16*$SZ(%r13),%r13
1022         vpaddd  0x00($Tbl),@X[0],$t0
1023         vpshufb $t3,@X[3],@X[3]
1024         vpaddd  0x20($Tbl),@X[1],$t1
1025         vpaddd  0x40($Tbl),@X[2],$t2
1026         vpaddd  0x60($Tbl),@X[3],$t3
1027         vmovdqa $t0,0x00(%rsp)
1028         xor     $a1,$a1
1029         vmovdqa $t1,0x20(%rsp)
1030         lea     -$PUSH8(%rsp),%rsp
1031         mov     $B,$a3
1032         vmovdqa $t2,0x00(%rsp)
1033         xor     $C,$a3                  # magic
1034         vmovdqa $t3,0x20(%rsp)
1035         mov     $F,$a4
1036         sub     \$-16*2*$SZ,$Tbl        # size optimization
1037         jmp     .Lavx2_00_47
1038
1039 .align  16
1040 .Lavx2_00_47:
1041         vmovdqu (%r13),$inout
1042         vpinsrq \$0,%r13,$offload,$offload
1043 ___
1044
1045 sub AVX2_256_00_47 () {
1046 my $j = shift;
1047 my $body = shift;
1048 my @X = @_;
1049 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1050 my $base = "+2*$PUSH8(%rsp)";
1051
1052         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1053         foreach (Xupdate_256_AVX()) {           # 29 instructions
1054             eval;
1055             eval(shift(@insns));
1056             eval(shift(@insns));
1057             eval(shift(@insns));
1058         }
1059         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1060           foreach (@insns) { eval; }            # remaining instructions
1061         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1062 }
1063     $aesni_cbc_idx=0;
1064     for ($i=0,$j=0; $j<4; $j++) {
1065         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1066         push(@X,shift(@X));                     # rotate(@X)
1067     }
1068         &vmovq          ("%r13",$offload);      # borrow $a0
1069         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1070         &vpand          ($temp,$temp,$mask14);
1071         &vpor           ($iv,$iv,$temp);
1072         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1073         &lea            ("%r13","16(%r13)");    # inp++
1074
1075         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1076         &cmpb   (($SZ-1)."($Tbl)",0);
1077         &jne    (".Lavx2_00_47");
1078
1079         &vmovdqu        ($inout,"(%r13)");
1080         &vpinsrq        ($offload,$offload,"%r13",0);
1081
1082     $aesni_cbc_idx=0;
1083     for ($i=0; $i<16; ) {
1084         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1085         foreach(bodyx_00_15()) { eval; }
1086     }
1087                                         }
1088 $code.=<<___;
1089         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1090         vmovq   $offload,%r13                   # $_inp, borrow $a0
1091         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1092         add     $a1,$A
1093         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1094
1095         vpand   $mask14,$temp,$temp
1096         vpor    $temp,$iv,$iv
1097         vmovdqu $iv,(%r12,%r13)                 # write output
1098         lea     16(%r13),%r13
1099
1100         add     $SZ*0(%r15),$A
1101         add     $SZ*1(%r15),$B
1102         add     $SZ*2(%r15),$C
1103         add     $SZ*3(%r15),$D
1104         add     $SZ*4(%r15),$E
1105         add     $SZ*5(%r15),$F
1106         add     $SZ*6(%r15),$G
1107         add     $SZ*7(%r15),$H
1108
1109         mov     $A,$SZ*0(%r15)
1110         mov     $B,$SZ*1(%r15)
1111         mov     $C,$SZ*2(%r15)
1112         mov     $D,$SZ*3(%r15)
1113         mov     $E,$SZ*4(%r15)
1114         mov     $F,$SZ*5(%r15)
1115         mov     $G,$SZ*6(%r15)
1116         mov     $H,$SZ*7(%r15)
1117
1118         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1119         je      .Ldone_avx2
1120
1121         xor     $a1,$a1
1122         mov     $B,$a3
1123         mov     $F,$a4
1124         xor     $C,$a3                  # magic
1125         jmp     .Lower_avx2
1126 .align  16
1127 .Lower_avx2:
1128         vmovdqu (%r13),$inout
1129         vpinsrq \$0,%r13,$offload,$offload
1130 ___
1131     $aesni_cbc_idx=0;
1132     for ($i=0; $i<16; ) {
1133         my $base="+16($Tbl)";
1134         foreach(bodyx_00_15()) { eval; }
1135         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1136     }
1137 $code.=<<___;
1138         vmovq   $offload,%r13                   # borrow $a0
1139         vpextrq \$1,$offload,%r15               # borrow $a2
1140         vpand   $mask14,$temp,$temp
1141         vpor    $temp,$iv,$iv
1142         lea     -$PUSH8($Tbl),$Tbl
1143         vmovdqu $iv,(%r15,%r13)                 # write output
1144         lea     16(%r13),%r13                   # inp++
1145         cmp     %rsp,$Tbl
1146         jae     .Lower_avx2
1147
1148         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1149         lea     16*$SZ(%r13),%r13
1150         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1151         add     $a1,$A
1152         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1153
1154         add     $SZ*0(%r15),$A
1155         add     $SZ*1(%r15),$B
1156         add     $SZ*2(%r15),$C
1157         add     $SZ*3(%r15),$D
1158         add     $SZ*4(%r15),$E
1159         add     $SZ*5(%r15),$F
1160         add     $SZ*6(%r15),$G
1161         lea     (%rsi,%r13),%r12
1162         add     $SZ*7(%r15),$H
1163
1164         cmp     $_end,%r13
1165
1166         mov     $A,$SZ*0(%r15)
1167         cmove   %rsp,%r12               # next block or stale data
1168         mov     $B,$SZ*1(%r15)
1169         mov     $C,$SZ*2(%r15)
1170         mov     $D,$SZ*3(%r15)
1171         mov     $E,$SZ*4(%r15)
1172         mov     $F,$SZ*5(%r15)
1173         mov     $G,$SZ*6(%r15)
1174         mov     $H,$SZ*7(%r15)
1175
1176         jbe     .Loop_avx2
1177         lea     (%rsp),$Tbl
1178
1179 .Ldone_avx2:
1180         lea     ($Tbl),%rsp
1181         mov     $_ivp,$ivp
1182         mov     $_rsp,%rsi
1183         vmovdqu $iv,($ivp)              # output IV
1184         vzeroall
1185 ___
1186 $code.=<<___ if ($win64);
1187         movaps  `$framesz+16*0`(%rsp),%xmm6
1188         movaps  `$framesz+16*1`(%rsp),%xmm7
1189         movaps  `$framesz+16*2`(%rsp),%xmm8
1190         movaps  `$framesz+16*3`(%rsp),%xmm9
1191         movaps  `$framesz+16*4`(%rsp),%xmm10
1192         movaps  `$framesz+16*5`(%rsp),%xmm11
1193         movaps  `$framesz+16*6`(%rsp),%xmm12
1194         movaps  `$framesz+16*7`(%rsp),%xmm13
1195         movaps  `$framesz+16*8`(%rsp),%xmm14
1196         movaps  `$framesz+16*9`(%rsp),%xmm15
1197 ___
1198 $code.=<<___;
1199         mov     (%rsi),%r15
1200         mov     8(%rsi),%r14
1201         mov     16(%rsi),%r13
1202         mov     24(%rsi),%r12
1203         mov     32(%rsi),%rbp
1204         mov     40(%rsi),%rbx
1205         lea     48(%rsi),%rsp
1206 .Lepilogue_avx2:
1207         ret
1208 .size   ${func}_avx2,.-${func}_avx2
1209 ___
1210 }}
1211 }}
1212 {{
1213 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1214
1215 my ($rounds,$Tbl)=("%r11d","%rbx");
1216
1217 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1218 my @rndkey=("%xmm4","%xmm5");
1219 my $r=0;
1220 my $sn=0;
1221
1222 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1223 my @MSG=map("%xmm$_",(10..13));
1224
1225 my $aesenc=sub {
1226   use integer;
1227   my ($n,$k)=($r/10,$r%10);
1228     if ($k==0) {
1229       $code.=<<___;
1230         movups          `16*$n`($in0),$in               # load input
1231         xorps           $rndkey0,$in
1232 ___
1233       $code.=<<___ if ($n);
1234         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1235 ___
1236       $code.=<<___;
1237         xorps           $in,$iv
1238         movups          `32+16*$k-112`($key),$rndkey[1]
1239         aesenc          $rndkey[0],$iv
1240 ___
1241     } elsif ($k==9) {
1242       $sn++;
1243       $code.=<<___;
1244         cmp             \$11,$rounds
1245         jb              .Laesenclast$sn
1246         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1247         aesenc          $rndkey[0],$iv
1248         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1249         aesenc          $rndkey[1],$iv
1250         je              .Laesenclast$sn
1251         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1252         aesenc          $rndkey[0],$iv
1253         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1254         aesenc          $rndkey[1],$iv
1255 .Laesenclast$sn:
1256         aesenclast      $rndkey[0],$iv
1257         movups          16-112($key),$rndkey[1]         # forward reference
1258         nop
1259 ___
1260     } else {
1261       $code.=<<___;
1262         movups          `32+16*$k-112`($key),$rndkey[1]
1263         aesenc          $rndkey[0],$iv
1264 ___
1265     }
1266     $r++;       unshift(@rndkey,pop(@rndkey));
1267 };
1268
1269 if ($shaext) {
1270 my $Tbl="%rax";
1271
1272 $code.=<<___;
1273 .type   ${func}_shaext,\@function,6
1274 .align  32
1275 ${func}_shaext:
1276         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1277 ___
1278 $code.=<<___ if ($win64);
1279         lea     `-8-10*16`(%rsp),%rsp
1280         movaps  %xmm6,-8-10*16(%rax)
1281         movaps  %xmm7,-8-9*16(%rax)
1282         movaps  %xmm8,-8-8*16(%rax)
1283         movaps  %xmm9,-8-7*16(%rax)
1284         movaps  %xmm10,-8-6*16(%rax)
1285         movaps  %xmm11,-8-5*16(%rax)
1286         movaps  %xmm12,-8-4*16(%rax)
1287         movaps  %xmm13,-8-3*16(%rax)
1288         movaps  %xmm14,-8-2*16(%rax)
1289         movaps  %xmm15,-8-1*16(%rax)
1290 .Lprologue_shaext:
1291 ___
1292 $code.=<<___;
1293         lea             K256+0x80(%rip),$Tbl
1294         movdqu          ($ctx),$ABEF            # DCBA
1295         movdqu          16($ctx),$CDGH          # HGFE
1296         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1297
1298         mov             240($key),$rounds
1299         sub             $in0,$out
1300         movups          ($key),$rndkey0         # $key[0]
1301         movups          16($key),$rndkey[0]     # forward reference
1302         lea             112($key),$key          # size optimization
1303
1304         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1305         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1306         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1307         movdqa          $TMP,$BSWAP             # offload
1308         palignr         \$8,$CDGH,$ABEF         # ABEF
1309         punpcklqdq      $Wi,$CDGH               # CDGH
1310
1311         jmp     .Loop_shaext
1312
1313 .align  16
1314 .Loop_shaext:
1315         movdqu          ($inp),@MSG[0]
1316         movdqu          0x10($inp),@MSG[1]
1317         movdqu          0x20($inp),@MSG[2]
1318         pshufb          $TMP,@MSG[0]
1319         movdqu          0x30($inp),@MSG[3]
1320
1321         movdqa          0*32-0x80($Tbl),$Wi
1322         paddd           @MSG[0],$Wi
1323         pshufb          $TMP,@MSG[1]
1324         movdqa          $CDGH,$CDGH_SAVE        # offload
1325         movdqa          $ABEF,$ABEF_SAVE        # offload
1326 ___
1327         &$aesenc();
1328 $code.=<<___;
1329         sha256rnds2     $ABEF,$CDGH             # 0-3
1330         pshufd          \$0x0e,$Wi,$Wi
1331 ___
1332         &$aesenc();
1333 $code.=<<___;
1334         sha256rnds2     $CDGH,$ABEF
1335
1336         movdqa          1*32-0x80($Tbl),$Wi
1337         paddd           @MSG[1],$Wi
1338         pshufb          $TMP,@MSG[2]
1339         lea             0x40($inp),$inp
1340 ___
1341         &$aesenc();
1342 $code.=<<___;
1343         sha256rnds2     $ABEF,$CDGH             # 4-7
1344         pshufd          \$0x0e,$Wi,$Wi
1345 ___
1346         &$aesenc();
1347 $code.=<<___;
1348         sha256rnds2     $CDGH,$ABEF
1349
1350         movdqa          2*32-0x80($Tbl),$Wi
1351         paddd           @MSG[2],$Wi
1352         pshufb          $TMP,@MSG[3]
1353         sha256msg1      @MSG[1],@MSG[0]
1354 ___
1355         &$aesenc();
1356 $code.=<<___;
1357         sha256rnds2     $ABEF,$CDGH             # 8-11
1358         pshufd          \$0x0e,$Wi,$Wi
1359         movdqa          @MSG[3],$TMP
1360         palignr         \$4,@MSG[2],$TMP
1361         paddd           $TMP,@MSG[0]
1362 ___
1363         &$aesenc();
1364 $code.=<<___;
1365         sha256rnds2     $CDGH,$ABEF
1366
1367         movdqa          3*32-0x80($Tbl),$Wi
1368         paddd           @MSG[3],$Wi
1369         sha256msg2      @MSG[3],@MSG[0]
1370         sha256msg1      @MSG[2],@MSG[1]
1371 ___
1372         &$aesenc();
1373 $code.=<<___;
1374         sha256rnds2     $ABEF,$CDGH             # 12-15
1375         pshufd          \$0x0e,$Wi,$Wi
1376 ___
1377         &$aesenc();
1378 $code.=<<___;
1379         movdqa          @MSG[0],$TMP
1380         palignr         \$4,@MSG[3],$TMP
1381         paddd           $TMP,@MSG[1]
1382         sha256rnds2     $CDGH,$ABEF
1383 ___
1384 for($i=4;$i<16-3;$i++) {
1385         &$aesenc()      if (($r%10)==0);
1386 $code.=<<___;
1387         movdqa          $i*32-0x80($Tbl),$Wi
1388         paddd           @MSG[0],$Wi
1389         sha256msg2      @MSG[0],@MSG[1]
1390         sha256msg1      @MSG[3],@MSG[2]
1391 ___
1392         &$aesenc();
1393 $code.=<<___;
1394         sha256rnds2     $ABEF,$CDGH             # 16-19...
1395         pshufd          \$0x0e,$Wi,$Wi
1396         movdqa          @MSG[1],$TMP
1397         palignr         \$4,@MSG[0],$TMP
1398         paddd           $TMP,@MSG[2]
1399 ___
1400         &$aesenc();
1401         &$aesenc()      if ($r==19);
1402 $code.=<<___;
1403         sha256rnds2     $CDGH,$ABEF
1404 ___
1405         push(@MSG,shift(@MSG));
1406 }
1407 $code.=<<___;
1408         movdqa          13*32-0x80($Tbl),$Wi
1409         paddd           @MSG[0],$Wi
1410         sha256msg2      @MSG[0],@MSG[1]
1411         sha256msg1      @MSG[3],@MSG[2]
1412 ___
1413         &$aesenc();
1414 $code.=<<___;
1415         sha256rnds2     $ABEF,$CDGH             # 52-55
1416         pshufd          \$0x0e,$Wi,$Wi
1417         movdqa          @MSG[1],$TMP
1418         palignr         \$4,@MSG[0],$TMP
1419         paddd           $TMP,@MSG[2]
1420 ___
1421         &$aesenc();
1422         &$aesenc();
1423 $code.=<<___;
1424         sha256rnds2     $CDGH,$ABEF
1425
1426         movdqa          14*32-0x80($Tbl),$Wi
1427         paddd           @MSG[1],$Wi
1428         sha256msg2      @MSG[1],@MSG[2]
1429         movdqa          $BSWAP,$TMP
1430 ___
1431         &$aesenc();
1432 $code.=<<___;
1433         sha256rnds2     $ABEF,$CDGH             # 56-59
1434         pshufd          \$0x0e,$Wi,$Wi
1435 ___
1436         &$aesenc();
1437 $code.=<<___;
1438         sha256rnds2     $CDGH,$ABEF
1439
1440         movdqa          15*32-0x80($Tbl),$Wi
1441         paddd           @MSG[2],$Wi
1442 ___
1443         &$aesenc();
1444         &$aesenc();
1445 $code.=<<___;
1446         sha256rnds2     $ABEF,$CDGH             # 60-63
1447         pshufd          \$0x0e,$Wi,$Wi
1448 ___
1449         &$aesenc();
1450 $code.=<<___;
1451         sha256rnds2     $CDGH,$ABEF
1452         #pxor           $CDGH,$rndkey0          # black magic
1453 ___
1454         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1455 $code.=<<___;
1456         #xorps          $CDGH,$rndkey0          # black magic
1457         paddd           $CDGH_SAVE,$CDGH
1458         paddd           $ABEF_SAVE,$ABEF
1459
1460         dec             $len
1461         movups          $iv,48($out,$in0)       # write output
1462         lea             64($in0),$in0
1463         jnz             .Loop_shaext
1464
1465         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1466         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1467         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1468         punpckhqdq      $CDGH,$ABEF             # DCBA
1469         palignr         \$8,$TMP,$CDGH          # HGFE
1470
1471         movups          $iv,($ivp)              # write IV
1472         movdqu          $ABEF,($ctx)
1473         movdqu          $CDGH,16($ctx)
1474 ___
1475 $code.=<<___ if ($win64);
1476         movaps  0*16(%rsp),%xmm6
1477         movaps  1*16(%rsp),%xmm7
1478         movaps  2*16(%rsp),%xmm8
1479         movaps  3*16(%rsp),%xmm9
1480         movaps  4*16(%rsp),%xmm10
1481         movaps  5*16(%rsp),%xmm11
1482         movaps  6*16(%rsp),%xmm12
1483         movaps  7*16(%rsp),%xmm13
1484         movaps  8*16(%rsp),%xmm14
1485         movaps  9*16(%rsp),%xmm15
1486         lea     8+10*16(%rsp),%rsp
1487 .Lepilogue_shaext:
1488 ___
1489 $code.=<<___;
1490         ret
1491 .size   ${func}_shaext,.-${func}_shaext
1492 ___
1493 }
1494 }}}}}
1495
1496 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1497 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1498 if ($win64) {
1499 $rec="%rcx";
1500 $frame="%rdx";
1501 $context="%r8";
1502 $disp="%r9";
1503
1504 $code.=<<___ if ($avx);
1505 .extern __imp_RtlVirtualUnwind
1506 .type   se_handler,\@abi-omnipotent
1507 .align  16
1508 se_handler:
1509         push    %rsi
1510         push    %rdi
1511         push    %rbx
1512         push    %rbp
1513         push    %r12
1514         push    %r13
1515         push    %r14
1516         push    %r15
1517         pushfq
1518         sub     \$64,%rsp
1519
1520         mov     120($context),%rax      # pull context->Rax
1521         mov     248($context),%rbx      # pull context->Rip
1522
1523         mov     8($disp),%rsi           # disp->ImageBase
1524         mov     56($disp),%r11          # disp->HanderlData
1525
1526         mov     0(%r11),%r10d           # HandlerData[0]
1527         lea     (%rsi,%r10),%r10        # prologue label
1528         cmp     %r10,%rbx               # context->Rip<prologue label
1529         jb      .Lin_prologue
1530
1531         mov     152($context),%rax      # pull context->Rsp
1532
1533         mov     4(%r11),%r10d           # HandlerData[1]
1534         lea     (%rsi,%r10),%r10        # epilogue label
1535         cmp     %r10,%rbx               # context->Rip>=epilogue label
1536         jae     .Lin_prologue
1537 ___
1538 $code.=<<___ if ($shaext);
1539         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1540         cmp     %r10,%rbx
1541         jb      .Lnot_in_shaext
1542
1543         lea     (%rax),%rsi
1544         lea     512($context),%rdi      # &context.Xmm6
1545         mov     \$20,%ecx
1546         .long   0xa548f3fc              # cld; rep movsq
1547         lea     168(%rax),%rax          # adjust stack pointer
1548         jmp     .Lin_prologue
1549 .Lnot_in_shaext:
1550 ___
1551 $code.=<<___ if ($avx>1);
1552         lea     .Lavx2_shortcut(%rip),%r10
1553         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1554         jb      .Lnot_in_avx2
1555
1556         and     \$-256*$SZ,%rax
1557         add     \$`2*$SZ*($rounds-8)`,%rax
1558 .Lnot_in_avx2:
1559 ___
1560 $code.=<<___;
1561         mov     %rax,%rsi               # put aside Rsp
1562         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1563         lea     48(%rax),%rax
1564
1565         mov     -8(%rax),%rbx
1566         mov     -16(%rax),%rbp
1567         mov     -24(%rax),%r12
1568         mov     -32(%rax),%r13
1569         mov     -40(%rax),%r14
1570         mov     -48(%rax),%r15
1571         mov     %rbx,144($context)      # restore context->Rbx
1572         mov     %rbp,160($context)      # restore context->Rbp
1573         mov     %r12,216($context)      # restore context->R12
1574         mov     %r13,224($context)      # restore context->R13
1575         mov     %r14,232($context)      # restore context->R14
1576         mov     %r15,240($context)      # restore context->R15
1577
1578         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1579         lea     512($context),%rdi      # &context.Xmm6
1580         mov     \$20,%ecx
1581         .long   0xa548f3fc              # cld; rep movsq
1582
1583 .Lin_prologue:
1584         mov     8(%rax),%rdi
1585         mov     16(%rax),%rsi
1586         mov     %rax,152($context)      # restore context->Rsp
1587         mov     %rsi,168($context)      # restore context->Rsi
1588         mov     %rdi,176($context)      # restore context->Rdi
1589
1590         mov     40($disp),%rdi          # disp->ContextRecord
1591         mov     $context,%rsi           # context
1592         mov     \$154,%ecx              # sizeof(CONTEXT)
1593         .long   0xa548f3fc              # cld; rep movsq
1594
1595         mov     $disp,%rsi
1596         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1597         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1598         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1599         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1600         mov     40(%rsi),%r10           # disp->ContextRecord
1601         lea     56(%rsi),%r11           # &disp->HandlerData
1602         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1603         mov     %r10,32(%rsp)           # arg5
1604         mov     %r11,40(%rsp)           # arg6
1605         mov     %r12,48(%rsp)           # arg7
1606         mov     %rcx,56(%rsp)           # arg8, (NULL)
1607         call    *__imp_RtlVirtualUnwind(%rip)
1608
1609         mov     \$1,%eax                # ExceptionContinueSearch
1610         add     \$64,%rsp
1611         popfq
1612         pop     %r15
1613         pop     %r14
1614         pop     %r13
1615         pop     %r12
1616         pop     %rbp
1617         pop     %rbx
1618         pop     %rdi
1619         pop     %rsi
1620         ret
1621 .size   se_handler,.-se_handler
1622
1623 .section        .pdata
1624         .rva    .LSEH_begin_${func}_xop
1625         .rva    .LSEH_end_${func}_xop
1626         .rva    .LSEH_info_${func}_xop
1627
1628         .rva    .LSEH_begin_${func}_avx
1629         .rva    .LSEH_end_${func}_avx
1630         .rva    .LSEH_info_${func}_avx
1631 ___
1632 $code.=<<___ if ($avx>1);
1633         .rva    .LSEH_begin_${func}_avx2
1634         .rva    .LSEH_end_${func}_avx2
1635         .rva    .LSEH_info_${func}_avx2
1636 ___
1637 $code.=<<___ if ($shaext);
1638         .rva    .LSEH_begin_${func}_shaext
1639         .rva    .LSEH_end_${func}_shaext
1640         .rva    .LSEH_info_${func}_shaext
1641 ___
1642 $code.=<<___ if ($avx);
1643 .section        .xdata
1644 .align  8
1645 .LSEH_info_${func}_xop:
1646         .byte   9,0,0,0
1647         .rva    se_handler
1648         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1649
1650 .LSEH_info_${func}_avx:
1651         .byte   9,0,0,0
1652         .rva    se_handler
1653         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1654 ___
1655 $code.=<<___ if ($avx>1);
1656 .LSEH_info_${func}_avx2:
1657         .byte   9,0,0,0
1658         .rva    se_handler
1659         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1660 ___
1661 $code.=<<___ if ($shaext);
1662 .LSEH_info_${func}_shaext:
1663         .byte   9,0,0,0
1664         .rva    se_handler
1665         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1666 ___
1667 }
1668
1669 ####################################################################
1670 sub rex {
1671   local *opcode=shift;
1672   my ($dst,$src)=@_;
1673   my $rex=0;
1674
1675     $rex|=0x04                  if($dst>=8);
1676     $rex|=0x01                  if($src>=8);
1677     unshift @opcode,$rex|0x40   if($rex);
1678 }
1679
1680 {
1681   my %opcodelet = (
1682                 "sha256rnds2" => 0xcb,
1683                 "sha256msg1"  => 0xcc,
1684                 "sha256msg2"  => 0xcd   );
1685
1686   sub sha256op38 {
1687     my $instr = shift;
1688
1689     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1690       my @opcode=(0x0f,0x38);
1691         rex(\@opcode,$2,$1);
1692         push @opcode,$opcodelet{$instr};
1693         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1694         return ".byte\t".join(',',@opcode);
1695     } else {
1696         return $instr."\t".@_[0];
1697     }
1698   }
1699 }
1700
1701 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1702 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1703 print $code;
1704 close STDOUT;