crypto/aes/asm/aesni-sha256-x86_64.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # January 2013
  18 #
  19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  22 # parallelism, interleaving it with another algorithm would allow to
  23 # utilize processor resources better and achieve better performance.
  24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  25 # AESNI code is weaved into it. As SHA256 dominates execution time,
  26 # stitch performance does not depend on AES key length. Below are
  27 # performance numbers in cycles per processed byte, less is better,
  28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  29 # subroutine:
  30 #
  31 #                AES-128/-192/-256+SHA256   this(**)    gain
  32 # Sandy Bridge      5.05/6.05/7.05+11.6     13.0        +28%/36%/43%
  33 # Ivy Bridge        5.05/6.05/7.05+10.3     11.6        +32%/41%/50%
  34 # Haswell           4.43/5.29/6.19+7.80     8.79        +39%/49%/59%
  35 # Skylake           2.62/3.14/3.62+7.70     8.10        +27%/34%/40%
  36 # Bulldozer         5.77/6.89/8.00+13.7     13.7        +42%/50%/58%
  37 # Ryzen(***)        2.71/-/3.71+2.05        2.74/-/3.73 +74%/-/54%
  38 # Goldmont(***)     3.82/-/5.35+4.16        4.73/-/5.94 +69%/-/60%
  39 #
  40 # (*)   there are XOP, AVX1 and AVX2 code paths, meaning that
  41 #       Westmere is omitted from loop, this is because gain was not
  42 #       estimated high enough to justify the effort;
  43 # (**)  these are EVP-free results, results obtained with 'speed
  44 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  45 # (***) these are SHAEXT results;
  46
  47 $flavour = shift;
  48 $output  = shift;
  49 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  50
  51 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  52
  53 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  54 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  55 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  56 die "can't locate x86_64-xlate.pl";
  57
  58 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  59                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  60         $avx = ($1>=2.19) + ($1>=2.22);
  61 }
  62
  63 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  64            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  65         $avx = ($1>=2.09) + ($1>=2.10);
  66 }
  67
  68 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  69            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  70         $avx = ($1>=10) + ($1>=12);
  71 }
  72
  73 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
  74         $avx = ($2>=3.0) + ($2>3.0);
  75 }
  76
  77 $shaext=$avx;   ### set to zero if compiling for 1.0.1
  78 $avx=1          if (!$shaext && $avx);
  79
  80 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  81 *STDOUT=*OUT;
  82
  83 $func="aesni_cbc_sha256_enc";
  84 $TABLE="K256";
  85 $SZ=4;
  86 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  87                                 "%r8d","%r9d","%r10d","%r11d");
  88 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  89 @Sigma0=( 2,13,22);
  90 @Sigma1=( 6,11,25);
  91 @sigma0=( 7,18, 3);
  92 @sigma1=(17,19,10);
  93 $rounds=64;
  94
  95 ########################################################################
  96 # void aesni_cbc_sha256_enc(const void *inp,
  97 #                       void *out,
  98 #                       size_t length,
  99 #                       const AES_KEY *key,
 100 #                       unsigned char *iv,
 101 #                       SHA256_CTX *ctx,
 102 #                       const void *in0);
 103 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
 104 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 105
 106 $Tbl="%rbp";
 107
 108 $_inp="16*$SZ+0*8(%rsp)";
 109 $_out="16*$SZ+1*8(%rsp)";
 110 $_end="16*$SZ+2*8(%rsp)";
 111 $_key="16*$SZ+3*8(%rsp)";
 112 $_ivp="16*$SZ+4*8(%rsp)";
 113 $_ctx="16*$SZ+5*8(%rsp)";
 114 $_in0="16*$SZ+6*8(%rsp)";
 115 $_rsp="`16*$SZ+7*8`(%rsp)";
 116 $framesz=16*$SZ+8*8;
 117
 118 $code=<<___;
 119 .text
 120
 121 .extern OPENSSL_ia32cap_P
 122 .globl  $func
 123 .type   $func,\@abi-omnipotent
 124 .align  16
 125 $func:
 126 ___
 127                                                 if ($avx) {
 128 $code.=<<___;
 129         lea     OPENSSL_ia32cap_P(%rip),%r11
 130         mov     \$1,%eax
 131         cmp     \$0,`$win64?"%rcx":"%rdi"`
 132         je      .Lprobe
 133         mov     0(%r11),%eax
 134         mov     4(%r11),%r10
 135 ___
 136 $code.=<<___ if ($shaext);
 137         bt      \$61,%r10                       # check for SHA
 138         jc      ${func}_shaext
 139 ___
 140 $code.=<<___;
 141         mov     %r10,%r11
 142         shr     \$32,%r11
 143
 144         test    \$`1<<11`,%r10d                 # check for XOP
 145         jnz     ${func}_xop
 146 ___
 147 $code.=<<___ if ($avx>1);
 148         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
 149         cmp     \$`1<<8|1<<5|1<<3`,%r11d
 150         je      ${func}_avx2
 151 ___
 152 $code.=<<___;
 153         and     \$`1<<28`,%r10d                 # check for AVX
 154         jnz     ${func}_avx
 155         ud2
 156 ___
 157                                                 }
 158 $code.=<<___;
 159         xor     %eax,%eax
 160         cmp     \$0,`$win64?"%rcx":"%rdi"`
 161         je      .Lprobe
 162         ud2
 163 .Lprobe:
 164         ret
 165 .size   $func,.-$func
 166
 167 .align  64
 168 .type   $TABLE,\@object
 169 $TABLE:
 170         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 171         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 172         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 173         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 174         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 175         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 176         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 177         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 178         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 179         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 180         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 181         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 182         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 183         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 184         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 185         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 186         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 187         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 188         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 189         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 190         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 191         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 192         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 193         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 194         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 195         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 196         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 197         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 198         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 199         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 200         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 201         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 202
 203         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 204         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 205         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
 206         .long   0,0,0,0,   0,0,0,0
 207         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 208 .align  64
 209 ___
 210
 211 ######################################################################
 212 # SIMD code paths
 213 #
 214 {{{
 215 ($iv,$inout,$roundkey,$temp,
 216  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
 217
 218 $aesni_cbc_idx=0;
 219 @aesni_cbc_block = (
 220 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
 221 ##      &vmovdqu        ($inout,($inp));
 222 ##      &mov            ($_inp,$inp);
 223
 224         '&vpxor         ($inout,$inout,$roundkey);'.
 225         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
 226
 227         '&vpxor         ($inout,$inout,$iv);',
 228
 229         '&vaesenc       ($inout,$inout,$roundkey);'.
 230         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
 231
 232         '&vaesenc       ($inout,$inout,$roundkey);'.
 233         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
 234
 235         '&vaesenc       ($inout,$inout,$roundkey);'.
 236         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
 237
 238         '&vaesenc       ($inout,$inout,$roundkey);'.
 239         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
 240
 241         '&vaesenc       ($inout,$inout,$roundkey);'.
 242         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
 243
 244         '&vaesenc       ($inout,$inout,$roundkey);'.
 245         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
 246
 247         '&vaesenc       ($inout,$inout,$roundkey);'.
 248         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
 249
 250         '&vaesenc       ($inout,$inout,$roundkey);'.
 251         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
 252
 253         '&vaesenc       ($inout,$inout,$roundkey);'.
 254         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
 255
 256         '&vaesenclast   ($temp,$inout,$roundkey);'.
 257         ' &vaesenc      ($inout,$inout,$roundkey);'.
 258         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
 259
 260         '&vpand         ($iv,$temp,$mask10);'.
 261         ' &vaesenc      ($inout,$inout,$roundkey);'.
 262         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
 263
 264         '&vaesenclast   ($temp,$inout,$roundkey);'.
 265         ' &vaesenc      ($inout,$inout,$roundkey);'.
 266         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
 267
 268         '&vpand         ($temp,$temp,$mask12);'.
 269         ' &vaesenc      ($inout,$inout,$roundkey);'.
 270          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
 271
 272         '&vpor          ($iv,$iv,$temp);'.
 273         ' &vaesenclast  ($temp,$inout,$roundkey);'.
 274         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
 275
 276 ##      &mov            ($inp,$_inp);
 277 ##      &mov            ($out,$_out);
 278 ##      &vpand          ($temp,$temp,$mask14);
 279 ##      &vpor           ($iv,$iv,$temp);
 280 ##      &vmovdqu        ($iv,($out,$inp);
 281 ##      &lea            (inp,16($inp));
 282 );
 283
 284 my $a4=$T1;
 285 my ($a,$b,$c,$d,$e,$f,$g,$h);
 286
 287 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 288 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 289   my $arg = pop;
 290     $arg = "\$$arg" if ($arg*1 eq $arg);
 291     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 292 }
 293
 294 sub body_00_15 () {
 295         (
 296         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 297
 298         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
 299         '&mov   ($a,$a1)',
 300         '&mov   ($a4,$f)',
 301
 302         '&xor   ($a0,$e)',
 303         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
 304         '&xor   ($a4,$g)',                      # f^g
 305
 306         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
 307         '&xor   ($a1,$a)',
 308         '&and   ($a4,$e)',                      # (f^g)&e
 309
 310         @aesni_cbc_block[$aesni_cbc_idx++].
 311         '&xor   ($a0,$e)',
 312         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
 313         '&mov   ($a2,$a)',
 314
 315         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
 316         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
 317         '&xor   ($a2,$b)',                      # a^b, b^c in next round
 318
 319         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
 320         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
 321         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
 322
 323         '&xor   ($a1,$a)',
 324         '&add   ($h,$a0)',                      # h+=Sigma1(e)
 325         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
 326
 327         '&add   ($d,$h)',                       # d+=h
 328         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
 329         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
 330
 331         '&mov   ($a0,$d)',
 332         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
 333         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 334         );
 335 }
 336
 337 if ($avx) {{
 338 ######################################################################
 339 # XOP code path
 340 #
 341 $code.=<<___;
 342 .type   ${func}_xop,\@function,6
 343 .align  64
 344 ${func}_xop:
 345 .cfi_startproc
 346 .Lxop_shortcut:
 347         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 348         mov     %rsp,%rax               # copy %rsp
 349 .cfi_def_cfa_register   %rax
 350         push    %rbx
 351 .cfi_push       %rbx
 352         push    %rbp
 353 .cfi_push       %rbp
 354         push    %r12
 355 .cfi_push       %r12
 356         push    %r13
 357 .cfi_push       %r13
 358         push    %r14
 359 .cfi_push       %r14
 360         push    %r15
 361 .cfi_push       %r15
 362         sub     \$`$framesz+$win64*16*10`,%rsp
 363         and     \$-64,%rsp              # align stack frame
 364
 365         shl     \$6,$len
 366         sub     $inp,$out               # re-bias
 367         sub     $inp,$in0
 368         add     $inp,$len               # end of input
 369
 370         #mov    $inp,$_inp              # saved later
 371         mov     $out,$_out
 372         mov     $len,$_end
 373         #mov    $key,$_key              # remains resident in $inp register
 374         mov     $ivp,$_ivp
 375         mov     $ctx,$_ctx
 376         mov     $in0,$_in0
 377         mov     %rax,$_rsp
 378 .cfi_cfa_expression     $_rsp,deref,+8
 379 ___
 380 $code.=<<___ if ($win64);
 381         movaps  %xmm6,`$framesz+16*0`(%rsp)
 382         movaps  %xmm7,`$framesz+16*1`(%rsp)
 383         movaps  %xmm8,`$framesz+16*2`(%rsp)
 384         movaps  %xmm9,`$framesz+16*3`(%rsp)
 385         movaps  %xmm10,`$framesz+16*4`(%rsp)
 386         movaps  %xmm11,`$framesz+16*5`(%rsp)
 387         movaps  %xmm12,`$framesz+16*6`(%rsp)
 388         movaps  %xmm13,`$framesz+16*7`(%rsp)
 389         movaps  %xmm14,`$framesz+16*8`(%rsp)
 390         movaps  %xmm15,`$framesz+16*9`(%rsp)
 391 ___
 392 $code.=<<___;
 393 .Lprologue_xop:
 394         vzeroall
 395
 396         mov     $inp,%r12               # borrow $a4
 397         lea     0x80($key),$inp         # size optimization, reassign
 398         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 399         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 400         mov     $ctx,%r15               # borrow $a2
 401         mov     $in0,%rsi               # borrow $a3
 402         vmovdqu ($ivp),$iv              # load IV
 403         sub     \$9,%r14
 404
 405         mov     $SZ*0(%r15),$A
 406         mov     $SZ*1(%r15),$B
 407         mov     $SZ*2(%r15),$C
 408         mov     $SZ*3(%r15),$D
 409         mov     $SZ*4(%r15),$E
 410         mov     $SZ*5(%r15),$F
 411         mov     $SZ*6(%r15),$G
 412         mov     $SZ*7(%r15),$H
 413
 414         vmovdqa 0x00(%r13,%r14,8),$mask14
 415         vmovdqa 0x10(%r13,%r14,8),$mask12
 416         vmovdqa 0x20(%r13,%r14,8),$mask10
 417         vmovdqu 0x00-0x80($inp),$roundkey
 418         jmp     .Lloop_xop
 419 ___
 420                                         if ($SZ==4) {   # SHA256
 421     my @X = map("%xmm$_",(0..3));
 422     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 423
 424 $code.=<<___;
 425 .align  16
 426 .Lloop_xop:
 427         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 428         vmovdqu 0x00(%rsi,%r12),@X[0]
 429         vmovdqu 0x10(%rsi,%r12),@X[1]
 430         vmovdqu 0x20(%rsi,%r12),@X[2]
 431         vmovdqu 0x30(%rsi,%r12),@X[3]
 432         vpshufb $t3,@X[0],@X[0]
 433         lea     $TABLE(%rip),$Tbl
 434         vpshufb $t3,@X[1],@X[1]
 435         vpshufb $t3,@X[2],@X[2]
 436         vpaddd  0x00($Tbl),@X[0],$t0
 437         vpshufb $t3,@X[3],@X[3]
 438         vpaddd  0x20($Tbl),@X[1],$t1
 439         vpaddd  0x40($Tbl),@X[2],$t2
 440         vpaddd  0x60($Tbl),@X[3],$t3
 441         vmovdqa $t0,0x00(%rsp)
 442         mov     $A,$a1
 443         vmovdqa $t1,0x10(%rsp)
 444         mov     $B,$a3
 445         vmovdqa $t2,0x20(%rsp)
 446         xor     $C,$a3                  # magic
 447         vmovdqa $t3,0x30(%rsp)
 448         mov     $E,$a0
 449         jmp     .Lxop_00_47
 450
 451 .align  16
 452 .Lxop_00_47:
 453         sub     \$-16*2*$SZ,$Tbl        # size optimization
 454         vmovdqu (%r12),$inout           # $a4
 455         mov     %r12,$_inp              # $a4
 456 ___
 457 sub XOP_256_00_47 () {
 458 my $j = shift;
 459 my $body = shift;
 460 my @X = @_;
 461 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 462
 463         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
 464           eval(shift(@insns));
 465           eval(shift(@insns));
 466          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
 467           eval(shift(@insns));
 468           eval(shift(@insns));
 469         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
 470           eval(shift(@insns));
 471           eval(shift(@insns));
 472         &vpsrld         ($t0,$t0,$sigma0[2]);
 473           eval(shift(@insns));
 474           eval(shift(@insns));
 475          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
 476           eval(shift(@insns));
 477           eval(shift(@insns));
 478           eval(shift(@insns));
 479           eval(shift(@insns));
 480         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
 481           eval(shift(@insns));
 482           eval(shift(@insns));
 483         &vpxor          ($t0,$t0,$t1);
 484           eval(shift(@insns));
 485           eval(shift(@insns));
 486           eval(shift(@insns));
 487           eval(shift(@insns));
 488          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
 489           eval(shift(@insns));
 490           eval(shift(@insns));
 491         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
 492           eval(shift(@insns));
 493           eval(shift(@insns));
 494          &vpsrld        ($t2,@X[3],$sigma1[2]);
 495           eval(shift(@insns));
 496           eval(shift(@insns));
 497         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
 498           eval(shift(@insns));
 499           eval(shift(@insns));
 500          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 501           eval(shift(@insns));
 502           eval(shift(@insns));
 503          &vpxor         ($t3,$t3,$t2);
 504           eval(shift(@insns));
 505           eval(shift(@insns));
 506           eval(shift(@insns));
 507           eval(shift(@insns));
 508          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
 509           eval(shift(@insns));
 510           eval(shift(@insns));
 511           eval(shift(@insns));
 512           eval(shift(@insns));
 513         &vpsrldq        ($t3,$t3,8);
 514           eval(shift(@insns));
 515           eval(shift(@insns));
 516           eval(shift(@insns));
 517           eval(shift(@insns));
 518         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
 519           eval(shift(@insns));
 520           eval(shift(@insns));
 521           eval(shift(@insns));
 522           eval(shift(@insns));
 523          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
 524           eval(shift(@insns));
 525           eval(shift(@insns));
 526          &vpsrld        ($t2,@X[0],$sigma1[2]);
 527           eval(shift(@insns));
 528           eval(shift(@insns));
 529          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 530           eval(shift(@insns));
 531           eval(shift(@insns));
 532          &vpxor         ($t3,$t3,$t2);
 533           eval(shift(@insns));
 534           eval(shift(@insns));
 535           eval(shift(@insns));
 536           eval(shift(@insns));
 537          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
 538           eval(shift(@insns));
 539           eval(shift(@insns));
 540           eval(shift(@insns));
 541           eval(shift(@insns));
 542         &vpslldq        ($t3,$t3,8);            # 22 instructions
 543           eval(shift(@insns));
 544           eval(shift(@insns));
 545           eval(shift(@insns));
 546           eval(shift(@insns));
 547         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
 548           eval(shift(@insns));
 549           eval(shift(@insns));
 550           eval(shift(@insns));
 551           eval(shift(@insns));
 552         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 553           foreach (@insns) { eval; }            # remaining instructions
 554         &vmovdqa        (16*$j."(%rsp)",$t2);
 555 }
 556
 557     $aesni_cbc_idx=0;
 558     for ($i=0,$j=0; $j<4; $j++) {
 559         &XOP_256_00_47($j,\&body_00_15,@X);
 560         push(@X,shift(@X));                     # rotate(@X)
 561     }
 562         &mov            ("%r12",$_inp);         # borrow $a4
 563         &vpand          ($temp,$temp,$mask14);
 564         &mov            ("%r15",$_out);         # borrow $a2
 565         &vpor           ($iv,$iv,$temp);
 566         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 567         &lea            ("%r12","16(%r12)");    # inp++
 568
 569         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 570         &jne    (".Lxop_00_47");
 571
 572         &vmovdqu        ($inout,"(%r12)");
 573         &mov            ($_inp,"%r12");
 574
 575     $aesni_cbc_idx=0;
 576     for ($i=0; $i<16; ) {
 577         foreach(body_00_15()) { eval; }
 578     }
 579                                         }
 580 $code.=<<___;
 581         mov     $_inp,%r12              # borrow $a4
 582         mov     $_out,%r13              # borrow $a0
 583         mov     $_ctx,%r15              # borrow $a2
 584         mov     $_in0,%rsi              # borrow $a3
 585
 586         vpand   $mask14,$temp,$temp
 587         mov     $a1,$A
 588         vpor    $temp,$iv,$iv
 589         vmovdqu $iv,(%r13,%r12)         # write output
 590         lea     16(%r12),%r12           # inp++
 591
 592         add     $SZ*0(%r15),$A
 593         add     $SZ*1(%r15),$B
 594         add     $SZ*2(%r15),$C
 595         add     $SZ*3(%r15),$D
 596         add     $SZ*4(%r15),$E
 597         add     $SZ*5(%r15),$F
 598         add     $SZ*6(%r15),$G
 599         add     $SZ*7(%r15),$H
 600
 601         cmp     $_end,%r12
 602
 603         mov     $A,$SZ*0(%r15)
 604         mov     $B,$SZ*1(%r15)
 605         mov     $C,$SZ*2(%r15)
 606         mov     $D,$SZ*3(%r15)
 607         mov     $E,$SZ*4(%r15)
 608         mov     $F,$SZ*5(%r15)
 609         mov     $G,$SZ*6(%r15)
 610         mov     $H,$SZ*7(%r15)
 611
 612         jb      .Lloop_xop
 613
 614         mov     $_ivp,$ivp
 615         mov     $_rsp,%rsi
 616 .cfi_def_cfa    %rsi,8
 617         vmovdqu $iv,($ivp)              # output IV
 618         vzeroall
 619 ___
 620 $code.=<<___ if ($win64);
 621         movaps  `$framesz+16*0`(%rsp),%xmm6
 622         movaps  `$framesz+16*1`(%rsp),%xmm7
 623         movaps  `$framesz+16*2`(%rsp),%xmm8
 624         movaps  `$framesz+16*3`(%rsp),%xmm9
 625         movaps  `$framesz+16*4`(%rsp),%xmm10
 626         movaps  `$framesz+16*5`(%rsp),%xmm11
 627         movaps  `$framesz+16*6`(%rsp),%xmm12
 628         movaps  `$framesz+16*7`(%rsp),%xmm13
 629         movaps  `$framesz+16*8`(%rsp),%xmm14
 630         movaps  `$framesz+16*9`(%rsp),%xmm15
 631 ___
 632 $code.=<<___;
 633         mov     -48(%rsi),%r15
 634 .cfi_restore    %r15
 635         mov     -40(%rsi),%r14
 636 .cfi_restore    %r14
 637         mov     -32(%rsi),%r13
 638 .cfi_restore    %r13
 639         mov     -24(%rsi),%r12
 640 .cfi_restore    %r12
 641         mov     -16(%rsi),%rbp
 642 .cfi_restore    %rbp
 643         mov     -8(%rsi),%rbx
 644 .cfi_restore    %rbx
 645         lea     (%rsi),%rsp
 646 .cfi_def_cfa_register   %rsp
 647 .Lepilogue_xop:
 648         ret
 649 .cfi_endproc
 650 .size   ${func}_xop,.-${func}_xop
 651 ___
 652 ######################################################################
 653 # AVX+shrd code path
 654 #
 655 local *ror = sub { &shrd(@_[0],@_) };
 656
 657 $code.=<<___;
 658 .type   ${func}_avx,\@function,6
 659 .align  64
 660 ${func}_avx:
 661 .cfi_startproc
 662 .Lavx_shortcut:
 663         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 664         mov     %rsp,%rax               # copy %rsp
 665 .cfi_def_cfa_register   %rax
 666         push    %rbx
 667 .cfi_push       %rbx
 668         push    %rbp
 669 .cfi_push       %rbp
 670         push    %r12
 671 .cfi_push       %r12
 672         push    %r13
 673 .cfi_push       %r13
 674         push    %r14
 675 .cfi_push       %r14
 676         push    %r15
 677 .cfi_push       %r15
 678         sub     \$`$framesz+$win64*16*10`,%rsp
 679         and     \$-64,%rsp              # align stack frame
 680
 681         shl     \$6,$len
 682         sub     $inp,$out               # re-bias
 683         sub     $inp,$in0
 684         add     $inp,$len               # end of input
 685
 686         #mov    $inp,$_inp              # saved later
 687         mov     $out,$_out
 688         mov     $len,$_end
 689         #mov    $key,$_key              # remains resident in $inp register
 690         mov     $ivp,$_ivp
 691         mov     $ctx,$_ctx
 692         mov     $in0,$_in0
 693         mov     %rax,$_rsp
 694 .cfi_cfa_expression     $_rsp,deref,+8
 695 ___
 696 $code.=<<___ if ($win64);
 697         movaps  %xmm6,`$framesz+16*0`(%rsp)
 698         movaps  %xmm7,`$framesz+16*1`(%rsp)
 699         movaps  %xmm8,`$framesz+16*2`(%rsp)
 700         movaps  %xmm9,`$framesz+16*3`(%rsp)
 701         movaps  %xmm10,`$framesz+16*4`(%rsp)
 702         movaps  %xmm11,`$framesz+16*5`(%rsp)
 703         movaps  %xmm12,`$framesz+16*6`(%rsp)
 704         movaps  %xmm13,`$framesz+16*7`(%rsp)
 705         movaps  %xmm14,`$framesz+16*8`(%rsp)
 706         movaps  %xmm15,`$framesz+16*9`(%rsp)
 707 ___
 708 $code.=<<___;
 709 .Lprologue_avx:
 710         vzeroall
 711
 712         mov     $inp,%r12               # borrow $a4
 713         lea     0x80($key),$inp         # size optimization, reassign
 714         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 715         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 716         mov     $ctx,%r15               # borrow $a2
 717         mov     $in0,%rsi               # borrow $a3
 718         vmovdqu ($ivp),$iv              # load IV
 719         sub     \$9,%r14
 720
 721         mov     $SZ*0(%r15),$A
 722         mov     $SZ*1(%r15),$B
 723         mov     $SZ*2(%r15),$C
 724         mov     $SZ*3(%r15),$D
 725         mov     $SZ*4(%r15),$E
 726         mov     $SZ*5(%r15),$F
 727         mov     $SZ*6(%r15),$G
 728         mov     $SZ*7(%r15),$H
 729
 730         vmovdqa 0x00(%r13,%r14,8),$mask14
 731         vmovdqa 0x10(%r13,%r14,8),$mask12
 732         vmovdqa 0x20(%r13,%r14,8),$mask10
 733         vmovdqu 0x00-0x80($inp),$roundkey
 734 ___
 735                                         if ($SZ==4) {   # SHA256
 736     my @X = map("%xmm$_",(0..3));
 737     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 738
 739 $code.=<<___;
 740         jmp     .Lloop_avx
 741 .align  16
 742 .Lloop_avx:
 743         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 744         vmovdqu 0x00(%rsi,%r12),@X[0]
 745         vmovdqu 0x10(%rsi,%r12),@X[1]
 746         vmovdqu 0x20(%rsi,%r12),@X[2]
 747         vmovdqu 0x30(%rsi,%r12),@X[3]
 748         vpshufb $t3,@X[0],@X[0]
 749         lea     $TABLE(%rip),$Tbl
 750         vpshufb $t3,@X[1],@X[1]
 751         vpshufb $t3,@X[2],@X[2]
 752         vpaddd  0x00($Tbl),@X[0],$t0
 753         vpshufb $t3,@X[3],@X[3]
 754         vpaddd  0x20($Tbl),@X[1],$t1
 755         vpaddd  0x40($Tbl),@X[2],$t2
 756         vpaddd  0x60($Tbl),@X[3],$t3
 757         vmovdqa $t0,0x00(%rsp)
 758         mov     $A,$a1
 759         vmovdqa $t1,0x10(%rsp)
 760         mov     $B,$a3
 761         vmovdqa $t2,0x20(%rsp)
 762         xor     $C,$a3                  # magic
 763         vmovdqa $t3,0x30(%rsp)
 764         mov     $E,$a0
 765         jmp     .Lavx_00_47
 766
 767 .align  16
 768 .Lavx_00_47:
 769         sub     \$-16*2*$SZ,$Tbl        # size optimization
 770         vmovdqu (%r12),$inout           # $a4
 771         mov     %r12,$_inp              # $a4
 772 ___
 773 sub Xupdate_256_AVX () {
 774         (
 775         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
 776          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
 777         '&vpsrld        ($t2,$t0,$sigma0[0]);',
 778          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
 779         '&vpsrld        ($t3,$t0,$sigma0[2])',
 780         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
 781         '&vpxor         ($t0,$t3,$t2)',
 782          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
 783         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
 784         '&vpxor         ($t0,$t0,$t1)',
 785         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
 786         '&vpxor         ($t0,$t0,$t2)',
 787          '&vpsrld       ($t2,$t3,$sigma1[2]);',
 788         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
 789          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
 790         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
 791          '&vpxor        ($t2,$t2,$t3);',
 792          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 793          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
 794          '&vpshufd      ($t2,$t2,0b10000100)',
 795          '&vpsrldq      ($t2,$t2,8)',
 796         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
 797          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
 798          '&vpsrld       ($t2,$t3,$sigma1[2])',
 799          '&vpsrlq       ($t3,$t3,$sigma1[0])',
 800          '&vpxor        ($t2,$t2,$t3);',
 801          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 802          '&vpxor        ($t2,$t2,$t3)',
 803          '&vpshufd      ($t2,$t2,0b11101000)',
 804          '&vpslldq      ($t2,$t2,8)',
 805         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
 806         );
 807 }
 808
 809 sub AVX_256_00_47 () {
 810 my $j = shift;
 811 my $body = shift;
 812 my @X = @_;
 813 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 814
 815         foreach (Xupdate_256_AVX()) {           # 29 instructions
 816             eval;
 817             eval(shift(@insns));
 818             eval(shift(@insns));
 819             eval(shift(@insns));
 820         }
 821         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 822           foreach (@insns) { eval; }            # remaining instructions
 823         &vmovdqa        (16*$j."(%rsp)",$t2);
 824 }
 825
 826     $aesni_cbc_idx=0;
 827     for ($i=0,$j=0; $j<4; $j++) {
 828         &AVX_256_00_47($j,\&body_00_15,@X);
 829         push(@X,shift(@X));                     # rotate(@X)
 830     }
 831         &mov            ("%r12",$_inp);         # borrow $a4
 832         &vpand          ($temp,$temp,$mask14);
 833         &mov            ("%r15",$_out);         # borrow $a2
 834         &vpor           ($iv,$iv,$temp);
 835         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 836         &lea            ("%r12","16(%r12)");    # inp++
 837
 838         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 839         &jne    (".Lavx_00_47");
 840
 841         &vmovdqu        ($inout,"(%r12)");
 842         &mov            ($_inp,"%r12");
 843
 844     $aesni_cbc_idx=0;
 845     for ($i=0; $i<16; ) {
 846         foreach(body_00_15()) { eval; }
 847     }
 848
 849                                         }
 850 $code.=<<___;
 851         mov     $_inp,%r12              # borrow $a4
 852         mov     $_out,%r13              # borrow $a0
 853         mov     $_ctx,%r15              # borrow $a2
 854         mov     $_in0,%rsi              # borrow $a3
 855
 856         vpand   $mask14,$temp,$temp
 857         mov     $a1,$A
 858         vpor    $temp,$iv,$iv
 859         vmovdqu $iv,(%r13,%r12)         # write output
 860         lea     16(%r12),%r12           # inp++
 861
 862         add     $SZ*0(%r15),$A
 863         add     $SZ*1(%r15),$B
 864         add     $SZ*2(%r15),$C
 865         add     $SZ*3(%r15),$D
 866         add     $SZ*4(%r15),$E
 867         add     $SZ*5(%r15),$F
 868         add     $SZ*6(%r15),$G
 869         add     $SZ*7(%r15),$H
 870
 871         cmp     $_end,%r12
 872
 873         mov     $A,$SZ*0(%r15)
 874         mov     $B,$SZ*1(%r15)
 875         mov     $C,$SZ*2(%r15)
 876         mov     $D,$SZ*3(%r15)
 877         mov     $E,$SZ*4(%r15)
 878         mov     $F,$SZ*5(%r15)
 879         mov     $G,$SZ*6(%r15)
 880         mov     $H,$SZ*7(%r15)
 881         jb      .Lloop_avx
 882
 883         mov     $_ivp,$ivp
 884         mov     $_rsp,%rsi
 885 .cfi_def_cfa    %rsi,8
 886         vmovdqu $iv,($ivp)              # output IV
 887         vzeroall
 888 ___
 889 $code.=<<___ if ($win64);
 890         movaps  `$framesz+16*0`(%rsp),%xmm6
 891         movaps  `$framesz+16*1`(%rsp),%xmm7
 892         movaps  `$framesz+16*2`(%rsp),%xmm8
 893         movaps  `$framesz+16*3`(%rsp),%xmm9
 894         movaps  `$framesz+16*4`(%rsp),%xmm10
 895         movaps  `$framesz+16*5`(%rsp),%xmm11
 896         movaps  `$framesz+16*6`(%rsp),%xmm12
 897         movaps  `$framesz+16*7`(%rsp),%xmm13
 898         movaps  `$framesz+16*8`(%rsp),%xmm14
 899         movaps  `$framesz+16*9`(%rsp),%xmm15
 900 ___
 901 $code.=<<___;
 902         mov     -48(%rsi),%r15
 903 .cfi_restore    %r15
 904         mov     -40(%rsi),%r14
 905 .cfi_restore    %r14
 906         mov     -32(%rsi),%r13
 907 .cfi_restore    %r13
 908         mov     -24(%rsi),%r12
 909 .cfi_restore    %r12
 910         mov     -16(%rsi),%rbp
 911 .cfi_restore    %rbp
 912         mov     -8(%rsi),%rbx
 913 .cfi_restore    %rbx
 914         lea     (%rsi),%rsp
 915 .cfi_def_cfa_register   %rsp
 916 .Lepilogue_avx:
 917         ret
 918 .cfi_endproc
 919 .size   ${func}_avx,.-${func}_avx
 920 ___
 921
 922 if ($avx>1) {{
 923 ######################################################################
 924 # AVX2+BMI code path
 925 #
 926 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
 927 my $PUSH8=8*2*$SZ;
 928 use integer;
 929
 930 sub bodyx_00_15 () {
 931         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
 932         (
 933         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 934
 935         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
 936         '&and   ($a4,$e)',              # f&e
 937         '&rorx  ($a0,$e,$Sigma1[2])',
 938         '&rorx  ($a2,$e,$Sigma1[1])',
 939
 940         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
 941         '&lea   ($h,"($h,$a4)")',
 942         '&andn  ($a4,$e,$g)',           # ~e&g
 943         '&xor   ($a0,$a2)',
 944
 945         '&rorx  ($a1,$e,$Sigma1[0])',
 946         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
 947         '&xor   ($a0,$a1)',             # Sigma1(e)
 948         '&mov   ($a2,$a)',
 949
 950         '&rorx  ($a4,$a,$Sigma0[2])',
 951         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
 952         '&xor   ($a2,$b)',              # a^b, b^c in next round
 953         '&rorx  ($a1,$a,$Sigma0[1])',
 954
 955         '&rorx  ($a0,$a,$Sigma0[0])',
 956         '&lea   ($d,"($d,$h)")',        # d+=h
 957         '&and   ($a3,$a2)',             # (b^c)&(a^b)
 958         @aesni_cbc_block[$aesni_cbc_idx++].
 959         '&xor   ($a1,$a4)',
 960
 961         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
 962         '&xor   ($a1,$a0)',             # Sigma0(a)
 963         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
 964         '&mov   ($a4,$e)',              # copy of f in future
 965
 966         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 967         );
 968         # and at the finish one has to $a+=$a1
 969 }
 970
 971 $code.=<<___;
 972 .type   ${func}_avx2,\@function,6
 973 .align  64
 974 ${func}_avx2:
 975 .cfi_startproc
 976 .Lavx2_shortcut:
 977         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 978         mov     %rsp,%rax               # copy %rsp
 979 .cfi_def_cfa_register   %rax
 980         push    %rbx
 981 .cfi_push       %rbx
 982         push    %rbp
 983 .cfi_push       %rbp
 984         push    %r12
 985 .cfi_push       %r12
 986         push    %r13
 987 .cfi_push       %r13
 988         push    %r14
 989 .cfi_push       %r14
 990         push    %r15
 991 .cfi_push       %r15
 992         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
 993         and     \$-256*$SZ,%rsp         # align stack frame
 994         add     \$`2*$SZ*($rounds-8)`,%rsp
 995
 996         shl     \$6,$len
 997         sub     $inp,$out               # re-bias
 998         sub     $inp,$in0
 999         add     $inp,$len               # end of input
1000
1001         #mov    $inp,$_inp              # saved later
1002         #mov    $out,$_out              # kept in $offload
1003         mov     $len,$_end
1004         #mov    $key,$_key              # remains resident in $inp register
1005         mov     $ivp,$_ivp
1006         mov     $ctx,$_ctx
1007         mov     $in0,$_in0
1008         mov     %rax,$_rsp
1009 .cfi_cfa_expression     $_rsp,deref,+8
1010 ___
1011 $code.=<<___ if ($win64);
1012         movaps  %xmm6,`$framesz+16*0`(%rsp)
1013         movaps  %xmm7,`$framesz+16*1`(%rsp)
1014         movaps  %xmm8,`$framesz+16*2`(%rsp)
1015         movaps  %xmm9,`$framesz+16*3`(%rsp)
1016         movaps  %xmm10,`$framesz+16*4`(%rsp)
1017         movaps  %xmm11,`$framesz+16*5`(%rsp)
1018         movaps  %xmm12,`$framesz+16*6`(%rsp)
1019         movaps  %xmm13,`$framesz+16*7`(%rsp)
1020         movaps  %xmm14,`$framesz+16*8`(%rsp)
1021         movaps  %xmm15,`$framesz+16*9`(%rsp)
1022 ___
1023 $code.=<<___;
1024 .Lprologue_avx2:
1025         vzeroall
1026
1027         mov     $inp,%r13               # borrow $a0
1028         vpinsrq \$1,$out,$offload,$offload
1029         lea     0x80($key),$inp         # size optimization, reassign
1030         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
1031         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
1032         mov     $ctx,%r15               # borrow $a2
1033         mov     $in0,%rsi               # borrow $a3
1034         vmovdqu ($ivp),$iv              # load IV
1035         lea     -9(%r14),%r14
1036
1037         vmovdqa 0x00(%r12,%r14,8),$mask14
1038         vmovdqa 0x10(%r12,%r14,8),$mask12
1039         vmovdqa 0x20(%r12,%r14,8),$mask10
1040
1041         sub     \$-16*$SZ,%r13          # inp++, size optimization
1042         mov     $SZ*0(%r15),$A
1043         lea     (%rsi,%r13),%r12        # borrow $a0
1044         mov     $SZ*1(%r15),$B
1045         cmp     $len,%r13               # $_end
1046         mov     $SZ*2(%r15),$C
1047         cmove   %rsp,%r12               # next block or random data
1048         mov     $SZ*3(%r15),$D
1049         mov     $SZ*4(%r15),$E
1050         mov     $SZ*5(%r15),$F
1051         mov     $SZ*6(%r15),$G
1052         mov     $SZ*7(%r15),$H
1053         vmovdqu 0x00-0x80($inp),$roundkey
1054 ___
1055                                         if ($SZ==4) {   # SHA256
1056     my @X = map("%ymm$_",(0..3));
1057     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1058
1059 $code.=<<___;
1060         jmp     .Loop_avx2
1061 .align  16
1062 .Loop_avx2:
1063         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1064         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1065         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1066         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1067         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1068
1069         vinserti128     \$1,(%r12),@X[0],@X[0]
1070         vinserti128     \$1,16(%r12),@X[1],@X[1]
1071          vpshufb        $t3,@X[0],@X[0]
1072         vinserti128     \$1,32(%r12),@X[2],@X[2]
1073          vpshufb        $t3,@X[1],@X[1]
1074         vinserti128     \$1,48(%r12),@X[3],@X[3]
1075
1076         lea     $TABLE(%rip),$Tbl
1077         vpshufb $t3,@X[2],@X[2]
1078         lea     -16*$SZ(%r13),%r13
1079         vpaddd  0x00($Tbl),@X[0],$t0
1080         vpshufb $t3,@X[3],@X[3]
1081         vpaddd  0x20($Tbl),@X[1],$t1
1082         vpaddd  0x40($Tbl),@X[2],$t2
1083         vpaddd  0x60($Tbl),@X[3],$t3
1084         vmovdqa $t0,0x00(%rsp)
1085         xor     $a1,$a1
1086         vmovdqa $t1,0x20(%rsp)
1087         lea     -$PUSH8(%rsp),%rsp
1088         mov     $B,$a3
1089         vmovdqa $t2,0x00(%rsp)
1090         xor     $C,$a3                  # magic
1091         vmovdqa $t3,0x20(%rsp)
1092         mov     $F,$a4
1093         sub     \$-16*2*$SZ,$Tbl        # size optimization
1094         jmp     .Lavx2_00_47
1095
1096 .align  16
1097 .Lavx2_00_47:
1098         vmovdqu (%r13),$inout
1099         vpinsrq \$0,%r13,$offload,$offload
1100 ___
1101
1102 sub AVX2_256_00_47 () {
1103 my $j = shift;
1104 my $body = shift;
1105 my @X = @_;
1106 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1107 my $base = "+2*$PUSH8(%rsp)";
1108
1109         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1110         foreach (Xupdate_256_AVX()) {           # 29 instructions
1111             eval;
1112             eval(shift(@insns));
1113             eval(shift(@insns));
1114             eval(shift(@insns));
1115         }
1116         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1117           foreach (@insns) { eval; }            # remaining instructions
1118         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1119 }
1120     $aesni_cbc_idx=0;
1121     for ($i=0,$j=0; $j<4; $j++) {
1122         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1123         push(@X,shift(@X));                     # rotate(@X)
1124     }
1125         &vmovq          ("%r13",$offload);      # borrow $a0
1126         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1127         &vpand          ($temp,$temp,$mask14);
1128         &vpor           ($iv,$iv,$temp);
1129         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1130         &lea            ("%r13","16(%r13)");    # inp++
1131
1132         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1133         &cmpb   (($SZ-1)."($Tbl)",0);
1134         &jne    (".Lavx2_00_47");
1135
1136         &vmovdqu        ($inout,"(%r13)");
1137         &vpinsrq        ($offload,$offload,"%r13",0);
1138
1139     $aesni_cbc_idx=0;
1140     for ($i=0; $i<16; ) {
1141         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1142         foreach(bodyx_00_15()) { eval; }
1143     }
1144                                         }
1145 $code.=<<___;
1146         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1147         vmovq   $offload,%r13                   # $_inp, borrow $a0
1148         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1149         add     $a1,$A
1150         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1151
1152         vpand   $mask14,$temp,$temp
1153         vpor    $temp,$iv,$iv
1154         vmovdqu $iv,(%r12,%r13)                 # write output
1155         lea     16(%r13),%r13
1156
1157         add     $SZ*0(%r15),$A
1158         add     $SZ*1(%r15),$B
1159         add     $SZ*2(%r15),$C
1160         add     $SZ*3(%r15),$D
1161         add     $SZ*4(%r15),$E
1162         add     $SZ*5(%r15),$F
1163         add     $SZ*6(%r15),$G
1164         add     $SZ*7(%r15),$H
1165
1166         mov     $A,$SZ*0(%r15)
1167         mov     $B,$SZ*1(%r15)
1168         mov     $C,$SZ*2(%r15)
1169         mov     $D,$SZ*3(%r15)
1170         mov     $E,$SZ*4(%r15)
1171         mov     $F,$SZ*5(%r15)
1172         mov     $G,$SZ*6(%r15)
1173         mov     $H,$SZ*7(%r15)
1174
1175         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1176         je      .Ldone_avx2
1177
1178         xor     $a1,$a1
1179         mov     $B,$a3
1180         mov     $F,$a4
1181         xor     $C,$a3                  # magic
1182         jmp     .Lower_avx2
1183 .align  16
1184 .Lower_avx2:
1185         vmovdqu (%r13),$inout
1186         vpinsrq \$0,%r13,$offload,$offload
1187 ___
1188     $aesni_cbc_idx=0;
1189     for ($i=0; $i<16; ) {
1190         my $base="+16($Tbl)";
1191         foreach(bodyx_00_15()) { eval; }
1192         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1193     }
1194 $code.=<<___;
1195         vmovq   $offload,%r13                   # borrow $a0
1196         vpextrq \$1,$offload,%r15               # borrow $a2
1197         vpand   $mask14,$temp,$temp
1198         vpor    $temp,$iv,$iv
1199         lea     -$PUSH8($Tbl),$Tbl
1200         vmovdqu $iv,(%r15,%r13)                 # write output
1201         lea     16(%r13),%r13                   # inp++
1202         cmp     %rsp,$Tbl
1203         jae     .Lower_avx2
1204
1205         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1206         lea     16*$SZ(%r13),%r13
1207         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1208         add     $a1,$A
1209         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1210
1211         add     $SZ*0(%r15),$A
1212         add     $SZ*1(%r15),$B
1213         add     $SZ*2(%r15),$C
1214         add     $SZ*3(%r15),$D
1215         add     $SZ*4(%r15),$E
1216         add     $SZ*5(%r15),$F
1217         add     $SZ*6(%r15),$G
1218         lea     (%rsi,%r13),%r12
1219         add     $SZ*7(%r15),$H
1220
1221         cmp     $_end,%r13
1222
1223         mov     $A,$SZ*0(%r15)
1224         cmove   %rsp,%r12               # next block or stale data
1225         mov     $B,$SZ*1(%r15)
1226         mov     $C,$SZ*2(%r15)
1227         mov     $D,$SZ*3(%r15)
1228         mov     $E,$SZ*4(%r15)
1229         mov     $F,$SZ*5(%r15)
1230         mov     $G,$SZ*6(%r15)
1231         mov     $H,$SZ*7(%r15)
1232
1233         jbe     .Loop_avx2
1234         lea     (%rsp),$Tbl
1235
1236 .Ldone_avx2:
1237         lea     ($Tbl),%rsp
1238         mov     $_ivp,$ivp
1239         mov     $_rsp,%rsi
1240 .cfi_def_cfa    %rsi,8
1241         vmovdqu $iv,($ivp)              # output IV
1242         vzeroall
1243 ___
1244 $code.=<<___ if ($win64);
1245         movaps  `$framesz+16*0`(%rsp),%xmm6
1246         movaps  `$framesz+16*1`(%rsp),%xmm7
1247         movaps  `$framesz+16*2`(%rsp),%xmm8
1248         movaps  `$framesz+16*3`(%rsp),%xmm9
1249         movaps  `$framesz+16*4`(%rsp),%xmm10
1250         movaps  `$framesz+16*5`(%rsp),%xmm11
1251         movaps  `$framesz+16*6`(%rsp),%xmm12
1252         movaps  `$framesz+16*7`(%rsp),%xmm13
1253         movaps  `$framesz+16*8`(%rsp),%xmm14
1254         movaps  `$framesz+16*9`(%rsp),%xmm15
1255 ___
1256 $code.=<<___;
1257         mov     -48(%rsi),%r15
1258 .cfi_restore    %r15
1259         mov     -40(%rsi),%r14
1260 .cfi_restore    %r14
1261         mov     -32(%rsi),%r13
1262 .cfi_restore    %r13
1263         mov     -24(%rsi),%r12
1264 .cfi_restore    %r12
1265         mov     -16(%rsi),%rbp
1266 .cfi_restore    %rbp
1267         mov     -8(%rsi),%rbx
1268 .cfi_restore    %rbx
1269         lea     (%rsi),%rsp
1270 .cfi_def_cfa_register   %rsp
1271 .Lepilogue_avx2:
1272         ret
1273 .cfi_endproc
1274 .size   ${func}_avx2,.-${func}_avx2
1275 ___
1276 }}
1277 }}
1278 {{
1279 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1280
1281 my ($rounds,$Tbl)=("%r11d","%rbx");
1282
1283 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1284 my @rndkey=("%xmm4","%xmm5");
1285 my $r=0;
1286 my $sn=0;
1287
1288 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1289 my @MSG=map("%xmm$_",(10..13));
1290
1291 my $aesenc=sub {
1292   use integer;
1293   my ($n,$k)=($r/10,$r%10);
1294     if ($k==0) {
1295       $code.=<<___;
1296         movups          `16*$n`($in0),$in               # load input
1297         xorps           $rndkey0,$in
1298 ___
1299       $code.=<<___ if ($n);
1300         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1301 ___
1302       $code.=<<___;
1303         xorps           $in,$iv
1304         movups          `32+16*$k-112`($key),$rndkey[1]
1305         aesenc          $rndkey[0],$iv
1306 ___
1307     } elsif ($k==9) {
1308       $sn++;
1309       $code.=<<___;
1310         cmp             \$11,$rounds
1311         jb              .Laesenclast$sn
1312         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1313         aesenc          $rndkey[0],$iv
1314         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1315         aesenc          $rndkey[1],$iv
1316         je              .Laesenclast$sn
1317         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1318         aesenc          $rndkey[0],$iv
1319         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1320         aesenc          $rndkey[1],$iv
1321 .Laesenclast$sn:
1322         aesenclast      $rndkey[0],$iv
1323         movups          16-112($key),$rndkey[1]         # forward reference
1324         nop
1325 ___
1326     } else {
1327       $code.=<<___;
1328         movups          `32+16*$k-112`($key),$rndkey[1]
1329         aesenc          $rndkey[0],$iv
1330 ___
1331     }
1332     $r++;       unshift(@rndkey,pop(@rndkey));
1333 };
1334
1335 if ($shaext) {
1336 my $Tbl="%rax";
1337
1338 $code.=<<___;
1339 .type   ${func}_shaext,\@function,6
1340 .align  32
1341 ${func}_shaext:
1342         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1343 ___
1344 $code.=<<___ if ($win64);
1345         lea     `-8-10*16`(%rsp),%rsp
1346         movaps  %xmm6,-8-10*16(%rax)
1347         movaps  %xmm7,-8-9*16(%rax)
1348         movaps  %xmm8,-8-8*16(%rax)
1349         movaps  %xmm9,-8-7*16(%rax)
1350         movaps  %xmm10,-8-6*16(%rax)
1351         movaps  %xmm11,-8-5*16(%rax)
1352         movaps  %xmm12,-8-4*16(%rax)
1353         movaps  %xmm13,-8-3*16(%rax)
1354         movaps  %xmm14,-8-2*16(%rax)
1355         movaps  %xmm15,-8-1*16(%rax)
1356 .Lprologue_shaext:
1357 ___
1358 $code.=<<___;
1359         lea             K256+0x80(%rip),$Tbl
1360         movdqu          ($ctx),$ABEF            # DCBA
1361         movdqu          16($ctx),$CDGH          # HGFE
1362         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1363
1364         mov             240($key),$rounds
1365         sub             $in0,$out
1366         movups          ($key),$rndkey0         # $key[0]
1367         movups          ($ivp),$iv              # load IV
1368         movups          16($key),$rndkey[0]     # forward reference
1369         lea             112($key),$key          # size optimization
1370
1371         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1372         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1373         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1374         movdqa          $TMP,$BSWAP             # offload
1375         palignr         \$8,$CDGH,$ABEF         # ABEF
1376         punpcklqdq      $Wi,$CDGH               # CDGH
1377
1378         jmp     .Loop_shaext
1379
1380 .align  16
1381 .Loop_shaext:
1382         movdqu          ($inp),@MSG[0]
1383         movdqu          0x10($inp),@MSG[1]
1384         movdqu          0x20($inp),@MSG[2]
1385         pshufb          $TMP,@MSG[0]
1386         movdqu          0x30($inp),@MSG[3]
1387
1388         movdqa          0*32-0x80($Tbl),$Wi
1389         paddd           @MSG[0],$Wi
1390         pshufb          $TMP,@MSG[1]
1391         movdqa          $CDGH,$CDGH_SAVE        # offload
1392         movdqa          $ABEF,$ABEF_SAVE        # offload
1393 ___
1394         &$aesenc();
1395 $code.=<<___;
1396         sha256rnds2     $ABEF,$CDGH             # 0-3
1397         pshufd          \$0x0e,$Wi,$Wi
1398 ___
1399         &$aesenc();
1400 $code.=<<___;
1401         sha256rnds2     $CDGH,$ABEF
1402
1403         movdqa          1*32-0x80($Tbl),$Wi
1404         paddd           @MSG[1],$Wi
1405         pshufb          $TMP,@MSG[2]
1406         lea             0x40($inp),$inp
1407 ___
1408         &$aesenc();
1409 $code.=<<___;
1410         sha256rnds2     $ABEF,$CDGH             # 4-7
1411         pshufd          \$0x0e,$Wi,$Wi
1412 ___
1413         &$aesenc();
1414 $code.=<<___;
1415         sha256rnds2     $CDGH,$ABEF
1416
1417         movdqa          2*32-0x80($Tbl),$Wi
1418         paddd           @MSG[2],$Wi
1419         pshufb          $TMP,@MSG[3]
1420         sha256msg1      @MSG[1],@MSG[0]
1421 ___
1422         &$aesenc();
1423 $code.=<<___;
1424         sha256rnds2     $ABEF,$CDGH             # 8-11
1425         pshufd          \$0x0e,$Wi,$Wi
1426         movdqa          @MSG[3],$TMP
1427         palignr         \$4,@MSG[2],$TMP
1428         paddd           $TMP,@MSG[0]
1429 ___
1430         &$aesenc();
1431 $code.=<<___;
1432         sha256rnds2     $CDGH,$ABEF
1433
1434         movdqa          3*32-0x80($Tbl),$Wi
1435         paddd           @MSG[3],$Wi
1436         sha256msg2      @MSG[3],@MSG[0]
1437         sha256msg1      @MSG[2],@MSG[1]
1438 ___
1439         &$aesenc();
1440 $code.=<<___;
1441         sha256rnds2     $ABEF,$CDGH             # 12-15
1442         pshufd          \$0x0e,$Wi,$Wi
1443 ___
1444         &$aesenc();
1445 $code.=<<___;
1446         movdqa          @MSG[0],$TMP
1447         palignr         \$4,@MSG[3],$TMP
1448         paddd           $TMP,@MSG[1]
1449         sha256rnds2     $CDGH,$ABEF
1450 ___
1451 for($i=4;$i<16-3;$i++) {
1452         &$aesenc()      if (($r%10)==0);
1453 $code.=<<___;
1454         movdqa          $i*32-0x80($Tbl),$Wi
1455         paddd           @MSG[0],$Wi
1456         sha256msg2      @MSG[0],@MSG[1]
1457         sha256msg1      @MSG[3],@MSG[2]
1458 ___
1459         &$aesenc();
1460 $code.=<<___;
1461         sha256rnds2     $ABEF,$CDGH             # 16-19...
1462         pshufd          \$0x0e,$Wi,$Wi
1463         movdqa          @MSG[1],$TMP
1464         palignr         \$4,@MSG[0],$TMP
1465         paddd           $TMP,@MSG[2]
1466 ___
1467         &$aesenc();
1468         &$aesenc()      if ($r==19);
1469 $code.=<<___;
1470         sha256rnds2     $CDGH,$ABEF
1471 ___
1472         push(@MSG,shift(@MSG));
1473 }
1474 $code.=<<___;
1475         movdqa          13*32-0x80($Tbl),$Wi
1476         paddd           @MSG[0],$Wi
1477         sha256msg2      @MSG[0],@MSG[1]
1478         sha256msg1      @MSG[3],@MSG[2]
1479 ___
1480         &$aesenc();
1481 $code.=<<___;
1482         sha256rnds2     $ABEF,$CDGH             # 52-55
1483         pshufd          \$0x0e,$Wi,$Wi
1484         movdqa          @MSG[1],$TMP
1485         palignr         \$4,@MSG[0],$TMP
1486         paddd           $TMP,@MSG[2]
1487 ___
1488         &$aesenc();
1489         &$aesenc();
1490 $code.=<<___;
1491         sha256rnds2     $CDGH,$ABEF
1492
1493         movdqa          14*32-0x80($Tbl),$Wi
1494         paddd           @MSG[1],$Wi
1495         sha256msg2      @MSG[1],@MSG[2]
1496         movdqa          $BSWAP,$TMP
1497 ___
1498         &$aesenc();
1499 $code.=<<___;
1500         sha256rnds2     $ABEF,$CDGH             # 56-59
1501         pshufd          \$0x0e,$Wi,$Wi
1502 ___
1503         &$aesenc();
1504 $code.=<<___;
1505         sha256rnds2     $CDGH,$ABEF
1506
1507         movdqa          15*32-0x80($Tbl),$Wi
1508         paddd           @MSG[2],$Wi
1509 ___
1510         &$aesenc();
1511         &$aesenc();
1512 $code.=<<___;
1513         sha256rnds2     $ABEF,$CDGH             # 60-63
1514         pshufd          \$0x0e,$Wi,$Wi
1515 ___
1516         &$aesenc();
1517 $code.=<<___;
1518         sha256rnds2     $CDGH,$ABEF
1519         #pxor           $CDGH,$rndkey0          # black magic
1520 ___
1521         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1522 $code.=<<___;
1523         #xorps          $CDGH,$rndkey0          # black magic
1524         paddd           $CDGH_SAVE,$CDGH
1525         paddd           $ABEF_SAVE,$ABEF
1526
1527         dec             $len
1528         movups          $iv,48($out,$in0)       # write output
1529         lea             64($in0),$in0
1530         jnz             .Loop_shaext
1531
1532         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1533         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1534         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1535         punpckhqdq      $CDGH,$ABEF             # DCBA
1536         palignr         \$8,$TMP,$CDGH          # HGFE
1537
1538         movups          $iv,($ivp)              # write IV
1539         movdqu          $ABEF,($ctx)
1540         movdqu          $CDGH,16($ctx)
1541 ___
1542 $code.=<<___ if ($win64);
1543         movaps  0*16(%rsp),%xmm6
1544         movaps  1*16(%rsp),%xmm7
1545         movaps  2*16(%rsp),%xmm8
1546         movaps  3*16(%rsp),%xmm9
1547         movaps  4*16(%rsp),%xmm10
1548         movaps  5*16(%rsp),%xmm11
1549         movaps  6*16(%rsp),%xmm12
1550         movaps  7*16(%rsp),%xmm13
1551         movaps  8*16(%rsp),%xmm14
1552         movaps  9*16(%rsp),%xmm15
1553         lea     8+10*16(%rsp),%rsp
1554 .Lepilogue_shaext:
1555 ___
1556 $code.=<<___;
1557         ret
1558 .size   ${func}_shaext,.-${func}_shaext
1559 ___
1560 }
1561 }}}}}
1562
1563 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1564 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1565 if ($win64 && $avx) {
1566 $rec="%rcx";
1567 $frame="%rdx";
1568 $context="%r8";
1569 $disp="%r9";
1570
1571 $code.=<<___;
1572 .extern __imp_RtlVirtualUnwind
1573 .type   se_handler,\@abi-omnipotent
1574 .align  16
1575 se_handler:
1576         push    %rsi
1577         push    %rdi
1578         push    %rbx
1579         push    %rbp
1580         push    %r12
1581         push    %r13
1582         push    %r14
1583         push    %r15
1584         pushfq
1585         sub     \$64,%rsp
1586
1587         mov     120($context),%rax      # pull context->Rax
1588         mov     248($context),%rbx      # pull context->Rip
1589
1590         mov     8($disp),%rsi           # disp->ImageBase
1591         mov     56($disp),%r11          # disp->HanderlData
1592
1593         mov     0(%r11),%r10d           # HandlerData[0]
1594         lea     (%rsi,%r10),%r10        # prologue label
1595         cmp     %r10,%rbx               # context->Rip<prologue label
1596         jb      .Lin_prologue
1597
1598         mov     152($context),%rax      # pull context->Rsp
1599
1600         mov     4(%r11),%r10d           # HandlerData[1]
1601         lea     (%rsi,%r10),%r10        # epilogue label
1602         cmp     %r10,%rbx               # context->Rip>=epilogue label
1603         jae     .Lin_prologue
1604 ___
1605 $code.=<<___ if ($shaext);
1606         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1607         cmp     %r10,%rbx
1608         jb      .Lnot_in_shaext
1609
1610         lea     (%rax),%rsi
1611         lea     512($context),%rdi      # &context.Xmm6
1612         mov     \$20,%ecx
1613         .long   0xa548f3fc              # cld; rep movsq
1614         lea     168(%rax),%rax          # adjust stack pointer
1615         jmp     .Lin_prologue
1616 .Lnot_in_shaext:
1617 ___
1618 $code.=<<___ if ($avx>1);
1619         lea     .Lavx2_shortcut(%rip),%r10
1620         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1621         jb      .Lnot_in_avx2
1622
1623         and     \$-256*$SZ,%rax
1624         add     \$`2*$SZ*($rounds-8)`,%rax
1625 .Lnot_in_avx2:
1626 ___
1627 $code.=<<___;
1628         mov     %rax,%rsi               # put aside Rsp
1629         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1630
1631         mov     -8(%rax),%rbx
1632         mov     -16(%rax),%rbp
1633         mov     -24(%rax),%r12
1634         mov     -32(%rax),%r13
1635         mov     -40(%rax),%r14
1636         mov     -48(%rax),%r15
1637         mov     %rbx,144($context)      # restore context->Rbx
1638         mov     %rbp,160($context)      # restore context->Rbp
1639         mov     %r12,216($context)      # restore context->R12
1640         mov     %r13,224($context)      # restore context->R13
1641         mov     %r14,232($context)      # restore context->R14
1642         mov     %r15,240($context)      # restore context->R15
1643
1644         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1645         lea     512($context),%rdi      # &context.Xmm6
1646         mov     \$20,%ecx
1647         .long   0xa548f3fc              # cld; rep movsq
1648
1649 .Lin_prologue:
1650         mov     8(%rax),%rdi
1651         mov     16(%rax),%rsi
1652         mov     %rax,152($context)      # restore context->Rsp
1653         mov     %rsi,168($context)      # restore context->Rsi
1654         mov     %rdi,176($context)      # restore context->Rdi
1655
1656         mov     40($disp),%rdi          # disp->ContextRecord
1657         mov     $context,%rsi           # context
1658         mov     \$154,%ecx              # sizeof(CONTEXT)
1659         .long   0xa548f3fc              # cld; rep movsq
1660
1661         mov     $disp,%rsi
1662         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1663         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1664         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1665         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1666         mov     40(%rsi),%r10           # disp->ContextRecord
1667         lea     56(%rsi),%r11           # &disp->HandlerData
1668         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1669         mov     %r10,32(%rsp)           # arg5
1670         mov     %r11,40(%rsp)           # arg6
1671         mov     %r12,48(%rsp)           # arg7
1672         mov     %rcx,56(%rsp)           # arg8, (NULL)
1673         call    *__imp_RtlVirtualUnwind(%rip)
1674
1675         mov     \$1,%eax                # ExceptionContinueSearch
1676         add     \$64,%rsp
1677         popfq
1678         pop     %r15
1679         pop     %r14
1680         pop     %r13
1681         pop     %r12
1682         pop     %rbp
1683         pop     %rbx
1684         pop     %rdi
1685         pop     %rsi
1686         ret
1687 .size   se_handler,.-se_handler
1688
1689 .section        .pdata
1690         .rva    .LSEH_begin_${func}_xop
1691         .rva    .LSEH_end_${func}_xop
1692         .rva    .LSEH_info_${func}_xop
1693
1694         .rva    .LSEH_begin_${func}_avx
1695         .rva    .LSEH_end_${func}_avx
1696         .rva    .LSEH_info_${func}_avx
1697 ___
1698 $code.=<<___ if ($avx>1);
1699         .rva    .LSEH_begin_${func}_avx2
1700         .rva    .LSEH_end_${func}_avx2
1701         .rva    .LSEH_info_${func}_avx2
1702 ___
1703 $code.=<<___ if ($shaext);
1704         .rva    .LSEH_begin_${func}_shaext
1705         .rva    .LSEH_end_${func}_shaext
1706         .rva    .LSEH_info_${func}_shaext
1707 ___
1708 $code.=<<___;
1709 .section        .xdata
1710 .align  8
1711 .LSEH_info_${func}_xop:
1712         .byte   9,0,0,0
1713         .rva    se_handler
1714         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1715
1716 .LSEH_info_${func}_avx:
1717         .byte   9,0,0,0
1718         .rva    se_handler
1719         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1720 ___
1721 $code.=<<___ if ($avx>1);
1722 .LSEH_info_${func}_avx2:
1723         .byte   9,0,0,0
1724         .rva    se_handler
1725         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1726 ___
1727 $code.=<<___ if ($shaext);
1728 .LSEH_info_${func}_shaext:
1729         .byte   9,0,0,0
1730         .rva    se_handler
1731         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1732 ___
1733 }
1734
1735 ####################################################################
1736 sub rex {
1737   local *opcode=shift;
1738   my ($dst,$src)=@_;
1739   my $rex=0;
1740
1741     $rex|=0x04                  if($dst>=8);
1742     $rex|=0x01                  if($src>=8);
1743     unshift @opcode,$rex|0x40   if($rex);
1744 }
1745
1746 {
1747   my %opcodelet = (
1748                 "sha256rnds2" => 0xcb,
1749                 "sha256msg1"  => 0xcc,
1750                 "sha256msg2"  => 0xcd   );
1751
1752   sub sha256op38 {
1753     my $instr = shift;
1754
1755     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1756       my @opcode=(0x0f,0x38);
1757         rex(\@opcode,$2,$1);
1758         push @opcode,$opcodelet{$instr};
1759         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1760         return ".byte\t".join(',',@opcode);
1761     } else {
1762         return $instr."\t".@_[0];
1763     }
1764   }
1765 }
1766
1767 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1768 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1769 print $code;
1770 close STDOUT;