crypto/aes/asm/aesni-sha256-x86_64.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # January 2013
  18 #
  19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  22 # parallelism, interleaving it with another algorithm would allow to
  23 # utilize processor resources better and achieve better performance.
  24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  25 # AESNI code is weaved into it. As SHA256 dominates execution time,
  26 # stitch performance does not depend on AES key length. Below are
  27 # performance numbers in cycles per processed byte, less is better,
  28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  29 # subroutine:
  30 #
  31 #                AES-128/-192/-256+SHA256   this(**)    gain
  32 # Sandy Bridge      5.05/6.05/7.05+11.6     13.0        +28%/36%/43%
  33 # Ivy Bridge        5.05/6.05/7.05+10.3     11.6        +32%/41%/50%
  34 # Haswell           4.43/5.29/6.19+7.80     8.79        +39%/49%/59%
  35 # Skylake           2.62/3.14/3.62+7.70     8.10        +27%/34%/40%
  36 # Bulldozer         5.77/6.89/8.00+13.7     13.7        +42%/50%/58%
  37 # Ryzen(***)        2.71/-/3.71+2.05        2.74/-/3.73 +74%/-/54%
  38 # Goldmont(***)     3.82/-/5.35+4.16        4.73/-/5.94 +69%/-/60%
  39 #
  40 # (*)   there are XOP, AVX1 and AVX2 code paths, meaning that
  41 #       Westmere is omitted from loop, this is because gain was not
  42 #       estimated high enough to justify the effort;
  43 # (**)  these are EVP-free results, results obtained with 'speed
  44 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  45 # (***) these are SHAEXT results;
  46
  47 # $output is the last argument if it looks like a file (it has an extension)
  48 # $flavour is the first argument if it doesn't look like a file
  49 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  50 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  51
  52 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  53
  54 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  55 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  56 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  57 die "can't locate x86_64-xlate.pl";
  58
  59 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  60                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  61         $avx = ($1>=2.19) + ($1>=2.22);
  62 }
  63
  64 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  65            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  66         $avx = ($1>=2.09) + ($1>=2.10);
  67 }
  68
  69 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  70            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  71         $avx = ($1>=10) + ($1>=12);
  72 }
  73
  74 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
  75         $avx = ($2>=3.0) + ($2>3.0);
  76 }
  77
  78 $shaext=$avx;   ### set to zero if compiling for 1.0.1
  79 $avx=1          if (!$shaext && $avx);
  80
  81 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  82     or die "can't call $xlate: $!";
  83 *STDOUT=*OUT;
  84
  85 $func="aesni_cbc_sha256_enc";
  86 $TABLE="K256";
  87 $SZ=4;
  88 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  89                                 "%r8d","%r9d","%r10d","%r11d");
  90 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  91 @Sigma0=( 2,13,22);
  92 @Sigma1=( 6,11,25);
  93 @sigma0=( 7,18, 3);
  94 @sigma1=(17,19,10);
  95 $rounds=64;
  96
  97 ########################################################################
  98 # void aesni_cbc_sha256_enc(const void *inp,
  99 #                       void *out,
 100 #                       size_t length,
 101 #                       const AES_KEY *key,
 102 #                       unsigned char *iv,
 103 #                       SHA256_CTX *ctx,
 104 #                       const void *in0);
 105 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
 106 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 107
 108 $Tbl="%rbp";
 109
 110 $_inp="16*$SZ+0*8(%rsp)";
 111 $_out="16*$SZ+1*8(%rsp)";
 112 $_end="16*$SZ+2*8(%rsp)";
 113 $_key="16*$SZ+3*8(%rsp)";
 114 $_ivp="16*$SZ+4*8(%rsp)";
 115 $_ctx="16*$SZ+5*8(%rsp)";
 116 $_in0="16*$SZ+6*8(%rsp)";
 117 $_rsp="`16*$SZ+7*8`(%rsp)";
 118 $framesz=16*$SZ+8*8;
 119
 120 $code=<<___;
 121 .text
 122
 123 .extern OPENSSL_ia32cap_P
 124 .globl  $func
 125 .type   $func,\@abi-omnipotent
 126 .align  16
 127 $func:
 128 .cfi_startproc
 129 ___
 130                                                 if ($avx) {
 131 $code.=<<___;
 132         lea     OPENSSL_ia32cap_P(%rip),%r11
 133         mov     \$1,%eax
 134         cmp     \$0,`$win64?"%rcx":"%rdi"`
 135         je      .Lprobe
 136         mov     0(%r11),%eax
 137         mov     4(%r11),%r10
 138 ___
 139 $code.=<<___ if ($shaext);
 140         bt      \$61,%r10                       # check for SHA
 141         jc      ${func}_shaext
 142 ___
 143 $code.=<<___;
 144         mov     %r10,%r11
 145         shr     \$32,%r11
 146
 147         test    \$`1<<11`,%r10d                 # check for XOP
 148         jnz     ${func}_xop
 149 ___
 150 $code.=<<___ if ($avx>1);
 151         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
 152         cmp     \$`1<<8|1<<5|1<<3`,%r11d
 153         je      ${func}_avx2
 154 ___
 155 $code.=<<___;
 156         and     \$`1<<28`,%r10d                 # check for AVX
 157         jnz     ${func}_avx
 158         ud2
 159 ___
 160                                                 }
 161 $code.=<<___;
 162         xor     %eax,%eax
 163         cmp     \$0,`$win64?"%rcx":"%rdi"`
 164         je      .Lprobe
 165         ud2
 166 .Lprobe:
 167         ret
 168 .cfi_endproc
 169 .size   $func,.-$func
 170
 171 .align  64
 172 .type   $TABLE,\@object
 173 $TABLE:
 174         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 175         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 176         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 177         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 178         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 179         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 180         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 181         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 182         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 183         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 184         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 185         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 186         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 187         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 188         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 189         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 190         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 191         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 192         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 193         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 194         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 195         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 196         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 197         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 198         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 199         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 200         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 201         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 202         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 203         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 204         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 205         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 206
 207         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 208         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 209         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
 210         .long   0,0,0,0,   0,0,0,0
 211         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 212 .align  64
 213 ___
 214
 215 ######################################################################
 216 # SIMD code paths
 217 #
 218 {{{
 219 ($iv,$inout,$roundkey,$temp,
 220  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
 221
 222 $aesni_cbc_idx=0;
 223 @aesni_cbc_block = (
 224 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
 225 ##      &vmovdqu        ($inout,($inp));
 226 ##      &mov            ($_inp,$inp);
 227
 228         '&vpxor         ($inout,$inout,$roundkey);'.
 229         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
 230
 231         '&vpxor         ($inout,$inout,$iv);',
 232
 233         '&vaesenc       ($inout,$inout,$roundkey);'.
 234         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
 235
 236         '&vaesenc       ($inout,$inout,$roundkey);'.
 237         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
 238
 239         '&vaesenc       ($inout,$inout,$roundkey);'.
 240         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
 241
 242         '&vaesenc       ($inout,$inout,$roundkey);'.
 243         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
 244
 245         '&vaesenc       ($inout,$inout,$roundkey);'.
 246         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
 247
 248         '&vaesenc       ($inout,$inout,$roundkey);'.
 249         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
 250
 251         '&vaesenc       ($inout,$inout,$roundkey);'.
 252         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
 253
 254         '&vaesenc       ($inout,$inout,$roundkey);'.
 255         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
 256
 257         '&vaesenc       ($inout,$inout,$roundkey);'.
 258         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
 259
 260         '&vaesenclast   ($temp,$inout,$roundkey);'.
 261         ' &vaesenc      ($inout,$inout,$roundkey);'.
 262         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
 263
 264         '&vpand         ($iv,$temp,$mask10);'.
 265         ' &vaesenc      ($inout,$inout,$roundkey);'.
 266         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
 267
 268         '&vaesenclast   ($temp,$inout,$roundkey);'.
 269         ' &vaesenc      ($inout,$inout,$roundkey);'.
 270         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
 271
 272         '&vpand         ($temp,$temp,$mask12);'.
 273         ' &vaesenc      ($inout,$inout,$roundkey);'.
 274          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
 275
 276         '&vpor          ($iv,$iv,$temp);'.
 277         ' &vaesenclast  ($temp,$inout,$roundkey);'.
 278         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
 279
 280 ##      &mov            ($inp,$_inp);
 281 ##      &mov            ($out,$_out);
 282 ##      &vpand          ($temp,$temp,$mask14);
 283 ##      &vpor           ($iv,$iv,$temp);
 284 ##      &vmovdqu        ($iv,($out,$inp);
 285 ##      &lea            (inp,16($inp));
 286 );
 287
 288 my $a4=$T1;
 289 my ($a,$b,$c,$d,$e,$f,$g,$h);
 290
 291 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 292 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 293   my $arg = pop;
 294     $arg = "\$$arg" if ($arg*1 eq $arg);
 295     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 296 }
 297
 298 sub body_00_15 () {
 299         (
 300         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 301
 302         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
 303         '&mov   ($a,$a1)',
 304         '&mov   ($a4,$f)',
 305
 306         '&xor   ($a0,$e)',
 307         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
 308         '&xor   ($a4,$g)',                      # f^g
 309
 310         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
 311         '&xor   ($a1,$a)',
 312         '&and   ($a4,$e)',                      # (f^g)&e
 313
 314         @aesni_cbc_block[$aesni_cbc_idx++].
 315         '&xor   ($a0,$e)',
 316         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
 317         '&mov   ($a2,$a)',
 318
 319         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
 320         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
 321         '&xor   ($a2,$b)',                      # a^b, b^c in next round
 322
 323         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
 324         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
 325         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
 326
 327         '&xor   ($a1,$a)',
 328         '&add   ($h,$a0)',                      # h+=Sigma1(e)
 329         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
 330
 331         '&add   ($d,$h)',                       # d+=h
 332         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
 333         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
 334
 335         '&mov   ($a0,$d)',
 336         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
 337         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 338         );
 339 }
 340
 341 if ($avx) {{
 342 ######################################################################
 343 # XOP code path
 344 #
 345 $code.=<<___;
 346 .type   ${func}_xop,\@function,6
 347 .align  64
 348 ${func}_xop:
 349 .cfi_startproc
 350 .Lxop_shortcut:
 351         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 352         mov     %rsp,%rax               # copy %rsp
 353 .cfi_def_cfa_register   %rax
 354         push    %rbx
 355 .cfi_push       %rbx
 356         push    %rbp
 357 .cfi_push       %rbp
 358         push    %r12
 359 .cfi_push       %r12
 360         push    %r13
 361 .cfi_push       %r13
 362         push    %r14
 363 .cfi_push       %r14
 364         push    %r15
 365 .cfi_push       %r15
 366         sub     \$`$framesz+$win64*16*10`,%rsp
 367         and     \$-64,%rsp              # align stack frame
 368
 369         shl     \$6,$len
 370         sub     $inp,$out               # re-bias
 371         sub     $inp,$in0
 372         add     $inp,$len               # end of input
 373
 374         #mov    $inp,$_inp              # saved later
 375         mov     $out,$_out
 376         mov     $len,$_end
 377         #mov    $key,$_key              # remains resident in $inp register
 378         mov     $ivp,$_ivp
 379         mov     $ctx,$_ctx
 380         mov     $in0,$_in0
 381         mov     %rax,$_rsp
 382 .cfi_cfa_expression     $_rsp,deref,+8
 383 ___
 384 $code.=<<___ if ($win64);
 385         movaps  %xmm6,`$framesz+16*0`(%rsp)
 386         movaps  %xmm7,`$framesz+16*1`(%rsp)
 387         movaps  %xmm8,`$framesz+16*2`(%rsp)
 388         movaps  %xmm9,`$framesz+16*3`(%rsp)
 389         movaps  %xmm10,`$framesz+16*4`(%rsp)
 390         movaps  %xmm11,`$framesz+16*5`(%rsp)
 391         movaps  %xmm12,`$framesz+16*6`(%rsp)
 392         movaps  %xmm13,`$framesz+16*7`(%rsp)
 393         movaps  %xmm14,`$framesz+16*8`(%rsp)
 394         movaps  %xmm15,`$framesz+16*9`(%rsp)
 395 ___
 396 $code.=<<___;
 397 .Lprologue_xop:
 398         vzeroall
 399
 400         mov     $inp,%r12               # borrow $a4
 401         lea     0x80($key),$inp         # size optimization, reassign
 402         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 403         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 404         mov     $ctx,%r15               # borrow $a2
 405         mov     $in0,%rsi               # borrow $a3
 406         vmovdqu ($ivp),$iv              # load IV
 407         sub     \$9,%r14
 408
 409         mov     $SZ*0(%r15),$A
 410         mov     $SZ*1(%r15),$B
 411         mov     $SZ*2(%r15),$C
 412         mov     $SZ*3(%r15),$D
 413         mov     $SZ*4(%r15),$E
 414         mov     $SZ*5(%r15),$F
 415         mov     $SZ*6(%r15),$G
 416         mov     $SZ*7(%r15),$H
 417
 418         vmovdqa 0x00(%r13,%r14,8),$mask14
 419         vmovdqa 0x10(%r13,%r14,8),$mask12
 420         vmovdqa 0x20(%r13,%r14,8),$mask10
 421         vmovdqu 0x00-0x80($inp),$roundkey
 422         jmp     .Lloop_xop
 423 ___
 424                                         if ($SZ==4) {   # SHA256
 425     my @X = map("%xmm$_",(0..3));
 426     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 427
 428 $code.=<<___;
 429 .align  16
 430 .Lloop_xop:
 431         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 432         vmovdqu 0x00(%rsi,%r12),@X[0]
 433         vmovdqu 0x10(%rsi,%r12),@X[1]
 434         vmovdqu 0x20(%rsi,%r12),@X[2]
 435         vmovdqu 0x30(%rsi,%r12),@X[3]
 436         vpshufb $t3,@X[0],@X[0]
 437         lea     $TABLE(%rip),$Tbl
 438         vpshufb $t3,@X[1],@X[1]
 439         vpshufb $t3,@X[2],@X[2]
 440         vpaddd  0x00($Tbl),@X[0],$t0
 441         vpshufb $t3,@X[3],@X[3]
 442         vpaddd  0x20($Tbl),@X[1],$t1
 443         vpaddd  0x40($Tbl),@X[2],$t2
 444         vpaddd  0x60($Tbl),@X[3],$t3
 445         vmovdqa $t0,0x00(%rsp)
 446         mov     $A,$a1
 447         vmovdqa $t1,0x10(%rsp)
 448         mov     $B,$a3
 449         vmovdqa $t2,0x20(%rsp)
 450         xor     $C,$a3                  # magic
 451         vmovdqa $t3,0x30(%rsp)
 452         mov     $E,$a0
 453         jmp     .Lxop_00_47
 454
 455 .align  16
 456 .Lxop_00_47:
 457         sub     \$-16*2*$SZ,$Tbl        # size optimization
 458         vmovdqu (%r12),$inout           # $a4
 459         mov     %r12,$_inp              # $a4
 460 ___
 461 sub XOP_256_00_47 () {
 462 my $j = shift;
 463 my $body = shift;
 464 my @X = @_;
 465 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 466
 467         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
 468           eval(shift(@insns));
 469           eval(shift(@insns));
 470          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
 471           eval(shift(@insns));
 472           eval(shift(@insns));
 473         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
 474           eval(shift(@insns));
 475           eval(shift(@insns));
 476         &vpsrld         ($t0,$t0,$sigma0[2]);
 477           eval(shift(@insns));
 478           eval(shift(@insns));
 479          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
 480           eval(shift(@insns));
 481           eval(shift(@insns));
 482           eval(shift(@insns));
 483           eval(shift(@insns));
 484         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
 485           eval(shift(@insns));
 486           eval(shift(@insns));
 487         &vpxor          ($t0,$t0,$t1);
 488           eval(shift(@insns));
 489           eval(shift(@insns));
 490           eval(shift(@insns));
 491           eval(shift(@insns));
 492          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
 493           eval(shift(@insns));
 494           eval(shift(@insns));
 495         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
 496           eval(shift(@insns));
 497           eval(shift(@insns));
 498          &vpsrld        ($t2,@X[3],$sigma1[2]);
 499           eval(shift(@insns));
 500           eval(shift(@insns));
 501         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
 502           eval(shift(@insns));
 503           eval(shift(@insns));
 504          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 505           eval(shift(@insns));
 506           eval(shift(@insns));
 507          &vpxor         ($t3,$t3,$t2);
 508           eval(shift(@insns));
 509           eval(shift(@insns));
 510           eval(shift(@insns));
 511           eval(shift(@insns));
 512          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
 513           eval(shift(@insns));
 514           eval(shift(@insns));
 515           eval(shift(@insns));
 516           eval(shift(@insns));
 517         &vpsrldq        ($t3,$t3,8);
 518           eval(shift(@insns));
 519           eval(shift(@insns));
 520           eval(shift(@insns));
 521           eval(shift(@insns));
 522         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
 523           eval(shift(@insns));
 524           eval(shift(@insns));
 525           eval(shift(@insns));
 526           eval(shift(@insns));
 527          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
 528           eval(shift(@insns));
 529           eval(shift(@insns));
 530          &vpsrld        ($t2,@X[0],$sigma1[2]);
 531           eval(shift(@insns));
 532           eval(shift(@insns));
 533          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 534           eval(shift(@insns));
 535           eval(shift(@insns));
 536          &vpxor         ($t3,$t3,$t2);
 537           eval(shift(@insns));
 538           eval(shift(@insns));
 539           eval(shift(@insns));
 540           eval(shift(@insns));
 541          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
 542           eval(shift(@insns));
 543           eval(shift(@insns));
 544           eval(shift(@insns));
 545           eval(shift(@insns));
 546         &vpslldq        ($t3,$t3,8);            # 22 instructions
 547           eval(shift(@insns));
 548           eval(shift(@insns));
 549           eval(shift(@insns));
 550           eval(shift(@insns));
 551         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
 552           eval(shift(@insns));
 553           eval(shift(@insns));
 554           eval(shift(@insns));
 555           eval(shift(@insns));
 556         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 557           foreach (@insns) { eval; }            # remaining instructions
 558         &vmovdqa        (16*$j."(%rsp)",$t2);
 559 }
 560
 561     $aesni_cbc_idx=0;
 562     for ($i=0,$j=0; $j<4; $j++) {
 563         &XOP_256_00_47($j,\&body_00_15,@X);
 564         push(@X,shift(@X));                     # rotate(@X)
 565     }
 566         &mov            ("%r12",$_inp);         # borrow $a4
 567         &vpand          ($temp,$temp,$mask14);
 568         &mov            ("%r15",$_out);         # borrow $a2
 569         &vpor           ($iv,$iv,$temp);
 570         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 571         &lea            ("%r12","16(%r12)");    # inp++
 572
 573         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 574         &jne    (".Lxop_00_47");
 575
 576         &vmovdqu        ($inout,"(%r12)");
 577         &mov            ($_inp,"%r12");
 578
 579     $aesni_cbc_idx=0;
 580     for ($i=0; $i<16; ) {
 581         foreach(body_00_15()) { eval; }
 582     }
 583                                         }
 584 $code.=<<___;
 585         mov     $_inp,%r12              # borrow $a4
 586         mov     $_out,%r13              # borrow $a0
 587         mov     $_ctx,%r15              # borrow $a2
 588         mov     $_in0,%rsi              # borrow $a3
 589
 590         vpand   $mask14,$temp,$temp
 591         mov     $a1,$A
 592         vpor    $temp,$iv,$iv
 593         vmovdqu $iv,(%r13,%r12)         # write output
 594         lea     16(%r12),%r12           # inp++
 595
 596         add     $SZ*0(%r15),$A
 597         add     $SZ*1(%r15),$B
 598         add     $SZ*2(%r15),$C
 599         add     $SZ*3(%r15),$D
 600         add     $SZ*4(%r15),$E
 601         add     $SZ*5(%r15),$F
 602         add     $SZ*6(%r15),$G
 603         add     $SZ*7(%r15),$H
 604
 605         cmp     $_end,%r12
 606
 607         mov     $A,$SZ*0(%r15)
 608         mov     $B,$SZ*1(%r15)
 609         mov     $C,$SZ*2(%r15)
 610         mov     $D,$SZ*3(%r15)
 611         mov     $E,$SZ*4(%r15)
 612         mov     $F,$SZ*5(%r15)
 613         mov     $G,$SZ*6(%r15)
 614         mov     $H,$SZ*7(%r15)
 615
 616         jb      .Lloop_xop
 617
 618         mov     $_ivp,$ivp
 619         mov     $_rsp,%rsi
 620 .cfi_def_cfa    %rsi,8
 621         vmovdqu $iv,($ivp)              # output IV
 622         vzeroall
 623 ___
 624 $code.=<<___ if ($win64);
 625         movaps  `$framesz+16*0`(%rsp),%xmm6
 626         movaps  `$framesz+16*1`(%rsp),%xmm7
 627         movaps  `$framesz+16*2`(%rsp),%xmm8
 628         movaps  `$framesz+16*3`(%rsp),%xmm9
 629         movaps  `$framesz+16*4`(%rsp),%xmm10
 630         movaps  `$framesz+16*5`(%rsp),%xmm11
 631         movaps  `$framesz+16*6`(%rsp),%xmm12
 632         movaps  `$framesz+16*7`(%rsp),%xmm13
 633         movaps  `$framesz+16*8`(%rsp),%xmm14
 634         movaps  `$framesz+16*9`(%rsp),%xmm15
 635 ___
 636 $code.=<<___;
 637         mov     -48(%rsi),%r15
 638 .cfi_restore    %r15
 639         mov     -40(%rsi),%r14
 640 .cfi_restore    %r14
 641         mov     -32(%rsi),%r13
 642 .cfi_restore    %r13
 643         mov     -24(%rsi),%r12
 644 .cfi_restore    %r12
 645         mov     -16(%rsi),%rbp
 646 .cfi_restore    %rbp
 647         mov     -8(%rsi),%rbx
 648 .cfi_restore    %rbx
 649         lea     (%rsi),%rsp
 650 .cfi_def_cfa_register   %rsp
 651 .Lepilogue_xop:
 652         ret
 653 .cfi_endproc
 654 .size   ${func}_xop,.-${func}_xop
 655 ___
 656 ######################################################################
 657 # AVX+shrd code path
 658 #
 659 local *ror = sub { &shrd(@_[0],@_) };
 660
 661 $code.=<<___;
 662 .type   ${func}_avx,\@function,6
 663 .align  64
 664 ${func}_avx:
 665 .cfi_startproc
 666 .Lavx_shortcut:
 667         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 668         mov     %rsp,%rax               # copy %rsp
 669 .cfi_def_cfa_register   %rax
 670         push    %rbx
 671 .cfi_push       %rbx
 672         push    %rbp
 673 .cfi_push       %rbp
 674         push    %r12
 675 .cfi_push       %r12
 676         push    %r13
 677 .cfi_push       %r13
 678         push    %r14
 679 .cfi_push       %r14
 680         push    %r15
 681 .cfi_push       %r15
 682         sub     \$`$framesz+$win64*16*10`,%rsp
 683         and     \$-64,%rsp              # align stack frame
 684
 685         shl     \$6,$len
 686         sub     $inp,$out               # re-bias
 687         sub     $inp,$in0
 688         add     $inp,$len               # end of input
 689
 690         #mov    $inp,$_inp              # saved later
 691         mov     $out,$_out
 692         mov     $len,$_end
 693         #mov    $key,$_key              # remains resident in $inp register
 694         mov     $ivp,$_ivp
 695         mov     $ctx,$_ctx
 696         mov     $in0,$_in0
 697         mov     %rax,$_rsp
 698 .cfi_cfa_expression     $_rsp,deref,+8
 699 ___
 700 $code.=<<___ if ($win64);
 701         movaps  %xmm6,`$framesz+16*0`(%rsp)
 702         movaps  %xmm7,`$framesz+16*1`(%rsp)
 703         movaps  %xmm8,`$framesz+16*2`(%rsp)
 704         movaps  %xmm9,`$framesz+16*3`(%rsp)
 705         movaps  %xmm10,`$framesz+16*4`(%rsp)
 706         movaps  %xmm11,`$framesz+16*5`(%rsp)
 707         movaps  %xmm12,`$framesz+16*6`(%rsp)
 708         movaps  %xmm13,`$framesz+16*7`(%rsp)
 709         movaps  %xmm14,`$framesz+16*8`(%rsp)
 710         movaps  %xmm15,`$framesz+16*9`(%rsp)
 711 ___
 712 $code.=<<___;
 713 .Lprologue_avx:
 714         vzeroall
 715
 716         mov     $inp,%r12               # borrow $a4
 717         lea     0x80($key),$inp         # size optimization, reassign
 718         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 719         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 720         mov     $ctx,%r15               # borrow $a2
 721         mov     $in0,%rsi               # borrow $a3
 722         vmovdqu ($ivp),$iv              # load IV
 723         sub     \$9,%r14
 724
 725         mov     $SZ*0(%r15),$A
 726         mov     $SZ*1(%r15),$B
 727         mov     $SZ*2(%r15),$C
 728         mov     $SZ*3(%r15),$D
 729         mov     $SZ*4(%r15),$E
 730         mov     $SZ*5(%r15),$F
 731         mov     $SZ*6(%r15),$G
 732         mov     $SZ*7(%r15),$H
 733
 734         vmovdqa 0x00(%r13,%r14,8),$mask14
 735         vmovdqa 0x10(%r13,%r14,8),$mask12
 736         vmovdqa 0x20(%r13,%r14,8),$mask10
 737         vmovdqu 0x00-0x80($inp),$roundkey
 738 ___
 739                                         if ($SZ==4) {   # SHA256
 740     my @X = map("%xmm$_",(0..3));
 741     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 742
 743 $code.=<<___;
 744         jmp     .Lloop_avx
 745 .align  16
 746 .Lloop_avx:
 747         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 748         vmovdqu 0x00(%rsi,%r12),@X[0]
 749         vmovdqu 0x10(%rsi,%r12),@X[1]
 750         vmovdqu 0x20(%rsi,%r12),@X[2]
 751         vmovdqu 0x30(%rsi,%r12),@X[3]
 752         vpshufb $t3,@X[0],@X[0]
 753         lea     $TABLE(%rip),$Tbl
 754         vpshufb $t3,@X[1],@X[1]
 755         vpshufb $t3,@X[2],@X[2]
 756         vpaddd  0x00($Tbl),@X[0],$t0
 757         vpshufb $t3,@X[3],@X[3]
 758         vpaddd  0x20($Tbl),@X[1],$t1
 759         vpaddd  0x40($Tbl),@X[2],$t2
 760         vpaddd  0x60($Tbl),@X[3],$t3
 761         vmovdqa $t0,0x00(%rsp)
 762         mov     $A,$a1
 763         vmovdqa $t1,0x10(%rsp)
 764         mov     $B,$a3
 765         vmovdqa $t2,0x20(%rsp)
 766         xor     $C,$a3                  # magic
 767         vmovdqa $t3,0x30(%rsp)
 768         mov     $E,$a0
 769         jmp     .Lavx_00_47
 770
 771 .align  16
 772 .Lavx_00_47:
 773         sub     \$-16*2*$SZ,$Tbl        # size optimization
 774         vmovdqu (%r12),$inout           # $a4
 775         mov     %r12,$_inp              # $a4
 776 ___
 777 sub Xupdate_256_AVX () {
 778         (
 779         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
 780          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
 781         '&vpsrld        ($t2,$t0,$sigma0[0]);',
 782          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
 783         '&vpsrld        ($t3,$t0,$sigma0[2])',
 784         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
 785         '&vpxor         ($t0,$t3,$t2)',
 786          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
 787         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
 788         '&vpxor         ($t0,$t0,$t1)',
 789         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
 790         '&vpxor         ($t0,$t0,$t2)',
 791          '&vpsrld       ($t2,$t3,$sigma1[2]);',
 792         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
 793          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
 794         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
 795          '&vpxor        ($t2,$t2,$t3);',
 796          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 797          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
 798          '&vpshufd      ($t2,$t2,0b10000100)',
 799          '&vpsrldq      ($t2,$t2,8)',
 800         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
 801          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
 802          '&vpsrld       ($t2,$t3,$sigma1[2])',
 803          '&vpsrlq       ($t3,$t3,$sigma1[0])',
 804          '&vpxor        ($t2,$t2,$t3);',
 805          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 806          '&vpxor        ($t2,$t2,$t3)',
 807          '&vpshufd      ($t2,$t2,0b11101000)',
 808          '&vpslldq      ($t2,$t2,8)',
 809         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
 810         );
 811 }
 812
 813 sub AVX_256_00_47 () {
 814 my $j = shift;
 815 my $body = shift;
 816 my @X = @_;
 817 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 818
 819         foreach (Xupdate_256_AVX()) {           # 29 instructions
 820             eval;
 821             eval(shift(@insns));
 822             eval(shift(@insns));
 823             eval(shift(@insns));
 824         }
 825         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 826           foreach (@insns) { eval; }            # remaining instructions
 827         &vmovdqa        (16*$j."(%rsp)",$t2);
 828 }
 829
 830     $aesni_cbc_idx=0;
 831     for ($i=0,$j=0; $j<4; $j++) {
 832         &AVX_256_00_47($j,\&body_00_15,@X);
 833         push(@X,shift(@X));                     # rotate(@X)
 834     }
 835         &mov            ("%r12",$_inp);         # borrow $a4
 836         &vpand          ($temp,$temp,$mask14);
 837         &mov            ("%r15",$_out);         # borrow $a2
 838         &vpor           ($iv,$iv,$temp);
 839         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 840         &lea            ("%r12","16(%r12)");    # inp++
 841
 842         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 843         &jne    (".Lavx_00_47");
 844
 845         &vmovdqu        ($inout,"(%r12)");
 846         &mov            ($_inp,"%r12");
 847
 848     $aesni_cbc_idx=0;
 849     for ($i=0; $i<16; ) {
 850         foreach(body_00_15()) { eval; }
 851     }
 852
 853                                         }
 854 $code.=<<___;
 855         mov     $_inp,%r12              # borrow $a4
 856         mov     $_out,%r13              # borrow $a0
 857         mov     $_ctx,%r15              # borrow $a2
 858         mov     $_in0,%rsi              # borrow $a3
 859
 860         vpand   $mask14,$temp,$temp
 861         mov     $a1,$A
 862         vpor    $temp,$iv,$iv
 863         vmovdqu $iv,(%r13,%r12)         # write output
 864         lea     16(%r12),%r12           # inp++
 865
 866         add     $SZ*0(%r15),$A
 867         add     $SZ*1(%r15),$B
 868         add     $SZ*2(%r15),$C
 869         add     $SZ*3(%r15),$D
 870         add     $SZ*4(%r15),$E
 871         add     $SZ*5(%r15),$F
 872         add     $SZ*6(%r15),$G
 873         add     $SZ*7(%r15),$H
 874
 875         cmp     $_end,%r12
 876
 877         mov     $A,$SZ*0(%r15)
 878         mov     $B,$SZ*1(%r15)
 879         mov     $C,$SZ*2(%r15)
 880         mov     $D,$SZ*3(%r15)
 881         mov     $E,$SZ*4(%r15)
 882         mov     $F,$SZ*5(%r15)
 883         mov     $G,$SZ*6(%r15)
 884         mov     $H,$SZ*7(%r15)
 885         jb      .Lloop_avx
 886
 887         mov     $_ivp,$ivp
 888         mov     $_rsp,%rsi
 889 .cfi_def_cfa    %rsi,8
 890         vmovdqu $iv,($ivp)              # output IV
 891         vzeroall
 892 ___
 893 $code.=<<___ if ($win64);
 894         movaps  `$framesz+16*0`(%rsp),%xmm6
 895         movaps  `$framesz+16*1`(%rsp),%xmm7
 896         movaps  `$framesz+16*2`(%rsp),%xmm8
 897         movaps  `$framesz+16*3`(%rsp),%xmm9
 898         movaps  `$framesz+16*4`(%rsp),%xmm10
 899         movaps  `$framesz+16*5`(%rsp),%xmm11
 900         movaps  `$framesz+16*6`(%rsp),%xmm12
 901         movaps  `$framesz+16*7`(%rsp),%xmm13
 902         movaps  `$framesz+16*8`(%rsp),%xmm14
 903         movaps  `$framesz+16*9`(%rsp),%xmm15
 904 ___
 905 $code.=<<___;
 906         mov     -48(%rsi),%r15
 907 .cfi_restore    %r15
 908         mov     -40(%rsi),%r14
 909 .cfi_restore    %r14
 910         mov     -32(%rsi),%r13
 911 .cfi_restore    %r13
 912         mov     -24(%rsi),%r12
 913 .cfi_restore    %r12
 914         mov     -16(%rsi),%rbp
 915 .cfi_restore    %rbp
 916         mov     -8(%rsi),%rbx
 917 .cfi_restore    %rbx
 918         lea     (%rsi),%rsp
 919 .cfi_def_cfa_register   %rsp
 920 .Lepilogue_avx:
 921         ret
 922 .cfi_endproc
 923 .size   ${func}_avx,.-${func}_avx
 924 ___
 925
 926 if ($avx>1) {{
 927 ######################################################################
 928 # AVX2+BMI code path
 929 #
 930 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
 931 my $PUSH8=8*2*$SZ;
 932 use integer;
 933
 934 sub bodyx_00_15 () {
 935         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
 936         (
 937         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 938
 939         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
 940         '&and   ($a4,$e)',              # f&e
 941         '&rorx  ($a0,$e,$Sigma1[2])',
 942         '&rorx  ($a2,$e,$Sigma1[1])',
 943
 944         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
 945         '&lea   ($h,"($h,$a4)")',
 946         '&andn  ($a4,$e,$g)',           # ~e&g
 947         '&xor   ($a0,$a2)',
 948
 949         '&rorx  ($a1,$e,$Sigma1[0])',
 950         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
 951         '&xor   ($a0,$a1)',             # Sigma1(e)
 952         '&mov   ($a2,$a)',
 953
 954         '&rorx  ($a4,$a,$Sigma0[2])',
 955         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
 956         '&xor   ($a2,$b)',              # a^b, b^c in next round
 957         '&rorx  ($a1,$a,$Sigma0[1])',
 958
 959         '&rorx  ($a0,$a,$Sigma0[0])',
 960         '&lea   ($d,"($d,$h)")',        # d+=h
 961         '&and   ($a3,$a2)',             # (b^c)&(a^b)
 962         @aesni_cbc_block[$aesni_cbc_idx++].
 963         '&xor   ($a1,$a4)',
 964
 965         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
 966         '&xor   ($a1,$a0)',             # Sigma0(a)
 967         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
 968         '&mov   ($a4,$e)',              # copy of f in future
 969
 970         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 971         );
 972         # and at the finish one has to $a+=$a1
 973 }
 974
 975 $code.=<<___;
 976 .type   ${func}_avx2,\@function,6
 977 .align  64
 978 ${func}_avx2:
 979 .cfi_startproc
 980 .Lavx2_shortcut:
 981         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 982         mov     %rsp,%rax               # copy %rsp
 983 .cfi_def_cfa_register   %rax
 984         push    %rbx
 985 .cfi_push       %rbx
 986         push    %rbp
 987 .cfi_push       %rbp
 988         push    %r12
 989 .cfi_push       %r12
 990         push    %r13
 991 .cfi_push       %r13
 992         push    %r14
 993 .cfi_push       %r14
 994         push    %r15
 995 .cfi_push       %r15
 996         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
 997         and     \$-256*$SZ,%rsp         # align stack frame
 998         add     \$`2*$SZ*($rounds-8)`,%rsp
 999
1000         shl     \$6,$len
1001         sub     $inp,$out               # re-bias
1002         sub     $inp,$in0
1003         add     $inp,$len               # end of input
1004
1005         #mov    $inp,$_inp              # saved later
1006         #mov    $out,$_out              # kept in $offload
1007         mov     $len,$_end
1008         #mov    $key,$_key              # remains resident in $inp register
1009         mov     $ivp,$_ivp
1010         mov     $ctx,$_ctx
1011         mov     $in0,$_in0
1012         mov     %rax,$_rsp
1013 .cfi_cfa_expression     $_rsp,deref,+8
1014 ___
1015 $code.=<<___ if ($win64);
1016         movaps  %xmm6,`$framesz+16*0`(%rsp)
1017         movaps  %xmm7,`$framesz+16*1`(%rsp)
1018         movaps  %xmm8,`$framesz+16*2`(%rsp)
1019         movaps  %xmm9,`$framesz+16*3`(%rsp)
1020         movaps  %xmm10,`$framesz+16*4`(%rsp)
1021         movaps  %xmm11,`$framesz+16*5`(%rsp)
1022         movaps  %xmm12,`$framesz+16*6`(%rsp)
1023         movaps  %xmm13,`$framesz+16*7`(%rsp)
1024         movaps  %xmm14,`$framesz+16*8`(%rsp)
1025         movaps  %xmm15,`$framesz+16*9`(%rsp)
1026 ___
1027 $code.=<<___;
1028 .Lprologue_avx2:
1029         vzeroall
1030
1031         mov     $inp,%r13               # borrow $a0
1032         vpinsrq \$1,$out,$offload,$offload
1033         lea     0x80($key),$inp         # size optimization, reassign
1034         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
1035         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
1036         mov     $ctx,%r15               # borrow $a2
1037         mov     $in0,%rsi               # borrow $a3
1038         vmovdqu ($ivp),$iv              # load IV
1039         lea     -9(%r14),%r14
1040
1041         vmovdqa 0x00(%r12,%r14,8),$mask14
1042         vmovdqa 0x10(%r12,%r14,8),$mask12
1043         vmovdqa 0x20(%r12,%r14,8),$mask10
1044
1045         sub     \$-16*$SZ,%r13          # inp++, size optimization
1046         mov     $SZ*0(%r15),$A
1047         lea     (%rsi,%r13),%r12        # borrow $a0
1048         mov     $SZ*1(%r15),$B
1049         cmp     $len,%r13               # $_end
1050         mov     $SZ*2(%r15),$C
1051         cmove   %rsp,%r12               # next block or random data
1052         mov     $SZ*3(%r15),$D
1053         mov     $SZ*4(%r15),$E
1054         mov     $SZ*5(%r15),$F
1055         mov     $SZ*6(%r15),$G
1056         mov     $SZ*7(%r15),$H
1057         vmovdqu 0x00-0x80($inp),$roundkey
1058 ___
1059                                         if ($SZ==4) {   # SHA256
1060     my @X = map("%ymm$_",(0..3));
1061     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1062
1063 $code.=<<___;
1064         jmp     .Loop_avx2
1065 .align  16
1066 .Loop_avx2:
1067         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1068         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1069         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1070         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1071         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1072
1073         vinserti128     \$1,(%r12),@X[0],@X[0]
1074         vinserti128     \$1,16(%r12),@X[1],@X[1]
1075          vpshufb        $t3,@X[0],@X[0]
1076         vinserti128     \$1,32(%r12),@X[2],@X[2]
1077          vpshufb        $t3,@X[1],@X[1]
1078         vinserti128     \$1,48(%r12),@X[3],@X[3]
1079
1080         lea     $TABLE(%rip),$Tbl
1081         vpshufb $t3,@X[2],@X[2]
1082         lea     -16*$SZ(%r13),%r13
1083         vpaddd  0x00($Tbl),@X[0],$t0
1084         vpshufb $t3,@X[3],@X[3]
1085         vpaddd  0x20($Tbl),@X[1],$t1
1086         vpaddd  0x40($Tbl),@X[2],$t2
1087         vpaddd  0x60($Tbl),@X[3],$t3
1088         vmovdqa $t0,0x00(%rsp)
1089         xor     $a1,$a1
1090         vmovdqa $t1,0x20(%rsp)
1091 ___
1092 $code.=<<___ if (!$win64);
1093 # temporarily use %rsi as frame pointer
1094         mov     $_rsp,%rsi
1095 .cfi_def_cfa    %rsi,8
1096 ___
1097 $code.=<<___;
1098         lea     -$PUSH8(%rsp),%rsp
1099 ___
1100 $code.=<<___ if (!$win64);
1101 # the frame info is at $_rsp, but the stack is moving...
1102 # so a second frame pointer is saved at -8(%rsp)
1103 # that is in the red zone
1104         mov     %rsi,-8(%rsp)
1105 .cfi_cfa_expression     %rsp-8,deref,+8
1106 ___
1107 $code.=<<___;
1108         mov     $B,$a3
1109         vmovdqa $t2,0x00(%rsp)
1110         xor     $C,$a3                  # magic
1111         vmovdqa $t3,0x20(%rsp)
1112         mov     $F,$a4
1113         sub     \$-16*2*$SZ,$Tbl        # size optimization
1114         jmp     .Lavx2_00_47
1115
1116 .align  16
1117 .Lavx2_00_47:
1118         vmovdqu (%r13),$inout
1119         vpinsrq \$0,%r13,$offload,$offload
1120 ___
1121
1122 sub AVX2_256_00_47 () {
1123 my $j = shift;
1124 my $body = shift;
1125 my @X = @_;
1126 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1127 my $base = "+2*$PUSH8(%rsp)";
1128
1129         if (($j%2)==0) {
1130         &lea    ("%rsp","-$PUSH8(%rsp)");
1131 $code.=<<___ if (!$win64);
1132 .cfi_cfa_expression     %rsp+`$PUSH8-8`,deref,+8
1133 # copy secondary frame pointer to new location again at -8(%rsp)
1134         pushq   $PUSH8-8(%rsp)
1135 .cfi_cfa_expression     %rsp,deref,+8
1136         lea     8(%rsp),%rsp
1137 .cfi_cfa_expression     %rsp-8,deref,+8
1138 ___
1139         }
1140         foreach (Xupdate_256_AVX()) {           # 29 instructions
1141             eval;
1142             eval(shift(@insns));
1143             eval(shift(@insns));
1144             eval(shift(@insns));
1145         }
1146         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1147           foreach (@insns) { eval; }            # remaining instructions
1148         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1149 }
1150     $aesni_cbc_idx=0;
1151     for ($i=0,$j=0; $j<4; $j++) {
1152         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1153         push(@X,shift(@X));                     # rotate(@X)
1154     }
1155         &vmovq          ("%r13",$offload);      # borrow $a0
1156         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1157         &vpand          ($temp,$temp,$mask14);
1158         &vpor           ($iv,$iv,$temp);
1159         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1160         &lea            ("%r13","16(%r13)");    # inp++
1161
1162         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1163         &cmpb   (($SZ-1)."($Tbl)",0);
1164         &jne    (".Lavx2_00_47");
1165
1166         &vmovdqu        ($inout,"(%r13)");
1167         &vpinsrq        ($offload,$offload,"%r13",0);
1168
1169     $aesni_cbc_idx=0;
1170     for ($i=0; $i<16; ) {
1171         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1172         foreach(bodyx_00_15()) { eval; }
1173     }
1174                                         }
1175 $code.=<<___;
1176         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1177         vmovq   $offload,%r13                   # $_inp, borrow $a0
1178         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1179         add     $a1,$A
1180         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1181
1182         vpand   $mask14,$temp,$temp
1183         vpor    $temp,$iv,$iv
1184         vmovdqu $iv,(%r12,%r13)                 # write output
1185         lea     16(%r13),%r13
1186
1187         add     $SZ*0(%r15),$A
1188         add     $SZ*1(%r15),$B
1189         add     $SZ*2(%r15),$C
1190         add     $SZ*3(%r15),$D
1191         add     $SZ*4(%r15),$E
1192         add     $SZ*5(%r15),$F
1193         add     $SZ*6(%r15),$G
1194         add     $SZ*7(%r15),$H
1195
1196         mov     $A,$SZ*0(%r15)
1197         mov     $B,$SZ*1(%r15)
1198         mov     $C,$SZ*2(%r15)
1199         mov     $D,$SZ*3(%r15)
1200         mov     $E,$SZ*4(%r15)
1201         mov     $F,$SZ*5(%r15)
1202         mov     $G,$SZ*6(%r15)
1203         mov     $H,$SZ*7(%r15)
1204
1205         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1206         je      .Ldone_avx2
1207
1208         xor     $a1,$a1
1209         mov     $B,$a3
1210         mov     $F,$a4
1211         xor     $C,$a3                  # magic
1212         jmp     .Lower_avx2
1213 .align  16
1214 .Lower_avx2:
1215         vmovdqu (%r13),$inout
1216         vpinsrq \$0,%r13,$offload,$offload
1217 ___
1218     $aesni_cbc_idx=0;
1219     for ($i=0; $i<16; ) {
1220         my $base="+16($Tbl)";
1221         foreach(bodyx_00_15()) { eval; }
1222         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1223     }
1224 $code.=<<___;
1225         vmovq   $offload,%r13                   # borrow $a0
1226         vpextrq \$1,$offload,%r15               # borrow $a2
1227         vpand   $mask14,$temp,$temp
1228         vpor    $temp,$iv,$iv
1229         lea     -$PUSH8($Tbl),$Tbl
1230         vmovdqu $iv,(%r15,%r13)                 # write output
1231         lea     16(%r13),%r13                   # inp++
1232         cmp     %rsp,$Tbl
1233         jae     .Lower_avx2
1234
1235         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1236         lea     16*$SZ(%r13),%r13
1237         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1238         add     $a1,$A
1239         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1240
1241         add     $SZ*0(%r15),$A
1242         add     $SZ*1(%r15),$B
1243         add     $SZ*2(%r15),$C
1244         add     $SZ*3(%r15),$D
1245         add     $SZ*4(%r15),$E
1246         add     $SZ*5(%r15),$F
1247         add     $SZ*6(%r15),$G
1248         lea     (%rsi,%r13),%r12
1249         add     $SZ*7(%r15),$H
1250
1251         cmp     $_end,%r13
1252
1253         mov     $A,$SZ*0(%r15)
1254         cmove   %rsp,%r12               # next block or stale data
1255         mov     $B,$SZ*1(%r15)
1256         mov     $C,$SZ*2(%r15)
1257         mov     $D,$SZ*3(%r15)
1258         mov     $E,$SZ*4(%r15)
1259         mov     $F,$SZ*5(%r15)
1260         mov     $G,$SZ*6(%r15)
1261         mov     $H,$SZ*7(%r15)
1262
1263         jbe     .Loop_avx2
1264         lea     (%rsp),$Tbl
1265 # temporarily use $Tbl as index to $_rsp
1266 # this avoids the need to save a secondary frame pointer at -8(%rsp)
1267 .cfi_cfa_expression     $Tbl+`16*$SZ+7*8`,deref,+8
1268
1269 .Ldone_avx2:
1270         mov     16*$SZ+4*8($Tbl),$ivp
1271         mov     16*$SZ+7*8($Tbl),%rsi
1272 .cfi_def_cfa    %rsi,8
1273         vmovdqu $iv,($ivp)              # output IV
1274         vzeroall
1275 ___
1276 $code.=<<___ if ($win64);
1277         movaps  `$framesz+16*0`($Tbl),%xmm6
1278         movaps  `$framesz+16*1`($Tbl),%xmm7
1279         movaps  `$framesz+16*2`($Tbl),%xmm8
1280         movaps  `$framesz+16*3`($Tbl),%xmm9
1281         movaps  `$framesz+16*4`($Tbl),%xmm10
1282         movaps  `$framesz+16*5`($Tbl),%xmm11
1283         movaps  `$framesz+16*6`($Tbl),%xmm12
1284         movaps  `$framesz+16*7`($Tbl),%xmm13
1285         movaps  `$framesz+16*8`($Tbl),%xmm14
1286         movaps  `$framesz+16*9`($Tbl),%xmm15
1287 ___
1288 $code.=<<___;
1289         mov     -48(%rsi),%r15
1290 .cfi_restore    %r15
1291         mov     -40(%rsi),%r14
1292 .cfi_restore    %r14
1293         mov     -32(%rsi),%r13
1294 .cfi_restore    %r13
1295         mov     -24(%rsi),%r12
1296 .cfi_restore    %r12
1297         mov     -16(%rsi),%rbp
1298 .cfi_restore    %rbp
1299         mov     -8(%rsi),%rbx
1300 .cfi_restore    %rbx
1301         lea     (%rsi),%rsp
1302 .cfi_def_cfa_register   %rsp
1303 .Lepilogue_avx2:
1304         ret
1305 .cfi_endproc
1306 .size   ${func}_avx2,.-${func}_avx2
1307 ___
1308 }}
1309 }}
1310 {{
1311 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1312
1313 my ($rounds,$Tbl)=("%r11d","%rbx");
1314
1315 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1316 my @rndkey=("%xmm4","%xmm5");
1317 my $r=0;
1318 my $sn=0;
1319
1320 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1321 my @MSG=map("%xmm$_",(10..13));
1322
1323 my $aesenc=sub {
1324   use integer;
1325   my ($n,$k)=($r/10,$r%10);
1326     if ($k==0) {
1327       $code.=<<___;
1328         movups          `16*$n`($in0),$in               # load input
1329         xorps           $rndkey0,$in
1330 ___
1331       $code.=<<___ if ($n);
1332         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1333 ___
1334       $code.=<<___;
1335         xorps           $in,$iv
1336         movups          `32+16*$k-112`($key),$rndkey[1]
1337         aesenc          $rndkey[0],$iv
1338 ___
1339     } elsif ($k==9) {
1340       $sn++;
1341       $code.=<<___;
1342         cmp             \$11,$rounds
1343         jb              .Laesenclast$sn
1344         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1345         aesenc          $rndkey[0],$iv
1346         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1347         aesenc          $rndkey[1],$iv
1348         je              .Laesenclast$sn
1349         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1350         aesenc          $rndkey[0],$iv
1351         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1352         aesenc          $rndkey[1],$iv
1353 .Laesenclast$sn:
1354         aesenclast      $rndkey[0],$iv
1355         movups          16-112($key),$rndkey[1]         # forward reference
1356         nop
1357 ___
1358     } else {
1359       $code.=<<___;
1360         movups          `32+16*$k-112`($key),$rndkey[1]
1361         aesenc          $rndkey[0],$iv
1362 ___
1363     }
1364     $r++;       unshift(@rndkey,pop(@rndkey));
1365 };
1366
1367 if ($shaext) {
1368 my $Tbl="%rax";
1369
1370 $code.=<<___;
1371 .type   ${func}_shaext,\@function,6
1372 .align  32
1373 ${func}_shaext:
1374 .cfi_startproc
1375         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1376 ___
1377 $code.=<<___ if ($win64);
1378         lea     `-8-10*16`(%rsp),%rsp
1379         movaps  %xmm6,-8-10*16(%rax)
1380         movaps  %xmm7,-8-9*16(%rax)
1381         movaps  %xmm8,-8-8*16(%rax)
1382         movaps  %xmm9,-8-7*16(%rax)
1383         movaps  %xmm10,-8-6*16(%rax)
1384         movaps  %xmm11,-8-5*16(%rax)
1385         movaps  %xmm12,-8-4*16(%rax)
1386         movaps  %xmm13,-8-3*16(%rax)
1387         movaps  %xmm14,-8-2*16(%rax)
1388         movaps  %xmm15,-8-1*16(%rax)
1389 .Lprologue_shaext:
1390 ___
1391 $code.=<<___;
1392         lea             K256+0x80(%rip),$Tbl
1393         movdqu          ($ctx),$ABEF            # DCBA
1394         movdqu          16($ctx),$CDGH          # HGFE
1395         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1396
1397         mov             240($key),$rounds
1398         sub             $in0,$out
1399         movups          ($key),$rndkey0         # $key[0]
1400         movups          ($ivp),$iv              # load IV
1401         movups          16($key),$rndkey[0]     # forward reference
1402         lea             112($key),$key          # size optimization
1403
1404         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1405         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1406         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1407         movdqa          $TMP,$BSWAP             # offload
1408         palignr         \$8,$CDGH,$ABEF         # ABEF
1409         punpcklqdq      $Wi,$CDGH               # CDGH
1410
1411         jmp     .Loop_shaext
1412
1413 .align  16
1414 .Loop_shaext:
1415         movdqu          ($inp),@MSG[0]
1416         movdqu          0x10($inp),@MSG[1]
1417         movdqu          0x20($inp),@MSG[2]
1418         pshufb          $TMP,@MSG[0]
1419         movdqu          0x30($inp),@MSG[3]
1420
1421         movdqa          0*32-0x80($Tbl),$Wi
1422         paddd           @MSG[0],$Wi
1423         pshufb          $TMP,@MSG[1]
1424         movdqa          $CDGH,$CDGH_SAVE        # offload
1425         movdqa          $ABEF,$ABEF_SAVE        # offload
1426 ___
1427         &$aesenc();
1428 $code.=<<___;
1429         sha256rnds2     $ABEF,$CDGH             # 0-3
1430         pshufd          \$0x0e,$Wi,$Wi
1431 ___
1432         &$aesenc();
1433 $code.=<<___;
1434         sha256rnds2     $CDGH,$ABEF
1435
1436         movdqa          1*32-0x80($Tbl),$Wi
1437         paddd           @MSG[1],$Wi
1438         pshufb          $TMP,@MSG[2]
1439         lea             0x40($inp),$inp
1440 ___
1441         &$aesenc();
1442 $code.=<<___;
1443         sha256rnds2     $ABEF,$CDGH             # 4-7
1444         pshufd          \$0x0e,$Wi,$Wi
1445 ___
1446         &$aesenc();
1447 $code.=<<___;
1448         sha256rnds2     $CDGH,$ABEF
1449
1450         movdqa          2*32-0x80($Tbl),$Wi
1451         paddd           @MSG[2],$Wi
1452         pshufb          $TMP,@MSG[3]
1453         sha256msg1      @MSG[1],@MSG[0]
1454 ___
1455         &$aesenc();
1456 $code.=<<___;
1457         sha256rnds2     $ABEF,$CDGH             # 8-11
1458         pshufd          \$0x0e,$Wi,$Wi
1459         movdqa          @MSG[3],$TMP
1460         palignr         \$4,@MSG[2],$TMP
1461         paddd           $TMP,@MSG[0]
1462 ___
1463         &$aesenc();
1464 $code.=<<___;
1465         sha256rnds2     $CDGH,$ABEF
1466
1467         movdqa          3*32-0x80($Tbl),$Wi
1468         paddd           @MSG[3],$Wi
1469         sha256msg2      @MSG[3],@MSG[0]
1470         sha256msg1      @MSG[2],@MSG[1]
1471 ___
1472         &$aesenc();
1473 $code.=<<___;
1474         sha256rnds2     $ABEF,$CDGH             # 12-15
1475         pshufd          \$0x0e,$Wi,$Wi
1476 ___
1477         &$aesenc();
1478 $code.=<<___;
1479         movdqa          @MSG[0],$TMP
1480         palignr         \$4,@MSG[3],$TMP
1481         paddd           $TMP,@MSG[1]
1482         sha256rnds2     $CDGH,$ABEF
1483 ___
1484 for($i=4;$i<16-3;$i++) {
1485         &$aesenc()      if (($r%10)==0);
1486 $code.=<<___;
1487         movdqa          $i*32-0x80($Tbl),$Wi
1488         paddd           @MSG[0],$Wi
1489         sha256msg2      @MSG[0],@MSG[1]
1490         sha256msg1      @MSG[3],@MSG[2]
1491 ___
1492         &$aesenc();
1493 $code.=<<___;
1494         sha256rnds2     $ABEF,$CDGH             # 16-19...
1495         pshufd          \$0x0e,$Wi,$Wi
1496         movdqa          @MSG[1],$TMP
1497         palignr         \$4,@MSG[0],$TMP
1498         paddd           $TMP,@MSG[2]
1499 ___
1500         &$aesenc();
1501         &$aesenc()      if ($r==19);
1502 $code.=<<___;
1503         sha256rnds2     $CDGH,$ABEF
1504 ___
1505         push(@MSG,shift(@MSG));
1506 }
1507 $code.=<<___;
1508         movdqa          13*32-0x80($Tbl),$Wi
1509         paddd           @MSG[0],$Wi
1510         sha256msg2      @MSG[0],@MSG[1]
1511         sha256msg1      @MSG[3],@MSG[2]
1512 ___
1513         &$aesenc();
1514 $code.=<<___;
1515         sha256rnds2     $ABEF,$CDGH             # 52-55
1516         pshufd          \$0x0e,$Wi,$Wi
1517         movdqa          @MSG[1],$TMP
1518         palignr         \$4,@MSG[0],$TMP
1519         paddd           $TMP,@MSG[2]
1520 ___
1521         &$aesenc();
1522         &$aesenc();
1523 $code.=<<___;
1524         sha256rnds2     $CDGH,$ABEF
1525
1526         movdqa          14*32-0x80($Tbl),$Wi
1527         paddd           @MSG[1],$Wi
1528         sha256msg2      @MSG[1],@MSG[2]
1529         movdqa          $BSWAP,$TMP
1530 ___
1531         &$aesenc();
1532 $code.=<<___;
1533         sha256rnds2     $ABEF,$CDGH             # 56-59
1534         pshufd          \$0x0e,$Wi,$Wi
1535 ___
1536         &$aesenc();
1537 $code.=<<___;
1538         sha256rnds2     $CDGH,$ABEF
1539
1540         movdqa          15*32-0x80($Tbl),$Wi
1541         paddd           @MSG[2],$Wi
1542 ___
1543         &$aesenc();
1544         &$aesenc();
1545 $code.=<<___;
1546         sha256rnds2     $ABEF,$CDGH             # 60-63
1547         pshufd          \$0x0e,$Wi,$Wi
1548 ___
1549         &$aesenc();
1550 $code.=<<___;
1551         sha256rnds2     $CDGH,$ABEF
1552         #pxor           $CDGH,$rndkey0          # black magic
1553 ___
1554         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1555 $code.=<<___;
1556         #xorps          $CDGH,$rndkey0          # black magic
1557         paddd           $CDGH_SAVE,$CDGH
1558         paddd           $ABEF_SAVE,$ABEF
1559
1560         dec             $len
1561         movups          $iv,48($out,$in0)       # write output
1562         lea             64($in0),$in0
1563         jnz             .Loop_shaext
1564
1565         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1566         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1567         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1568         punpckhqdq      $CDGH,$ABEF             # DCBA
1569         palignr         \$8,$TMP,$CDGH          # HGFE
1570
1571         movups          $iv,($ivp)              # write IV
1572         movdqu          $ABEF,($ctx)
1573         movdqu          $CDGH,16($ctx)
1574 ___
1575 $code.=<<___ if ($win64);
1576         movaps  0*16(%rsp),%xmm6
1577         movaps  1*16(%rsp),%xmm7
1578         movaps  2*16(%rsp),%xmm8
1579         movaps  3*16(%rsp),%xmm9
1580         movaps  4*16(%rsp),%xmm10
1581         movaps  5*16(%rsp),%xmm11
1582         movaps  6*16(%rsp),%xmm12
1583         movaps  7*16(%rsp),%xmm13
1584         movaps  8*16(%rsp),%xmm14
1585         movaps  9*16(%rsp),%xmm15
1586         lea     8+10*16(%rsp),%rsp
1587 .Lepilogue_shaext:
1588 ___
1589 $code.=<<___;
1590         ret
1591 .cfi_endproc
1592 .size   ${func}_shaext,.-${func}_shaext
1593 ___
1594 }
1595 }}}}}
1596
1597 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1598 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1599 if ($win64 && $avx) {
1600 $rec="%rcx";
1601 $frame="%rdx";
1602 $context="%r8";
1603 $disp="%r9";
1604
1605 $code.=<<___;
1606 .extern __imp_RtlVirtualUnwind
1607 .type   se_handler,\@abi-omnipotent
1608 .align  16
1609 se_handler:
1610         push    %rsi
1611         push    %rdi
1612         push    %rbx
1613         push    %rbp
1614         push    %r12
1615         push    %r13
1616         push    %r14
1617         push    %r15
1618         pushfq
1619         sub     \$64,%rsp
1620
1621         mov     120($context),%rax      # pull context->Rax
1622         mov     248($context),%rbx      # pull context->Rip
1623
1624         mov     8($disp),%rsi           # disp->ImageBase
1625         mov     56($disp),%r11          # disp->HanderlData
1626
1627         mov     0(%r11),%r10d           # HandlerData[0]
1628         lea     (%rsi,%r10),%r10        # prologue label
1629         cmp     %r10,%rbx               # context->Rip<prologue label
1630         jb      .Lin_prologue
1631
1632         mov     152($context),%rax      # pull context->Rsp
1633
1634         mov     4(%r11),%r10d           # HandlerData[1]
1635         lea     (%rsi,%r10),%r10        # epilogue label
1636         cmp     %r10,%rbx               # context->Rip>=epilogue label
1637         jae     .Lin_prologue
1638 ___
1639 $code.=<<___ if ($shaext);
1640         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1641         cmp     %r10,%rbx
1642         jb      .Lnot_in_shaext
1643
1644         lea     (%rax),%rsi
1645         lea     512($context),%rdi      # &context.Xmm6
1646         mov     \$20,%ecx
1647         .long   0xa548f3fc              # cld; rep movsq
1648         lea     168(%rax),%rax          # adjust stack pointer
1649         jmp     .Lin_prologue
1650 .Lnot_in_shaext:
1651 ___
1652 $code.=<<___ if ($avx>1);
1653         lea     .Lavx2_shortcut(%rip),%r10
1654         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1655         jb      .Lnot_in_avx2
1656
1657         and     \$-256*$SZ,%rax
1658         add     \$`2*$SZ*($rounds-8)`,%rax
1659 .Lnot_in_avx2:
1660 ___
1661 $code.=<<___;
1662         mov     %rax,%rsi               # put aside Rsp
1663         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1664
1665         mov     -8(%rax),%rbx
1666         mov     -16(%rax),%rbp
1667         mov     -24(%rax),%r12
1668         mov     -32(%rax),%r13
1669         mov     -40(%rax),%r14
1670         mov     -48(%rax),%r15
1671         mov     %rbx,144($context)      # restore context->Rbx
1672         mov     %rbp,160($context)      # restore context->Rbp
1673         mov     %r12,216($context)      # restore context->R12
1674         mov     %r13,224($context)      # restore context->R13
1675         mov     %r14,232($context)      # restore context->R14
1676         mov     %r15,240($context)      # restore context->R15
1677
1678         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1679         lea     512($context),%rdi      # &context.Xmm6
1680         mov     \$20,%ecx
1681         .long   0xa548f3fc              # cld; rep movsq
1682
1683 .Lin_prologue:
1684         mov     8(%rax),%rdi
1685         mov     16(%rax),%rsi
1686         mov     %rax,152($context)      # restore context->Rsp
1687         mov     %rsi,168($context)      # restore context->Rsi
1688         mov     %rdi,176($context)      # restore context->Rdi
1689
1690         mov     40($disp),%rdi          # disp->ContextRecord
1691         mov     $context,%rsi           # context
1692         mov     \$154,%ecx              # sizeof(CONTEXT)
1693         .long   0xa548f3fc              # cld; rep movsq
1694
1695         mov     $disp,%rsi
1696         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1697         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1698         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1699         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1700         mov     40(%rsi),%r10           # disp->ContextRecord
1701         lea     56(%rsi),%r11           # &disp->HandlerData
1702         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1703         mov     %r10,32(%rsp)           # arg5
1704         mov     %r11,40(%rsp)           # arg6
1705         mov     %r12,48(%rsp)           # arg7
1706         mov     %rcx,56(%rsp)           # arg8, (NULL)
1707         call    *__imp_RtlVirtualUnwind(%rip)
1708
1709         mov     \$1,%eax                # ExceptionContinueSearch
1710         add     \$64,%rsp
1711         popfq
1712         pop     %r15
1713         pop     %r14
1714         pop     %r13
1715         pop     %r12
1716         pop     %rbp
1717         pop     %rbx
1718         pop     %rdi
1719         pop     %rsi
1720         ret
1721 .size   se_handler,.-se_handler
1722
1723 .section        .pdata
1724         .rva    .LSEH_begin_${func}_xop
1725         .rva    .LSEH_end_${func}_xop
1726         .rva    .LSEH_info_${func}_xop
1727
1728         .rva    .LSEH_begin_${func}_avx
1729         .rva    .LSEH_end_${func}_avx
1730         .rva    .LSEH_info_${func}_avx
1731 ___
1732 $code.=<<___ if ($avx>1);
1733         .rva    .LSEH_begin_${func}_avx2
1734         .rva    .LSEH_end_${func}_avx2
1735         .rva    .LSEH_info_${func}_avx2
1736 ___
1737 $code.=<<___ if ($shaext);
1738         .rva    .LSEH_begin_${func}_shaext
1739         .rva    .LSEH_end_${func}_shaext
1740         .rva    .LSEH_info_${func}_shaext
1741 ___
1742 $code.=<<___;
1743 .section        .xdata
1744 .align  8
1745 .LSEH_info_${func}_xop:
1746         .byte   9,0,0,0
1747         .rva    se_handler
1748         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1749
1750 .LSEH_info_${func}_avx:
1751         .byte   9,0,0,0
1752         .rva    se_handler
1753         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1754 ___
1755 $code.=<<___ if ($avx>1);
1756 .LSEH_info_${func}_avx2:
1757         .byte   9,0,0,0
1758         .rva    se_handler
1759         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1760 ___
1761 $code.=<<___ if ($shaext);
1762 .LSEH_info_${func}_shaext:
1763         .byte   9,0,0,0
1764         .rva    se_handler
1765         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1766 ___
1767 }
1768
1769 ####################################################################
1770 sub rex {
1771   local *opcode=shift;
1772   my ($dst,$src)=@_;
1773   my $rex=0;
1774
1775     $rex|=0x04                  if($dst>=8);
1776     $rex|=0x01                  if($src>=8);
1777     unshift @opcode,$rex|0x40   if($rex);
1778 }
1779
1780 {
1781   my %opcodelet = (
1782                 "sha256rnds2" => 0xcb,
1783                 "sha256msg1"  => 0xcc,
1784                 "sha256msg2"  => 0xcd   );
1785
1786   sub sha256op38 {
1787     my $instr = shift;
1788
1789     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1790       my @opcode=(0x0f,0x38);
1791         rex(\@opcode,$2,$1);
1792         push @opcode,$opcodelet{$instr};
1793         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1794         return ".byte\t".join(',',@opcode);
1795     } else {
1796         return $instr."\t".@_[0];
1797     }
1798   }
1799 }
1800
1801 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1802 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1803 print $code;
1804 close STDOUT or die "error closing STDOUT: $!";