crypto/aes/asm/aesni-sha256-x86_64.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # January 2013
  18 #
  19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  22 # parallelism, interleaving it with another algorithm would allow to
  23 # utilize processor resources better and achieve better performance.
  24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  25 # AESNI code is weaved into it. As SHA256 dominates execution time,
  26 # stitch performance does not depend on AES key length. Below are
  27 # performance numbers in cycles per processed byte, less is better,
  28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  29 # subroutine:
  30 #
  31 #                AES-128/-192/-256+SHA256   this(**)    gain
  32 # Sandy Bridge      5.05/6.05/7.05+11.6     13.0        +28%/36%/43%
  33 # Ivy Bridge        5.05/6.05/7.05+10.3     11.6        +32%/41%/50%
  34 # Haswell           4.43/5.29/6.19+7.80     8.79        +39%/49%/59%
  35 # Skylake           2.62/3.14/3.62+7.70     8.10        +27%/34%/40%
  36 # Bulldozer         5.77/6.89/8.00+13.7     13.7        +42%/50%/58%
  37 # Ryzen(***)        2.71/-/3.71+2.05        2.74/-/3.73 +74%/-/54%
  38 # Goldmont(***)     3.82/-/5.35+4.16        4.73/-/5.94 +69%/-/60%
  39 #
  40 # (*)   there are XOP, AVX1 and AVX2 code paths, meaning that
  41 #       Westmere is omitted from loop, this is because gain was not
  42 #       estimated high enough to justify the effort;
  43 # (**)  these are EVP-free results, results obtained with 'speed
  44 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  45 # (***) these are SHAEXT results;
  46
  47 # $output is the last argument if it looks like a file (it has an extension)
  48 # $flavour is the first argument if it doesn't look like a file
  49 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  50 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  51
  52 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  53
  54 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  55 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  56 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  57 die "can't locate x86_64-xlate.pl";
  58
  59 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  60                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  61         $avx = ($1>=2.19) + ($1>=2.22);
  62 }
  63
  64 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  65            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  66         $avx = ($1>=2.09) + ($1>=2.10);
  67 }
  68
  69 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  70            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  71         $avx = ($1>=10) + ($1>=12);
  72 }
  73
  74 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
  75         $avx = ($2>=3.0) + ($2>3.0);
  76 }
  77
  78 $shaext=$avx;   ### set to zero if compiling for 1.0.1
  79 $avx=1          if (!$shaext && $avx);
  80
  81 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  82     or die "can't call $xlate: $!";
  83 *STDOUT=*OUT;
  84
  85 $func="aesni_cbc_sha256_enc";
  86 $TABLE="K256";
  87 $SZ=4;
  88 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  89                                 "%r8d","%r9d","%r10d","%r11d");
  90 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  91 @Sigma0=( 2,13,22);
  92 @Sigma1=( 6,11,25);
  93 @sigma0=( 7,18, 3);
  94 @sigma1=(17,19,10);
  95 $rounds=64;
  96
  97 ########################################################################
  98 # void aesni_cbc_sha256_enc(const void *inp,
  99 #                       void *out,
 100 #                       size_t length,
 101 #                       const AES_KEY *key,
 102 #                       unsigned char *iv,
 103 #                       SHA256_CTX *ctx,
 104 #                       const void *in0);
 105 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
 106 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 107
 108 $Tbl="%rbp";
 109
 110 $_inp="16*$SZ+0*8(%rsp)";
 111 $_out="16*$SZ+1*8(%rsp)";
 112 $_end="16*$SZ+2*8(%rsp)";
 113 $_key="16*$SZ+3*8(%rsp)";
 114 $_ivp="16*$SZ+4*8(%rsp)";
 115 $_ctx="16*$SZ+5*8(%rsp)";
 116 $_in0="16*$SZ+6*8(%rsp)";
 117 $_rsp="`16*$SZ+7*8`(%rsp)";
 118 $framesz=16*$SZ+8*8;
 119
 120 $code=<<___;
 121 .text
 122
 123 .extern OPENSSL_ia32cap_P
 124 .globl  $func
 125 .type   $func,\@abi-omnipotent
 126 .align  16
 127 $func:
 128 .cfi_startproc
 129 ___
 130                                                 if ($avx) {
 131 $code.=<<___;
 132         lea     OPENSSL_ia32cap_P(%rip),%r11
 133         mov     \$1,%eax
 134         cmp     \$0,`$win64?"%rcx":"%rdi"`
 135         je      .Lprobe
 136         mov     0(%r11),%eax
 137         mov     4(%r11),%r10
 138 ___
 139 $code.=<<___ if ($shaext);
 140         bt      \$61,%r10                       # check for SHA
 141         jc      ${func}_shaext
 142 ___
 143 $code.=<<___;
 144         mov     %r10,%r11
 145         shr     \$32,%r11
 146
 147         test    \$`1<<11`,%r10d                 # check for XOP
 148         jnz     ${func}_xop
 149 ___
 150 $code.=<<___ if ($avx>1);
 151         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
 152         cmp     \$`1<<8|1<<5|1<<3`,%r11d
 153         je      ${func}_avx2
 154 ___
 155 $code.=<<___;
 156         and     \$`1<<28`,%r10d                 # check for AVX
 157         jnz     ${func}_avx
 158         ud2
 159 ___
 160                                                 }
 161 $code.=<<___;
 162         xor     %eax,%eax
 163         cmp     \$0,`$win64?"%rcx":"%rdi"`
 164         je      .Lprobe
 165         ud2
 166 .Lprobe:
 167         ret
 168 .cfi_endproc
 169 .size   $func,.-$func
 170
 171 .align  64
 172 .type   $TABLE,\@object
 173 $TABLE:
 174         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 175         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 176         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 177         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 178         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 179         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 180         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 181         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 182         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 183         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 184         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 185         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 186         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 187         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 188         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 189         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 190         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 191         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 192         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 193         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 194         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 195         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 196         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 197         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 198         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 199         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 200         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 201         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 202         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 203         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 204         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 205         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 206
 207         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 208         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 209         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
 210         .long   0,0,0,0,   0,0,0,0
 211         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 212 .align  64
 213 ___
 214
 215 ######################################################################
 216 # SIMD code paths
 217 #
 218 {{{
 219 ($iv,$inout,$roundkey,$temp,
 220  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
 221
 222 $aesni_cbc_idx=0;
 223 @aesni_cbc_block = (
 224 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
 225 ##      &vmovdqu        ($inout,($inp));
 226 ##      &mov            ($_inp,$inp);
 227
 228         '&vpxor         ($inout,$inout,$roundkey);'.
 229         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
 230
 231         '&vpxor         ($inout,$inout,$iv);',
 232
 233         '&vaesenc       ($inout,$inout,$roundkey);'.
 234         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
 235
 236         '&vaesenc       ($inout,$inout,$roundkey);'.
 237         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
 238
 239         '&vaesenc       ($inout,$inout,$roundkey);'.
 240         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
 241
 242         '&vaesenc       ($inout,$inout,$roundkey);'.
 243         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
 244
 245         '&vaesenc       ($inout,$inout,$roundkey);'.
 246         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
 247
 248         '&vaesenc       ($inout,$inout,$roundkey);'.
 249         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
 250
 251         '&vaesenc       ($inout,$inout,$roundkey);'.
 252         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
 253
 254         '&vaesenc       ($inout,$inout,$roundkey);'.
 255         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
 256
 257         '&vaesenc       ($inout,$inout,$roundkey);'.
 258         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
 259
 260         '&vaesenclast   ($temp,$inout,$roundkey);'.
 261         ' &vaesenc      ($inout,$inout,$roundkey);'.
 262         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
 263
 264         '&vpand         ($iv,$temp,$mask10);'.
 265         ' &vaesenc      ($inout,$inout,$roundkey);'.
 266         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
 267
 268         '&vaesenclast   ($temp,$inout,$roundkey);'.
 269         ' &vaesenc      ($inout,$inout,$roundkey);'.
 270         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
 271
 272         '&vpand         ($temp,$temp,$mask12);'.
 273         ' &vaesenc      ($inout,$inout,$roundkey);'.
 274          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
 275
 276         '&vpor          ($iv,$iv,$temp);'.
 277         ' &vaesenclast  ($temp,$inout,$roundkey);'.
 278         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
 279
 280 ##      &mov            ($inp,$_inp);
 281 ##      &mov            ($out,$_out);
 282 ##      &vpand          ($temp,$temp,$mask14);
 283 ##      &vpor           ($iv,$iv,$temp);
 284 ##      &vmovdqu        ($iv,($out,$inp);
 285 ##      &lea            (inp,16($inp));
 286 );
 287
 288 my $a4=$T1;
 289 my ($a,$b,$c,$d,$e,$f,$g,$h);
 290
 291 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 292 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 293   my $arg = pop;
 294     $arg = "\$$arg" if ($arg*1 eq $arg);
 295     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 296 }
 297
 298 sub body_00_15 () {
 299         (
 300         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 301
 302         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
 303         '&mov   ($a,$a1)',
 304         '&mov   ($a4,$f)',
 305
 306         '&xor   ($a0,$e)',
 307         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
 308         '&xor   ($a4,$g)',                      # f^g
 309
 310         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
 311         '&xor   ($a1,$a)',
 312         '&and   ($a4,$e)',                      # (f^g)&e
 313
 314         @aesni_cbc_block[$aesni_cbc_idx++].
 315         '&xor   ($a0,$e)',
 316         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
 317         '&mov   ($a2,$a)',
 318
 319         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
 320         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
 321         '&xor   ($a2,$b)',                      # a^b, b^c in next round
 322
 323         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
 324         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
 325         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
 326
 327         '&xor   ($a1,$a)',
 328         '&add   ($h,$a0)',                      # h+=Sigma1(e)
 329         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
 330
 331         '&add   ($d,$h)',                       # d+=h
 332         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
 333         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
 334
 335         '&mov   ($a0,$d)',
 336         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
 337         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 338         );
 339 }
 340
 341 if ($avx) {{
 342 ######################################################################
 343 # XOP code path
 344 #
 345 $code.=<<___;
 346 .type   ${func}_xop,\@function,6
 347 .align  64
 348 ${func}_xop:
 349 .cfi_startproc
 350 .Lxop_shortcut:
 351         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 352         mov     %rsp,%rax               # copy %rsp
 353 .cfi_def_cfa_register   %rax
 354         push    %rbx
 355 .cfi_push       %rbx
 356         push    %rbp
 357 .cfi_push       %rbp
 358         push    %r12
 359 .cfi_push       %r12
 360         push    %r13
 361 .cfi_push       %r13
 362         push    %r14
 363 .cfi_push       %r14
 364         push    %r15
 365 .cfi_push       %r15
 366         sub     \$`$framesz+$win64*16*10`,%rsp
 367         and     \$-64,%rsp              # align stack frame
 368
 369         shl     \$6,$len
 370         sub     $inp,$out               # re-bias
 371         sub     $inp,$in0
 372         add     $inp,$len               # end of input
 373
 374         #mov    $inp,$_inp              # saved later
 375         mov     $out,$_out
 376         mov     $len,$_end
 377         #mov    $key,$_key              # remains resident in $inp register
 378         mov     $ivp,$_ivp
 379         mov     $ctx,$_ctx
 380         mov     $in0,$_in0
 381         mov     %rax,$_rsp
 382 .cfi_cfa_expression     $_rsp,deref,+8
 383 ___
 384 $code.=<<___ if ($win64);
 385         movaps  %xmm6,`$framesz+16*0`(%rsp)
 386         movaps  %xmm7,`$framesz+16*1`(%rsp)
 387         movaps  %xmm8,`$framesz+16*2`(%rsp)
 388         movaps  %xmm9,`$framesz+16*3`(%rsp)
 389         movaps  %xmm10,`$framesz+16*4`(%rsp)
 390         movaps  %xmm11,`$framesz+16*5`(%rsp)
 391         movaps  %xmm12,`$framesz+16*6`(%rsp)
 392         movaps  %xmm13,`$framesz+16*7`(%rsp)
 393         movaps  %xmm14,`$framesz+16*8`(%rsp)
 394         movaps  %xmm15,`$framesz+16*9`(%rsp)
 395 ___
 396 $code.=<<___;
 397 .Lprologue_xop:
 398         vzeroall
 399
 400         mov     $inp,%r12               # borrow $a4
 401         lea     0x80($key),$inp         # size optimization, reassign
 402         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 403         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 404         mov     $ctx,%r15               # borrow $a2
 405         mov     $in0,%rsi               # borrow $a3
 406         vmovdqu ($ivp),$iv              # load IV
 407         sub     \$9,%r14
 408
 409         mov     $SZ*0(%r15),$A
 410         mov     $SZ*1(%r15),$B
 411         mov     $SZ*2(%r15),$C
 412         mov     $SZ*3(%r15),$D
 413         mov     $SZ*4(%r15),$E
 414         mov     $SZ*5(%r15),$F
 415         mov     $SZ*6(%r15),$G
 416         mov     $SZ*7(%r15),$H
 417
 418         vmovdqa 0x00(%r13,%r14,8),$mask14
 419         vmovdqa 0x10(%r13,%r14,8),$mask12
 420         vmovdqa 0x20(%r13,%r14,8),$mask10
 421         vmovdqu 0x00-0x80($inp),$roundkey
 422         jmp     .Lloop_xop
 423 ___
 424                                         if ($SZ==4) {   # SHA256
 425     my @X = map("%xmm$_",(0..3));
 426     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 427
 428 $code.=<<___;
 429 .align  16
 430 .Lloop_xop:
 431         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 432         vmovdqu 0x00(%rsi,%r12),@X[0]
 433         vmovdqu 0x10(%rsi,%r12),@X[1]
 434         vmovdqu 0x20(%rsi,%r12),@X[2]
 435         vmovdqu 0x30(%rsi,%r12),@X[3]
 436         vpshufb $t3,@X[0],@X[0]
 437         lea     $TABLE(%rip),$Tbl
 438         vpshufb $t3,@X[1],@X[1]
 439         vpshufb $t3,@X[2],@X[2]
 440         vpaddd  0x00($Tbl),@X[0],$t0
 441         vpshufb $t3,@X[3],@X[3]
 442         vpaddd  0x20($Tbl),@X[1],$t1
 443         vpaddd  0x40($Tbl),@X[2],$t2
 444         vpaddd  0x60($Tbl),@X[3],$t3
 445         vmovdqa $t0,0x00(%rsp)
 446         mov     $A,$a1
 447         vmovdqa $t1,0x10(%rsp)
 448         mov     $B,$a3
 449         vmovdqa $t2,0x20(%rsp)
 450         xor     $C,$a3                  # magic
 451         vmovdqa $t3,0x30(%rsp)
 452         mov     $E,$a0
 453         jmp     .Lxop_00_47
 454
 455 .align  16
 456 .Lxop_00_47:
 457         sub     \$-16*2*$SZ,$Tbl        # size optimization
 458         vmovdqu (%r12),$inout           # $a4
 459         mov     %r12,$_inp              # $a4
 460 ___
 461 sub XOP_256_00_47 () {
 462 my $j = shift;
 463 my $body = shift;
 464 my @X = @_;
 465 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 466
 467         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
 468           eval(shift(@insns));
 469           eval(shift(@insns));
 470          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
 471           eval(shift(@insns));
 472           eval(shift(@insns));
 473         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
 474           eval(shift(@insns));
 475           eval(shift(@insns));
 476         &vpsrld         ($t0,$t0,$sigma0[2]);
 477           eval(shift(@insns));
 478           eval(shift(@insns));
 479          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
 480           eval(shift(@insns));
 481           eval(shift(@insns));
 482           eval(shift(@insns));
 483           eval(shift(@insns));
 484         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
 485           eval(shift(@insns));
 486           eval(shift(@insns));
 487         &vpxor          ($t0,$t0,$t1);
 488           eval(shift(@insns));
 489           eval(shift(@insns));
 490           eval(shift(@insns));
 491           eval(shift(@insns));
 492          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
 493           eval(shift(@insns));
 494           eval(shift(@insns));
 495         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
 496           eval(shift(@insns));
 497           eval(shift(@insns));
 498          &vpsrld        ($t2,@X[3],$sigma1[2]);
 499           eval(shift(@insns));
 500           eval(shift(@insns));
 501         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
 502           eval(shift(@insns));
 503           eval(shift(@insns));
 504          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 505           eval(shift(@insns));
 506           eval(shift(@insns));
 507          &vpxor         ($t3,$t3,$t2);
 508           eval(shift(@insns));
 509           eval(shift(@insns));
 510           eval(shift(@insns));
 511           eval(shift(@insns));
 512          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
 513           eval(shift(@insns));
 514           eval(shift(@insns));
 515           eval(shift(@insns));
 516           eval(shift(@insns));
 517         &vpsrldq        ($t3,$t3,8);
 518           eval(shift(@insns));
 519           eval(shift(@insns));
 520           eval(shift(@insns));
 521           eval(shift(@insns));
 522         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
 523           eval(shift(@insns));
 524           eval(shift(@insns));
 525           eval(shift(@insns));
 526           eval(shift(@insns));
 527          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
 528           eval(shift(@insns));
 529           eval(shift(@insns));
 530          &vpsrld        ($t2,@X[0],$sigma1[2]);
 531           eval(shift(@insns));
 532           eval(shift(@insns));
 533          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 534           eval(shift(@insns));
 535           eval(shift(@insns));
 536          &vpxor         ($t3,$t3,$t2);
 537           eval(shift(@insns));
 538           eval(shift(@insns));
 539           eval(shift(@insns));
 540           eval(shift(@insns));
 541          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
 542           eval(shift(@insns));
 543           eval(shift(@insns));
 544           eval(shift(@insns));
 545           eval(shift(@insns));
 546         &vpslldq        ($t3,$t3,8);            # 22 instructions
 547           eval(shift(@insns));
 548           eval(shift(@insns));
 549           eval(shift(@insns));
 550           eval(shift(@insns));
 551         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
 552           eval(shift(@insns));
 553           eval(shift(@insns));
 554           eval(shift(@insns));
 555           eval(shift(@insns));
 556         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 557           foreach (@insns) { eval; }            # remaining instructions
 558         &vmovdqa        (16*$j."(%rsp)",$t2);
 559 }
 560
 561     $aesni_cbc_idx=0;
 562     for ($i=0,$j=0; $j<4; $j++) {
 563         &XOP_256_00_47($j,\&body_00_15,@X);
 564         push(@X,shift(@X));                     # rotate(@X)
 565     }
 566         &mov            ("%r12",$_inp);         # borrow $a4
 567         &vpand          ($temp,$temp,$mask14);
 568         &mov            ("%r15",$_out);         # borrow $a2
 569         &vpor           ($iv,$iv,$temp);
 570         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 571         &lea            ("%r12","16(%r12)");    # inp++
 572
 573         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 574         &jne    (".Lxop_00_47");
 575
 576         &vmovdqu        ($inout,"(%r12)");
 577         &mov            ($_inp,"%r12");
 578
 579     $aesni_cbc_idx=0;
 580     for ($i=0; $i<16; ) {
 581         foreach(body_00_15()) { eval; }
 582     }
 583                                         }
 584 $code.=<<___;
 585         mov     $_inp,%r12              # borrow $a4
 586         mov     $_out,%r13              # borrow $a0
 587         mov     $_ctx,%r15              # borrow $a2
 588         mov     $_in0,%rsi              # borrow $a3
 589
 590         vpand   $mask14,$temp,$temp
 591         mov     $a1,$A
 592         vpor    $temp,$iv,$iv
 593         vmovdqu $iv,(%r13,%r12)         # write output
 594         lea     16(%r12),%r12           # inp++
 595
 596         add     $SZ*0(%r15),$A
 597         add     $SZ*1(%r15),$B
 598         add     $SZ*2(%r15),$C
 599         add     $SZ*3(%r15),$D
 600         add     $SZ*4(%r15),$E
 601         add     $SZ*5(%r15),$F
 602         add     $SZ*6(%r15),$G
 603         add     $SZ*7(%r15),$H
 604
 605         cmp     $_end,%r12
 606
 607         mov     $A,$SZ*0(%r15)
 608         mov     $B,$SZ*1(%r15)
 609         mov     $C,$SZ*2(%r15)
 610         mov     $D,$SZ*3(%r15)
 611         mov     $E,$SZ*4(%r15)
 612         mov     $F,$SZ*5(%r15)
 613         mov     $G,$SZ*6(%r15)
 614         mov     $H,$SZ*7(%r15)
 615
 616         jb      .Lloop_xop
 617
 618         mov     $_ivp,$ivp
 619         mov     $_rsp,%rsi
 620 .cfi_def_cfa    %rsi,8
 621         vmovdqu $iv,($ivp)              # output IV
 622         vzeroall
 623 ___
 624 $code.=<<___ if ($win64);
 625         movaps  `$framesz+16*0`(%rsp),%xmm6
 626         movaps  `$framesz+16*1`(%rsp),%xmm7
 627         movaps  `$framesz+16*2`(%rsp),%xmm8
 628         movaps  `$framesz+16*3`(%rsp),%xmm9
 629         movaps  `$framesz+16*4`(%rsp),%xmm10
 630         movaps  `$framesz+16*5`(%rsp),%xmm11
 631         movaps  `$framesz+16*6`(%rsp),%xmm12
 632         movaps  `$framesz+16*7`(%rsp),%xmm13
 633         movaps  `$framesz+16*8`(%rsp),%xmm14
 634         movaps  `$framesz+16*9`(%rsp),%xmm15
 635 ___
 636 $code.=<<___;
 637         mov     -48(%rsi),%r15
 638 .cfi_restore    %r15
 639         mov     -40(%rsi),%r14
 640 .cfi_restore    %r14
 641         mov     -32(%rsi),%r13
 642 .cfi_restore    %r13
 643         mov     -24(%rsi),%r12
 644 .cfi_restore    %r12
 645         mov     -16(%rsi),%rbp
 646 .cfi_restore    %rbp
 647         mov     -8(%rsi),%rbx
 648 .cfi_restore    %rbx
 649         lea     (%rsi),%rsp
 650 .cfi_def_cfa_register   %rsp
 651 .Lepilogue_xop:
 652         ret
 653 .cfi_endproc
 654 .size   ${func}_xop,.-${func}_xop
 655 ___
 656 ######################################################################
 657 # AVX+shrd code path
 658 #
 659 local *ror = sub { &shrd(@_[0],@_) };
 660
 661 $code.=<<___;
 662 .type   ${func}_avx,\@function,6
 663 .align  64
 664 ${func}_avx:
 665 .cfi_startproc
 666 .Lavx_shortcut:
 667         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 668         mov     %rsp,%rax               # copy %rsp
 669 .cfi_def_cfa_register   %rax
 670         push    %rbx
 671 .cfi_push       %rbx
 672         push    %rbp
 673 .cfi_push       %rbp
 674         push    %r12
 675 .cfi_push       %r12
 676         push    %r13
 677 .cfi_push       %r13
 678         push    %r14
 679 .cfi_push       %r14
 680         push    %r15
 681 .cfi_push       %r15
 682         sub     \$`$framesz+$win64*16*10`,%rsp
 683         and     \$-64,%rsp              # align stack frame
 684
 685         shl     \$6,$len
 686         sub     $inp,$out               # re-bias
 687         sub     $inp,$in0
 688         add     $inp,$len               # end of input
 689
 690         #mov    $inp,$_inp              # saved later
 691         mov     $out,$_out
 692         mov     $len,$_end
 693         #mov    $key,$_key              # remains resident in $inp register
 694         mov     $ivp,$_ivp
 695         mov     $ctx,$_ctx
 696         mov     $in0,$_in0
 697         mov     %rax,$_rsp
 698 .cfi_cfa_expression     $_rsp,deref,+8
 699 ___
 700 $code.=<<___ if ($win64);
 701         movaps  %xmm6,`$framesz+16*0`(%rsp)
 702         movaps  %xmm7,`$framesz+16*1`(%rsp)
 703         movaps  %xmm8,`$framesz+16*2`(%rsp)
 704         movaps  %xmm9,`$framesz+16*3`(%rsp)
 705         movaps  %xmm10,`$framesz+16*4`(%rsp)
 706         movaps  %xmm11,`$framesz+16*5`(%rsp)
 707         movaps  %xmm12,`$framesz+16*6`(%rsp)
 708         movaps  %xmm13,`$framesz+16*7`(%rsp)
 709         movaps  %xmm14,`$framesz+16*8`(%rsp)
 710         movaps  %xmm15,`$framesz+16*9`(%rsp)
 711 ___
 712 $code.=<<___;
 713 .Lprologue_avx:
 714         vzeroall
 715
 716         mov     $inp,%r12               # borrow $a4
 717         lea     0x80($key),$inp         # size optimization, reassign
 718         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 719         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 720         mov     $ctx,%r15               # borrow $a2
 721         mov     $in0,%rsi               # borrow $a3
 722         vmovdqu ($ivp),$iv              # load IV
 723         sub     \$9,%r14
 724
 725         mov     $SZ*0(%r15),$A
 726         mov     $SZ*1(%r15),$B
 727         mov     $SZ*2(%r15),$C
 728         mov     $SZ*3(%r15),$D
 729         mov     $SZ*4(%r15),$E
 730         mov     $SZ*5(%r15),$F
 731         mov     $SZ*6(%r15),$G
 732         mov     $SZ*7(%r15),$H
 733
 734         vmovdqa 0x00(%r13,%r14,8),$mask14
 735         vmovdqa 0x10(%r13,%r14,8),$mask12
 736         vmovdqa 0x20(%r13,%r14,8),$mask10
 737         vmovdqu 0x00-0x80($inp),$roundkey
 738 ___
 739                                         if ($SZ==4) {   # SHA256
 740     my @X = map("%xmm$_",(0..3));
 741     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 742
 743 $code.=<<___;
 744         jmp     .Lloop_avx
 745 .align  16
 746 .Lloop_avx:
 747         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 748         vmovdqu 0x00(%rsi,%r12),@X[0]
 749         vmovdqu 0x10(%rsi,%r12),@X[1]
 750         vmovdqu 0x20(%rsi,%r12),@X[2]
 751         vmovdqu 0x30(%rsi,%r12),@X[3]
 752         vpshufb $t3,@X[0],@X[0]
 753         lea     $TABLE(%rip),$Tbl
 754         vpshufb $t3,@X[1],@X[1]
 755         vpshufb $t3,@X[2],@X[2]
 756         vpaddd  0x00($Tbl),@X[0],$t0
 757         vpshufb $t3,@X[3],@X[3]
 758         vpaddd  0x20($Tbl),@X[1],$t1
 759         vpaddd  0x40($Tbl),@X[2],$t2
 760         vpaddd  0x60($Tbl),@X[3],$t3
 761         vmovdqa $t0,0x00(%rsp)
 762         mov     $A,$a1
 763         vmovdqa $t1,0x10(%rsp)
 764         mov     $B,$a3
 765         vmovdqa $t2,0x20(%rsp)
 766         xor     $C,$a3                  # magic
 767         vmovdqa $t3,0x30(%rsp)
 768         mov     $E,$a0
 769         jmp     .Lavx_00_47
 770
 771 .align  16
 772 .Lavx_00_47:
 773         sub     \$-16*2*$SZ,$Tbl        # size optimization
 774         vmovdqu (%r12),$inout           # $a4
 775         mov     %r12,$_inp              # $a4
 776 ___
 777 sub Xupdate_256_AVX () {
 778         (
 779         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
 780          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
 781         '&vpsrld        ($t2,$t0,$sigma0[0]);',
 782          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
 783         '&vpsrld        ($t3,$t0,$sigma0[2])',
 784         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
 785         '&vpxor         ($t0,$t3,$t2)',
 786          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
 787         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
 788         '&vpxor         ($t0,$t0,$t1)',
 789         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
 790         '&vpxor         ($t0,$t0,$t2)',
 791          '&vpsrld       ($t2,$t3,$sigma1[2]);',
 792         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
 793          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
 794         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
 795          '&vpxor        ($t2,$t2,$t3);',
 796          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 797          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
 798          '&vpshufd      ($t2,$t2,0b10000100)',
 799          '&vpsrldq      ($t2,$t2,8)',
 800         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
 801          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
 802          '&vpsrld       ($t2,$t3,$sigma1[2])',
 803          '&vpsrlq       ($t3,$t3,$sigma1[0])',
 804          '&vpxor        ($t2,$t2,$t3);',
 805          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 806          '&vpxor        ($t2,$t2,$t3)',
 807          '&vpshufd      ($t2,$t2,0b11101000)',
 808          '&vpslldq      ($t2,$t2,8)',
 809         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
 810         );
 811 }
 812
 813 sub AVX_256_00_47 () {
 814 my $j = shift;
 815 my $body = shift;
 816 my @X = @_;
 817 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 818
 819         foreach (Xupdate_256_AVX()) {           # 29 instructions
 820             eval;
 821             eval(shift(@insns));
 822             eval(shift(@insns));
 823             eval(shift(@insns));
 824         }
 825         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 826           foreach (@insns) { eval; }            # remaining instructions
 827         &vmovdqa        (16*$j."(%rsp)",$t2);
 828 }
 829
 830     $aesni_cbc_idx=0;
 831     for ($i=0,$j=0; $j<4; $j++) {
 832         &AVX_256_00_47($j,\&body_00_15,@X);
 833         push(@X,shift(@X));                     # rotate(@X)
 834     }
 835         &mov            ("%r12",$_inp);         # borrow $a4
 836         &vpand          ($temp,$temp,$mask14);
 837         &mov            ("%r15",$_out);         # borrow $a2
 838         &vpor           ($iv,$iv,$temp);
 839         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 840         &lea            ("%r12","16(%r12)");    # inp++
 841
 842         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 843         &jne    (".Lavx_00_47");
 844
 845         &vmovdqu        ($inout,"(%r12)");
 846         &mov            ($_inp,"%r12");
 847
 848     $aesni_cbc_idx=0;
 849     for ($i=0; $i<16; ) {
 850         foreach(body_00_15()) { eval; }
 851     }
 852
 853                                         }
 854 $code.=<<___;
 855         mov     $_inp,%r12              # borrow $a4
 856         mov     $_out,%r13              # borrow $a0
 857         mov     $_ctx,%r15              # borrow $a2
 858         mov     $_in0,%rsi              # borrow $a3
 859
 860         vpand   $mask14,$temp,$temp
 861         mov     $a1,$A
 862         vpor    $temp,$iv,$iv
 863         vmovdqu $iv,(%r13,%r12)         # write output
 864         lea     16(%r12),%r12           # inp++
 865
 866         add     $SZ*0(%r15),$A
 867         add     $SZ*1(%r15),$B
 868         add     $SZ*2(%r15),$C
 869         add     $SZ*3(%r15),$D
 870         add     $SZ*4(%r15),$E
 871         add     $SZ*5(%r15),$F
 872         add     $SZ*6(%r15),$G
 873         add     $SZ*7(%r15),$H
 874
 875         cmp     $_end,%r12
 876
 877         mov     $A,$SZ*0(%r15)
 878         mov     $B,$SZ*1(%r15)
 879         mov     $C,$SZ*2(%r15)
 880         mov     $D,$SZ*3(%r15)
 881         mov     $E,$SZ*4(%r15)
 882         mov     $F,$SZ*5(%r15)
 883         mov     $G,$SZ*6(%r15)
 884         mov     $H,$SZ*7(%r15)
 885         jb      .Lloop_avx
 886
 887         mov     $_ivp,$ivp
 888         mov     $_rsp,%rsi
 889 .cfi_def_cfa    %rsi,8
 890         vmovdqu $iv,($ivp)              # output IV
 891         vzeroall
 892 ___
 893 $code.=<<___ if ($win64);
 894         movaps  `$framesz+16*0`(%rsp),%xmm6
 895         movaps  `$framesz+16*1`(%rsp),%xmm7
 896         movaps  `$framesz+16*2`(%rsp),%xmm8
 897         movaps  `$framesz+16*3`(%rsp),%xmm9
 898         movaps  `$framesz+16*4`(%rsp),%xmm10
 899         movaps  `$framesz+16*5`(%rsp),%xmm11
 900         movaps  `$framesz+16*6`(%rsp),%xmm12
 901         movaps  `$framesz+16*7`(%rsp),%xmm13
 902         movaps  `$framesz+16*8`(%rsp),%xmm14
 903         movaps  `$framesz+16*9`(%rsp),%xmm15
 904 ___
 905 $code.=<<___;
 906         mov     -48(%rsi),%r15
 907 .cfi_restore    %r15
 908         mov     -40(%rsi),%r14
 909 .cfi_restore    %r14
 910         mov     -32(%rsi),%r13
 911 .cfi_restore    %r13
 912         mov     -24(%rsi),%r12
 913 .cfi_restore    %r12
 914         mov     -16(%rsi),%rbp
 915 .cfi_restore    %rbp
 916         mov     -8(%rsi),%rbx
 917 .cfi_restore    %rbx
 918         lea     (%rsi),%rsp
 919 .cfi_def_cfa_register   %rsp
 920 .Lepilogue_avx:
 921         ret
 922 .cfi_endproc
 923 .size   ${func}_avx,.-${func}_avx
 924 ___
 925
 926 if ($avx>1) {{
 927 ######################################################################
 928 # AVX2+BMI code path
 929 #
 930 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
 931 my $PUSH8=8*2*$SZ;
 932 use integer;
 933
 934 sub bodyx_00_15 () {
 935         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
 936         (
 937         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 938
 939         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
 940         '&and   ($a4,$e)',              # f&e
 941         '&rorx  ($a0,$e,$Sigma1[2])',
 942         '&rorx  ($a2,$e,$Sigma1[1])',
 943
 944         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
 945         '&lea   ($h,"($h,$a4)")',
 946         '&andn  ($a4,$e,$g)',           # ~e&g
 947         '&xor   ($a0,$a2)',
 948
 949         '&rorx  ($a1,$e,$Sigma1[0])',
 950         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
 951         '&xor   ($a0,$a1)',             # Sigma1(e)
 952         '&mov   ($a2,$a)',
 953
 954         '&rorx  ($a4,$a,$Sigma0[2])',
 955         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
 956         '&xor   ($a2,$b)',              # a^b, b^c in next round
 957         '&rorx  ($a1,$a,$Sigma0[1])',
 958
 959         '&rorx  ($a0,$a,$Sigma0[0])',
 960         '&lea   ($d,"($d,$h)")',        # d+=h
 961         '&and   ($a3,$a2)',             # (b^c)&(a^b)
 962         @aesni_cbc_block[$aesni_cbc_idx++].
 963         '&xor   ($a1,$a4)',
 964
 965         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
 966         '&xor   ($a1,$a0)',             # Sigma0(a)
 967         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
 968         '&mov   ($a4,$e)',              # copy of f in future
 969
 970         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 971         );
 972         # and at the finish one has to $a+=$a1
 973 }
 974
 975 $code.=<<___;
 976 .type   ${func}_avx2,\@function,6
 977 .align  64
 978 ${func}_avx2:
 979 .cfi_startproc
 980 .Lavx2_shortcut:
 981         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 982         mov     %rsp,%rax               # copy %rsp
 983 .cfi_def_cfa_register   %rax
 984         push    %rbx
 985 .cfi_push       %rbx
 986         push    %rbp
 987 .cfi_push       %rbp
 988         push    %r12
 989 .cfi_push       %r12
 990         push    %r13
 991 .cfi_push       %r13
 992         push    %r14
 993 .cfi_push       %r14
 994         push    %r15
 995 .cfi_push       %r15
 996         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
 997         and     \$-256*$SZ,%rsp         # align stack frame
 998         add     \$`2*$SZ*($rounds-8)`,%rsp
 999
1000         shl     \$6,$len
1001         sub     $inp,$out               # re-bias
1002         sub     $inp,$in0
1003         add     $inp,$len               # end of input
1004
1005         #mov    $inp,$_inp              # saved later
1006         #mov    $out,$_out              # kept in $offload
1007         mov     $len,$_end
1008         #mov    $key,$_key              # remains resident in $inp register
1009         mov     $ivp,$_ivp
1010         mov     $ctx,$_ctx
1011         mov     $in0,$_in0
1012         mov     %rax,$_rsp
1013 .cfi_cfa_expression     $_rsp,deref,+8
1014 ___
1015 $code.=<<___ if ($win64);
1016         movaps  %xmm6,`$framesz+16*0`(%rsp)
1017         movaps  %xmm7,`$framesz+16*1`(%rsp)
1018         movaps  %xmm8,`$framesz+16*2`(%rsp)
1019         movaps  %xmm9,`$framesz+16*3`(%rsp)
1020         movaps  %xmm10,`$framesz+16*4`(%rsp)
1021         movaps  %xmm11,`$framesz+16*5`(%rsp)
1022         movaps  %xmm12,`$framesz+16*6`(%rsp)
1023         movaps  %xmm13,`$framesz+16*7`(%rsp)
1024         movaps  %xmm14,`$framesz+16*8`(%rsp)
1025         movaps  %xmm15,`$framesz+16*9`(%rsp)
1026 ___
1027 $code.=<<___;
1028 .Lprologue_avx2:
1029         vzeroall
1030
1031         mov     $inp,%r13               # borrow $a0
1032         vpinsrq \$1,$out,$offload,$offload
1033         lea     0x80($key),$inp         # size optimization, reassign
1034         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
1035         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
1036         mov     $ctx,%r15               # borrow $a2
1037         mov     $in0,%rsi               # borrow $a3
1038         vmovdqu ($ivp),$iv              # load IV
1039         lea     -9(%r14),%r14
1040
1041         vmovdqa 0x00(%r12,%r14,8),$mask14
1042         vmovdqa 0x10(%r12,%r14,8),$mask12
1043         vmovdqa 0x20(%r12,%r14,8),$mask10
1044
1045         sub     \$-16*$SZ,%r13          # inp++, size optimization
1046         mov     $SZ*0(%r15),$A
1047         lea     (%rsi,%r13),%r12        # borrow $a0
1048         mov     $SZ*1(%r15),$B
1049         cmp     $len,%r13               # $_end
1050         mov     $SZ*2(%r15),$C
1051         cmove   %rsp,%r12               # next block or random data
1052         mov     $SZ*3(%r15),$D
1053         mov     $SZ*4(%r15),$E
1054         mov     $SZ*5(%r15),$F
1055         mov     $SZ*6(%r15),$G
1056         mov     $SZ*7(%r15),$H
1057         vmovdqu 0x00-0x80($inp),$roundkey
1058 ___
1059                                         if ($SZ==4) {   # SHA256
1060     my @X = map("%ymm$_",(0..3));
1061     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1062
1063 $code.=<<___;
1064         jmp     .Loop_avx2
1065 .align  16
1066 .Loop_avx2:
1067         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1068         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1069         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1070         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1071         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1072
1073         vinserti128     \$1,(%r12),@X[0],@X[0]
1074         vinserti128     \$1,16(%r12),@X[1],@X[1]
1075          vpshufb        $t3,@X[0],@X[0]
1076         vinserti128     \$1,32(%r12),@X[2],@X[2]
1077          vpshufb        $t3,@X[1],@X[1]
1078         vinserti128     \$1,48(%r12),@X[3],@X[3]
1079
1080         lea     $TABLE(%rip),$Tbl
1081         vpshufb $t3,@X[2],@X[2]
1082         lea     -16*$SZ(%r13),%r13
1083         vpaddd  0x00($Tbl),@X[0],$t0
1084         vpshufb $t3,@X[3],@X[3]
1085         vpaddd  0x20($Tbl),@X[1],$t1
1086         vpaddd  0x40($Tbl),@X[2],$t2
1087         vpaddd  0x60($Tbl),@X[3],$t3
1088         vmovdqa $t0,0x00(%rsp)
1089         xor     $a1,$a1
1090         vmovdqa $t1,0x20(%rsp)
1091         lea     -$PUSH8(%rsp),%rsp
1092         mov     $B,$a3
1093         vmovdqa $t2,0x00(%rsp)
1094         xor     $C,$a3                  # magic
1095         vmovdqa $t3,0x20(%rsp)
1096         mov     $F,$a4
1097         sub     \$-16*2*$SZ,$Tbl        # size optimization
1098         jmp     .Lavx2_00_47
1099
1100 .align  16
1101 .Lavx2_00_47:
1102         vmovdqu (%r13),$inout
1103         vpinsrq \$0,%r13,$offload,$offload
1104 ___
1105
1106 sub AVX2_256_00_47 () {
1107 my $j = shift;
1108 my $body = shift;
1109 my @X = @_;
1110 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1111 my $base = "+2*$PUSH8(%rsp)";
1112
1113         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1114         foreach (Xupdate_256_AVX()) {           # 29 instructions
1115             eval;
1116             eval(shift(@insns));
1117             eval(shift(@insns));
1118             eval(shift(@insns));
1119         }
1120         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1121           foreach (@insns) { eval; }            # remaining instructions
1122         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1123 }
1124     $aesni_cbc_idx=0;
1125     for ($i=0,$j=0; $j<4; $j++) {
1126         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1127         push(@X,shift(@X));                     # rotate(@X)
1128     }
1129         &vmovq          ("%r13",$offload);      # borrow $a0
1130         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1131         &vpand          ($temp,$temp,$mask14);
1132         &vpor           ($iv,$iv,$temp);
1133         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1134         &lea            ("%r13","16(%r13)");    # inp++
1135
1136         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1137         &cmpb   (($SZ-1)."($Tbl)",0);
1138         &jne    (".Lavx2_00_47");
1139
1140         &vmovdqu        ($inout,"(%r13)");
1141         &vpinsrq        ($offload,$offload,"%r13",0);
1142
1143     $aesni_cbc_idx=0;
1144     for ($i=0; $i<16; ) {
1145         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1146         foreach(bodyx_00_15()) { eval; }
1147     }
1148                                         }
1149 $code.=<<___;
1150         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1151         vmovq   $offload,%r13                   # $_inp, borrow $a0
1152         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1153         add     $a1,$A
1154         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1155
1156         vpand   $mask14,$temp,$temp
1157         vpor    $temp,$iv,$iv
1158         vmovdqu $iv,(%r12,%r13)                 # write output
1159         lea     16(%r13),%r13
1160
1161         add     $SZ*0(%r15),$A
1162         add     $SZ*1(%r15),$B
1163         add     $SZ*2(%r15),$C
1164         add     $SZ*3(%r15),$D
1165         add     $SZ*4(%r15),$E
1166         add     $SZ*5(%r15),$F
1167         add     $SZ*6(%r15),$G
1168         add     $SZ*7(%r15),$H
1169
1170         mov     $A,$SZ*0(%r15)
1171         mov     $B,$SZ*1(%r15)
1172         mov     $C,$SZ*2(%r15)
1173         mov     $D,$SZ*3(%r15)
1174         mov     $E,$SZ*4(%r15)
1175         mov     $F,$SZ*5(%r15)
1176         mov     $G,$SZ*6(%r15)
1177         mov     $H,$SZ*7(%r15)
1178
1179         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1180         je      .Ldone_avx2
1181
1182         xor     $a1,$a1
1183         mov     $B,$a3
1184         mov     $F,$a4
1185         xor     $C,$a3                  # magic
1186         jmp     .Lower_avx2
1187 .align  16
1188 .Lower_avx2:
1189         vmovdqu (%r13),$inout
1190         vpinsrq \$0,%r13,$offload,$offload
1191 ___
1192     $aesni_cbc_idx=0;
1193     for ($i=0; $i<16; ) {
1194         my $base="+16($Tbl)";
1195         foreach(bodyx_00_15()) { eval; }
1196         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1197     }
1198 $code.=<<___;
1199         vmovq   $offload,%r13                   # borrow $a0
1200         vpextrq \$1,$offload,%r15               # borrow $a2
1201         vpand   $mask14,$temp,$temp
1202         vpor    $temp,$iv,$iv
1203         lea     -$PUSH8($Tbl),$Tbl
1204         vmovdqu $iv,(%r15,%r13)                 # write output
1205         lea     16(%r13),%r13                   # inp++
1206         cmp     %rsp,$Tbl
1207         jae     .Lower_avx2
1208
1209         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1210         lea     16*$SZ(%r13),%r13
1211         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1212         add     $a1,$A
1213         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1214
1215         add     $SZ*0(%r15),$A
1216         add     $SZ*1(%r15),$B
1217         add     $SZ*2(%r15),$C
1218         add     $SZ*3(%r15),$D
1219         add     $SZ*4(%r15),$E
1220         add     $SZ*5(%r15),$F
1221         add     $SZ*6(%r15),$G
1222         lea     (%rsi,%r13),%r12
1223         add     $SZ*7(%r15),$H
1224
1225         cmp     $_end,%r13
1226
1227         mov     $A,$SZ*0(%r15)
1228         cmove   %rsp,%r12               # next block or stale data
1229         mov     $B,$SZ*1(%r15)
1230         mov     $C,$SZ*2(%r15)
1231         mov     $D,$SZ*3(%r15)
1232         mov     $E,$SZ*4(%r15)
1233         mov     $F,$SZ*5(%r15)
1234         mov     $G,$SZ*6(%r15)
1235         mov     $H,$SZ*7(%r15)
1236
1237         jbe     .Loop_avx2
1238         lea     (%rsp),$Tbl
1239
1240 .Ldone_avx2:
1241         lea     ($Tbl),%rsp
1242         mov     $_ivp,$ivp
1243         mov     $_rsp,%rsi
1244 .cfi_def_cfa    %rsi,8
1245         vmovdqu $iv,($ivp)              # output IV
1246         vzeroall
1247 ___
1248 $code.=<<___ if ($win64);
1249         movaps  `$framesz+16*0`(%rsp),%xmm6
1250         movaps  `$framesz+16*1`(%rsp),%xmm7
1251         movaps  `$framesz+16*2`(%rsp),%xmm8
1252         movaps  `$framesz+16*3`(%rsp),%xmm9
1253         movaps  `$framesz+16*4`(%rsp),%xmm10
1254         movaps  `$framesz+16*5`(%rsp),%xmm11
1255         movaps  `$framesz+16*6`(%rsp),%xmm12
1256         movaps  `$framesz+16*7`(%rsp),%xmm13
1257         movaps  `$framesz+16*8`(%rsp),%xmm14
1258         movaps  `$framesz+16*9`(%rsp),%xmm15
1259 ___
1260 $code.=<<___;
1261         mov     -48(%rsi),%r15
1262 .cfi_restore    %r15
1263         mov     -40(%rsi),%r14
1264 .cfi_restore    %r14
1265         mov     -32(%rsi),%r13
1266 .cfi_restore    %r13
1267         mov     -24(%rsi),%r12
1268 .cfi_restore    %r12
1269         mov     -16(%rsi),%rbp
1270 .cfi_restore    %rbp
1271         mov     -8(%rsi),%rbx
1272 .cfi_restore    %rbx
1273         lea     (%rsi),%rsp
1274 .cfi_def_cfa_register   %rsp
1275 .Lepilogue_avx2:
1276         ret
1277 .cfi_endproc
1278 .size   ${func}_avx2,.-${func}_avx2
1279 ___
1280 }}
1281 }}
1282 {{
1283 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1284
1285 my ($rounds,$Tbl)=("%r11d","%rbx");
1286
1287 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1288 my @rndkey=("%xmm4","%xmm5");
1289 my $r=0;
1290 my $sn=0;
1291
1292 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1293 my @MSG=map("%xmm$_",(10..13));
1294
1295 my $aesenc=sub {
1296   use integer;
1297   my ($n,$k)=($r/10,$r%10);
1298     if ($k==0) {
1299       $code.=<<___;
1300         movups          `16*$n`($in0),$in               # load input
1301         xorps           $rndkey0,$in
1302 ___
1303       $code.=<<___ if ($n);
1304         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1305 ___
1306       $code.=<<___;
1307         xorps           $in,$iv
1308         movups          `32+16*$k-112`($key),$rndkey[1]
1309         aesenc          $rndkey[0],$iv
1310 ___
1311     } elsif ($k==9) {
1312       $sn++;
1313       $code.=<<___;
1314         cmp             \$11,$rounds
1315         jb              .Laesenclast$sn
1316         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1317         aesenc          $rndkey[0],$iv
1318         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1319         aesenc          $rndkey[1],$iv
1320         je              .Laesenclast$sn
1321         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1322         aesenc          $rndkey[0],$iv
1323         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1324         aesenc          $rndkey[1],$iv
1325 .Laesenclast$sn:
1326         aesenclast      $rndkey[0],$iv
1327         movups          16-112($key),$rndkey[1]         # forward reference
1328         nop
1329 ___
1330     } else {
1331       $code.=<<___;
1332         movups          `32+16*$k-112`($key),$rndkey[1]
1333         aesenc          $rndkey[0],$iv
1334 ___
1335     }
1336     $r++;       unshift(@rndkey,pop(@rndkey));
1337 };
1338
1339 if ($shaext) {
1340 my $Tbl="%rax";
1341
1342 $code.=<<___;
1343 .type   ${func}_shaext,\@function,6
1344 .align  32
1345 ${func}_shaext:
1346         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1347 ___
1348 $code.=<<___ if ($win64);
1349         lea     `-8-10*16`(%rsp),%rsp
1350         movaps  %xmm6,-8-10*16(%rax)
1351         movaps  %xmm7,-8-9*16(%rax)
1352         movaps  %xmm8,-8-8*16(%rax)
1353         movaps  %xmm9,-8-7*16(%rax)
1354         movaps  %xmm10,-8-6*16(%rax)
1355         movaps  %xmm11,-8-5*16(%rax)
1356         movaps  %xmm12,-8-4*16(%rax)
1357         movaps  %xmm13,-8-3*16(%rax)
1358         movaps  %xmm14,-8-2*16(%rax)
1359         movaps  %xmm15,-8-1*16(%rax)
1360 .Lprologue_shaext:
1361 ___
1362 $code.=<<___;
1363         lea             K256+0x80(%rip),$Tbl
1364         movdqu          ($ctx),$ABEF            # DCBA
1365         movdqu          16($ctx),$CDGH          # HGFE
1366         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1367
1368         mov             240($key),$rounds
1369         sub             $in0,$out
1370         movups          ($key),$rndkey0         # $key[0]
1371         movups          ($ivp),$iv              # load IV
1372         movups          16($key),$rndkey[0]     # forward reference
1373         lea             112($key),$key          # size optimization
1374
1375         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1376         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1377         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1378         movdqa          $TMP,$BSWAP             # offload
1379         palignr         \$8,$CDGH,$ABEF         # ABEF
1380         punpcklqdq      $Wi,$CDGH               # CDGH
1381
1382         jmp     .Loop_shaext
1383
1384 .align  16
1385 .Loop_shaext:
1386         movdqu          ($inp),@MSG[0]
1387         movdqu          0x10($inp),@MSG[1]
1388         movdqu          0x20($inp),@MSG[2]
1389         pshufb          $TMP,@MSG[0]
1390         movdqu          0x30($inp),@MSG[3]
1391
1392         movdqa          0*32-0x80($Tbl),$Wi
1393         paddd           @MSG[0],$Wi
1394         pshufb          $TMP,@MSG[1]
1395         movdqa          $CDGH,$CDGH_SAVE        # offload
1396         movdqa          $ABEF,$ABEF_SAVE        # offload
1397 ___
1398         &$aesenc();
1399 $code.=<<___;
1400         sha256rnds2     $ABEF,$CDGH             # 0-3
1401         pshufd          \$0x0e,$Wi,$Wi
1402 ___
1403         &$aesenc();
1404 $code.=<<___;
1405         sha256rnds2     $CDGH,$ABEF
1406
1407         movdqa          1*32-0x80($Tbl),$Wi
1408         paddd           @MSG[1],$Wi
1409         pshufb          $TMP,@MSG[2]
1410         lea             0x40($inp),$inp
1411 ___
1412         &$aesenc();
1413 $code.=<<___;
1414         sha256rnds2     $ABEF,$CDGH             # 4-7
1415         pshufd          \$0x0e,$Wi,$Wi
1416 ___
1417         &$aesenc();
1418 $code.=<<___;
1419         sha256rnds2     $CDGH,$ABEF
1420
1421         movdqa          2*32-0x80($Tbl),$Wi
1422         paddd           @MSG[2],$Wi
1423         pshufb          $TMP,@MSG[3]
1424         sha256msg1      @MSG[1],@MSG[0]
1425 ___
1426         &$aesenc();
1427 $code.=<<___;
1428         sha256rnds2     $ABEF,$CDGH             # 8-11
1429         pshufd          \$0x0e,$Wi,$Wi
1430         movdqa          @MSG[3],$TMP
1431         palignr         \$4,@MSG[2],$TMP
1432         paddd           $TMP,@MSG[0]
1433 ___
1434         &$aesenc();
1435 $code.=<<___;
1436         sha256rnds2     $CDGH,$ABEF
1437
1438         movdqa          3*32-0x80($Tbl),$Wi
1439         paddd           @MSG[3],$Wi
1440         sha256msg2      @MSG[3],@MSG[0]
1441         sha256msg1      @MSG[2],@MSG[1]
1442 ___
1443         &$aesenc();
1444 $code.=<<___;
1445         sha256rnds2     $ABEF,$CDGH             # 12-15
1446         pshufd          \$0x0e,$Wi,$Wi
1447 ___
1448         &$aesenc();
1449 $code.=<<___;
1450         movdqa          @MSG[0],$TMP
1451         palignr         \$4,@MSG[3],$TMP
1452         paddd           $TMP,@MSG[1]
1453         sha256rnds2     $CDGH,$ABEF
1454 ___
1455 for($i=4;$i<16-3;$i++) {
1456         &$aesenc()      if (($r%10)==0);
1457 $code.=<<___;
1458         movdqa          $i*32-0x80($Tbl),$Wi
1459         paddd           @MSG[0],$Wi
1460         sha256msg2      @MSG[0],@MSG[1]
1461         sha256msg1      @MSG[3],@MSG[2]
1462 ___
1463         &$aesenc();
1464 $code.=<<___;
1465         sha256rnds2     $ABEF,$CDGH             # 16-19...
1466         pshufd          \$0x0e,$Wi,$Wi
1467         movdqa          @MSG[1],$TMP
1468         palignr         \$4,@MSG[0],$TMP
1469         paddd           $TMP,@MSG[2]
1470 ___
1471         &$aesenc();
1472         &$aesenc()      if ($r==19);
1473 $code.=<<___;
1474         sha256rnds2     $CDGH,$ABEF
1475 ___
1476         push(@MSG,shift(@MSG));
1477 }
1478 $code.=<<___;
1479         movdqa          13*32-0x80($Tbl),$Wi
1480         paddd           @MSG[0],$Wi
1481         sha256msg2      @MSG[0],@MSG[1]
1482         sha256msg1      @MSG[3],@MSG[2]
1483 ___
1484         &$aesenc();
1485 $code.=<<___;
1486         sha256rnds2     $ABEF,$CDGH             # 52-55
1487         pshufd          \$0x0e,$Wi,$Wi
1488         movdqa          @MSG[1],$TMP
1489         palignr         \$4,@MSG[0],$TMP
1490         paddd           $TMP,@MSG[2]
1491 ___
1492         &$aesenc();
1493         &$aesenc();
1494 $code.=<<___;
1495         sha256rnds2     $CDGH,$ABEF
1496
1497         movdqa          14*32-0x80($Tbl),$Wi
1498         paddd           @MSG[1],$Wi
1499         sha256msg2      @MSG[1],@MSG[2]
1500         movdqa          $BSWAP,$TMP
1501 ___
1502         &$aesenc();
1503 $code.=<<___;
1504         sha256rnds2     $ABEF,$CDGH             # 56-59
1505         pshufd          \$0x0e,$Wi,$Wi
1506 ___
1507         &$aesenc();
1508 $code.=<<___;
1509         sha256rnds2     $CDGH,$ABEF
1510
1511         movdqa          15*32-0x80($Tbl),$Wi
1512         paddd           @MSG[2],$Wi
1513 ___
1514         &$aesenc();
1515         &$aesenc();
1516 $code.=<<___;
1517         sha256rnds2     $ABEF,$CDGH             # 60-63
1518         pshufd          \$0x0e,$Wi,$Wi
1519 ___
1520         &$aesenc();
1521 $code.=<<___;
1522         sha256rnds2     $CDGH,$ABEF
1523         #pxor           $CDGH,$rndkey0          # black magic
1524 ___
1525         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1526 $code.=<<___;
1527         #xorps          $CDGH,$rndkey0          # black magic
1528         paddd           $CDGH_SAVE,$CDGH
1529         paddd           $ABEF_SAVE,$ABEF
1530
1531         dec             $len
1532         movups          $iv,48($out,$in0)       # write output
1533         lea             64($in0),$in0
1534         jnz             .Loop_shaext
1535
1536         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1537         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1538         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1539         punpckhqdq      $CDGH,$ABEF             # DCBA
1540         palignr         \$8,$TMP,$CDGH          # HGFE
1541
1542         movups          $iv,($ivp)              # write IV
1543         movdqu          $ABEF,($ctx)
1544         movdqu          $CDGH,16($ctx)
1545 ___
1546 $code.=<<___ if ($win64);
1547         movaps  0*16(%rsp),%xmm6
1548         movaps  1*16(%rsp),%xmm7
1549         movaps  2*16(%rsp),%xmm8
1550         movaps  3*16(%rsp),%xmm9
1551         movaps  4*16(%rsp),%xmm10
1552         movaps  5*16(%rsp),%xmm11
1553         movaps  6*16(%rsp),%xmm12
1554         movaps  7*16(%rsp),%xmm13
1555         movaps  8*16(%rsp),%xmm14
1556         movaps  9*16(%rsp),%xmm15
1557         lea     8+10*16(%rsp),%rsp
1558 .Lepilogue_shaext:
1559 ___
1560 $code.=<<___;
1561         ret
1562 .size   ${func}_shaext,.-${func}_shaext
1563 ___
1564 }
1565 }}}}}
1566
1567 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1568 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1569 if ($win64 && $avx) {
1570 $rec="%rcx";
1571 $frame="%rdx";
1572 $context="%r8";
1573 $disp="%r9";
1574
1575 $code.=<<___;
1576 .extern __imp_RtlVirtualUnwind
1577 .type   se_handler,\@abi-omnipotent
1578 .align  16
1579 se_handler:
1580         push    %rsi
1581         push    %rdi
1582         push    %rbx
1583         push    %rbp
1584         push    %r12
1585         push    %r13
1586         push    %r14
1587         push    %r15
1588         pushfq
1589         sub     \$64,%rsp
1590
1591         mov     120($context),%rax      # pull context->Rax
1592         mov     248($context),%rbx      # pull context->Rip
1593
1594         mov     8($disp),%rsi           # disp->ImageBase
1595         mov     56($disp),%r11          # disp->HanderlData
1596
1597         mov     0(%r11),%r10d           # HandlerData[0]
1598         lea     (%rsi,%r10),%r10        # prologue label
1599         cmp     %r10,%rbx               # context->Rip<prologue label
1600         jb      .Lin_prologue
1601
1602         mov     152($context),%rax      # pull context->Rsp
1603
1604         mov     4(%r11),%r10d           # HandlerData[1]
1605         lea     (%rsi,%r10),%r10        # epilogue label
1606         cmp     %r10,%rbx               # context->Rip>=epilogue label
1607         jae     .Lin_prologue
1608 ___
1609 $code.=<<___ if ($shaext);
1610         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1611         cmp     %r10,%rbx
1612         jb      .Lnot_in_shaext
1613
1614         lea     (%rax),%rsi
1615         lea     512($context),%rdi      # &context.Xmm6
1616         mov     \$20,%ecx
1617         .long   0xa548f3fc              # cld; rep movsq
1618         lea     168(%rax),%rax          # adjust stack pointer
1619         jmp     .Lin_prologue
1620 .Lnot_in_shaext:
1621 ___
1622 $code.=<<___ if ($avx>1);
1623         lea     .Lavx2_shortcut(%rip),%r10
1624         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1625         jb      .Lnot_in_avx2
1626
1627         and     \$-256*$SZ,%rax
1628         add     \$`2*$SZ*($rounds-8)`,%rax
1629 .Lnot_in_avx2:
1630 ___
1631 $code.=<<___;
1632         mov     %rax,%rsi               # put aside Rsp
1633         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1634
1635         mov     -8(%rax),%rbx
1636         mov     -16(%rax),%rbp
1637         mov     -24(%rax),%r12
1638         mov     -32(%rax),%r13
1639         mov     -40(%rax),%r14
1640         mov     -48(%rax),%r15
1641         mov     %rbx,144($context)      # restore context->Rbx
1642         mov     %rbp,160($context)      # restore context->Rbp
1643         mov     %r12,216($context)      # restore context->R12
1644         mov     %r13,224($context)      # restore context->R13
1645         mov     %r14,232($context)      # restore context->R14
1646         mov     %r15,240($context)      # restore context->R15
1647
1648         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1649         lea     512($context),%rdi      # &context.Xmm6
1650         mov     \$20,%ecx
1651         .long   0xa548f3fc              # cld; rep movsq
1652
1653 .Lin_prologue:
1654         mov     8(%rax),%rdi
1655         mov     16(%rax),%rsi
1656         mov     %rax,152($context)      # restore context->Rsp
1657         mov     %rsi,168($context)      # restore context->Rsi
1658         mov     %rdi,176($context)      # restore context->Rdi
1659
1660         mov     40($disp),%rdi          # disp->ContextRecord
1661         mov     $context,%rsi           # context
1662         mov     \$154,%ecx              # sizeof(CONTEXT)
1663         .long   0xa548f3fc              # cld; rep movsq
1664
1665         mov     $disp,%rsi
1666         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1667         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1668         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1669         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1670         mov     40(%rsi),%r10           # disp->ContextRecord
1671         lea     56(%rsi),%r11           # &disp->HandlerData
1672         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1673         mov     %r10,32(%rsp)           # arg5
1674         mov     %r11,40(%rsp)           # arg6
1675         mov     %r12,48(%rsp)           # arg7
1676         mov     %rcx,56(%rsp)           # arg8, (NULL)
1677         call    *__imp_RtlVirtualUnwind(%rip)
1678
1679         mov     \$1,%eax                # ExceptionContinueSearch
1680         add     \$64,%rsp
1681         popfq
1682         pop     %r15
1683         pop     %r14
1684         pop     %r13
1685         pop     %r12
1686         pop     %rbp
1687         pop     %rbx
1688         pop     %rdi
1689         pop     %rsi
1690         ret
1691 .size   se_handler,.-se_handler
1692
1693 .section        .pdata
1694         .rva    .LSEH_begin_${func}_xop
1695         .rva    .LSEH_end_${func}_xop
1696         .rva    .LSEH_info_${func}_xop
1697
1698         .rva    .LSEH_begin_${func}_avx
1699         .rva    .LSEH_end_${func}_avx
1700         .rva    .LSEH_info_${func}_avx
1701 ___
1702 $code.=<<___ if ($avx>1);
1703         .rva    .LSEH_begin_${func}_avx2
1704         .rva    .LSEH_end_${func}_avx2
1705         .rva    .LSEH_info_${func}_avx2
1706 ___
1707 $code.=<<___ if ($shaext);
1708         .rva    .LSEH_begin_${func}_shaext
1709         .rva    .LSEH_end_${func}_shaext
1710         .rva    .LSEH_info_${func}_shaext
1711 ___
1712 $code.=<<___;
1713 .section        .xdata
1714 .align  8
1715 .LSEH_info_${func}_xop:
1716         .byte   9,0,0,0
1717         .rva    se_handler
1718         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1719
1720 .LSEH_info_${func}_avx:
1721         .byte   9,0,0,0
1722         .rva    se_handler
1723         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1724 ___
1725 $code.=<<___ if ($avx>1);
1726 .LSEH_info_${func}_avx2:
1727         .byte   9,0,0,0
1728         .rva    se_handler
1729         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1730 ___
1731 $code.=<<___ if ($shaext);
1732 .LSEH_info_${func}_shaext:
1733         .byte   9,0,0,0
1734         .rva    se_handler
1735         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1736 ___
1737 }
1738
1739 ####################################################################
1740 sub rex {
1741   local *opcode=shift;
1742   my ($dst,$src)=@_;
1743   my $rex=0;
1744
1745     $rex|=0x04                  if($dst>=8);
1746     $rex|=0x01                  if($src>=8);
1747     unshift @opcode,$rex|0x40   if($rex);
1748 }
1749
1750 {
1751   my %opcodelet = (
1752                 "sha256rnds2" => 0xcb,
1753                 "sha256msg1"  => 0xcc,
1754                 "sha256msg2"  => 0xcd   );
1755
1756   sub sha256op38 {
1757     my $instr = shift;
1758
1759     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1760       my @opcode=(0x0f,0x38);
1761         rex(\@opcode,$2,$1);
1762         push @opcode,$opcodelet{$instr};
1763         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1764         return ".byte\t".join(',',@opcode);
1765     } else {
1766         return $instr."\t".@_[0];
1767     }
1768   }
1769 }
1770
1771 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1772 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1773 print $code;
1774 close STDOUT;