crypto/aes/asm/aesni-sha256-x86_64.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # January 2013
  18 #
  19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  22 # parallelism, interleaving it with another algorithm would allow to
  23 # utilize processor resources better and achieve better performance.
  24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  25 # AESNI code is weaved into it. As SHA256 dominates execution time,
  26 # stitch performance does not depend on AES key length. Below are
  27 # performance numbers in cycles per processed byte, less is better,
  28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  29 # subroutine:
  30 #
  31 #                AES-128/-192/-256+SHA256   this(**)    gain
  32 # Sandy Bridge      5.05/6.05/7.05+11.6     13.0        +28%/36%/43%
  33 # Ivy Bridge        5.05/6.05/7.05+10.3     11.6        +32%/41%/50%
  34 # Haswell           4.43/5.29/6.19+7.80     8.79        +39%/49%/59%
  35 # Skylake           2.62/3.14/3.62+7.70     8.10        +27%/34%/40%
  36 # Bulldozer         5.77/6.89/8.00+13.7     13.7        +42%/50%/58%
  37 # Ryzen(***)        2.71/-/3.71+2.05        2.74/-/3.73 +74%/-/54%
  38 # Goldmont(***)     3.82/-/5.35+4.16        4.73/-/5.94 +69%/-/60%
  39 #
  40 # (*)   there are XOP, AVX1 and AVX2 code paths, meaning that
  41 #       Westmere is omitted from loop, this is because gain was not
  42 #       estimated high enough to justify the effort;
  43 # (**)  these are EVP-free results, results obtained with 'speed
  44 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  45 # (***) these are SHAEXT results;
  46
  47 # $output is the last argument if it looks like a file (it has an extension)
  48 # $flavour is the first argument if it doesn't look like a file
  49 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  50 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  51
  52 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  53
  54 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  55 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  56 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  57 die "can't locate x86_64-xlate.pl";
  58
  59 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  60                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  61         $avx = ($1>=2.19) + ($1>=2.22);
  62 }
  63
  64 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  65            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  66         $avx = ($1>=2.09) + ($1>=2.10);
  67 }
  68
  69 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  70            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  71         $avx = ($1>=10) + ($1>=12);
  72 }
  73
  74 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
  75         $avx = ($2>=3.0) + ($2>3.0);
  76 }
  77
  78 $shaext=$avx;   ### set to zero if compiling for 1.0.1
  79 $avx=1          if (!$shaext && $avx);
  80
  81 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
  82     or die "can't call $xlate: $!";
  83 *STDOUT=*OUT;
  84
  85 $func="aesni_cbc_sha256_enc";
  86 $TABLE="K256";
  87 $SZ=4;
  88 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  89                                 "%r8d","%r9d","%r10d","%r11d");
  90 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  91 @Sigma0=( 2,13,22);
  92 @Sigma1=( 6,11,25);
  93 @sigma0=( 7,18, 3);
  94 @sigma1=(17,19,10);
  95 $rounds=64;
  96
  97 ########################################################################
  98 # void aesni_cbc_sha256_enc(const void *inp,
  99 #                       void *out,
 100 #                       size_t length,
 101 #                       const AES_KEY *key,
 102 #                       unsigned char *iv,
 103 #                       SHA256_CTX *ctx,
 104 #                       const void *in0);
 105 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
 106 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 107
 108 $Tbl="%rbp";
 109
 110 $_inp="16*$SZ+0*8(%rsp)";
 111 $_out="16*$SZ+1*8(%rsp)";
 112 $_end="16*$SZ+2*8(%rsp)";
 113 $_key="16*$SZ+3*8(%rsp)";
 114 $_ivp="16*$SZ+4*8(%rsp)";
 115 $_ctx="16*$SZ+5*8(%rsp)";
 116 $_in0="16*$SZ+6*8(%rsp)";
 117 $_rsp="`16*$SZ+7*8`(%rsp)";
 118 $framesz=16*$SZ+8*8;
 119
 120 $code=<<___;
 121 .text
 122
 123 .extern OPENSSL_ia32cap_P
 124 .globl  $func
 125 .type   $func,\@abi-omnipotent
 126 .align  16
 127 $func:
 128 ___
 129                                                 if ($avx) {
 130 $code.=<<___;
 131         lea     OPENSSL_ia32cap_P(%rip),%r11
 132         mov     \$1,%eax
 133         cmp     \$0,`$win64?"%rcx":"%rdi"`
 134         je      .Lprobe
 135         mov     0(%r11),%eax
 136         mov     4(%r11),%r10
 137 ___
 138 $code.=<<___ if ($shaext);
 139         bt      \$61,%r10                       # check for SHA
 140         jc      ${func}_shaext
 141 ___
 142 $code.=<<___;
 143         mov     %r10,%r11
 144         shr     \$32,%r11
 145
 146         test    \$`1<<11`,%r10d                 # check for XOP
 147         jnz     ${func}_xop
 148 ___
 149 $code.=<<___ if ($avx>1);
 150         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
 151         cmp     \$`1<<8|1<<5|1<<3`,%r11d
 152         je      ${func}_avx2
 153 ___
 154 $code.=<<___;
 155         and     \$`1<<28`,%r10d                 # check for AVX
 156         jnz     ${func}_avx
 157         ud2
 158 ___
 159                                                 }
 160 $code.=<<___;
 161         xor     %eax,%eax
 162         cmp     \$0,`$win64?"%rcx":"%rdi"`
 163         je      .Lprobe
 164         ud2
 165 .Lprobe:
 166         ret
 167 .size   $func,.-$func
 168
 169 .align  64
 170 .type   $TABLE,\@object
 171 $TABLE:
 172         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 173         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 174         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 175         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 176         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 177         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 178         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 179         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 180         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 181         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 182         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 183         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 184         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 185         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 186         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 187         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 188         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 189         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 190         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 191         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 192         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 193         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 194         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 195         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 196         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 197         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 198         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 199         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 200         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 201         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 202         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 203         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 204
 205         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 206         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 207         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
 208         .long   0,0,0,0,   0,0,0,0
 209         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 210 .align  64
 211 ___
 212
 213 ######################################################################
 214 # SIMD code paths
 215 #
 216 {{{
 217 ($iv,$inout,$roundkey,$temp,
 218  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
 219
 220 $aesni_cbc_idx=0;
 221 @aesni_cbc_block = (
 222 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
 223 ##      &vmovdqu        ($inout,($inp));
 224 ##      &mov            ($_inp,$inp);
 225
 226         '&vpxor         ($inout,$inout,$roundkey);'.
 227         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
 228
 229         '&vpxor         ($inout,$inout,$iv);',
 230
 231         '&vaesenc       ($inout,$inout,$roundkey);'.
 232         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
 233
 234         '&vaesenc       ($inout,$inout,$roundkey);'.
 235         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
 236
 237         '&vaesenc       ($inout,$inout,$roundkey);'.
 238         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
 239
 240         '&vaesenc       ($inout,$inout,$roundkey);'.
 241         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
 242
 243         '&vaesenc       ($inout,$inout,$roundkey);'.
 244         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
 245
 246         '&vaesenc       ($inout,$inout,$roundkey);'.
 247         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
 248
 249         '&vaesenc       ($inout,$inout,$roundkey);'.
 250         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
 251
 252         '&vaesenc       ($inout,$inout,$roundkey);'.
 253         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
 254
 255         '&vaesenc       ($inout,$inout,$roundkey);'.
 256         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
 257
 258         '&vaesenclast   ($temp,$inout,$roundkey);'.
 259         ' &vaesenc      ($inout,$inout,$roundkey);'.
 260         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
 261
 262         '&vpand         ($iv,$temp,$mask10);'.
 263         ' &vaesenc      ($inout,$inout,$roundkey);'.
 264         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
 265
 266         '&vaesenclast   ($temp,$inout,$roundkey);'.
 267         ' &vaesenc      ($inout,$inout,$roundkey);'.
 268         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
 269
 270         '&vpand         ($temp,$temp,$mask12);'.
 271         ' &vaesenc      ($inout,$inout,$roundkey);'.
 272          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
 273
 274         '&vpor          ($iv,$iv,$temp);'.
 275         ' &vaesenclast  ($temp,$inout,$roundkey);'.
 276         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
 277
 278 ##      &mov            ($inp,$_inp);
 279 ##      &mov            ($out,$_out);
 280 ##      &vpand          ($temp,$temp,$mask14);
 281 ##      &vpor           ($iv,$iv,$temp);
 282 ##      &vmovdqu        ($iv,($out,$inp);
 283 ##      &lea            (inp,16($inp));
 284 );
 285
 286 my $a4=$T1;
 287 my ($a,$b,$c,$d,$e,$f,$g,$h);
 288
 289 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 290 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 291   my $arg = pop;
 292     $arg = "\$$arg" if ($arg*1 eq $arg);
 293     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 294 }
 295
 296 sub body_00_15 () {
 297         (
 298         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 299
 300         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
 301         '&mov   ($a,$a1)',
 302         '&mov   ($a4,$f)',
 303
 304         '&xor   ($a0,$e)',
 305         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
 306         '&xor   ($a4,$g)',                      # f^g
 307
 308         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
 309         '&xor   ($a1,$a)',
 310         '&and   ($a4,$e)',                      # (f^g)&e
 311
 312         @aesni_cbc_block[$aesni_cbc_idx++].
 313         '&xor   ($a0,$e)',
 314         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
 315         '&mov   ($a2,$a)',
 316
 317         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
 318         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
 319         '&xor   ($a2,$b)',                      # a^b, b^c in next round
 320
 321         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
 322         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
 323         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
 324
 325         '&xor   ($a1,$a)',
 326         '&add   ($h,$a0)',                      # h+=Sigma1(e)
 327         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
 328
 329         '&add   ($d,$h)',                       # d+=h
 330         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
 331         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
 332
 333         '&mov   ($a0,$d)',
 334         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
 335         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 336         );
 337 }
 338
 339 if ($avx) {{
 340 ######################################################################
 341 # XOP code path
 342 #
 343 $code.=<<___;
 344 .type   ${func}_xop,\@function,6
 345 .align  64
 346 ${func}_xop:
 347 .cfi_startproc
 348 .Lxop_shortcut:
 349         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 350         mov     %rsp,%rax               # copy %rsp
 351 .cfi_def_cfa_register   %rax
 352         push    %rbx
 353 .cfi_push       %rbx
 354         push    %rbp
 355 .cfi_push       %rbp
 356         push    %r12
 357 .cfi_push       %r12
 358         push    %r13
 359 .cfi_push       %r13
 360         push    %r14
 361 .cfi_push       %r14
 362         push    %r15
 363 .cfi_push       %r15
 364         sub     \$`$framesz+$win64*16*10`,%rsp
 365         and     \$-64,%rsp              # align stack frame
 366
 367         shl     \$6,$len
 368         sub     $inp,$out               # re-bias
 369         sub     $inp,$in0
 370         add     $inp,$len               # end of input
 371
 372         #mov    $inp,$_inp              # saved later
 373         mov     $out,$_out
 374         mov     $len,$_end
 375         #mov    $key,$_key              # remains resident in $inp register
 376         mov     $ivp,$_ivp
 377         mov     $ctx,$_ctx
 378         mov     $in0,$_in0
 379         mov     %rax,$_rsp
 380 .cfi_cfa_expression     $_rsp,deref,+8
 381 ___
 382 $code.=<<___ if ($win64);
 383         movaps  %xmm6,`$framesz+16*0`(%rsp)
 384         movaps  %xmm7,`$framesz+16*1`(%rsp)
 385         movaps  %xmm8,`$framesz+16*2`(%rsp)
 386         movaps  %xmm9,`$framesz+16*3`(%rsp)
 387         movaps  %xmm10,`$framesz+16*4`(%rsp)
 388         movaps  %xmm11,`$framesz+16*5`(%rsp)
 389         movaps  %xmm12,`$framesz+16*6`(%rsp)
 390         movaps  %xmm13,`$framesz+16*7`(%rsp)
 391         movaps  %xmm14,`$framesz+16*8`(%rsp)
 392         movaps  %xmm15,`$framesz+16*9`(%rsp)
 393 ___
 394 $code.=<<___;
 395 .Lprologue_xop:
 396         vzeroall
 397
 398         mov     $inp,%r12               # borrow $a4
 399         lea     0x80($key),$inp         # size optimization, reassign
 400         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 401         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 402         mov     $ctx,%r15               # borrow $a2
 403         mov     $in0,%rsi               # borrow $a3
 404         vmovdqu ($ivp),$iv              # load IV
 405         sub     \$9,%r14
 406
 407         mov     $SZ*0(%r15),$A
 408         mov     $SZ*1(%r15),$B
 409         mov     $SZ*2(%r15),$C
 410         mov     $SZ*3(%r15),$D
 411         mov     $SZ*4(%r15),$E
 412         mov     $SZ*5(%r15),$F
 413         mov     $SZ*6(%r15),$G
 414         mov     $SZ*7(%r15),$H
 415
 416         vmovdqa 0x00(%r13,%r14,8),$mask14
 417         vmovdqa 0x10(%r13,%r14,8),$mask12
 418         vmovdqa 0x20(%r13,%r14,8),$mask10
 419         vmovdqu 0x00-0x80($inp),$roundkey
 420         jmp     .Lloop_xop
 421 ___
 422                                         if ($SZ==4) {   # SHA256
 423     my @X = map("%xmm$_",(0..3));
 424     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 425
 426 $code.=<<___;
 427 .align  16
 428 .Lloop_xop:
 429         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 430         vmovdqu 0x00(%rsi,%r12),@X[0]
 431         vmovdqu 0x10(%rsi,%r12),@X[1]
 432         vmovdqu 0x20(%rsi,%r12),@X[2]
 433         vmovdqu 0x30(%rsi,%r12),@X[3]
 434         vpshufb $t3,@X[0],@X[0]
 435         lea     $TABLE(%rip),$Tbl
 436         vpshufb $t3,@X[1],@X[1]
 437         vpshufb $t3,@X[2],@X[2]
 438         vpaddd  0x00($Tbl),@X[0],$t0
 439         vpshufb $t3,@X[3],@X[3]
 440         vpaddd  0x20($Tbl),@X[1],$t1
 441         vpaddd  0x40($Tbl),@X[2],$t2
 442         vpaddd  0x60($Tbl),@X[3],$t3
 443         vmovdqa $t0,0x00(%rsp)
 444         mov     $A,$a1
 445         vmovdqa $t1,0x10(%rsp)
 446         mov     $B,$a3
 447         vmovdqa $t2,0x20(%rsp)
 448         xor     $C,$a3                  # magic
 449         vmovdqa $t3,0x30(%rsp)
 450         mov     $E,$a0
 451         jmp     .Lxop_00_47
 452
 453 .align  16
 454 .Lxop_00_47:
 455         sub     \$-16*2*$SZ,$Tbl        # size optimization
 456         vmovdqu (%r12),$inout           # $a4
 457         mov     %r12,$_inp              # $a4
 458 ___
 459 sub XOP_256_00_47 () {
 460 my $j = shift;
 461 my $body = shift;
 462 my @X = @_;
 463 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 464
 465         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
 466           eval(shift(@insns));
 467           eval(shift(@insns));
 468          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
 469           eval(shift(@insns));
 470           eval(shift(@insns));
 471         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
 472           eval(shift(@insns));
 473           eval(shift(@insns));
 474         &vpsrld         ($t0,$t0,$sigma0[2]);
 475           eval(shift(@insns));
 476           eval(shift(@insns));
 477          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
 478           eval(shift(@insns));
 479           eval(shift(@insns));
 480           eval(shift(@insns));
 481           eval(shift(@insns));
 482         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
 483           eval(shift(@insns));
 484           eval(shift(@insns));
 485         &vpxor          ($t0,$t0,$t1);
 486           eval(shift(@insns));
 487           eval(shift(@insns));
 488           eval(shift(@insns));
 489           eval(shift(@insns));
 490          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
 491           eval(shift(@insns));
 492           eval(shift(@insns));
 493         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
 494           eval(shift(@insns));
 495           eval(shift(@insns));
 496          &vpsrld        ($t2,@X[3],$sigma1[2]);
 497           eval(shift(@insns));
 498           eval(shift(@insns));
 499         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
 500           eval(shift(@insns));
 501           eval(shift(@insns));
 502          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 503           eval(shift(@insns));
 504           eval(shift(@insns));
 505          &vpxor         ($t3,$t3,$t2);
 506           eval(shift(@insns));
 507           eval(shift(@insns));
 508           eval(shift(@insns));
 509           eval(shift(@insns));
 510          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
 511           eval(shift(@insns));
 512           eval(shift(@insns));
 513           eval(shift(@insns));
 514           eval(shift(@insns));
 515         &vpsrldq        ($t3,$t3,8);
 516           eval(shift(@insns));
 517           eval(shift(@insns));
 518           eval(shift(@insns));
 519           eval(shift(@insns));
 520         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
 521           eval(shift(@insns));
 522           eval(shift(@insns));
 523           eval(shift(@insns));
 524           eval(shift(@insns));
 525          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
 526           eval(shift(@insns));
 527           eval(shift(@insns));
 528          &vpsrld        ($t2,@X[0],$sigma1[2]);
 529           eval(shift(@insns));
 530           eval(shift(@insns));
 531          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 532           eval(shift(@insns));
 533           eval(shift(@insns));
 534          &vpxor         ($t3,$t3,$t2);
 535           eval(shift(@insns));
 536           eval(shift(@insns));
 537           eval(shift(@insns));
 538           eval(shift(@insns));
 539          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
 540           eval(shift(@insns));
 541           eval(shift(@insns));
 542           eval(shift(@insns));
 543           eval(shift(@insns));
 544         &vpslldq        ($t3,$t3,8);            # 22 instructions
 545           eval(shift(@insns));
 546           eval(shift(@insns));
 547           eval(shift(@insns));
 548           eval(shift(@insns));
 549         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
 550           eval(shift(@insns));
 551           eval(shift(@insns));
 552           eval(shift(@insns));
 553           eval(shift(@insns));
 554         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 555           foreach (@insns) { eval; }            # remaining instructions
 556         &vmovdqa        (16*$j."(%rsp)",$t2);
 557 }
 558
 559     $aesni_cbc_idx=0;
 560     for ($i=0,$j=0; $j<4; $j++) {
 561         &XOP_256_00_47($j,\&body_00_15,@X);
 562         push(@X,shift(@X));                     # rotate(@X)
 563     }
 564         &mov            ("%r12",$_inp);         # borrow $a4
 565         &vpand          ($temp,$temp,$mask14);
 566         &mov            ("%r15",$_out);         # borrow $a2
 567         &vpor           ($iv,$iv,$temp);
 568         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 569         &lea            ("%r12","16(%r12)");    # inp++
 570
 571         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 572         &jne    (".Lxop_00_47");
 573
 574         &vmovdqu        ($inout,"(%r12)");
 575         &mov            ($_inp,"%r12");
 576
 577     $aesni_cbc_idx=0;
 578     for ($i=0; $i<16; ) {
 579         foreach(body_00_15()) { eval; }
 580     }
 581                                         }
 582 $code.=<<___;
 583         mov     $_inp,%r12              # borrow $a4
 584         mov     $_out,%r13              # borrow $a0
 585         mov     $_ctx,%r15              # borrow $a2
 586         mov     $_in0,%rsi              # borrow $a3
 587
 588         vpand   $mask14,$temp,$temp
 589         mov     $a1,$A
 590         vpor    $temp,$iv,$iv
 591         vmovdqu $iv,(%r13,%r12)         # write output
 592         lea     16(%r12),%r12           # inp++
 593
 594         add     $SZ*0(%r15),$A
 595         add     $SZ*1(%r15),$B
 596         add     $SZ*2(%r15),$C
 597         add     $SZ*3(%r15),$D
 598         add     $SZ*4(%r15),$E
 599         add     $SZ*5(%r15),$F
 600         add     $SZ*6(%r15),$G
 601         add     $SZ*7(%r15),$H
 602
 603         cmp     $_end,%r12
 604
 605         mov     $A,$SZ*0(%r15)
 606         mov     $B,$SZ*1(%r15)
 607         mov     $C,$SZ*2(%r15)
 608         mov     $D,$SZ*3(%r15)
 609         mov     $E,$SZ*4(%r15)
 610         mov     $F,$SZ*5(%r15)
 611         mov     $G,$SZ*6(%r15)
 612         mov     $H,$SZ*7(%r15)
 613
 614         jb      .Lloop_xop
 615
 616         mov     $_ivp,$ivp
 617         mov     $_rsp,%rsi
 618 .cfi_def_cfa    %rsi,8
 619         vmovdqu $iv,($ivp)              # output IV
 620         vzeroall
 621 ___
 622 $code.=<<___ if ($win64);
 623         movaps  `$framesz+16*0`(%rsp),%xmm6
 624         movaps  `$framesz+16*1`(%rsp),%xmm7
 625         movaps  `$framesz+16*2`(%rsp),%xmm8
 626         movaps  `$framesz+16*3`(%rsp),%xmm9
 627         movaps  `$framesz+16*4`(%rsp),%xmm10
 628         movaps  `$framesz+16*5`(%rsp),%xmm11
 629         movaps  `$framesz+16*6`(%rsp),%xmm12
 630         movaps  `$framesz+16*7`(%rsp),%xmm13
 631         movaps  `$framesz+16*8`(%rsp),%xmm14
 632         movaps  `$framesz+16*9`(%rsp),%xmm15
 633 ___
 634 $code.=<<___;
 635         mov     -48(%rsi),%r15
 636 .cfi_restore    %r15
 637         mov     -40(%rsi),%r14
 638 .cfi_restore    %r14
 639         mov     -32(%rsi),%r13
 640 .cfi_restore    %r13
 641         mov     -24(%rsi),%r12
 642 .cfi_restore    %r12
 643         mov     -16(%rsi),%rbp
 644 .cfi_restore    %rbp
 645         mov     -8(%rsi),%rbx
 646 .cfi_restore    %rbx
 647         lea     (%rsi),%rsp
 648 .cfi_def_cfa_register   %rsp
 649 .Lepilogue_xop:
 650         ret
 651 .cfi_endproc
 652 .size   ${func}_xop,.-${func}_xop
 653 ___
 654 ######################################################################
 655 # AVX+shrd code path
 656 #
 657 local *ror = sub { &shrd(@_[0],@_) };
 658
 659 $code.=<<___;
 660 .type   ${func}_avx,\@function,6
 661 .align  64
 662 ${func}_avx:
 663 .cfi_startproc
 664 .Lavx_shortcut:
 665         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 666         mov     %rsp,%rax               # copy %rsp
 667 .cfi_def_cfa_register   %rax
 668         push    %rbx
 669 .cfi_push       %rbx
 670         push    %rbp
 671 .cfi_push       %rbp
 672         push    %r12
 673 .cfi_push       %r12
 674         push    %r13
 675 .cfi_push       %r13
 676         push    %r14
 677 .cfi_push       %r14
 678         push    %r15
 679 .cfi_push       %r15
 680         sub     \$`$framesz+$win64*16*10`,%rsp
 681         and     \$-64,%rsp              # align stack frame
 682
 683         shl     \$6,$len
 684         sub     $inp,$out               # re-bias
 685         sub     $inp,$in0
 686         add     $inp,$len               # end of input
 687
 688         #mov    $inp,$_inp              # saved later
 689         mov     $out,$_out
 690         mov     $len,$_end
 691         #mov    $key,$_key              # remains resident in $inp register
 692         mov     $ivp,$_ivp
 693         mov     $ctx,$_ctx
 694         mov     $in0,$_in0
 695         mov     %rax,$_rsp
 696 .cfi_cfa_expression     $_rsp,deref,+8
 697 ___
 698 $code.=<<___ if ($win64);
 699         movaps  %xmm6,`$framesz+16*0`(%rsp)
 700         movaps  %xmm7,`$framesz+16*1`(%rsp)
 701         movaps  %xmm8,`$framesz+16*2`(%rsp)
 702         movaps  %xmm9,`$framesz+16*3`(%rsp)
 703         movaps  %xmm10,`$framesz+16*4`(%rsp)
 704         movaps  %xmm11,`$framesz+16*5`(%rsp)
 705         movaps  %xmm12,`$framesz+16*6`(%rsp)
 706         movaps  %xmm13,`$framesz+16*7`(%rsp)
 707         movaps  %xmm14,`$framesz+16*8`(%rsp)
 708         movaps  %xmm15,`$framesz+16*9`(%rsp)
 709 ___
 710 $code.=<<___;
 711 .Lprologue_avx:
 712         vzeroall
 713
 714         mov     $inp,%r12               # borrow $a4
 715         lea     0x80($key),$inp         # size optimization, reassign
 716         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 717         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 718         mov     $ctx,%r15               # borrow $a2
 719         mov     $in0,%rsi               # borrow $a3
 720         vmovdqu ($ivp),$iv              # load IV
 721         sub     \$9,%r14
 722
 723         mov     $SZ*0(%r15),$A
 724         mov     $SZ*1(%r15),$B
 725         mov     $SZ*2(%r15),$C
 726         mov     $SZ*3(%r15),$D
 727         mov     $SZ*4(%r15),$E
 728         mov     $SZ*5(%r15),$F
 729         mov     $SZ*6(%r15),$G
 730         mov     $SZ*7(%r15),$H
 731
 732         vmovdqa 0x00(%r13,%r14,8),$mask14
 733         vmovdqa 0x10(%r13,%r14,8),$mask12
 734         vmovdqa 0x20(%r13,%r14,8),$mask10
 735         vmovdqu 0x00-0x80($inp),$roundkey
 736 ___
 737                                         if ($SZ==4) {   # SHA256
 738     my @X = map("%xmm$_",(0..3));
 739     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 740
 741 $code.=<<___;
 742         jmp     .Lloop_avx
 743 .align  16
 744 .Lloop_avx:
 745         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 746         vmovdqu 0x00(%rsi,%r12),@X[0]
 747         vmovdqu 0x10(%rsi,%r12),@X[1]
 748         vmovdqu 0x20(%rsi,%r12),@X[2]
 749         vmovdqu 0x30(%rsi,%r12),@X[3]
 750         vpshufb $t3,@X[0],@X[0]
 751         lea     $TABLE(%rip),$Tbl
 752         vpshufb $t3,@X[1],@X[1]
 753         vpshufb $t3,@X[2],@X[2]
 754         vpaddd  0x00($Tbl),@X[0],$t0
 755         vpshufb $t3,@X[3],@X[3]
 756         vpaddd  0x20($Tbl),@X[1],$t1
 757         vpaddd  0x40($Tbl),@X[2],$t2
 758         vpaddd  0x60($Tbl),@X[3],$t3
 759         vmovdqa $t0,0x00(%rsp)
 760         mov     $A,$a1
 761         vmovdqa $t1,0x10(%rsp)
 762         mov     $B,$a3
 763         vmovdqa $t2,0x20(%rsp)
 764         xor     $C,$a3                  # magic
 765         vmovdqa $t3,0x30(%rsp)
 766         mov     $E,$a0
 767         jmp     .Lavx_00_47
 768
 769 .align  16
 770 .Lavx_00_47:
 771         sub     \$-16*2*$SZ,$Tbl        # size optimization
 772         vmovdqu (%r12),$inout           # $a4
 773         mov     %r12,$_inp              # $a4
 774 ___
 775 sub Xupdate_256_AVX () {
 776         (
 777         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
 778          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
 779         '&vpsrld        ($t2,$t0,$sigma0[0]);',
 780          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
 781         '&vpsrld        ($t3,$t0,$sigma0[2])',
 782         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
 783         '&vpxor         ($t0,$t3,$t2)',
 784          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
 785         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
 786         '&vpxor         ($t0,$t0,$t1)',
 787         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
 788         '&vpxor         ($t0,$t0,$t2)',
 789          '&vpsrld       ($t2,$t3,$sigma1[2]);',
 790         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
 791          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
 792         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
 793          '&vpxor        ($t2,$t2,$t3);',
 794          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 795          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
 796          '&vpshufd      ($t2,$t2,0b10000100)',
 797          '&vpsrldq      ($t2,$t2,8)',
 798         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
 799          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
 800          '&vpsrld       ($t2,$t3,$sigma1[2])',
 801          '&vpsrlq       ($t3,$t3,$sigma1[0])',
 802          '&vpxor        ($t2,$t2,$t3);',
 803          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 804          '&vpxor        ($t2,$t2,$t3)',
 805          '&vpshufd      ($t2,$t2,0b11101000)',
 806          '&vpslldq      ($t2,$t2,8)',
 807         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
 808         );
 809 }
 810
 811 sub AVX_256_00_47 () {
 812 my $j = shift;
 813 my $body = shift;
 814 my @X = @_;
 815 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 816
 817         foreach (Xupdate_256_AVX()) {           # 29 instructions
 818             eval;
 819             eval(shift(@insns));
 820             eval(shift(@insns));
 821             eval(shift(@insns));
 822         }
 823         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 824           foreach (@insns) { eval; }            # remaining instructions
 825         &vmovdqa        (16*$j."(%rsp)",$t2);
 826 }
 827
 828     $aesni_cbc_idx=0;
 829     for ($i=0,$j=0; $j<4; $j++) {
 830         &AVX_256_00_47($j,\&body_00_15,@X);
 831         push(@X,shift(@X));                     # rotate(@X)
 832     }
 833         &mov            ("%r12",$_inp);         # borrow $a4
 834         &vpand          ($temp,$temp,$mask14);
 835         &mov            ("%r15",$_out);         # borrow $a2
 836         &vpor           ($iv,$iv,$temp);
 837         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 838         &lea            ("%r12","16(%r12)");    # inp++
 839
 840         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 841         &jne    (".Lavx_00_47");
 842
 843         &vmovdqu        ($inout,"(%r12)");
 844         &mov            ($_inp,"%r12");
 845
 846     $aesni_cbc_idx=0;
 847     for ($i=0; $i<16; ) {
 848         foreach(body_00_15()) { eval; }
 849     }
 850
 851                                         }
 852 $code.=<<___;
 853         mov     $_inp,%r12              # borrow $a4
 854         mov     $_out,%r13              # borrow $a0
 855         mov     $_ctx,%r15              # borrow $a2
 856         mov     $_in0,%rsi              # borrow $a3
 857
 858         vpand   $mask14,$temp,$temp
 859         mov     $a1,$A
 860         vpor    $temp,$iv,$iv
 861         vmovdqu $iv,(%r13,%r12)         # write output
 862         lea     16(%r12),%r12           # inp++
 863
 864         add     $SZ*0(%r15),$A
 865         add     $SZ*1(%r15),$B
 866         add     $SZ*2(%r15),$C
 867         add     $SZ*3(%r15),$D
 868         add     $SZ*4(%r15),$E
 869         add     $SZ*5(%r15),$F
 870         add     $SZ*6(%r15),$G
 871         add     $SZ*7(%r15),$H
 872
 873         cmp     $_end,%r12
 874
 875         mov     $A,$SZ*0(%r15)
 876         mov     $B,$SZ*1(%r15)
 877         mov     $C,$SZ*2(%r15)
 878         mov     $D,$SZ*3(%r15)
 879         mov     $E,$SZ*4(%r15)
 880         mov     $F,$SZ*5(%r15)
 881         mov     $G,$SZ*6(%r15)
 882         mov     $H,$SZ*7(%r15)
 883         jb      .Lloop_avx
 884
 885         mov     $_ivp,$ivp
 886         mov     $_rsp,%rsi
 887 .cfi_def_cfa    %rsi,8
 888         vmovdqu $iv,($ivp)              # output IV
 889         vzeroall
 890 ___
 891 $code.=<<___ if ($win64);
 892         movaps  `$framesz+16*0`(%rsp),%xmm6
 893         movaps  `$framesz+16*1`(%rsp),%xmm7
 894         movaps  `$framesz+16*2`(%rsp),%xmm8
 895         movaps  `$framesz+16*3`(%rsp),%xmm9
 896         movaps  `$framesz+16*4`(%rsp),%xmm10
 897         movaps  `$framesz+16*5`(%rsp),%xmm11
 898         movaps  `$framesz+16*6`(%rsp),%xmm12
 899         movaps  `$framesz+16*7`(%rsp),%xmm13
 900         movaps  `$framesz+16*8`(%rsp),%xmm14
 901         movaps  `$framesz+16*9`(%rsp),%xmm15
 902 ___
 903 $code.=<<___;
 904         mov     -48(%rsi),%r15
 905 .cfi_restore    %r15
 906         mov     -40(%rsi),%r14
 907 .cfi_restore    %r14
 908         mov     -32(%rsi),%r13
 909 .cfi_restore    %r13
 910         mov     -24(%rsi),%r12
 911 .cfi_restore    %r12
 912         mov     -16(%rsi),%rbp
 913 .cfi_restore    %rbp
 914         mov     -8(%rsi),%rbx
 915 .cfi_restore    %rbx
 916         lea     (%rsi),%rsp
 917 .cfi_def_cfa_register   %rsp
 918 .Lepilogue_avx:
 919         ret
 920 .cfi_endproc
 921 .size   ${func}_avx,.-${func}_avx
 922 ___
 923
 924 if ($avx>1) {{
 925 ######################################################################
 926 # AVX2+BMI code path
 927 #
 928 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
 929 my $PUSH8=8*2*$SZ;
 930 use integer;
 931
 932 sub bodyx_00_15 () {
 933         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
 934         (
 935         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 936
 937         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
 938         '&and   ($a4,$e)',              # f&e
 939         '&rorx  ($a0,$e,$Sigma1[2])',
 940         '&rorx  ($a2,$e,$Sigma1[1])',
 941
 942         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
 943         '&lea   ($h,"($h,$a4)")',
 944         '&andn  ($a4,$e,$g)',           # ~e&g
 945         '&xor   ($a0,$a2)',
 946
 947         '&rorx  ($a1,$e,$Sigma1[0])',
 948         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
 949         '&xor   ($a0,$a1)',             # Sigma1(e)
 950         '&mov   ($a2,$a)',
 951
 952         '&rorx  ($a4,$a,$Sigma0[2])',
 953         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
 954         '&xor   ($a2,$b)',              # a^b, b^c in next round
 955         '&rorx  ($a1,$a,$Sigma0[1])',
 956
 957         '&rorx  ($a0,$a,$Sigma0[0])',
 958         '&lea   ($d,"($d,$h)")',        # d+=h
 959         '&and   ($a3,$a2)',             # (b^c)&(a^b)
 960         @aesni_cbc_block[$aesni_cbc_idx++].
 961         '&xor   ($a1,$a4)',
 962
 963         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
 964         '&xor   ($a1,$a0)',             # Sigma0(a)
 965         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
 966         '&mov   ($a4,$e)',              # copy of f in future
 967
 968         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 969         );
 970         # and at the finish one has to $a+=$a1
 971 }
 972
 973 $code.=<<___;
 974 .type   ${func}_avx2,\@function,6
 975 .align  64
 976 ${func}_avx2:
 977 .cfi_startproc
 978 .Lavx2_shortcut:
 979         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 980         mov     %rsp,%rax               # copy %rsp
 981 .cfi_def_cfa_register   %rax
 982         push    %rbx
 983 .cfi_push       %rbx
 984         push    %rbp
 985 .cfi_push       %rbp
 986         push    %r12
 987 .cfi_push       %r12
 988         push    %r13
 989 .cfi_push       %r13
 990         push    %r14
 991 .cfi_push       %r14
 992         push    %r15
 993 .cfi_push       %r15
 994         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
 995         and     \$-256*$SZ,%rsp         # align stack frame
 996         add     \$`2*$SZ*($rounds-8)`,%rsp
 997
 998         shl     \$6,$len
 999         sub     $inp,$out               # re-bias
1000         sub     $inp,$in0
1001         add     $inp,$len               # end of input
1002
1003         #mov    $inp,$_inp              # saved later
1004         #mov    $out,$_out              # kept in $offload
1005         mov     $len,$_end
1006         #mov    $key,$_key              # remains resident in $inp register
1007         mov     $ivp,$_ivp
1008         mov     $ctx,$_ctx
1009         mov     $in0,$_in0
1010         mov     %rax,$_rsp
1011 .cfi_cfa_expression     $_rsp,deref,+8
1012 ___
1013 $code.=<<___ if ($win64);
1014         movaps  %xmm6,`$framesz+16*0`(%rsp)
1015         movaps  %xmm7,`$framesz+16*1`(%rsp)
1016         movaps  %xmm8,`$framesz+16*2`(%rsp)
1017         movaps  %xmm9,`$framesz+16*3`(%rsp)
1018         movaps  %xmm10,`$framesz+16*4`(%rsp)
1019         movaps  %xmm11,`$framesz+16*5`(%rsp)
1020         movaps  %xmm12,`$framesz+16*6`(%rsp)
1021         movaps  %xmm13,`$framesz+16*7`(%rsp)
1022         movaps  %xmm14,`$framesz+16*8`(%rsp)
1023         movaps  %xmm15,`$framesz+16*9`(%rsp)
1024 ___
1025 $code.=<<___;
1026 .Lprologue_avx2:
1027         vzeroall
1028
1029         mov     $inp,%r13               # borrow $a0
1030         vpinsrq \$1,$out,$offload,$offload
1031         lea     0x80($key),$inp         # size optimization, reassign
1032         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
1033         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
1034         mov     $ctx,%r15               # borrow $a2
1035         mov     $in0,%rsi               # borrow $a3
1036         vmovdqu ($ivp),$iv              # load IV
1037         lea     -9(%r14),%r14
1038
1039         vmovdqa 0x00(%r12,%r14,8),$mask14
1040         vmovdqa 0x10(%r12,%r14,8),$mask12
1041         vmovdqa 0x20(%r12,%r14,8),$mask10
1042
1043         sub     \$-16*$SZ,%r13          # inp++, size optimization
1044         mov     $SZ*0(%r15),$A
1045         lea     (%rsi,%r13),%r12        # borrow $a0
1046         mov     $SZ*1(%r15),$B
1047         cmp     $len,%r13               # $_end
1048         mov     $SZ*2(%r15),$C
1049         cmove   %rsp,%r12               # next block or random data
1050         mov     $SZ*3(%r15),$D
1051         mov     $SZ*4(%r15),$E
1052         mov     $SZ*5(%r15),$F
1053         mov     $SZ*6(%r15),$G
1054         mov     $SZ*7(%r15),$H
1055         vmovdqu 0x00-0x80($inp),$roundkey
1056 ___
1057                                         if ($SZ==4) {   # SHA256
1058     my @X = map("%ymm$_",(0..3));
1059     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1060
1061 $code.=<<___;
1062         jmp     .Loop_avx2
1063 .align  16
1064 .Loop_avx2:
1065         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1066         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1067         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1068         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1069         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1070
1071         vinserti128     \$1,(%r12),@X[0],@X[0]
1072         vinserti128     \$1,16(%r12),@X[1],@X[1]
1073          vpshufb        $t3,@X[0],@X[0]
1074         vinserti128     \$1,32(%r12),@X[2],@X[2]
1075          vpshufb        $t3,@X[1],@X[1]
1076         vinserti128     \$1,48(%r12),@X[3],@X[3]
1077
1078         lea     $TABLE(%rip),$Tbl
1079         vpshufb $t3,@X[2],@X[2]
1080         lea     -16*$SZ(%r13),%r13
1081         vpaddd  0x00($Tbl),@X[0],$t0
1082         vpshufb $t3,@X[3],@X[3]
1083         vpaddd  0x20($Tbl),@X[1],$t1
1084         vpaddd  0x40($Tbl),@X[2],$t2
1085         vpaddd  0x60($Tbl),@X[3],$t3
1086         vmovdqa $t0,0x00(%rsp)
1087         xor     $a1,$a1
1088         vmovdqa $t1,0x20(%rsp)
1089         lea     -$PUSH8(%rsp),%rsp
1090         mov     $B,$a3
1091         vmovdqa $t2,0x00(%rsp)
1092         xor     $C,$a3                  # magic
1093         vmovdqa $t3,0x20(%rsp)
1094         mov     $F,$a4
1095         sub     \$-16*2*$SZ,$Tbl        # size optimization
1096         jmp     .Lavx2_00_47
1097
1098 .align  16
1099 .Lavx2_00_47:
1100         vmovdqu (%r13),$inout
1101         vpinsrq \$0,%r13,$offload,$offload
1102 ___
1103
1104 sub AVX2_256_00_47 () {
1105 my $j = shift;
1106 my $body = shift;
1107 my @X = @_;
1108 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1109 my $base = "+2*$PUSH8(%rsp)";
1110
1111         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1112         foreach (Xupdate_256_AVX()) {           # 29 instructions
1113             eval;
1114             eval(shift(@insns));
1115             eval(shift(@insns));
1116             eval(shift(@insns));
1117         }
1118         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1119           foreach (@insns) { eval; }            # remaining instructions
1120         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1121 }
1122     $aesni_cbc_idx=0;
1123     for ($i=0,$j=0; $j<4; $j++) {
1124         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1125         push(@X,shift(@X));                     # rotate(@X)
1126     }
1127         &vmovq          ("%r13",$offload);      # borrow $a0
1128         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1129         &vpand          ($temp,$temp,$mask14);
1130         &vpor           ($iv,$iv,$temp);
1131         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1132         &lea            ("%r13","16(%r13)");    # inp++
1133
1134         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1135         &cmpb   (($SZ-1)."($Tbl)",0);
1136         &jne    (".Lavx2_00_47");
1137
1138         &vmovdqu        ($inout,"(%r13)");
1139         &vpinsrq        ($offload,$offload,"%r13",0);
1140
1141     $aesni_cbc_idx=0;
1142     for ($i=0; $i<16; ) {
1143         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1144         foreach(bodyx_00_15()) { eval; }
1145     }
1146                                         }
1147 $code.=<<___;
1148         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1149         vmovq   $offload,%r13                   # $_inp, borrow $a0
1150         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1151         add     $a1,$A
1152         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1153
1154         vpand   $mask14,$temp,$temp
1155         vpor    $temp,$iv,$iv
1156         vmovdqu $iv,(%r12,%r13)                 # write output
1157         lea     16(%r13),%r13
1158
1159         add     $SZ*0(%r15),$A
1160         add     $SZ*1(%r15),$B
1161         add     $SZ*2(%r15),$C
1162         add     $SZ*3(%r15),$D
1163         add     $SZ*4(%r15),$E
1164         add     $SZ*5(%r15),$F
1165         add     $SZ*6(%r15),$G
1166         add     $SZ*7(%r15),$H
1167
1168         mov     $A,$SZ*0(%r15)
1169         mov     $B,$SZ*1(%r15)
1170         mov     $C,$SZ*2(%r15)
1171         mov     $D,$SZ*3(%r15)
1172         mov     $E,$SZ*4(%r15)
1173         mov     $F,$SZ*5(%r15)
1174         mov     $G,$SZ*6(%r15)
1175         mov     $H,$SZ*7(%r15)
1176
1177         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1178         je      .Ldone_avx2
1179
1180         xor     $a1,$a1
1181         mov     $B,$a3
1182         mov     $F,$a4
1183         xor     $C,$a3                  # magic
1184         jmp     .Lower_avx2
1185 .align  16
1186 .Lower_avx2:
1187         vmovdqu (%r13),$inout
1188         vpinsrq \$0,%r13,$offload,$offload
1189 ___
1190     $aesni_cbc_idx=0;
1191     for ($i=0; $i<16; ) {
1192         my $base="+16($Tbl)";
1193         foreach(bodyx_00_15()) { eval; }
1194         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1195     }
1196 $code.=<<___;
1197         vmovq   $offload,%r13                   # borrow $a0
1198         vpextrq \$1,$offload,%r15               # borrow $a2
1199         vpand   $mask14,$temp,$temp
1200         vpor    $temp,$iv,$iv
1201         lea     -$PUSH8($Tbl),$Tbl
1202         vmovdqu $iv,(%r15,%r13)                 # write output
1203         lea     16(%r13),%r13                   # inp++
1204         cmp     %rsp,$Tbl
1205         jae     .Lower_avx2
1206
1207         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1208         lea     16*$SZ(%r13),%r13
1209         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1210         add     $a1,$A
1211         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1212
1213         add     $SZ*0(%r15),$A
1214         add     $SZ*1(%r15),$B
1215         add     $SZ*2(%r15),$C
1216         add     $SZ*3(%r15),$D
1217         add     $SZ*4(%r15),$E
1218         add     $SZ*5(%r15),$F
1219         add     $SZ*6(%r15),$G
1220         lea     (%rsi,%r13),%r12
1221         add     $SZ*7(%r15),$H
1222
1223         cmp     $_end,%r13
1224
1225         mov     $A,$SZ*0(%r15)
1226         cmove   %rsp,%r12               # next block or stale data
1227         mov     $B,$SZ*1(%r15)
1228         mov     $C,$SZ*2(%r15)
1229         mov     $D,$SZ*3(%r15)
1230         mov     $E,$SZ*4(%r15)
1231         mov     $F,$SZ*5(%r15)
1232         mov     $G,$SZ*6(%r15)
1233         mov     $H,$SZ*7(%r15)
1234
1235         jbe     .Loop_avx2
1236         lea     (%rsp),$Tbl
1237
1238 .Ldone_avx2:
1239         lea     ($Tbl),%rsp
1240         mov     $_ivp,$ivp
1241         mov     $_rsp,%rsi
1242 .cfi_def_cfa    %rsi,8
1243         vmovdqu $iv,($ivp)              # output IV
1244         vzeroall
1245 ___
1246 $code.=<<___ if ($win64);
1247         movaps  `$framesz+16*0`(%rsp),%xmm6
1248         movaps  `$framesz+16*1`(%rsp),%xmm7
1249         movaps  `$framesz+16*2`(%rsp),%xmm8
1250         movaps  `$framesz+16*3`(%rsp),%xmm9
1251         movaps  `$framesz+16*4`(%rsp),%xmm10
1252         movaps  `$framesz+16*5`(%rsp),%xmm11
1253         movaps  `$framesz+16*6`(%rsp),%xmm12
1254         movaps  `$framesz+16*7`(%rsp),%xmm13
1255         movaps  `$framesz+16*8`(%rsp),%xmm14
1256         movaps  `$framesz+16*9`(%rsp),%xmm15
1257 ___
1258 $code.=<<___;
1259         mov     -48(%rsi),%r15
1260 .cfi_restore    %r15
1261         mov     -40(%rsi),%r14
1262 .cfi_restore    %r14
1263         mov     -32(%rsi),%r13
1264 .cfi_restore    %r13
1265         mov     -24(%rsi),%r12
1266 .cfi_restore    %r12
1267         mov     -16(%rsi),%rbp
1268 .cfi_restore    %rbp
1269         mov     -8(%rsi),%rbx
1270 .cfi_restore    %rbx
1271         lea     (%rsi),%rsp
1272 .cfi_def_cfa_register   %rsp
1273 .Lepilogue_avx2:
1274         ret
1275 .cfi_endproc
1276 .size   ${func}_avx2,.-${func}_avx2
1277 ___
1278 }}
1279 }}
1280 {{
1281 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1282
1283 my ($rounds,$Tbl)=("%r11d","%rbx");
1284
1285 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1286 my @rndkey=("%xmm4","%xmm5");
1287 my $r=0;
1288 my $sn=0;
1289
1290 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1291 my @MSG=map("%xmm$_",(10..13));
1292
1293 my $aesenc=sub {
1294   use integer;
1295   my ($n,$k)=($r/10,$r%10);
1296     if ($k==0) {
1297       $code.=<<___;
1298         movups          `16*$n`($in0),$in               # load input
1299         xorps           $rndkey0,$in
1300 ___
1301       $code.=<<___ if ($n);
1302         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1303 ___
1304       $code.=<<___;
1305         xorps           $in,$iv
1306         movups          `32+16*$k-112`($key),$rndkey[1]
1307         aesenc          $rndkey[0],$iv
1308 ___
1309     } elsif ($k==9) {
1310       $sn++;
1311       $code.=<<___;
1312         cmp             \$11,$rounds
1313         jb              .Laesenclast$sn
1314         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1315         aesenc          $rndkey[0],$iv
1316         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1317         aesenc          $rndkey[1],$iv
1318         je              .Laesenclast$sn
1319         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1320         aesenc          $rndkey[0],$iv
1321         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1322         aesenc          $rndkey[1],$iv
1323 .Laesenclast$sn:
1324         aesenclast      $rndkey[0],$iv
1325         movups          16-112($key),$rndkey[1]         # forward reference
1326         nop
1327 ___
1328     } else {
1329       $code.=<<___;
1330         movups          `32+16*$k-112`($key),$rndkey[1]
1331         aesenc          $rndkey[0],$iv
1332 ___
1333     }
1334     $r++;       unshift(@rndkey,pop(@rndkey));
1335 };
1336
1337 if ($shaext) {
1338 my $Tbl="%rax";
1339
1340 $code.=<<___;
1341 .type   ${func}_shaext,\@function,6
1342 .align  32
1343 ${func}_shaext:
1344         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1345 ___
1346 $code.=<<___ if ($win64);
1347         lea     `-8-10*16`(%rsp),%rsp
1348         movaps  %xmm6,-8-10*16(%rax)
1349         movaps  %xmm7,-8-9*16(%rax)
1350         movaps  %xmm8,-8-8*16(%rax)
1351         movaps  %xmm9,-8-7*16(%rax)
1352         movaps  %xmm10,-8-6*16(%rax)
1353         movaps  %xmm11,-8-5*16(%rax)
1354         movaps  %xmm12,-8-4*16(%rax)
1355         movaps  %xmm13,-8-3*16(%rax)
1356         movaps  %xmm14,-8-2*16(%rax)
1357         movaps  %xmm15,-8-1*16(%rax)
1358 .Lprologue_shaext:
1359 ___
1360 $code.=<<___;
1361         lea             K256+0x80(%rip),$Tbl
1362         movdqu          ($ctx),$ABEF            # DCBA
1363         movdqu          16($ctx),$CDGH          # HGFE
1364         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1365
1366         mov             240($key),$rounds
1367         sub             $in0,$out
1368         movups          ($key),$rndkey0         # $key[0]
1369         movups          ($ivp),$iv              # load IV
1370         movups          16($key),$rndkey[0]     # forward reference
1371         lea             112($key),$key          # size optimization
1372
1373         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1374         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1375         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1376         movdqa          $TMP,$BSWAP             # offload
1377         palignr         \$8,$CDGH,$ABEF         # ABEF
1378         punpcklqdq      $Wi,$CDGH               # CDGH
1379
1380         jmp     .Loop_shaext
1381
1382 .align  16
1383 .Loop_shaext:
1384         movdqu          ($inp),@MSG[0]
1385         movdqu          0x10($inp),@MSG[1]
1386         movdqu          0x20($inp),@MSG[2]
1387         pshufb          $TMP,@MSG[0]
1388         movdqu          0x30($inp),@MSG[3]
1389
1390         movdqa          0*32-0x80($Tbl),$Wi
1391         paddd           @MSG[0],$Wi
1392         pshufb          $TMP,@MSG[1]
1393         movdqa          $CDGH,$CDGH_SAVE        # offload
1394         movdqa          $ABEF,$ABEF_SAVE        # offload
1395 ___
1396         &$aesenc();
1397 $code.=<<___;
1398         sha256rnds2     $ABEF,$CDGH             # 0-3
1399         pshufd          \$0x0e,$Wi,$Wi
1400 ___
1401         &$aesenc();
1402 $code.=<<___;
1403         sha256rnds2     $CDGH,$ABEF
1404
1405         movdqa          1*32-0x80($Tbl),$Wi
1406         paddd           @MSG[1],$Wi
1407         pshufb          $TMP,@MSG[2]
1408         lea             0x40($inp),$inp
1409 ___
1410         &$aesenc();
1411 $code.=<<___;
1412         sha256rnds2     $ABEF,$CDGH             # 4-7
1413         pshufd          \$0x0e,$Wi,$Wi
1414 ___
1415         &$aesenc();
1416 $code.=<<___;
1417         sha256rnds2     $CDGH,$ABEF
1418
1419         movdqa          2*32-0x80($Tbl),$Wi
1420         paddd           @MSG[2],$Wi
1421         pshufb          $TMP,@MSG[3]
1422         sha256msg1      @MSG[1],@MSG[0]
1423 ___
1424         &$aesenc();
1425 $code.=<<___;
1426         sha256rnds2     $ABEF,$CDGH             # 8-11
1427         pshufd          \$0x0e,$Wi,$Wi
1428         movdqa          @MSG[3],$TMP
1429         palignr         \$4,@MSG[2],$TMP
1430         paddd           $TMP,@MSG[0]
1431 ___
1432         &$aesenc();
1433 $code.=<<___;
1434         sha256rnds2     $CDGH,$ABEF
1435
1436         movdqa          3*32-0x80($Tbl),$Wi
1437         paddd           @MSG[3],$Wi
1438         sha256msg2      @MSG[3],@MSG[0]
1439         sha256msg1      @MSG[2],@MSG[1]
1440 ___
1441         &$aesenc();
1442 $code.=<<___;
1443         sha256rnds2     $ABEF,$CDGH             # 12-15
1444         pshufd          \$0x0e,$Wi,$Wi
1445 ___
1446         &$aesenc();
1447 $code.=<<___;
1448         movdqa          @MSG[0],$TMP
1449         palignr         \$4,@MSG[3],$TMP
1450         paddd           $TMP,@MSG[1]
1451         sha256rnds2     $CDGH,$ABEF
1452 ___
1453 for($i=4;$i<16-3;$i++) {
1454         &$aesenc()      if (($r%10)==0);
1455 $code.=<<___;
1456         movdqa          $i*32-0x80($Tbl),$Wi
1457         paddd           @MSG[0],$Wi
1458         sha256msg2      @MSG[0],@MSG[1]
1459         sha256msg1      @MSG[3],@MSG[2]
1460 ___
1461         &$aesenc();
1462 $code.=<<___;
1463         sha256rnds2     $ABEF,$CDGH             # 16-19...
1464         pshufd          \$0x0e,$Wi,$Wi
1465         movdqa          @MSG[1],$TMP
1466         palignr         \$4,@MSG[0],$TMP
1467         paddd           $TMP,@MSG[2]
1468 ___
1469         &$aesenc();
1470         &$aesenc()      if ($r==19);
1471 $code.=<<___;
1472         sha256rnds2     $CDGH,$ABEF
1473 ___
1474         push(@MSG,shift(@MSG));
1475 }
1476 $code.=<<___;
1477         movdqa          13*32-0x80($Tbl),$Wi
1478         paddd           @MSG[0],$Wi
1479         sha256msg2      @MSG[0],@MSG[1]
1480         sha256msg1      @MSG[3],@MSG[2]
1481 ___
1482         &$aesenc();
1483 $code.=<<___;
1484         sha256rnds2     $ABEF,$CDGH             # 52-55
1485         pshufd          \$0x0e,$Wi,$Wi
1486         movdqa          @MSG[1],$TMP
1487         palignr         \$4,@MSG[0],$TMP
1488         paddd           $TMP,@MSG[2]
1489 ___
1490         &$aesenc();
1491         &$aesenc();
1492 $code.=<<___;
1493         sha256rnds2     $CDGH,$ABEF
1494
1495         movdqa          14*32-0x80($Tbl),$Wi
1496         paddd           @MSG[1],$Wi
1497         sha256msg2      @MSG[1],@MSG[2]
1498         movdqa          $BSWAP,$TMP
1499 ___
1500         &$aesenc();
1501 $code.=<<___;
1502         sha256rnds2     $ABEF,$CDGH             # 56-59
1503         pshufd          \$0x0e,$Wi,$Wi
1504 ___
1505         &$aesenc();
1506 $code.=<<___;
1507         sha256rnds2     $CDGH,$ABEF
1508
1509         movdqa          15*32-0x80($Tbl),$Wi
1510         paddd           @MSG[2],$Wi
1511 ___
1512         &$aesenc();
1513         &$aesenc();
1514 $code.=<<___;
1515         sha256rnds2     $ABEF,$CDGH             # 60-63
1516         pshufd          \$0x0e,$Wi,$Wi
1517 ___
1518         &$aesenc();
1519 $code.=<<___;
1520         sha256rnds2     $CDGH,$ABEF
1521         #pxor           $CDGH,$rndkey0          # black magic
1522 ___
1523         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1524 $code.=<<___;
1525         #xorps          $CDGH,$rndkey0          # black magic
1526         paddd           $CDGH_SAVE,$CDGH
1527         paddd           $ABEF_SAVE,$ABEF
1528
1529         dec             $len
1530         movups          $iv,48($out,$in0)       # write output
1531         lea             64($in0),$in0
1532         jnz             .Loop_shaext
1533
1534         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1535         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1536         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1537         punpckhqdq      $CDGH,$ABEF             # DCBA
1538         palignr         \$8,$TMP,$CDGH          # HGFE
1539
1540         movups          $iv,($ivp)              # write IV
1541         movdqu          $ABEF,($ctx)
1542         movdqu          $CDGH,16($ctx)
1543 ___
1544 $code.=<<___ if ($win64);
1545         movaps  0*16(%rsp),%xmm6
1546         movaps  1*16(%rsp),%xmm7
1547         movaps  2*16(%rsp),%xmm8
1548         movaps  3*16(%rsp),%xmm9
1549         movaps  4*16(%rsp),%xmm10
1550         movaps  5*16(%rsp),%xmm11
1551         movaps  6*16(%rsp),%xmm12
1552         movaps  7*16(%rsp),%xmm13
1553         movaps  8*16(%rsp),%xmm14
1554         movaps  9*16(%rsp),%xmm15
1555         lea     8+10*16(%rsp),%rsp
1556 .Lepilogue_shaext:
1557 ___
1558 $code.=<<___;
1559         ret
1560 .size   ${func}_shaext,.-${func}_shaext
1561 ___
1562 }
1563 }}}}}
1564
1565 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1566 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1567 if ($win64 && $avx) {
1568 $rec="%rcx";
1569 $frame="%rdx";
1570 $context="%r8";
1571 $disp="%r9";
1572
1573 $code.=<<___;
1574 .extern __imp_RtlVirtualUnwind
1575 .type   se_handler,\@abi-omnipotent
1576 .align  16
1577 se_handler:
1578         push    %rsi
1579         push    %rdi
1580         push    %rbx
1581         push    %rbp
1582         push    %r12
1583         push    %r13
1584         push    %r14
1585         push    %r15
1586         pushfq
1587         sub     \$64,%rsp
1588
1589         mov     120($context),%rax      # pull context->Rax
1590         mov     248($context),%rbx      # pull context->Rip
1591
1592         mov     8($disp),%rsi           # disp->ImageBase
1593         mov     56($disp),%r11          # disp->HanderlData
1594
1595         mov     0(%r11),%r10d           # HandlerData[0]
1596         lea     (%rsi,%r10),%r10        # prologue label
1597         cmp     %r10,%rbx               # context->Rip<prologue label
1598         jb      .Lin_prologue
1599
1600         mov     152($context),%rax      # pull context->Rsp
1601
1602         mov     4(%r11),%r10d           # HandlerData[1]
1603         lea     (%rsi,%r10),%r10        # epilogue label
1604         cmp     %r10,%rbx               # context->Rip>=epilogue label
1605         jae     .Lin_prologue
1606 ___
1607 $code.=<<___ if ($shaext);
1608         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1609         cmp     %r10,%rbx
1610         jb      .Lnot_in_shaext
1611
1612         lea     (%rax),%rsi
1613         lea     512($context),%rdi      # &context.Xmm6
1614         mov     \$20,%ecx
1615         .long   0xa548f3fc              # cld; rep movsq
1616         lea     168(%rax),%rax          # adjust stack pointer
1617         jmp     .Lin_prologue
1618 .Lnot_in_shaext:
1619 ___
1620 $code.=<<___ if ($avx>1);
1621         lea     .Lavx2_shortcut(%rip),%r10
1622         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1623         jb      .Lnot_in_avx2
1624
1625         and     \$-256*$SZ,%rax
1626         add     \$`2*$SZ*($rounds-8)`,%rax
1627 .Lnot_in_avx2:
1628 ___
1629 $code.=<<___;
1630         mov     %rax,%rsi               # put aside Rsp
1631         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1632
1633         mov     -8(%rax),%rbx
1634         mov     -16(%rax),%rbp
1635         mov     -24(%rax),%r12
1636         mov     -32(%rax),%r13
1637         mov     -40(%rax),%r14
1638         mov     -48(%rax),%r15
1639         mov     %rbx,144($context)      # restore context->Rbx
1640         mov     %rbp,160($context)      # restore context->Rbp
1641         mov     %r12,216($context)      # restore context->R12
1642         mov     %r13,224($context)      # restore context->R13
1643         mov     %r14,232($context)      # restore context->R14
1644         mov     %r15,240($context)      # restore context->R15
1645
1646         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1647         lea     512($context),%rdi      # &context.Xmm6
1648         mov     \$20,%ecx
1649         .long   0xa548f3fc              # cld; rep movsq
1650
1651 .Lin_prologue:
1652         mov     8(%rax),%rdi
1653         mov     16(%rax),%rsi
1654         mov     %rax,152($context)      # restore context->Rsp
1655         mov     %rsi,168($context)      # restore context->Rsi
1656         mov     %rdi,176($context)      # restore context->Rdi
1657
1658         mov     40($disp),%rdi          # disp->ContextRecord
1659         mov     $context,%rsi           # context
1660         mov     \$154,%ecx              # sizeof(CONTEXT)
1661         .long   0xa548f3fc              # cld; rep movsq
1662
1663         mov     $disp,%rsi
1664         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1665         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1666         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1667         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1668         mov     40(%rsi),%r10           # disp->ContextRecord
1669         lea     56(%rsi),%r11           # &disp->HandlerData
1670         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1671         mov     %r10,32(%rsp)           # arg5
1672         mov     %r11,40(%rsp)           # arg6
1673         mov     %r12,48(%rsp)           # arg7
1674         mov     %rcx,56(%rsp)           # arg8, (NULL)
1675         call    *__imp_RtlVirtualUnwind(%rip)
1676
1677         mov     \$1,%eax                # ExceptionContinueSearch
1678         add     \$64,%rsp
1679         popfq
1680         pop     %r15
1681         pop     %r14
1682         pop     %r13
1683         pop     %r12
1684         pop     %rbp
1685         pop     %rbx
1686         pop     %rdi
1687         pop     %rsi
1688         ret
1689 .size   se_handler,.-se_handler
1690
1691 .section        .pdata
1692         .rva    .LSEH_begin_${func}_xop
1693         .rva    .LSEH_end_${func}_xop
1694         .rva    .LSEH_info_${func}_xop
1695
1696         .rva    .LSEH_begin_${func}_avx
1697         .rva    .LSEH_end_${func}_avx
1698         .rva    .LSEH_info_${func}_avx
1699 ___
1700 $code.=<<___ if ($avx>1);
1701         .rva    .LSEH_begin_${func}_avx2
1702         .rva    .LSEH_end_${func}_avx2
1703         .rva    .LSEH_info_${func}_avx2
1704 ___
1705 $code.=<<___ if ($shaext);
1706         .rva    .LSEH_begin_${func}_shaext
1707         .rva    .LSEH_end_${func}_shaext
1708         .rva    .LSEH_info_${func}_shaext
1709 ___
1710 $code.=<<___;
1711 .section        .xdata
1712 .align  8
1713 .LSEH_info_${func}_xop:
1714         .byte   9,0,0,0
1715         .rva    se_handler
1716         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1717
1718 .LSEH_info_${func}_avx:
1719         .byte   9,0,0,0
1720         .rva    se_handler
1721         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1722 ___
1723 $code.=<<___ if ($avx>1);
1724 .LSEH_info_${func}_avx2:
1725         .byte   9,0,0,0
1726         .rva    se_handler
1727         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1728 ___
1729 $code.=<<___ if ($shaext);
1730 .LSEH_info_${func}_shaext:
1731         .byte   9,0,0,0
1732         .rva    se_handler
1733         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1734 ___
1735 }
1736
1737 ####################################################################
1738 sub rex {
1739   local *opcode=shift;
1740   my ($dst,$src)=@_;
1741   my $rex=0;
1742
1743     $rex|=0x04                  if($dst>=8);
1744     $rex|=0x01                  if($src>=8);
1745     unshift @opcode,$rex|0x40   if($rex);
1746 }
1747
1748 {
1749   my %opcodelet = (
1750                 "sha256rnds2" => 0xcb,
1751                 "sha256msg1"  => 0xcc,
1752                 "sha256msg2"  => 0xcd   );
1753
1754   sub sha256op38 {
1755     my $instr = shift;
1756
1757     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1758       my @opcode=(0x0f,0x38);
1759         rex(\@opcode,$2,$1);
1760         push @opcode,$opcodelet{$instr};
1761         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1762         return ".byte\t".join(',',@opcode);
1763     } else {
1764         return $instr."\t".@_[0];
1765     }
1766   }
1767 }
1768
1769 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1770 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1771 print $code;
1772 close STDOUT;