crypto/aes/asm/aesni-sha256-x86_64.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # January 2013
  18 #
  19 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  20 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  21 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  22 # parallelism, interleaving it with another algorithm would allow to
  23 # utilize processor resources better and achieve better performance.
  24 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  25 # AESNI code is weaved into it. As SHA256 dominates execution time,
  26 # stitch performance does not depend on AES key length. Below are
  27 # performance numbers in cycles per processed byte, less is better,
  28 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  29 # subroutine:
  30 #
  31 #                AES-128/-192/-256+SHA256       this(**)gain
  32 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
  33 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
  34 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
  35 # Skylake           2.62/3.14/3.62+7.70         8.10    +27%/34%/40%
  36 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
  37 #
  38 # (*)   there are XOP, AVX1 and AVX2 code paths, meaning that
  39 #       Westmere is omitted from loop, this is because gain was not
  40 #       estimated high enough to justify the effort;
  41 # (**)  these are EVP-free results, results obtained with 'speed
  42 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  43
  44 $flavour = shift;
  45 $output  = shift;
  46 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  47
  48 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  49
  50 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  51 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  52 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  53 die "can't locate x86_64-xlate.pl";
  54
  55 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  56                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  57         $avx = ($1>=2.19) + ($1>=2.22);
  58 }
  59
  60 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  61            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  62         $avx = ($1>=2.09) + ($1>=2.10);
  63 }
  64
  65 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  66            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  67         $avx = ($1>=10) + ($1>=12);
  68 }
  69
  70 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
  71         $avx = ($2>=3.0) + ($2>3.0);
  72 }
  73
  74 $shaext=$avx;   ### set to zero if compiling for 1.0.1
  75 $avx=1          if (!$shaext && $avx);
  76
  77 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  78 *STDOUT=*OUT;
  79
  80 $func="aesni_cbc_sha256_enc";
  81 $TABLE="K256";
  82 $SZ=4;
  83 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  84                                 "%r8d","%r9d","%r10d","%r11d");
  85 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  86 @Sigma0=( 2,13,22);
  87 @Sigma1=( 6,11,25);
  88 @sigma0=( 7,18, 3);
  89 @sigma1=(17,19,10);
  90 $rounds=64;
  91
  92 ########################################################################
  93 # void aesni_cbc_sha256_enc(const void *inp,
  94 #                       void *out,
  95 #                       size_t length,
  96 #                       const AES_KEY *key,
  97 #                       unsigned char *iv,
  98 #                       SHA256_CTX *ctx,
  99 #                       const void *in0);
 100 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
 101 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 102
 103 $Tbl="%rbp";
 104
 105 $_inp="16*$SZ+0*8(%rsp)";
 106 $_out="16*$SZ+1*8(%rsp)";
 107 $_end="16*$SZ+2*8(%rsp)";
 108 $_key="16*$SZ+3*8(%rsp)";
 109 $_ivp="16*$SZ+4*8(%rsp)";
 110 $_ctx="16*$SZ+5*8(%rsp)";
 111 $_in0="16*$SZ+6*8(%rsp)";
 112 $_rsp="`16*$SZ+7*8`(%rsp)";
 113 $framesz=16*$SZ+8*8;
 114
 115 $code=<<___;
 116 .text
 117
 118 .extern OPENSSL_ia32cap_P
 119 .globl  $func
 120 .type   $func,\@abi-omnipotent
 121 .align  16
 122 $func:
 123 ___
 124                                                 if ($avx) {
 125 $code.=<<___;
 126         lea     OPENSSL_ia32cap_P(%rip),%r11
 127         mov     \$1,%eax
 128         cmp     \$0,`$win64?"%rcx":"%rdi"`
 129         je      .Lprobe
 130         mov     0(%r11),%eax
 131         mov     4(%r11),%r10
 132 ___
 133 $code.=<<___ if ($shaext);
 134         bt      \$61,%r10                       # check for SHA
 135         jc      ${func}_shaext
 136 ___
 137 $code.=<<___;
 138         mov     %r10,%r11
 139         shr     \$32,%r11
 140
 141         test    \$`1<<11`,%r10d                 # check for XOP
 142         jnz     ${func}_xop
 143 ___
 144 $code.=<<___ if ($avx>1);
 145         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
 146         cmp     \$`1<<8|1<<5|1<<3`,%r11d
 147         je      ${func}_avx2
 148 ___
 149 $code.=<<___;
 150         and     \$`1<<28`,%r10d                 # check for AVX
 151         jnz     ${func}_avx
 152         ud2
 153 ___
 154                                                 }
 155 $code.=<<___;
 156         xor     %eax,%eax
 157         cmp     \$0,`$win64?"%rcx":"%rdi"`
 158         je      .Lprobe
 159         ud2
 160 .Lprobe:
 161         ret
 162 .size   $func,.-$func
 163
 164 .align  64
 165 .type   $TABLE,\@object
 166 $TABLE:
 167         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 168         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 169         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 170         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 171         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 172         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 173         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 174         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 175         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 176         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 177         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 178         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 179         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 180         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 181         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 182         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 183         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 184         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 185         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 186         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 187         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 188         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 189         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 190         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 191         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 192         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 193         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 194         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 195         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 196         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 197         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 198         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 199
 200         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 201         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 202         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
 203         .long   0,0,0,0,   0,0,0,0
 204         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 205 .align  64
 206 ___
 207
 208 ######################################################################
 209 # SIMD code paths
 210 #
 211 {{{
 212 ($iv,$inout,$roundkey,$temp,
 213  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
 214
 215 $aesni_cbc_idx=0;
 216 @aesni_cbc_block = (
 217 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
 218 ##      &vmovdqu        ($inout,($inp));
 219 ##      &mov            ($_inp,$inp);
 220
 221         '&vpxor         ($inout,$inout,$roundkey);'.
 222         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
 223
 224         '&vpxor         ($inout,$inout,$iv);',
 225
 226         '&vaesenc       ($inout,$inout,$roundkey);'.
 227         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
 228
 229         '&vaesenc       ($inout,$inout,$roundkey);'.
 230         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
 231
 232         '&vaesenc       ($inout,$inout,$roundkey);'.
 233         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
 234
 235         '&vaesenc       ($inout,$inout,$roundkey);'.
 236         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
 237
 238         '&vaesenc       ($inout,$inout,$roundkey);'.
 239         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
 240
 241         '&vaesenc       ($inout,$inout,$roundkey);'.
 242         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
 243
 244         '&vaesenc       ($inout,$inout,$roundkey);'.
 245         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
 246
 247         '&vaesenc       ($inout,$inout,$roundkey);'.
 248         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
 249
 250         '&vaesenc       ($inout,$inout,$roundkey);'.
 251         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
 252
 253         '&vaesenclast   ($temp,$inout,$roundkey);'.
 254         ' &vaesenc      ($inout,$inout,$roundkey);'.
 255         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
 256
 257         '&vpand         ($iv,$temp,$mask10);'.
 258         ' &vaesenc      ($inout,$inout,$roundkey);'.
 259         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
 260
 261         '&vaesenclast   ($temp,$inout,$roundkey);'.
 262         ' &vaesenc      ($inout,$inout,$roundkey);'.
 263         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
 264
 265         '&vpand         ($temp,$temp,$mask12);'.
 266         ' &vaesenc      ($inout,$inout,$roundkey);'.
 267          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
 268
 269         '&vpor          ($iv,$iv,$temp);'.
 270         ' &vaesenclast  ($temp,$inout,$roundkey);'.
 271         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
 272
 273 ##      &mov            ($inp,$_inp);
 274 ##      &mov            ($out,$_out);
 275 ##      &vpand          ($temp,$temp,$mask14);
 276 ##      &vpor           ($iv,$iv,$temp);
 277 ##      &vmovdqu        ($iv,($out,$inp);
 278 ##      &lea            (inp,16($inp));
 279 );
 280
 281 my $a4=$T1;
 282 my ($a,$b,$c,$d,$e,$f,$g,$h);
 283
 284 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 285 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 286   my $arg = pop;
 287     $arg = "\$$arg" if ($arg*1 eq $arg);
 288     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 289 }
 290
 291 sub body_00_15 () {
 292         (
 293         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 294
 295         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
 296         '&mov   ($a,$a1)',
 297         '&mov   ($a4,$f)',
 298
 299         '&xor   ($a0,$e)',
 300         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
 301         '&xor   ($a4,$g)',                      # f^g
 302
 303         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
 304         '&xor   ($a1,$a)',
 305         '&and   ($a4,$e)',                      # (f^g)&e
 306
 307         @aesni_cbc_block[$aesni_cbc_idx++].
 308         '&xor   ($a0,$e)',
 309         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
 310         '&mov   ($a2,$a)',
 311
 312         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
 313         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
 314         '&xor   ($a2,$b)',                      # a^b, b^c in next round
 315
 316         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
 317         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
 318         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
 319
 320         '&xor   ($a1,$a)',
 321         '&add   ($h,$a0)',                      # h+=Sigma1(e)
 322         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
 323
 324         '&add   ($d,$h)',                       # d+=h
 325         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
 326         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
 327
 328         '&mov   ($a0,$d)',
 329         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
 330         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 331         );
 332 }
 333
 334 if ($avx) {{
 335 ######################################################################
 336 # XOP code path
 337 #
 338 $code.=<<___;
 339 .type   ${func}_xop,\@function,6
 340 .align  64
 341 ${func}_xop:
 342 .cfi_startproc
 343 .Lxop_shortcut:
 344         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 345         mov     %rsp,%rax               # copy %rsp
 346 .cfi_def_cfa_register   %rax
 347         push    %rbx
 348 .cfi_push       %rbx
 349         push    %rbp
 350 .cfi_push       %rbp
 351         push    %r12
 352 .cfi_push       %r12
 353         push    %r13
 354 .cfi_push       %r13
 355         push    %r14
 356 .cfi_push       %r14
 357         push    %r15
 358 .cfi_push       %r15
 359         sub     \$`$framesz+$win64*16*10`,%rsp
 360         and     \$-64,%rsp              # align stack frame
 361
 362         shl     \$6,$len
 363         sub     $inp,$out               # re-bias
 364         sub     $inp,$in0
 365         add     $inp,$len               # end of input
 366
 367         #mov    $inp,$_inp              # saved later
 368         mov     $out,$_out
 369         mov     $len,$_end
 370         #mov    $key,$_key              # remains resident in $inp register
 371         mov     $ivp,$_ivp
 372         mov     $ctx,$_ctx
 373         mov     $in0,$_in0
 374         mov     %rax,$_rsp
 375 .cfi_cfa_expression     $_rsp,deref,+8
 376 ___
 377 $code.=<<___ if ($win64);
 378         movaps  %xmm6,`$framesz+16*0`(%rsp)
 379         movaps  %xmm7,`$framesz+16*1`(%rsp)
 380         movaps  %xmm8,`$framesz+16*2`(%rsp)
 381         movaps  %xmm9,`$framesz+16*3`(%rsp)
 382         movaps  %xmm10,`$framesz+16*4`(%rsp)
 383         movaps  %xmm11,`$framesz+16*5`(%rsp)
 384         movaps  %xmm12,`$framesz+16*6`(%rsp)
 385         movaps  %xmm13,`$framesz+16*7`(%rsp)
 386         movaps  %xmm14,`$framesz+16*8`(%rsp)
 387         movaps  %xmm15,`$framesz+16*9`(%rsp)
 388 ___
 389 $code.=<<___;
 390 .Lprologue_xop:
 391         vzeroall
 392
 393         mov     $inp,%r12               # borrow $a4
 394         lea     0x80($key),$inp         # size optimization, reassign
 395         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 396         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 397         mov     $ctx,%r15               # borrow $a2
 398         mov     $in0,%rsi               # borrow $a3
 399         vmovdqu ($ivp),$iv              # load IV
 400         sub     \$9,%r14
 401
 402         mov     $SZ*0(%r15),$A
 403         mov     $SZ*1(%r15),$B
 404         mov     $SZ*2(%r15),$C
 405         mov     $SZ*3(%r15),$D
 406         mov     $SZ*4(%r15),$E
 407         mov     $SZ*5(%r15),$F
 408         mov     $SZ*6(%r15),$G
 409         mov     $SZ*7(%r15),$H
 410
 411         vmovdqa 0x00(%r13,%r14,8),$mask14
 412         vmovdqa 0x10(%r13,%r14,8),$mask12
 413         vmovdqa 0x20(%r13,%r14,8),$mask10
 414         vmovdqu 0x00-0x80($inp),$roundkey
 415         jmp     .Lloop_xop
 416 ___
 417                                         if ($SZ==4) {   # SHA256
 418     my @X = map("%xmm$_",(0..3));
 419     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 420
 421 $code.=<<___;
 422 .align  16
 423 .Lloop_xop:
 424         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 425         vmovdqu 0x00(%rsi,%r12),@X[0]
 426         vmovdqu 0x10(%rsi,%r12),@X[1]
 427         vmovdqu 0x20(%rsi,%r12),@X[2]
 428         vmovdqu 0x30(%rsi,%r12),@X[3]
 429         vpshufb $t3,@X[0],@X[0]
 430         lea     $TABLE(%rip),$Tbl
 431         vpshufb $t3,@X[1],@X[1]
 432         vpshufb $t3,@X[2],@X[2]
 433         vpaddd  0x00($Tbl),@X[0],$t0
 434         vpshufb $t3,@X[3],@X[3]
 435         vpaddd  0x20($Tbl),@X[1],$t1
 436         vpaddd  0x40($Tbl),@X[2],$t2
 437         vpaddd  0x60($Tbl),@X[3],$t3
 438         vmovdqa $t0,0x00(%rsp)
 439         mov     $A,$a1
 440         vmovdqa $t1,0x10(%rsp)
 441         mov     $B,$a3
 442         vmovdqa $t2,0x20(%rsp)
 443         xor     $C,$a3                  # magic
 444         vmovdqa $t3,0x30(%rsp)
 445         mov     $E,$a0
 446         jmp     .Lxop_00_47
 447
 448 .align  16
 449 .Lxop_00_47:
 450         sub     \$-16*2*$SZ,$Tbl        # size optimization
 451         vmovdqu (%r12),$inout           # $a4
 452         mov     %r12,$_inp              # $a4
 453 ___
 454 sub XOP_256_00_47 () {
 455 my $j = shift;
 456 my $body = shift;
 457 my @X = @_;
 458 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 459
 460         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
 461           eval(shift(@insns));
 462           eval(shift(@insns));
 463          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
 464           eval(shift(@insns));
 465           eval(shift(@insns));
 466         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
 467           eval(shift(@insns));
 468           eval(shift(@insns));
 469         &vpsrld         ($t0,$t0,$sigma0[2]);
 470           eval(shift(@insns));
 471           eval(shift(@insns));
 472          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
 473           eval(shift(@insns));
 474           eval(shift(@insns));
 475           eval(shift(@insns));
 476           eval(shift(@insns));
 477         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
 478           eval(shift(@insns));
 479           eval(shift(@insns));
 480         &vpxor          ($t0,$t0,$t1);
 481           eval(shift(@insns));
 482           eval(shift(@insns));
 483           eval(shift(@insns));
 484           eval(shift(@insns));
 485          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
 486           eval(shift(@insns));
 487           eval(shift(@insns));
 488         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
 489           eval(shift(@insns));
 490           eval(shift(@insns));
 491          &vpsrld        ($t2,@X[3],$sigma1[2]);
 492           eval(shift(@insns));
 493           eval(shift(@insns));
 494         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
 495           eval(shift(@insns));
 496           eval(shift(@insns));
 497          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 498           eval(shift(@insns));
 499           eval(shift(@insns));
 500          &vpxor         ($t3,$t3,$t2);
 501           eval(shift(@insns));
 502           eval(shift(@insns));
 503           eval(shift(@insns));
 504           eval(shift(@insns));
 505          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
 506           eval(shift(@insns));
 507           eval(shift(@insns));
 508           eval(shift(@insns));
 509           eval(shift(@insns));
 510         &vpsrldq        ($t3,$t3,8);
 511           eval(shift(@insns));
 512           eval(shift(@insns));
 513           eval(shift(@insns));
 514           eval(shift(@insns));
 515         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
 516           eval(shift(@insns));
 517           eval(shift(@insns));
 518           eval(shift(@insns));
 519           eval(shift(@insns));
 520          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
 521           eval(shift(@insns));
 522           eval(shift(@insns));
 523          &vpsrld        ($t2,@X[0],$sigma1[2]);
 524           eval(shift(@insns));
 525           eval(shift(@insns));
 526          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 527           eval(shift(@insns));
 528           eval(shift(@insns));
 529          &vpxor         ($t3,$t3,$t2);
 530           eval(shift(@insns));
 531           eval(shift(@insns));
 532           eval(shift(@insns));
 533           eval(shift(@insns));
 534          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
 535           eval(shift(@insns));
 536           eval(shift(@insns));
 537           eval(shift(@insns));
 538           eval(shift(@insns));
 539         &vpslldq        ($t3,$t3,8);            # 22 instructions
 540           eval(shift(@insns));
 541           eval(shift(@insns));
 542           eval(shift(@insns));
 543           eval(shift(@insns));
 544         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
 545           eval(shift(@insns));
 546           eval(shift(@insns));
 547           eval(shift(@insns));
 548           eval(shift(@insns));
 549         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 550           foreach (@insns) { eval; }            # remaining instructions
 551         &vmovdqa        (16*$j."(%rsp)",$t2);
 552 }
 553
 554     $aesni_cbc_idx=0;
 555     for ($i=0,$j=0; $j<4; $j++) {
 556         &XOP_256_00_47($j,\&body_00_15,@X);
 557         push(@X,shift(@X));                     # rotate(@X)
 558     }
 559         &mov            ("%r12",$_inp);         # borrow $a4
 560         &vpand          ($temp,$temp,$mask14);
 561         &mov            ("%r15",$_out);         # borrow $a2
 562         &vpor           ($iv,$iv,$temp);
 563         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 564         &lea            ("%r12","16(%r12)");    # inp++
 565
 566         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 567         &jne    (".Lxop_00_47");
 568
 569         &vmovdqu        ($inout,"(%r12)");
 570         &mov            ($_inp,"%r12");
 571
 572     $aesni_cbc_idx=0;
 573     for ($i=0; $i<16; ) {
 574         foreach(body_00_15()) { eval; }
 575     }
 576                                         }
 577 $code.=<<___;
 578         mov     $_inp,%r12              # borrow $a4
 579         mov     $_out,%r13              # borrow $a0
 580         mov     $_ctx,%r15              # borrow $a2
 581         mov     $_in0,%rsi              # borrow $a3
 582
 583         vpand   $mask14,$temp,$temp
 584         mov     $a1,$A
 585         vpor    $temp,$iv,$iv
 586         vmovdqu $iv,(%r13,%r12)         # write output
 587         lea     16(%r12),%r12           # inp++
 588
 589         add     $SZ*0(%r15),$A
 590         add     $SZ*1(%r15),$B
 591         add     $SZ*2(%r15),$C
 592         add     $SZ*3(%r15),$D
 593         add     $SZ*4(%r15),$E
 594         add     $SZ*5(%r15),$F
 595         add     $SZ*6(%r15),$G
 596         add     $SZ*7(%r15),$H
 597
 598         cmp     $_end,%r12
 599
 600         mov     $A,$SZ*0(%r15)
 601         mov     $B,$SZ*1(%r15)
 602         mov     $C,$SZ*2(%r15)
 603         mov     $D,$SZ*3(%r15)
 604         mov     $E,$SZ*4(%r15)
 605         mov     $F,$SZ*5(%r15)
 606         mov     $G,$SZ*6(%r15)
 607         mov     $H,$SZ*7(%r15)
 608
 609         jb      .Lloop_xop
 610
 611         mov     $_ivp,$ivp
 612         mov     $_rsp,%rsi
 613 .cfi_def_cfa    %rsi,8
 614         vmovdqu $iv,($ivp)              # output IV
 615         vzeroall
 616 ___
 617 $code.=<<___ if ($win64);
 618         movaps  `$framesz+16*0`(%rsp),%xmm6
 619         movaps  `$framesz+16*1`(%rsp),%xmm7
 620         movaps  `$framesz+16*2`(%rsp),%xmm8
 621         movaps  `$framesz+16*3`(%rsp),%xmm9
 622         movaps  `$framesz+16*4`(%rsp),%xmm10
 623         movaps  `$framesz+16*5`(%rsp),%xmm11
 624         movaps  `$framesz+16*6`(%rsp),%xmm12
 625         movaps  `$framesz+16*7`(%rsp),%xmm13
 626         movaps  `$framesz+16*8`(%rsp),%xmm14
 627         movaps  `$framesz+16*9`(%rsp),%xmm15
 628 ___
 629 $code.=<<___;
 630         mov     -48(%rsi),%r15
 631 .cfi_restore    %r15
 632         mov     -40(%rsi),%r14
 633 .cfi_restore    %r14
 634         mov     -32(%rsi),%r13
 635 .cfi_restore    %r13
 636         mov     -24(%rsi),%r12
 637 .cfi_restore    %r12
 638         mov     -16(%rsi),%rbp
 639 .cfi_restore    %rbp
 640         mov     -8(%rsi),%rbx
 641 .cfi_restore    %rbx
 642         lea     (%rsi),%rsp
 643 .cfi_def_cfa_register   %rsp
 644 .Lepilogue_xop:
 645         ret
 646 .cfi_endproc
 647 .size   ${func}_xop,.-${func}_xop
 648 ___
 649 ######################################################################
 650 # AVX+shrd code path
 651 #
 652 local *ror = sub { &shrd(@_[0],@_) };
 653
 654 $code.=<<___;
 655 .type   ${func}_avx,\@function,6
 656 .align  64
 657 ${func}_avx:
 658 .cfi_startproc
 659 .Lavx_shortcut:
 660         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 661         mov     %rsp,%rax               # copy %rsp
 662 .cfi_def_cfa_register   %rax
 663         push    %rbx
 664 .cfi_push       %rbx
 665         push    %rbp
 666 .cfi_push       %rbp
 667         push    %r12
 668 .cfi_push       %r12
 669         push    %r13
 670 .cfi_push       %r13
 671         push    %r14
 672 .cfi_push       %r14
 673         push    %r15
 674 .cfi_push       %r15
 675         sub     \$`$framesz+$win64*16*10`,%rsp
 676         and     \$-64,%rsp              # align stack frame
 677
 678         shl     \$6,$len
 679         sub     $inp,$out               # re-bias
 680         sub     $inp,$in0
 681         add     $inp,$len               # end of input
 682
 683         #mov    $inp,$_inp              # saved later
 684         mov     $out,$_out
 685         mov     $len,$_end
 686         #mov    $key,$_key              # remains resident in $inp register
 687         mov     $ivp,$_ivp
 688         mov     $ctx,$_ctx
 689         mov     $in0,$_in0
 690         mov     %rax,$_rsp
 691 .cfi_cfa_expression     $_rsp,deref,+8
 692 ___
 693 $code.=<<___ if ($win64);
 694         movaps  %xmm6,`$framesz+16*0`(%rsp)
 695         movaps  %xmm7,`$framesz+16*1`(%rsp)
 696         movaps  %xmm8,`$framesz+16*2`(%rsp)
 697         movaps  %xmm9,`$framesz+16*3`(%rsp)
 698         movaps  %xmm10,`$framesz+16*4`(%rsp)
 699         movaps  %xmm11,`$framesz+16*5`(%rsp)
 700         movaps  %xmm12,`$framesz+16*6`(%rsp)
 701         movaps  %xmm13,`$framesz+16*7`(%rsp)
 702         movaps  %xmm14,`$framesz+16*8`(%rsp)
 703         movaps  %xmm15,`$framesz+16*9`(%rsp)
 704 ___
 705 $code.=<<___;
 706 .Lprologue_avx:
 707         vzeroall
 708
 709         mov     $inp,%r12               # borrow $a4
 710         lea     0x80($key),$inp         # size optimization, reassign
 711         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 712         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 713         mov     $ctx,%r15               # borrow $a2
 714         mov     $in0,%rsi               # borrow $a3
 715         vmovdqu ($ivp),$iv              # load IV
 716         sub     \$9,%r14
 717
 718         mov     $SZ*0(%r15),$A
 719         mov     $SZ*1(%r15),$B
 720         mov     $SZ*2(%r15),$C
 721         mov     $SZ*3(%r15),$D
 722         mov     $SZ*4(%r15),$E
 723         mov     $SZ*5(%r15),$F
 724         mov     $SZ*6(%r15),$G
 725         mov     $SZ*7(%r15),$H
 726
 727         vmovdqa 0x00(%r13,%r14,8),$mask14
 728         vmovdqa 0x10(%r13,%r14,8),$mask12
 729         vmovdqa 0x20(%r13,%r14,8),$mask10
 730         vmovdqu 0x00-0x80($inp),$roundkey
 731 ___
 732                                         if ($SZ==4) {   # SHA256
 733     my @X = map("%xmm$_",(0..3));
 734     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 735
 736 $code.=<<___;
 737         jmp     .Lloop_avx
 738 .align  16
 739 .Lloop_avx:
 740         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 741         vmovdqu 0x00(%rsi,%r12),@X[0]
 742         vmovdqu 0x10(%rsi,%r12),@X[1]
 743         vmovdqu 0x20(%rsi,%r12),@X[2]
 744         vmovdqu 0x30(%rsi,%r12),@X[3]
 745         vpshufb $t3,@X[0],@X[0]
 746         lea     $TABLE(%rip),$Tbl
 747         vpshufb $t3,@X[1],@X[1]
 748         vpshufb $t3,@X[2],@X[2]
 749         vpaddd  0x00($Tbl),@X[0],$t0
 750         vpshufb $t3,@X[3],@X[3]
 751         vpaddd  0x20($Tbl),@X[1],$t1
 752         vpaddd  0x40($Tbl),@X[2],$t2
 753         vpaddd  0x60($Tbl),@X[3],$t3
 754         vmovdqa $t0,0x00(%rsp)
 755         mov     $A,$a1
 756         vmovdqa $t1,0x10(%rsp)
 757         mov     $B,$a3
 758         vmovdqa $t2,0x20(%rsp)
 759         xor     $C,$a3                  # magic
 760         vmovdqa $t3,0x30(%rsp)
 761         mov     $E,$a0
 762         jmp     .Lavx_00_47
 763
 764 .align  16
 765 .Lavx_00_47:
 766         sub     \$-16*2*$SZ,$Tbl        # size optimization
 767         vmovdqu (%r12),$inout           # $a4
 768         mov     %r12,$_inp              # $a4
 769 ___
 770 sub Xupdate_256_AVX () {
 771         (
 772         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
 773          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
 774         '&vpsrld        ($t2,$t0,$sigma0[0]);',
 775          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
 776         '&vpsrld        ($t3,$t0,$sigma0[2])',
 777         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
 778         '&vpxor         ($t0,$t3,$t2)',
 779          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
 780         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
 781         '&vpxor         ($t0,$t0,$t1)',
 782         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
 783         '&vpxor         ($t0,$t0,$t2)',
 784          '&vpsrld       ($t2,$t3,$sigma1[2]);',
 785         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
 786          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
 787         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
 788          '&vpxor        ($t2,$t2,$t3);',
 789          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 790          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
 791          '&vpshufd      ($t2,$t2,0b10000100)',
 792          '&vpsrldq      ($t2,$t2,8)',
 793         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
 794          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
 795          '&vpsrld       ($t2,$t3,$sigma1[2])',
 796          '&vpsrlq       ($t3,$t3,$sigma1[0])',
 797          '&vpxor        ($t2,$t2,$t3);',
 798          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 799          '&vpxor        ($t2,$t2,$t3)',
 800          '&vpshufd      ($t2,$t2,0b11101000)',
 801          '&vpslldq      ($t2,$t2,8)',
 802         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
 803         );
 804 }
 805
 806 sub AVX_256_00_47 () {
 807 my $j = shift;
 808 my $body = shift;
 809 my @X = @_;
 810 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 811
 812         foreach (Xupdate_256_AVX()) {           # 29 instructions
 813             eval;
 814             eval(shift(@insns));
 815             eval(shift(@insns));
 816             eval(shift(@insns));
 817         }
 818         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 819           foreach (@insns) { eval; }            # remaining instructions
 820         &vmovdqa        (16*$j."(%rsp)",$t2);
 821 }
 822
 823     $aesni_cbc_idx=0;
 824     for ($i=0,$j=0; $j<4; $j++) {
 825         &AVX_256_00_47($j,\&body_00_15,@X);
 826         push(@X,shift(@X));                     # rotate(@X)
 827     }
 828         &mov            ("%r12",$_inp);         # borrow $a4
 829         &vpand          ($temp,$temp,$mask14);
 830         &mov            ("%r15",$_out);         # borrow $a2
 831         &vpor           ($iv,$iv,$temp);
 832         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 833         &lea            ("%r12","16(%r12)");    # inp++
 834
 835         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 836         &jne    (".Lavx_00_47");
 837
 838         &vmovdqu        ($inout,"(%r12)");
 839         &mov            ($_inp,"%r12");
 840
 841     $aesni_cbc_idx=0;
 842     for ($i=0; $i<16; ) {
 843         foreach(body_00_15()) { eval; }
 844     }
 845
 846                                         }
 847 $code.=<<___;
 848         mov     $_inp,%r12              # borrow $a4
 849         mov     $_out,%r13              # borrow $a0
 850         mov     $_ctx,%r15              # borrow $a2
 851         mov     $_in0,%rsi              # borrow $a3
 852
 853         vpand   $mask14,$temp,$temp
 854         mov     $a1,$A
 855         vpor    $temp,$iv,$iv
 856         vmovdqu $iv,(%r13,%r12)         # write output
 857         lea     16(%r12),%r12           # inp++
 858
 859         add     $SZ*0(%r15),$A
 860         add     $SZ*1(%r15),$B
 861         add     $SZ*2(%r15),$C
 862         add     $SZ*3(%r15),$D
 863         add     $SZ*4(%r15),$E
 864         add     $SZ*5(%r15),$F
 865         add     $SZ*6(%r15),$G
 866         add     $SZ*7(%r15),$H
 867
 868         cmp     $_end,%r12
 869
 870         mov     $A,$SZ*0(%r15)
 871         mov     $B,$SZ*1(%r15)
 872         mov     $C,$SZ*2(%r15)
 873         mov     $D,$SZ*3(%r15)
 874         mov     $E,$SZ*4(%r15)
 875         mov     $F,$SZ*5(%r15)
 876         mov     $G,$SZ*6(%r15)
 877         mov     $H,$SZ*7(%r15)
 878         jb      .Lloop_avx
 879
 880         mov     $_ivp,$ivp
 881         mov     $_rsp,%rsi
 882 .cfi_def_cfa    %rsi,8
 883         vmovdqu $iv,($ivp)              # output IV
 884         vzeroall
 885 ___
 886 $code.=<<___ if ($win64);
 887         movaps  `$framesz+16*0`(%rsp),%xmm6
 888         movaps  `$framesz+16*1`(%rsp),%xmm7
 889         movaps  `$framesz+16*2`(%rsp),%xmm8
 890         movaps  `$framesz+16*3`(%rsp),%xmm9
 891         movaps  `$framesz+16*4`(%rsp),%xmm10
 892         movaps  `$framesz+16*5`(%rsp),%xmm11
 893         movaps  `$framesz+16*6`(%rsp),%xmm12
 894         movaps  `$framesz+16*7`(%rsp),%xmm13
 895         movaps  `$framesz+16*8`(%rsp),%xmm14
 896         movaps  `$framesz+16*9`(%rsp),%xmm15
 897 ___
 898 $code.=<<___;
 899         mov     -48(%rsi),%r15
 900 .cfi_restore    %r15
 901         mov     -40(%rsi),%r14
 902 .cfi_restore    %r14
 903         mov     -32(%rsi),%r13
 904 .cfi_restore    %r13
 905         mov     -24(%rsi),%r12
 906 .cfi_restore    %r12
 907         mov     -16(%rsi),%rbp
 908 .cfi_restore    %rbp
 909         mov     -8(%rsi),%rbx
 910 .cfi_restore    %rbx
 911         lea     (%rsi),%rsp
 912 .cfi_def_cfa_register   %rsp
 913 .Lepilogue_avx:
 914         ret
 915 .cfi_endproc
 916 .size   ${func}_avx,.-${func}_avx
 917 ___
 918
 919 if ($avx>1) {{
 920 ######################################################################
 921 # AVX2+BMI code path
 922 #
 923 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
 924 my $PUSH8=8*2*$SZ;
 925 use integer;
 926
 927 sub bodyx_00_15 () {
 928         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
 929         (
 930         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 931
 932         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
 933         '&and   ($a4,$e)',              # f&e
 934         '&rorx  ($a0,$e,$Sigma1[2])',
 935         '&rorx  ($a2,$e,$Sigma1[1])',
 936
 937         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
 938         '&lea   ($h,"($h,$a4)")',
 939         '&andn  ($a4,$e,$g)',           # ~e&g
 940         '&xor   ($a0,$a2)',
 941
 942         '&rorx  ($a1,$e,$Sigma1[0])',
 943         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
 944         '&xor   ($a0,$a1)',             # Sigma1(e)
 945         '&mov   ($a2,$a)',
 946
 947         '&rorx  ($a4,$a,$Sigma0[2])',
 948         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
 949         '&xor   ($a2,$b)',              # a^b, b^c in next round
 950         '&rorx  ($a1,$a,$Sigma0[1])',
 951
 952         '&rorx  ($a0,$a,$Sigma0[0])',
 953         '&lea   ($d,"($d,$h)")',        # d+=h
 954         '&and   ($a3,$a2)',             # (b^c)&(a^b)
 955         @aesni_cbc_block[$aesni_cbc_idx++].
 956         '&xor   ($a1,$a4)',
 957
 958         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
 959         '&xor   ($a1,$a0)',             # Sigma0(a)
 960         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
 961         '&mov   ($a4,$e)',              # copy of f in future
 962
 963         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 964         );
 965         # and at the finish one has to $a+=$a1
 966 }
 967
 968 $code.=<<___;
 969 .type   ${func}_avx2,\@function,6
 970 .align  64
 971 ${func}_avx2:
 972 .cfi_startproc
 973 .Lavx2_shortcut:
 974         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 975         mov     %rsp,%rax               # copy %rsp
 976 .cfi_def_cfa_register   %rax
 977         push    %rbx
 978 .cfi_push       %rbx
 979         push    %rbp
 980 .cfi_push       %rbp
 981         push    %r12
 982 .cfi_push       %r12
 983         push    %r13
 984 .cfi_push       %r13
 985         push    %r14
 986 .cfi_push       %r14
 987         push    %r15
 988 .cfi_push       %r15
 989         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
 990         and     \$-256*$SZ,%rsp         # align stack frame
 991         add     \$`2*$SZ*($rounds-8)`,%rsp
 992
 993         shl     \$6,$len
 994         sub     $inp,$out               # re-bias
 995         sub     $inp,$in0
 996         add     $inp,$len               # end of input
 997
 998         #mov    $inp,$_inp              # saved later
 999         #mov    $out,$_out              # kept in $offload
1000         mov     $len,$_end
1001         #mov    $key,$_key              # remains resident in $inp register
1002         mov     $ivp,$_ivp
1003         mov     $ctx,$_ctx
1004         mov     $in0,$_in0
1005         mov     %rax,$_rsp
1006 .cfi_cfa_expression     $_rsp,deref,+8
1007 ___
1008 $code.=<<___ if ($win64);
1009         movaps  %xmm6,`$framesz+16*0`(%rsp)
1010         movaps  %xmm7,`$framesz+16*1`(%rsp)
1011         movaps  %xmm8,`$framesz+16*2`(%rsp)
1012         movaps  %xmm9,`$framesz+16*3`(%rsp)
1013         movaps  %xmm10,`$framesz+16*4`(%rsp)
1014         movaps  %xmm11,`$framesz+16*5`(%rsp)
1015         movaps  %xmm12,`$framesz+16*6`(%rsp)
1016         movaps  %xmm13,`$framesz+16*7`(%rsp)
1017         movaps  %xmm14,`$framesz+16*8`(%rsp)
1018         movaps  %xmm15,`$framesz+16*9`(%rsp)
1019 ___
1020 $code.=<<___;
1021 .Lprologue_avx2:
1022         vzeroall
1023
1024         mov     $inp,%r13               # borrow $a0
1025         vpinsrq \$1,$out,$offload,$offload
1026         lea     0x80($key),$inp         # size optimization, reassign
1027         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
1028         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
1029         mov     $ctx,%r15               # borrow $a2
1030         mov     $in0,%rsi               # borrow $a3
1031         vmovdqu ($ivp),$iv              # load IV
1032         lea     -9(%r14),%r14
1033
1034         vmovdqa 0x00(%r12,%r14,8),$mask14
1035         vmovdqa 0x10(%r12,%r14,8),$mask12
1036         vmovdqa 0x20(%r12,%r14,8),$mask10
1037
1038         sub     \$-16*$SZ,%r13          # inp++, size optimization
1039         mov     $SZ*0(%r15),$A
1040         lea     (%rsi,%r13),%r12        # borrow $a0
1041         mov     $SZ*1(%r15),$B
1042         cmp     $len,%r13               # $_end
1043         mov     $SZ*2(%r15),$C
1044         cmove   %rsp,%r12               # next block or random data
1045         mov     $SZ*3(%r15),$D
1046         mov     $SZ*4(%r15),$E
1047         mov     $SZ*5(%r15),$F
1048         mov     $SZ*6(%r15),$G
1049         mov     $SZ*7(%r15),$H
1050         vmovdqu 0x00-0x80($inp),$roundkey
1051 ___
1052                                         if ($SZ==4) {   # SHA256
1053     my @X = map("%ymm$_",(0..3));
1054     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1055
1056 $code.=<<___;
1057         jmp     .Loop_avx2
1058 .align  16
1059 .Loop_avx2:
1060         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1061         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1062         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1063         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1064         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1065
1066         vinserti128     \$1,(%r12),@X[0],@X[0]
1067         vinserti128     \$1,16(%r12),@X[1],@X[1]
1068          vpshufb        $t3,@X[0],@X[0]
1069         vinserti128     \$1,32(%r12),@X[2],@X[2]
1070          vpshufb        $t3,@X[1],@X[1]
1071         vinserti128     \$1,48(%r12),@X[3],@X[3]
1072
1073         lea     $TABLE(%rip),$Tbl
1074         vpshufb $t3,@X[2],@X[2]
1075         lea     -16*$SZ(%r13),%r13
1076         vpaddd  0x00($Tbl),@X[0],$t0
1077         vpshufb $t3,@X[3],@X[3]
1078         vpaddd  0x20($Tbl),@X[1],$t1
1079         vpaddd  0x40($Tbl),@X[2],$t2
1080         vpaddd  0x60($Tbl),@X[3],$t3
1081         vmovdqa $t0,0x00(%rsp)
1082         xor     $a1,$a1
1083         vmovdqa $t1,0x20(%rsp)
1084         lea     -$PUSH8(%rsp),%rsp
1085         mov     $B,$a3
1086         vmovdqa $t2,0x00(%rsp)
1087         xor     $C,$a3                  # magic
1088         vmovdqa $t3,0x20(%rsp)
1089         mov     $F,$a4
1090         sub     \$-16*2*$SZ,$Tbl        # size optimization
1091         jmp     .Lavx2_00_47
1092
1093 .align  16
1094 .Lavx2_00_47:
1095         vmovdqu (%r13),$inout
1096         vpinsrq \$0,%r13,$offload,$offload
1097 ___
1098
1099 sub AVX2_256_00_47 () {
1100 my $j = shift;
1101 my $body = shift;
1102 my @X = @_;
1103 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1104 my $base = "+2*$PUSH8(%rsp)";
1105
1106         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1107         foreach (Xupdate_256_AVX()) {           # 29 instructions
1108             eval;
1109             eval(shift(@insns));
1110             eval(shift(@insns));
1111             eval(shift(@insns));
1112         }
1113         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1114           foreach (@insns) { eval; }            # remaining instructions
1115         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1116 }
1117     $aesni_cbc_idx=0;
1118     for ($i=0,$j=0; $j<4; $j++) {
1119         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1120         push(@X,shift(@X));                     # rotate(@X)
1121     }
1122         &vmovq          ("%r13",$offload);      # borrow $a0
1123         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1124         &vpand          ($temp,$temp,$mask14);
1125         &vpor           ($iv,$iv,$temp);
1126         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1127         &lea            ("%r13","16(%r13)");    # inp++
1128
1129         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1130         &cmpb   (($SZ-1)."($Tbl)",0);
1131         &jne    (".Lavx2_00_47");
1132
1133         &vmovdqu        ($inout,"(%r13)");
1134         &vpinsrq        ($offload,$offload,"%r13",0);
1135
1136     $aesni_cbc_idx=0;
1137     for ($i=0; $i<16; ) {
1138         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1139         foreach(bodyx_00_15()) { eval; }
1140     }
1141                                         }
1142 $code.=<<___;
1143         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1144         vmovq   $offload,%r13                   # $_inp, borrow $a0
1145         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1146         add     $a1,$A
1147         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1148
1149         vpand   $mask14,$temp,$temp
1150         vpor    $temp,$iv,$iv
1151         vmovdqu $iv,(%r12,%r13)                 # write output
1152         lea     16(%r13),%r13
1153
1154         add     $SZ*0(%r15),$A
1155         add     $SZ*1(%r15),$B
1156         add     $SZ*2(%r15),$C
1157         add     $SZ*3(%r15),$D
1158         add     $SZ*4(%r15),$E
1159         add     $SZ*5(%r15),$F
1160         add     $SZ*6(%r15),$G
1161         add     $SZ*7(%r15),$H
1162
1163         mov     $A,$SZ*0(%r15)
1164         mov     $B,$SZ*1(%r15)
1165         mov     $C,$SZ*2(%r15)
1166         mov     $D,$SZ*3(%r15)
1167         mov     $E,$SZ*4(%r15)
1168         mov     $F,$SZ*5(%r15)
1169         mov     $G,$SZ*6(%r15)
1170         mov     $H,$SZ*7(%r15)
1171
1172         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1173         je      .Ldone_avx2
1174
1175         xor     $a1,$a1
1176         mov     $B,$a3
1177         mov     $F,$a4
1178         xor     $C,$a3                  # magic
1179         jmp     .Lower_avx2
1180 .align  16
1181 .Lower_avx2:
1182         vmovdqu (%r13),$inout
1183         vpinsrq \$0,%r13,$offload,$offload
1184 ___
1185     $aesni_cbc_idx=0;
1186     for ($i=0; $i<16; ) {
1187         my $base="+16($Tbl)";
1188         foreach(bodyx_00_15()) { eval; }
1189         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1190     }
1191 $code.=<<___;
1192         vmovq   $offload,%r13                   # borrow $a0
1193         vpextrq \$1,$offload,%r15               # borrow $a2
1194         vpand   $mask14,$temp,$temp
1195         vpor    $temp,$iv,$iv
1196         lea     -$PUSH8($Tbl),$Tbl
1197         vmovdqu $iv,(%r15,%r13)                 # write output
1198         lea     16(%r13),%r13                   # inp++
1199         cmp     %rsp,$Tbl
1200         jae     .Lower_avx2
1201
1202         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1203         lea     16*$SZ(%r13),%r13
1204         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1205         add     $a1,$A
1206         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1207
1208         add     $SZ*0(%r15),$A
1209         add     $SZ*1(%r15),$B
1210         add     $SZ*2(%r15),$C
1211         add     $SZ*3(%r15),$D
1212         add     $SZ*4(%r15),$E
1213         add     $SZ*5(%r15),$F
1214         add     $SZ*6(%r15),$G
1215         lea     (%rsi,%r13),%r12
1216         add     $SZ*7(%r15),$H
1217
1218         cmp     $_end,%r13
1219
1220         mov     $A,$SZ*0(%r15)
1221         cmove   %rsp,%r12               # next block or stale data
1222         mov     $B,$SZ*1(%r15)
1223         mov     $C,$SZ*2(%r15)
1224         mov     $D,$SZ*3(%r15)
1225         mov     $E,$SZ*4(%r15)
1226         mov     $F,$SZ*5(%r15)
1227         mov     $G,$SZ*6(%r15)
1228         mov     $H,$SZ*7(%r15)
1229
1230         jbe     .Loop_avx2
1231         lea     (%rsp),$Tbl
1232
1233 .Ldone_avx2:
1234         lea     ($Tbl),%rsp
1235         mov     $_ivp,$ivp
1236         mov     $_rsp,%rsi
1237 .cfi_def_cfa    %rsi,8
1238         vmovdqu $iv,($ivp)              # output IV
1239         vzeroall
1240 ___
1241 $code.=<<___ if ($win64);
1242         movaps  `$framesz+16*0`(%rsp),%xmm6
1243         movaps  `$framesz+16*1`(%rsp),%xmm7
1244         movaps  `$framesz+16*2`(%rsp),%xmm8
1245         movaps  `$framesz+16*3`(%rsp),%xmm9
1246         movaps  `$framesz+16*4`(%rsp),%xmm10
1247         movaps  `$framesz+16*5`(%rsp),%xmm11
1248         movaps  `$framesz+16*6`(%rsp),%xmm12
1249         movaps  `$framesz+16*7`(%rsp),%xmm13
1250         movaps  `$framesz+16*8`(%rsp),%xmm14
1251         movaps  `$framesz+16*9`(%rsp),%xmm15
1252 ___
1253 $code.=<<___;
1254         mov     -48(%rsi),%r15
1255 .cfi_restore    %r15
1256         mov     -40(%rsi),%r14
1257 .cfi_restore    %r14
1258         mov     -32(%rsi),%r13
1259 .cfi_restore    %r13
1260         mov     -24(%rsi),%r12
1261 .cfi_restore    %r12
1262         mov     -16(%rsi),%rbp
1263 .cfi_restore    %rbp
1264         mov     -8(%rsi),%rbx
1265 .cfi_restore    %rbx
1266         lea     (%rsi),%rsp
1267 .cfi_def_cfa_register   %rsp
1268 .Lepilogue_avx2:
1269         ret
1270 .cfi_endproc
1271 .size   ${func}_avx2,.-${func}_avx2
1272 ___
1273 }}
1274 }}
1275 {{
1276 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1277
1278 my ($rounds,$Tbl)=("%r11d","%rbx");
1279
1280 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1281 my @rndkey=("%xmm4","%xmm5");
1282 my $r=0;
1283 my $sn=0;
1284
1285 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1286 my @MSG=map("%xmm$_",(10..13));
1287
1288 my $aesenc=sub {
1289   use integer;
1290   my ($n,$k)=($r/10,$r%10);
1291     if ($k==0) {
1292       $code.=<<___;
1293         movups          `16*$n`($in0),$in               # load input
1294         xorps           $rndkey0,$in
1295 ___
1296       $code.=<<___ if ($n);
1297         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1298 ___
1299       $code.=<<___;
1300         xorps           $in,$iv
1301         movups          `32+16*$k-112`($key),$rndkey[1]
1302         aesenc          $rndkey[0],$iv
1303 ___
1304     } elsif ($k==9) {
1305       $sn++;
1306       $code.=<<___;
1307         cmp             \$11,$rounds
1308         jb              .Laesenclast$sn
1309         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1310         aesenc          $rndkey[0],$iv
1311         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1312         aesenc          $rndkey[1],$iv
1313         je              .Laesenclast$sn
1314         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1315         aesenc          $rndkey[0],$iv
1316         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1317         aesenc          $rndkey[1],$iv
1318 .Laesenclast$sn:
1319         aesenclast      $rndkey[0],$iv
1320         movups          16-112($key),$rndkey[1]         # forward reference
1321         nop
1322 ___
1323     } else {
1324       $code.=<<___;
1325         movups          `32+16*$k-112`($key),$rndkey[1]
1326         aesenc          $rndkey[0],$iv
1327 ___
1328     }
1329     $r++;       unshift(@rndkey,pop(@rndkey));
1330 };
1331
1332 if ($shaext) {
1333 my $Tbl="%rax";
1334
1335 $code.=<<___;
1336 .type   ${func}_shaext,\@function,6
1337 .align  32
1338 ${func}_shaext:
1339         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1340 ___
1341 $code.=<<___ if ($win64);
1342         lea     `-8-10*16`(%rsp),%rsp
1343         movaps  %xmm6,-8-10*16(%rax)
1344         movaps  %xmm7,-8-9*16(%rax)
1345         movaps  %xmm8,-8-8*16(%rax)
1346         movaps  %xmm9,-8-7*16(%rax)
1347         movaps  %xmm10,-8-6*16(%rax)
1348         movaps  %xmm11,-8-5*16(%rax)
1349         movaps  %xmm12,-8-4*16(%rax)
1350         movaps  %xmm13,-8-3*16(%rax)
1351         movaps  %xmm14,-8-2*16(%rax)
1352         movaps  %xmm15,-8-1*16(%rax)
1353 .Lprologue_shaext:
1354 ___
1355 $code.=<<___;
1356         lea             K256+0x80(%rip),$Tbl
1357         movdqu          ($ctx),$ABEF            # DCBA
1358         movdqu          16($ctx),$CDGH          # HGFE
1359         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1360
1361         mov             240($key),$rounds
1362         sub             $in0,$out
1363         movups          ($key),$rndkey0         # $key[0]
1364         movups          16($key),$rndkey[0]     # forward reference
1365         lea             112($key),$key          # size optimization
1366
1367         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1368         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1369         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1370         movdqa          $TMP,$BSWAP             # offload
1371         palignr         \$8,$CDGH,$ABEF         # ABEF
1372         punpcklqdq      $Wi,$CDGH               # CDGH
1373
1374         jmp     .Loop_shaext
1375
1376 .align  16
1377 .Loop_shaext:
1378         movdqu          ($inp),@MSG[0]
1379         movdqu          0x10($inp),@MSG[1]
1380         movdqu          0x20($inp),@MSG[2]
1381         pshufb          $TMP,@MSG[0]
1382         movdqu          0x30($inp),@MSG[3]
1383
1384         movdqa          0*32-0x80($Tbl),$Wi
1385         paddd           @MSG[0],$Wi
1386         pshufb          $TMP,@MSG[1]
1387         movdqa          $CDGH,$CDGH_SAVE        # offload
1388         movdqa          $ABEF,$ABEF_SAVE        # offload
1389 ___
1390         &$aesenc();
1391 $code.=<<___;
1392         sha256rnds2     $ABEF,$CDGH             # 0-3
1393         pshufd          \$0x0e,$Wi,$Wi
1394 ___
1395         &$aesenc();
1396 $code.=<<___;
1397         sha256rnds2     $CDGH,$ABEF
1398
1399         movdqa          1*32-0x80($Tbl),$Wi
1400         paddd           @MSG[1],$Wi
1401         pshufb          $TMP,@MSG[2]
1402         lea             0x40($inp),$inp
1403 ___
1404         &$aesenc();
1405 $code.=<<___;
1406         sha256rnds2     $ABEF,$CDGH             # 4-7
1407         pshufd          \$0x0e,$Wi,$Wi
1408 ___
1409         &$aesenc();
1410 $code.=<<___;
1411         sha256rnds2     $CDGH,$ABEF
1412
1413         movdqa          2*32-0x80($Tbl),$Wi
1414         paddd           @MSG[2],$Wi
1415         pshufb          $TMP,@MSG[3]
1416         sha256msg1      @MSG[1],@MSG[0]
1417 ___
1418         &$aesenc();
1419 $code.=<<___;
1420         sha256rnds2     $ABEF,$CDGH             # 8-11
1421         pshufd          \$0x0e,$Wi,$Wi
1422         movdqa          @MSG[3],$TMP
1423         palignr         \$4,@MSG[2],$TMP
1424         paddd           $TMP,@MSG[0]
1425 ___
1426         &$aesenc();
1427 $code.=<<___;
1428         sha256rnds2     $CDGH,$ABEF
1429
1430         movdqa          3*32-0x80($Tbl),$Wi
1431         paddd           @MSG[3],$Wi
1432         sha256msg2      @MSG[3],@MSG[0]
1433         sha256msg1      @MSG[2],@MSG[1]
1434 ___
1435         &$aesenc();
1436 $code.=<<___;
1437         sha256rnds2     $ABEF,$CDGH             # 12-15
1438         pshufd          \$0x0e,$Wi,$Wi
1439 ___
1440         &$aesenc();
1441 $code.=<<___;
1442         movdqa          @MSG[0],$TMP
1443         palignr         \$4,@MSG[3],$TMP
1444         paddd           $TMP,@MSG[1]
1445         sha256rnds2     $CDGH,$ABEF
1446 ___
1447 for($i=4;$i<16-3;$i++) {
1448         &$aesenc()      if (($r%10)==0);
1449 $code.=<<___;
1450         movdqa          $i*32-0x80($Tbl),$Wi
1451         paddd           @MSG[0],$Wi
1452         sha256msg2      @MSG[0],@MSG[1]
1453         sha256msg1      @MSG[3],@MSG[2]
1454 ___
1455         &$aesenc();
1456 $code.=<<___;
1457         sha256rnds2     $ABEF,$CDGH             # 16-19...
1458         pshufd          \$0x0e,$Wi,$Wi
1459         movdqa          @MSG[1],$TMP
1460         palignr         \$4,@MSG[0],$TMP
1461         paddd           $TMP,@MSG[2]
1462 ___
1463         &$aesenc();
1464         &$aesenc()      if ($r==19);
1465 $code.=<<___;
1466         sha256rnds2     $CDGH,$ABEF
1467 ___
1468         push(@MSG,shift(@MSG));
1469 }
1470 $code.=<<___;
1471         movdqa          13*32-0x80($Tbl),$Wi
1472         paddd           @MSG[0],$Wi
1473         sha256msg2      @MSG[0],@MSG[1]
1474         sha256msg1      @MSG[3],@MSG[2]
1475 ___
1476         &$aesenc();
1477 $code.=<<___;
1478         sha256rnds2     $ABEF,$CDGH             # 52-55
1479         pshufd          \$0x0e,$Wi,$Wi
1480         movdqa          @MSG[1],$TMP
1481         palignr         \$4,@MSG[0],$TMP
1482         paddd           $TMP,@MSG[2]
1483 ___
1484         &$aesenc();
1485         &$aesenc();
1486 $code.=<<___;
1487         sha256rnds2     $CDGH,$ABEF
1488
1489         movdqa          14*32-0x80($Tbl),$Wi
1490         paddd           @MSG[1],$Wi
1491         sha256msg2      @MSG[1],@MSG[2]
1492         movdqa          $BSWAP,$TMP
1493 ___
1494         &$aesenc();
1495 $code.=<<___;
1496         sha256rnds2     $ABEF,$CDGH             # 56-59
1497         pshufd          \$0x0e,$Wi,$Wi
1498 ___
1499         &$aesenc();
1500 $code.=<<___;
1501         sha256rnds2     $CDGH,$ABEF
1502
1503         movdqa          15*32-0x80($Tbl),$Wi
1504         paddd           @MSG[2],$Wi
1505 ___
1506         &$aesenc();
1507         &$aesenc();
1508 $code.=<<___;
1509         sha256rnds2     $ABEF,$CDGH             # 60-63
1510         pshufd          \$0x0e,$Wi,$Wi
1511 ___
1512         &$aesenc();
1513 $code.=<<___;
1514         sha256rnds2     $CDGH,$ABEF
1515         #pxor           $CDGH,$rndkey0          # black magic
1516 ___
1517         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1518 $code.=<<___;
1519         #xorps          $CDGH,$rndkey0          # black magic
1520         paddd           $CDGH_SAVE,$CDGH
1521         paddd           $ABEF_SAVE,$ABEF
1522
1523         dec             $len
1524         movups          $iv,48($out,$in0)       # write output
1525         lea             64($in0),$in0
1526         jnz             .Loop_shaext
1527
1528         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1529         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1530         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1531         punpckhqdq      $CDGH,$ABEF             # DCBA
1532         palignr         \$8,$TMP,$CDGH          # HGFE
1533
1534         movups          $iv,($ivp)              # write IV
1535         movdqu          $ABEF,($ctx)
1536         movdqu          $CDGH,16($ctx)
1537 ___
1538 $code.=<<___ if ($win64);
1539         movaps  0*16(%rsp),%xmm6
1540         movaps  1*16(%rsp),%xmm7
1541         movaps  2*16(%rsp),%xmm8
1542         movaps  3*16(%rsp),%xmm9
1543         movaps  4*16(%rsp),%xmm10
1544         movaps  5*16(%rsp),%xmm11
1545         movaps  6*16(%rsp),%xmm12
1546         movaps  7*16(%rsp),%xmm13
1547         movaps  8*16(%rsp),%xmm14
1548         movaps  9*16(%rsp),%xmm15
1549         lea     8+10*16(%rsp),%rsp
1550 .Lepilogue_shaext:
1551 ___
1552 $code.=<<___;
1553         ret
1554 .size   ${func}_shaext,.-${func}_shaext
1555 ___
1556 }
1557 }}}}}
1558
1559 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1560 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1561 if ($win64 && $avx) {
1562 $rec="%rcx";
1563 $frame="%rdx";
1564 $context="%r8";
1565 $disp="%r9";
1566
1567 $code.=<<___;
1568 .extern __imp_RtlVirtualUnwind
1569 .type   se_handler,\@abi-omnipotent
1570 .align  16
1571 se_handler:
1572         push    %rsi
1573         push    %rdi
1574         push    %rbx
1575         push    %rbp
1576         push    %r12
1577         push    %r13
1578         push    %r14
1579         push    %r15
1580         pushfq
1581         sub     \$64,%rsp
1582
1583         mov     120($context),%rax      # pull context->Rax
1584         mov     248($context),%rbx      # pull context->Rip
1585
1586         mov     8($disp),%rsi           # disp->ImageBase
1587         mov     56($disp),%r11          # disp->HanderlData
1588
1589         mov     0(%r11),%r10d           # HandlerData[0]
1590         lea     (%rsi,%r10),%r10        # prologue label
1591         cmp     %r10,%rbx               # context->Rip<prologue label
1592         jb      .Lin_prologue
1593
1594         mov     152($context),%rax      # pull context->Rsp
1595
1596         mov     4(%r11),%r10d           # HandlerData[1]
1597         lea     (%rsi,%r10),%r10        # epilogue label
1598         cmp     %r10,%rbx               # context->Rip>=epilogue label
1599         jae     .Lin_prologue
1600 ___
1601 $code.=<<___ if ($shaext);
1602         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1603         cmp     %r10,%rbx
1604         jb      .Lnot_in_shaext
1605
1606         lea     (%rax),%rsi
1607         lea     512($context),%rdi      # &context.Xmm6
1608         mov     \$20,%ecx
1609         .long   0xa548f3fc              # cld; rep movsq
1610         lea     168(%rax),%rax          # adjust stack pointer
1611         jmp     .Lin_prologue
1612 .Lnot_in_shaext:
1613 ___
1614 $code.=<<___ if ($avx>1);
1615         lea     .Lavx2_shortcut(%rip),%r10
1616         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1617         jb      .Lnot_in_avx2
1618
1619         and     \$-256*$SZ,%rax
1620         add     \$`2*$SZ*($rounds-8)`,%rax
1621 .Lnot_in_avx2:
1622 ___
1623 $code.=<<___;
1624         mov     %rax,%rsi               # put aside Rsp
1625         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1626
1627         mov     -8(%rax),%rbx
1628         mov     -16(%rax),%rbp
1629         mov     -24(%rax),%r12
1630         mov     -32(%rax),%r13
1631         mov     -40(%rax),%r14
1632         mov     -48(%rax),%r15
1633         mov     %rbx,144($context)      # restore context->Rbx
1634         mov     %rbp,160($context)      # restore context->Rbp
1635         mov     %r12,216($context)      # restore context->R12
1636         mov     %r13,224($context)      # restore context->R13
1637         mov     %r14,232($context)      # restore context->R14
1638         mov     %r15,240($context)      # restore context->R15
1639
1640         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1641         lea     512($context),%rdi      # &context.Xmm6
1642         mov     \$20,%ecx
1643         .long   0xa548f3fc              # cld; rep movsq
1644
1645 .Lin_prologue:
1646         mov     8(%rax),%rdi
1647         mov     16(%rax),%rsi
1648         mov     %rax,152($context)      # restore context->Rsp
1649         mov     %rsi,168($context)      # restore context->Rsi
1650         mov     %rdi,176($context)      # restore context->Rdi
1651
1652         mov     40($disp),%rdi          # disp->ContextRecord
1653         mov     $context,%rsi           # context
1654         mov     \$154,%ecx              # sizeof(CONTEXT)
1655         .long   0xa548f3fc              # cld; rep movsq
1656
1657         mov     $disp,%rsi
1658         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1659         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1660         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1661         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1662         mov     40(%rsi),%r10           # disp->ContextRecord
1663         lea     56(%rsi),%r11           # &disp->HandlerData
1664         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1665         mov     %r10,32(%rsp)           # arg5
1666         mov     %r11,40(%rsp)           # arg6
1667         mov     %r12,48(%rsp)           # arg7
1668         mov     %rcx,56(%rsp)           # arg8, (NULL)
1669         call    *__imp_RtlVirtualUnwind(%rip)
1670
1671         mov     \$1,%eax                # ExceptionContinueSearch
1672         add     \$64,%rsp
1673         popfq
1674         pop     %r15
1675         pop     %r14
1676         pop     %r13
1677         pop     %r12
1678         pop     %rbp
1679         pop     %rbx
1680         pop     %rdi
1681         pop     %rsi
1682         ret
1683 .size   se_handler,.-se_handler
1684
1685 .section        .pdata
1686         .rva    .LSEH_begin_${func}_xop
1687         .rva    .LSEH_end_${func}_xop
1688         .rva    .LSEH_info_${func}_xop
1689
1690         .rva    .LSEH_begin_${func}_avx
1691         .rva    .LSEH_end_${func}_avx
1692         .rva    .LSEH_info_${func}_avx
1693 ___
1694 $code.=<<___ if ($avx>1);
1695         .rva    .LSEH_begin_${func}_avx2
1696         .rva    .LSEH_end_${func}_avx2
1697         .rva    .LSEH_info_${func}_avx2
1698 ___
1699 $code.=<<___ if ($shaext);
1700         .rva    .LSEH_begin_${func}_shaext
1701         .rva    .LSEH_end_${func}_shaext
1702         .rva    .LSEH_info_${func}_shaext
1703 ___
1704 $code.=<<___;
1705 .section        .xdata
1706 .align  8
1707 .LSEH_info_${func}_xop:
1708         .byte   9,0,0,0
1709         .rva    se_handler
1710         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1711
1712 .LSEH_info_${func}_avx:
1713         .byte   9,0,0,0
1714         .rva    se_handler
1715         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1716 ___
1717 $code.=<<___ if ($avx>1);
1718 .LSEH_info_${func}_avx2:
1719         .byte   9,0,0,0
1720         .rva    se_handler
1721         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1722 ___
1723 $code.=<<___ if ($shaext);
1724 .LSEH_info_${func}_shaext:
1725         .byte   9,0,0,0
1726         .rva    se_handler
1727         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1728 ___
1729 }
1730
1731 ####################################################################
1732 sub rex {
1733   local *opcode=shift;
1734   my ($dst,$src)=@_;
1735   my $rex=0;
1736
1737     $rex|=0x04                  if($dst>=8);
1738     $rex|=0x01                  if($src>=8);
1739     unshift @opcode,$rex|0x40   if($rex);
1740 }
1741
1742 {
1743   my %opcodelet = (
1744                 "sha256rnds2" => 0xcb,
1745                 "sha256msg1"  => 0xcc,
1746                 "sha256msg2"  => 0xcd   );
1747
1748   sub sha256op38 {
1749     my $instr = shift;
1750
1751     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1752       my @opcode=(0x0f,0x38);
1753         rex(\@opcode,$2,$1);
1754         push @opcode,$opcodelet{$instr};
1755         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1756         return ".byte\t".join(',',@opcode);
1757     } else {
1758         return $instr."\t".@_[0];
1759     }
1760   }
1761 }
1762
1763 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1764 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1765 print $code;
1766 close STDOUT;