crypto/aes/asm/aesni-sha256-x86_64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # January 2013
  11 #
  12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  15 # parallelism, interleaving it with another algorithm would allow to
  16 # utilize processor resources better and achieve better performance.
  17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  18 # AESNI code is weaved into it. As SHA256 dominates execution time,
  19 # stitch performance does not depend on AES key length. Below are
  20 # performance numbers in cycles per processed byte, less is better,
  21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  22 # subroutine:
  23 #
  24 #                AES-128/-192/-256+SHA256       this(**)gain
  25 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
  26 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
  27 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
  28 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
  29 #
  30 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
  31 #       Westmere is omitted from loop, this is because gain was not
  32 #       estimated high enough to justify the effort;
  33 # (**)  these are EVP-free results, results obtained with 'speed
  34 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  35
  36 $flavour = shift;
  37 $output  = shift;
  38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  39
  40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  41
  42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  45 die "can't locate x86_64-xlate.pl";
  46
  47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  48                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  49         $avx = ($1>=2.19) + ($1>=2.22);
  50 }
  51
  52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  53            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  54         $avx = ($1>=2.09) + ($1>=2.10);
  55 }
  56
  57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  58            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  59         $avx = ($1>=10) + ($1>=11);
  60 }
  61
  62 open OUT,"| \"$^X\" $xlate $flavour $output";
  63 *STDOUT=*OUT;
  64
  65 $func="aesni_cbc_sha256_enc";
  66 $TABLE="K256";
  67 $SZ=4;
  68 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  69                                 "%r8d","%r9d","%r10d","%r11d");
  70 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  71 @Sigma0=( 2,13,22);
  72 @Sigma1=( 6,11,25);
  73 @sigma0=( 7,18, 3);
  74 @sigma1=(17,19,10);
  75 $rounds=64;
  76
  77 ########################################################################
  78 # void aesni_cbc_sha256_enc(const void *inp,
  79 #                       void *out,
  80 #                       size_t length,
  81 #                       const AES_KEY *key,
  82 #                       unsigned char *iv,
  83 #                       SHA256_CTX *ctx,
  84 #                       const void *in0);
  85 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
  86 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  87
  88 $Tbl="%rbp";
  89
  90 $_inp="16*$SZ+0*8(%rsp)";
  91 $_out="16*$SZ+1*8(%rsp)";
  92 $_end="16*$SZ+2*8(%rsp)";
  93 $_key="16*$SZ+3*8(%rsp)";
  94 $_ivp="16*$SZ+4*8(%rsp)";
  95 $_ctx="16*$SZ+5*8(%rsp)";
  96 $_in0="16*$SZ+6*8(%rsp)";
  97 $_rsp="16*$SZ+7*8(%rsp)";
  98 $framesz=16*$SZ+8*8;
  99
 100 $code=<<___;
 101 .text
 102
 103 .extern OPENSSL_ia32cap_P
 104 .globl  $func
 105 .type   $func,\@abi-omnipotent
 106 .align  16
 107 $func:
 108 ___
 109 $code.=<<___ if ($avx);
 110         lea     OPENSSL_ia32cap_P(%rip),%r11
 111         mov     \$1,%eax
 112         cmp     \$0,`$win64?"%rcx":"%rdi"`
 113         je      .Lprobe
 114         mov     0(%r11),%eax
 115         mov     4(%r11),%r10d
 116         mov     8(%r11),%r11d
 117
 118         test    \$`1<<11`,%r10d                 # check for XOP
 119         jnz     ${func}_xop
 120 ___
 121 $code.=<<___ if ($avx>1);
 122         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
 123         cmp     \$`1<<8|1<<5|1<<3`,%r11d
 124         je      ${func}_avx2
 125 ___
 126 $code.=<<___ if ($avx);
 127         and     \$`1<<30`,%eax                  # mask "Intel CPU" bit
 128         and     \$`1<<28|1<<9`,%r10d            # mask AVX+SSSE3 bits
 129         or      %eax,%r10d
 130         cmp     \$`1<<28|1<<9|1<<30`,%r10d
 131         je      ${func}_avx
 132         ud2
 133 ___
 134 $code.=<<___;
 135         xor     %eax,%eax
 136         cmp     \$0,`$win64?"%rcx":"%rdi"`
 137         je      .Lprobe
 138         ud2
 139 .Lprobe:
 140         ret
 141 .size   $func,.-$func
 142
 143 .align  64
 144 .type   $TABLE,\@object
 145 $TABLE:
 146         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 147         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 148         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 149         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 150         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 151         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 152         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 153         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 154         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 155         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 156         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 157         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 158         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 159         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 160         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 161         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 162         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 163         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 164         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 165         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 166         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 167         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 168         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 169         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 170         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 171         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 172         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 173         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 174         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 175         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 176         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 177         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 178
 179         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 180         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 181         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
 182         .long   0,0,0,0,   0,0,0,0
 183         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 184 .align  64
 185 ___
 186
 187 ######################################################################
 188 # SIMD code paths
 189 #
 190 {{{
 191 ($iv,$inout,$roundkey,$temp,
 192  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
 193
 194 $aesni_cbc_idx=0;
 195 @aesni_cbc_block = (
 196 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
 197 ##      &vmovdqu        ($inout,($inp));
 198 ##      &mov            ($_inp,$inp);
 199
 200         '&vpxor         ($inout,$inout,$roundkey);'.
 201         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
 202
 203         '&vpxor         ($inout,$inout,$iv);',
 204
 205         '&vaesenc       ($inout,$inout,$roundkey);'.
 206         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
 207
 208         '&vaesenc       ($inout,$inout,$roundkey);'.
 209         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
 210
 211         '&vaesenc       ($inout,$inout,$roundkey);'.
 212         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
 213
 214         '&vaesenc       ($inout,$inout,$roundkey);'.
 215         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
 216
 217         '&vaesenc       ($inout,$inout,$roundkey);'.
 218         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
 219
 220         '&vaesenc       ($inout,$inout,$roundkey);'.
 221         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
 222
 223         '&vaesenc       ($inout,$inout,$roundkey);'.
 224         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
 225
 226         '&vaesenc       ($inout,$inout,$roundkey);'.
 227         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
 228
 229         '&vaesenc       ($inout,$inout,$roundkey);'.
 230         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
 231
 232         '&vaesenclast   ($temp,$inout,$roundkey);'.
 233         ' &vaesenc      ($inout,$inout,$roundkey);'.
 234         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
 235
 236         '&vpand         ($iv,$temp,$mask10);'.
 237         ' &vaesenc      ($inout,$inout,$roundkey);'.
 238         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
 239
 240         '&vaesenclast   ($temp,$inout,$roundkey);'.
 241         ' &vaesenc      ($inout,$inout,$roundkey);'.
 242         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
 243
 244         '&vpand         ($temp,$temp,$mask12);'.
 245         ' &vaesenc      ($inout,$inout,$roundkey);'.
 246          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
 247
 248         '&vpor          ($iv,$iv,$temp);'.
 249         ' &vaesenclast  ($temp,$inout,$roundkey);'.
 250         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
 251
 252 ##      &mov            ($inp,$_inp);
 253 ##      &mov            ($out,$_out);
 254 ##      &vpand          ($temp,$temp,$mask14);
 255 ##      &vpor           ($iv,$iv,$temp);
 256 ##      &vmovdqu        ($iv,($out,$inp);
 257 ##      &lea            (inp,16($inp));
 258 );
 259
 260 my $a4=$T1;
 261 my ($a,$b,$c,$d,$e,$f,$g,$h);
 262
 263 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 264 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 265   my $arg = pop;
 266     $arg = "\$$arg" if ($arg*1 eq $arg);
 267     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 268 }
 269
 270 sub body_00_15 () {
 271         (
 272         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 273
 274         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
 275         '&mov   ($a,$a1)',
 276         '&mov   ($a4,$f)',
 277
 278         '&xor   ($a0,$e)',
 279         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
 280         '&xor   ($a4,$g)',                      # f^g
 281
 282         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
 283         '&xor   ($a1,$a)',
 284         '&and   ($a4,$e)',                      # (f^g)&e
 285
 286         @aesni_cbc_block[$aesni_cbc_idx++].
 287         '&xor   ($a0,$e)',
 288         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
 289         '&mov   ($a2,$a)',
 290
 291         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
 292         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
 293         '&xor   ($a2,$b)',                      # a^b, b^c in next round
 294
 295         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
 296         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
 297         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
 298
 299         '&xor   ($a1,$a)',
 300         '&add   ($h,$a0)',                      # h+=Sigma1(e)
 301         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
 302
 303         '&add   ($d,$h)',                       # d+=h
 304         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
 305         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
 306
 307         '&mov   ($a0,$d)',
 308         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
 309         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 310         );
 311 }
 312
 313 if ($avx) {{
 314 ######################################################################
 315 # XOP code path
 316 #
 317 $code.=<<___;
 318 .type   ${func}_xop,\@function,6
 319 .align  64
 320 ${func}_xop:
 321 .Lxop_shortcut:
 322         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 323         push    %rbx
 324         push    %rbp
 325         push    %r12
 326         push    %r13
 327         push    %r14
 328         push    %r15
 329         mov     %rsp,%r11               # copy %rsp
 330         sub     \$`$framesz+$win64*16*10`,%rsp
 331         and     \$-64,%rsp              # align stack frame
 332
 333         shl     \$6,$len
 334         sub     $inp,$out               # re-bias
 335         sub     $inp,$in0
 336         add     $inp,$len               # end of input
 337
 338         #mov    $inp,$_inp              # saved later
 339         mov     $out,$_out
 340         mov     $len,$_end
 341         #mov    $key,$_key              # remains resident in $inp register
 342         mov     $ivp,$_ivp
 343         mov     $ctx,$_ctx
 344         mov     $in0,$_in0
 345         mov     %r11,$_rsp
 346 ___
 347 $code.=<<___ if ($win64);
 348         movaps  %xmm6,`$framesz+16*0`(%rsp)
 349         movaps  %xmm7,`$framesz+16*1`(%rsp)
 350         movaps  %xmm8,`$framesz+16*2`(%rsp)
 351         movaps  %xmm9,`$framesz+16*3`(%rsp)
 352         movaps  %xmm10,`$framesz+16*4`(%rsp)
 353         movaps  %xmm11,`$framesz+16*5`(%rsp)
 354         movaps  %xmm12,`$framesz+16*6`(%rsp)
 355         movaps  %xmm13,`$framesz+16*7`(%rsp)
 356         movaps  %xmm14,`$framesz+16*8`(%rsp)
 357         movaps  %xmm15,`$framesz+16*9`(%rsp)
 358 ___
 359 $code.=<<___;
 360 .Lprologue_xop:
 361         vzeroall
 362
 363         mov     $inp,%r12               # borrow $a4
 364         lea     0x80($key),$inp         # size optimization, reassign
 365         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 366         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 367         mov     $ctx,%r15               # borrow $a2
 368         mov     $in0,%rsi               # borrow $a3
 369         vmovdqu ($ivp),$iv              # load IV
 370         sub     \$9,%r14
 371
 372         mov     $SZ*0(%r15),$A
 373         mov     $SZ*1(%r15),$B
 374         mov     $SZ*2(%r15),$C
 375         mov     $SZ*3(%r15),$D
 376         mov     $SZ*4(%r15),$E
 377         mov     $SZ*5(%r15),$F
 378         mov     $SZ*6(%r15),$G
 379         mov     $SZ*7(%r15),$H
 380
 381         vmovdqa 0x00(%r13,%r14,8),$mask14
 382         vmovdqa 0x10(%r13,%r14,8),$mask12
 383         vmovdqa 0x20(%r13,%r14,8),$mask10
 384         vmovdqu 0x00-0x80($inp),$roundkey
 385         jmp     .Lloop_xop
 386 ___
 387                                         if ($SZ==4) {   # SHA256
 388     my @X = map("%xmm$_",(0..3));
 389     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 390
 391 $code.=<<___;
 392 .align  16
 393 .Lloop_xop:
 394         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 395         vmovdqu 0x00(%rsi,%r12),@X[0]
 396         vmovdqu 0x10(%rsi,%r12),@X[1]
 397         vmovdqu 0x20(%rsi,%r12),@X[2]
 398         vmovdqu 0x30(%rsi,%r12),@X[3]
 399         vpshufb $t3,@X[0],@X[0]
 400         lea     $TABLE(%rip),$Tbl
 401         vpshufb $t3,@X[1],@X[1]
 402         vpshufb $t3,@X[2],@X[2]
 403         vpaddd  0x00($Tbl),@X[0],$t0
 404         vpshufb $t3,@X[3],@X[3]
 405         vpaddd  0x20($Tbl),@X[1],$t1
 406         vpaddd  0x40($Tbl),@X[2],$t2
 407         vpaddd  0x60($Tbl),@X[3],$t3
 408         vmovdqa $t0,0x00(%rsp)
 409         mov     $A,$a1
 410         vmovdqa $t1,0x10(%rsp)
 411         mov     $B,$a3
 412         vmovdqa $t2,0x20(%rsp)
 413         xor     $C,$a3                  # magic
 414         vmovdqa $t3,0x30(%rsp)
 415         mov     $E,$a0
 416         jmp     .Lxop_00_47
 417
 418 .align  16
 419 .Lxop_00_47:
 420         sub     \$-16*2*$SZ,$Tbl        # size optimization
 421         vmovdqu (%r12),$inout           # $a4
 422         mov     %r12,$_inp              # $a4
 423 ___
 424 sub XOP_256_00_47 () {
 425 my $j = shift;
 426 my $body = shift;
 427 my @X = @_;
 428 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 429
 430         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
 431           eval(shift(@insns));
 432           eval(shift(@insns));
 433          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
 434           eval(shift(@insns));
 435           eval(shift(@insns));
 436         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
 437           eval(shift(@insns));
 438           eval(shift(@insns));
 439         &vpsrld         ($t0,$t0,$sigma0[2]);
 440           eval(shift(@insns));
 441           eval(shift(@insns));
 442          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
 443           eval(shift(@insns));
 444           eval(shift(@insns));
 445           eval(shift(@insns));
 446           eval(shift(@insns));
 447         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
 448           eval(shift(@insns));
 449           eval(shift(@insns));
 450         &vpxor          ($t0,$t0,$t1);
 451           eval(shift(@insns));
 452           eval(shift(@insns));
 453           eval(shift(@insns));
 454           eval(shift(@insns));
 455          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
 456           eval(shift(@insns));
 457           eval(shift(@insns));
 458         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
 459           eval(shift(@insns));
 460           eval(shift(@insns));
 461          &vpsrld        ($t2,@X[3],$sigma1[2]);
 462           eval(shift(@insns));
 463           eval(shift(@insns));
 464         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
 465           eval(shift(@insns));
 466           eval(shift(@insns));
 467          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 468           eval(shift(@insns));
 469           eval(shift(@insns));
 470          &vpxor         ($t3,$t3,$t2);
 471           eval(shift(@insns));
 472           eval(shift(@insns));
 473           eval(shift(@insns));
 474           eval(shift(@insns));
 475          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
 476           eval(shift(@insns));
 477           eval(shift(@insns));
 478           eval(shift(@insns));
 479           eval(shift(@insns));
 480         &vpsrldq        ($t3,$t3,8);
 481           eval(shift(@insns));
 482           eval(shift(@insns));
 483           eval(shift(@insns));
 484           eval(shift(@insns));
 485         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
 486           eval(shift(@insns));
 487           eval(shift(@insns));
 488           eval(shift(@insns));
 489           eval(shift(@insns));
 490          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
 491           eval(shift(@insns));
 492           eval(shift(@insns));
 493          &vpsrld        ($t2,@X[0],$sigma1[2]);
 494           eval(shift(@insns));
 495           eval(shift(@insns));
 496          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 497           eval(shift(@insns));
 498           eval(shift(@insns));
 499          &vpxor         ($t3,$t3,$t2);
 500           eval(shift(@insns));
 501           eval(shift(@insns));
 502           eval(shift(@insns));
 503           eval(shift(@insns));
 504          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
 505           eval(shift(@insns));
 506           eval(shift(@insns));
 507           eval(shift(@insns));
 508           eval(shift(@insns));
 509         &vpslldq        ($t3,$t3,8);            # 22 instructions
 510           eval(shift(@insns));
 511           eval(shift(@insns));
 512           eval(shift(@insns));
 513           eval(shift(@insns));
 514         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
 515           eval(shift(@insns));
 516           eval(shift(@insns));
 517           eval(shift(@insns));
 518           eval(shift(@insns));
 519         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 520           foreach (@insns) { eval; }            # remaining instructions
 521         &vmovdqa        (16*$j."(%rsp)",$t2);
 522 }
 523
 524     $aesni_cbc_idx=0;
 525     for ($i=0,$j=0; $j<4; $j++) {
 526         &XOP_256_00_47($j,\&body_00_15,@X);
 527         push(@X,shift(@X));                     # rotate(@X)
 528     }
 529         &mov            ("%r12",$_inp);         # borrow $a4
 530         &vpand          ($temp,$temp,$mask14);
 531         &mov            ("%r15",$_out);         # borrow $a2
 532         &vpor           ($iv,$iv,$temp);
 533         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 534         &lea            ("%r12","16(%r12)");    # inp++
 535
 536         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 537         &jne    (".Lxop_00_47");
 538
 539         &vmovdqu        ($inout,"(%r12)");
 540         &mov            ($_inp,"%r12");
 541
 542     $aesni_cbc_idx=0;
 543     for ($i=0; $i<16; ) {
 544         foreach(body_00_15()) { eval; }
 545     }
 546                                         }
 547 $code.=<<___;
 548         mov     $_inp,%r12              # borrow $a4
 549         mov     $_out,%r13              # borrow $a0
 550         mov     $_ctx,%r15              # borrow $a2
 551         mov     $_in0,%rsi              # borrow $a3
 552
 553         vpand   $mask14,$temp,$temp
 554         mov     $a1,$A
 555         vpor    $temp,$iv,$iv
 556         vmovdqu $iv,(%r13,%r12)         # write output
 557         lea     16(%r12),%r12           # inp++
 558
 559         add     $SZ*0(%r15),$A
 560         add     $SZ*1(%r15),$B
 561         add     $SZ*2(%r15),$C
 562         add     $SZ*3(%r15),$D
 563         add     $SZ*4(%r15),$E
 564         add     $SZ*5(%r15),$F
 565         add     $SZ*6(%r15),$G
 566         add     $SZ*7(%r15),$H
 567
 568         cmp     $_end,%r12
 569
 570         mov     $A,$SZ*0(%r15)
 571         mov     $B,$SZ*1(%r15)
 572         mov     $C,$SZ*2(%r15)
 573         mov     $D,$SZ*3(%r15)
 574         mov     $E,$SZ*4(%r15)
 575         mov     $F,$SZ*5(%r15)
 576         mov     $G,$SZ*6(%r15)
 577         mov     $H,$SZ*7(%r15)
 578
 579         jb      .Lloop_xop
 580
 581         mov     $_ivp,$ivp
 582         mov     $_rsp,%rsi
 583         vmovdqu $iv,($ivp)              # output IV
 584         vzeroall
 585 ___
 586 $code.=<<___ if ($win64);
 587         movaps  `$framesz+16*0`(%rsp),%xmm6
 588         movaps  `$framesz+16*1`(%rsp),%xmm7
 589         movaps  `$framesz+16*2`(%rsp),%xmm8
 590         movaps  `$framesz+16*3`(%rsp),%xmm9
 591         movaps  `$framesz+16*4`(%rsp),%xmm10
 592         movaps  `$framesz+16*5`(%rsp),%xmm11
 593         movaps  `$framesz+16*6`(%rsp),%xmm12
 594         movaps  `$framesz+16*7`(%rsp),%xmm13
 595         movaps  `$framesz+16*8`(%rsp),%xmm14
 596         movaps  `$framesz+16*9`(%rsp),%xmm15
 597 ___
 598 $code.=<<___;
 599         mov     (%rsi),%r15
 600         mov     8(%rsi),%r14
 601         mov     16(%rsi),%r13
 602         mov     24(%rsi),%r12
 603         mov     32(%rsi),%rbp
 604         mov     40(%rsi),%rbx
 605         lea     48(%rsi),%rsp
 606 .Lepilogue_xop:
 607         ret
 608 .size   ${func}_xop,.-${func}_xop
 609 ___
 610 ######################################################################
 611 # AVX+shrd code path
 612 #
 613 local *ror = sub { &shrd(@_[0],@_) };
 614
 615 $code.=<<___;
 616 .type   ${func}_avx,\@function,6
 617 .align  64
 618 ${func}_avx:
 619 .Lavx_shortcut:
 620         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 621         push    %rbx
 622         push    %rbp
 623         push    %r12
 624         push    %r13
 625         push    %r14
 626         push    %r15
 627         mov     %rsp,%r11               # copy %rsp
 628         sub     \$`$framesz+$win64*16*10`,%rsp
 629         and     \$-64,%rsp              # align stack frame
 630
 631         shl     \$6,$len
 632         sub     $inp,$out               # re-bias
 633         sub     $inp,$in0
 634         add     $inp,$len               # end of input
 635
 636         #mov    $inp,$_inp              # saved later
 637         mov     $out,$_out
 638         mov     $len,$_end
 639         #mov    $key,$_key              # remains resident in $inp register
 640         mov     $ivp,$_ivp
 641         mov     $ctx,$_ctx
 642         mov     $in0,$_in0
 643         mov     %r11,$_rsp
 644 ___
 645 $code.=<<___ if ($win64);
 646         movaps  %xmm6,`$framesz+16*0`(%rsp)
 647         movaps  %xmm7,`$framesz+16*1`(%rsp)
 648         movaps  %xmm8,`$framesz+16*2`(%rsp)
 649         movaps  %xmm9,`$framesz+16*3`(%rsp)
 650         movaps  %xmm10,`$framesz+16*4`(%rsp)
 651         movaps  %xmm11,`$framesz+16*5`(%rsp)
 652         movaps  %xmm12,`$framesz+16*6`(%rsp)
 653         movaps  %xmm13,`$framesz+16*7`(%rsp)
 654         movaps  %xmm14,`$framesz+16*8`(%rsp)
 655         movaps  %xmm15,`$framesz+16*9`(%rsp)
 656 ___
 657 $code.=<<___;
 658 .Lprologue_avx:
 659         vzeroall
 660
 661         mov     $inp,%r12               # borrow $a4
 662         lea     0x80($key),$inp         # size optimization, reassign
 663         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 664         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 665         mov     $ctx,%r15               # borrow $a2
 666         mov     $in0,%rsi               # borrow $a3
 667         vmovdqu ($ivp),$iv              # load IV
 668         sub     \$9,%r14
 669
 670         mov     $SZ*0(%r15),$A
 671         mov     $SZ*1(%r15),$B
 672         mov     $SZ*2(%r15),$C
 673         mov     $SZ*3(%r15),$D
 674         mov     $SZ*4(%r15),$E
 675         mov     $SZ*5(%r15),$F
 676         mov     $SZ*6(%r15),$G
 677         mov     $SZ*7(%r15),$H
 678
 679         vmovdqa 0x00(%r13,%r14,8),$mask14
 680         vmovdqa 0x10(%r13,%r14,8),$mask12
 681         vmovdqa 0x20(%r13,%r14,8),$mask10
 682         vmovdqu 0x00-0x80($inp),$roundkey
 683 ___
 684                                         if ($SZ==4) {   # SHA256
 685     my @X = map("%xmm$_",(0..3));
 686     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 687
 688 $code.=<<___;
 689         jmp     .Lloop_avx
 690 .align  16
 691 .Lloop_avx:
 692         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 693         vmovdqu 0x00(%rsi,%r12),@X[0]
 694         vmovdqu 0x10(%rsi,%r12),@X[1]
 695         vmovdqu 0x20(%rsi,%r12),@X[2]
 696         vmovdqu 0x30(%rsi,%r12),@X[3]
 697         vpshufb $t3,@X[0],@X[0]
 698         lea     $TABLE(%rip),$Tbl
 699         vpshufb $t3,@X[1],@X[1]
 700         vpshufb $t3,@X[2],@X[2]
 701         vpaddd  0x00($Tbl),@X[0],$t0
 702         vpshufb $t3,@X[3],@X[3]
 703         vpaddd  0x20($Tbl),@X[1],$t1
 704         vpaddd  0x40($Tbl),@X[2],$t2
 705         vpaddd  0x60($Tbl),@X[3],$t3
 706         vmovdqa $t0,0x00(%rsp)
 707         mov     $A,$a1
 708         vmovdqa $t1,0x10(%rsp)
 709         mov     $B,$a3
 710         vmovdqa $t2,0x20(%rsp)
 711         xor     $C,$a3                  # magic
 712         vmovdqa $t3,0x30(%rsp)
 713         mov     $E,$a0
 714         jmp     .Lavx_00_47
 715
 716 .align  16
 717 .Lavx_00_47:
 718         sub     \$-16*2*$SZ,$Tbl        # size optimization
 719         vmovdqu (%r12),$inout           # $a4
 720         mov     %r12,$_inp              # $a4
 721 ___
 722 sub Xupdate_256_AVX () {
 723         (
 724         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
 725          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
 726         '&vpsrld        ($t2,$t0,$sigma0[0]);',
 727          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
 728         '&vpsrld        ($t3,$t0,$sigma0[2])',
 729         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
 730         '&vpxor         ($t0,$t3,$t2)',
 731          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
 732         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
 733         '&vpxor         ($t0,$t0,$t1)',
 734         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
 735         '&vpxor         ($t0,$t0,$t2)',
 736          '&vpsrld       ($t2,$t3,$sigma1[2]);',
 737         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
 738          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
 739         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
 740          '&vpxor        ($t2,$t2,$t3);',
 741          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 742          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
 743          '&vpshufd      ($t2,$t2,0b10000100)',
 744          '&vpsrldq      ($t2,$t2,8)',
 745         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
 746          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
 747          '&vpsrld       ($t2,$t3,$sigma1[2])',
 748          '&vpsrlq       ($t3,$t3,$sigma1[0])',
 749          '&vpxor        ($t2,$t2,$t3);',
 750          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 751          '&vpxor        ($t2,$t2,$t3)',
 752          '&vpshufd      ($t2,$t2,0b11101000)',
 753          '&vpslldq      ($t2,$t2,8)',
 754         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
 755         );
 756 }
 757
 758 sub AVX_256_00_47 () {
 759 my $j = shift;
 760 my $body = shift;
 761 my @X = @_;
 762 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 763
 764         foreach (Xupdate_256_AVX()) {           # 29 instructions
 765             eval;
 766             eval(shift(@insns));
 767             eval(shift(@insns));
 768             eval(shift(@insns));
 769         }
 770         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 771           foreach (@insns) { eval; }            # remaining instructions
 772         &vmovdqa        (16*$j."(%rsp)",$t2);
 773 }
 774
 775     $aesni_cbc_idx=0;
 776     for ($i=0,$j=0; $j<4; $j++) {
 777         &AVX_256_00_47($j,\&body_00_15,@X);
 778         push(@X,shift(@X));                     # rotate(@X)
 779     }
 780         &mov            ("%r12",$_inp);         # borrow $a4
 781         &vpand          ($temp,$temp,$mask14);
 782         &mov            ("%r15",$_out);         # borrow $a2
 783         &vpor           ($iv,$iv,$temp);
 784         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 785         &lea            ("%r12","16(%r12)");    # inp++
 786
 787         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 788         &jne    (".Lavx_00_47");
 789
 790         &vmovdqu        ($inout,"(%r12)");
 791         &mov            ($_inp,"%r12");
 792
 793     $aesni_cbc_idx=0;
 794     for ($i=0; $i<16; ) {
 795         foreach(body_00_15()) { eval; }
 796     }
 797
 798                                         }
 799 $code.=<<___;
 800         mov     $_inp,%r12              # borrow $a4
 801         mov     $_out,%r13              # borrow $a0
 802         mov     $_ctx,%r15              # borrow $a2
 803         mov     $_in0,%rsi              # borrow $a3
 804
 805         vpand   $mask14,$temp,$temp
 806         mov     $a1,$A
 807         vpor    $temp,$iv,$iv
 808         vmovdqu $iv,(%r13,%r12)         # write output
 809         lea     16(%r12),%r12           # inp++
 810
 811         add     $SZ*0(%r15),$A
 812         add     $SZ*1(%r15),$B
 813         add     $SZ*2(%r15),$C
 814         add     $SZ*3(%r15),$D
 815         add     $SZ*4(%r15),$E
 816         add     $SZ*5(%r15),$F
 817         add     $SZ*6(%r15),$G
 818         add     $SZ*7(%r15),$H
 819
 820         cmp     $_end,%r12
 821
 822         mov     $A,$SZ*0(%r15)
 823         mov     $B,$SZ*1(%r15)
 824         mov     $C,$SZ*2(%r15)
 825         mov     $D,$SZ*3(%r15)
 826         mov     $E,$SZ*4(%r15)
 827         mov     $F,$SZ*5(%r15)
 828         mov     $G,$SZ*6(%r15)
 829         mov     $H,$SZ*7(%r15)
 830         jb      .Lloop_avx
 831
 832         mov     $_ivp,$ivp
 833         mov     $_rsp,%rsi
 834         vmovdqu $iv,($ivp)              # output IV
 835         vzeroall
 836 ___
 837 $code.=<<___ if ($win64);
 838         movaps  `$framesz+16*0`(%rsp),%xmm6
 839         movaps  `$framesz+16*1`(%rsp),%xmm7
 840         movaps  `$framesz+16*2`(%rsp),%xmm8
 841         movaps  `$framesz+16*3`(%rsp),%xmm9
 842         movaps  `$framesz+16*4`(%rsp),%xmm10
 843         movaps  `$framesz+16*5`(%rsp),%xmm11
 844         movaps  `$framesz+16*6`(%rsp),%xmm12
 845         movaps  `$framesz+16*7`(%rsp),%xmm13
 846         movaps  `$framesz+16*8`(%rsp),%xmm14
 847         movaps  `$framesz+16*9`(%rsp),%xmm15
 848 ___
 849 $code.=<<___;
 850         mov     (%rsi),%r15
 851         mov     8(%rsi),%r14
 852         mov     16(%rsi),%r13
 853         mov     24(%rsi),%r12
 854         mov     32(%rsi),%rbp
 855         mov     40(%rsi),%rbx
 856         lea     48(%rsi),%rsp
 857 .Lepilogue_avx:
 858         ret
 859 .size   ${func}_avx,.-${func}_avx
 860 ___
 861
 862 if ($avx>1) {{
 863 ######################################################################
 864 # AVX2+BMI code path
 865 #
 866 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
 867 my $PUSH8=8*2*$SZ;
 868 use integer;
 869
 870 sub bodyx_00_15 () {
 871         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
 872         (
 873         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 874
 875         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
 876         '&and   ($a4,$e)',              # f&e
 877         '&rorx  ($a0,$e,$Sigma1[2])',
 878         '&rorx  ($a2,$e,$Sigma1[1])',
 879
 880         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
 881         '&lea   ($h,"($h,$a4)")',
 882         '&andn  ($a4,$e,$g)',           # ~e&g
 883         '&xor   ($a0,$a2)',
 884
 885         '&rorx  ($a1,$e,$Sigma1[0])',
 886         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
 887         '&xor   ($a0,$a1)',             # Sigma1(e)
 888         '&mov   ($a2,$a)',
 889
 890         '&rorx  ($a4,$a,$Sigma0[2])',
 891         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
 892         '&xor   ($a2,$b)',              # a^b, b^c in next round
 893         '&rorx  ($a1,$a,$Sigma0[1])',
 894
 895         '&rorx  ($a0,$a,$Sigma0[0])',
 896         '&lea   ($d,"($d,$h)")',        # d+=h
 897         '&and   ($a3,$a2)',             # (b^c)&(a^b)
 898         @aesni_cbc_block[$aesni_cbc_idx++].
 899         '&xor   ($a1,$a4)',
 900
 901         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
 902         '&xor   ($a1,$a0)',             # Sigma0(a)
 903         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
 904         '&mov   ($a4,$e)',              # copy of f in future
 905
 906         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 907         );
 908         # and at the finish one has to $a+=$a1
 909 }
 910
 911 $code.=<<___;
 912 .type   ${func}_avx2,\@function,6
 913 .align  64
 914 ${func}_avx2:
 915 .Lavx2_shortcut:
 916         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 917         push    %rbx
 918         push    %rbp
 919         push    %r12
 920         push    %r13
 921         push    %r14
 922         push    %r15
 923         mov     %rsp,%r11               # copy %rsp
 924         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
 925         and     \$-256*$SZ,%rsp         # align stack frame
 926         add     \$`2*$SZ*($rounds-8)`,%rsp
 927
 928         shl     \$6,$len
 929         sub     $inp,$out               # re-bias
 930         sub     $inp,$in0
 931         add     $inp,$len               # end of input
 932
 933         #mov    $inp,$_inp              # saved later
 934         #mov    $out,$_out              # kept in $offload
 935         mov     $len,$_end
 936         #mov    $key,$_key              # remains resident in $inp register
 937         mov     $ivp,$_ivp
 938         mov     $ctx,$_ctx
 939         mov     $in0,$_in0
 940         mov     %r11,$_rsp
 941 ___
 942 $code.=<<___ if ($win64);
 943         movaps  %xmm6,`$framesz+16*0`(%rsp)
 944         movaps  %xmm7,`$framesz+16*1`(%rsp)
 945         movaps  %xmm8,`$framesz+16*2`(%rsp)
 946         movaps  %xmm9,`$framesz+16*3`(%rsp)
 947         movaps  %xmm10,`$framesz+16*4`(%rsp)
 948         movaps  %xmm11,`$framesz+16*5`(%rsp)
 949         movaps  %xmm12,`$framesz+16*6`(%rsp)
 950         movaps  %xmm13,`$framesz+16*7`(%rsp)
 951         movaps  %xmm14,`$framesz+16*8`(%rsp)
 952         movaps  %xmm15,`$framesz+16*9`(%rsp)
 953 ___
 954 $code.=<<___;
 955 .Lprologue_avx2:
 956         vzeroall
 957
 958         mov     $inp,%r13               # borrow $a0
 959         vpinsrq \$1,$out,$offload,$offload
 960         lea     0x80($key),$inp         # size optimization, reassign
 961         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
 962         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 963         mov     $ctx,%r15               # borrow $a2
 964         mov     $in0,%rsi               # borrow $a3
 965         vmovdqu ($ivp),$iv              # load IV
 966         lea     -9(%r14),%r14
 967
 968         vmovdqa 0x00(%r12,%r14,8),$mask14
 969         vmovdqa 0x10(%r12,%r14,8),$mask12
 970         vmovdqa 0x20(%r12,%r14,8),$mask10
 971
 972         sub     \$-16*$SZ,%r13          # inp++, size optimization
 973         mov     $SZ*0(%r15),$A
 974         lea     (%rsi,%r13),%r12        # borrow $a0
 975         mov     $SZ*1(%r15),$B
 976         cmp     $len,%r13               # $_end
 977         mov     $SZ*2(%r15),$C
 978         cmove   %rsp,%r12               # next block or random data
 979         mov     $SZ*3(%r15),$D
 980         mov     $SZ*4(%r15),$E
 981         mov     $SZ*5(%r15),$F
 982         mov     $SZ*6(%r15),$G
 983         mov     $SZ*7(%r15),$H
 984         vmovdqu 0x00-0x80($inp),$roundkey
 985 ___
 986                                         if ($SZ==4) {   # SHA256
 987     my @X = map("%ymm$_",(0..3));
 988     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
 989
 990 $code.=<<___;
 991         jmp     .Loop_avx2
 992 .align  16
 993 .Loop_avx2:
 994         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 995         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
 996         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
 997         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
 998         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
 999
1000         vinserti128     \$1,(%r12),@X[0],@X[0]
1001         vinserti128     \$1,16(%r12),@X[1],@X[1]
1002          vpshufb        $t3,@X[0],@X[0]
1003         vinserti128     \$1,32(%r12),@X[2],@X[2]
1004          vpshufb        $t3,@X[1],@X[1]
1005         vinserti128     \$1,48(%r12),@X[3],@X[3]
1006
1007         lea     $TABLE(%rip),$Tbl
1008         vpshufb $t3,@X[2],@X[2]
1009         lea     -16*$SZ(%r13),%r13
1010         vpaddd  0x00($Tbl),@X[0],$t0
1011         vpshufb $t3,@X[3],@X[3]
1012         vpaddd  0x20($Tbl),@X[1],$t1
1013         vpaddd  0x40($Tbl),@X[2],$t2
1014         vpaddd  0x60($Tbl),@X[3],$t3
1015         vmovdqa $t0,0x00(%rsp)
1016         xor     $a1,$a1
1017         vmovdqa $t1,0x20(%rsp)
1018         lea     -$PUSH8(%rsp),%rsp
1019         mov     $B,$a3
1020         vmovdqa $t2,0x00(%rsp)
1021         xor     $C,$a3                  # magic
1022         vmovdqa $t3,0x20(%rsp)
1023         mov     $F,$a4
1024         sub     \$-16*2*$SZ,$Tbl        # size optimization
1025         jmp     .Lavx2_00_47
1026
1027 .align  16
1028 .Lavx2_00_47:
1029         vmovdqu (%r13),$inout
1030         vpinsrq \$0,%r13,$offload,$offload
1031 ___
1032
1033 sub AVX2_256_00_47 () {
1034 my $j = shift;
1035 my $body = shift;
1036 my @X = @_;
1037 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1038 my $base = "+2*$PUSH8(%rsp)";
1039
1040         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1041         foreach (Xupdate_256_AVX()) {           # 29 instructions
1042             eval;
1043             eval(shift(@insns));
1044             eval(shift(@insns));
1045             eval(shift(@insns));
1046         }
1047         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1048           foreach (@insns) { eval; }            # remaining instructions
1049         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1050 }
1051     $aesni_cbc_idx=0;
1052     for ($i=0,$j=0; $j<4; $j++) {
1053         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1054         push(@X,shift(@X));                     # rotate(@X)
1055     }
1056         &vmovq          ("%r13",$offload);      # borrow $a0
1057         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1058         &vpand          ($temp,$temp,$mask14);
1059         &vpor           ($iv,$iv,$temp);
1060         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1061         &lea            ("%r13","16(%r13)");    # inp++
1062
1063         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1064         &cmpb   (($SZ-1)."($Tbl)",0);
1065         &jne    (".Lavx2_00_47");
1066
1067         &vmovdqu        ($inout,"(%r13)");
1068         &vpinsrq        ($offload,$offload,"%r13",0);
1069
1070     $aesni_cbc_idx=0;
1071     for ($i=0; $i<16; ) {
1072         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1073         foreach(bodyx_00_15()) { eval; }
1074     }
1075                                         }
1076 $code.=<<___;
1077         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1078         vmovq   $offload,%r13                   # $_inp, borrow $a0
1079         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1080         add     $a1,$A
1081         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1082
1083         vpand   $mask14,$temp,$temp
1084         vpor    $temp,$iv,$iv
1085         vmovdqu $iv,(%r12,%r13)                 # write output
1086         lea     16(%r13),%r13
1087
1088         add     $SZ*0(%r15),$A
1089         add     $SZ*1(%r15),$B
1090         add     $SZ*2(%r15),$C
1091         add     $SZ*3(%r15),$D
1092         add     $SZ*4(%r15),$E
1093         add     $SZ*5(%r15),$F
1094         add     $SZ*6(%r15),$G
1095         add     $SZ*7(%r15),$H
1096
1097         mov     $A,$SZ*0(%r15)
1098         mov     $B,$SZ*1(%r15)
1099         mov     $C,$SZ*2(%r15)
1100         mov     $D,$SZ*3(%r15)
1101         mov     $E,$SZ*4(%r15)
1102         mov     $F,$SZ*5(%r15)
1103         mov     $G,$SZ*6(%r15)
1104         mov     $H,$SZ*7(%r15)
1105
1106         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1107         je      .Ldone_avx2
1108
1109         xor     $a1,$a1
1110         mov     $B,$a3
1111         mov     $F,$a4
1112         xor     $C,$a3                  # magic
1113         jmp     .Lower_avx2
1114 .align  16
1115 .Lower_avx2:
1116         vmovdqu (%r13),$inout
1117         vpinsrq \$0,%r13,$offload,$offload
1118 ___
1119     $aesni_cbc_idx=0;
1120     for ($i=0; $i<16; ) {
1121         my $base="+16($Tbl)";
1122         foreach(bodyx_00_15()) { eval; }
1123         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1124     }
1125 $code.=<<___;
1126         vmovq   $offload,%r13                   # borrow $a0
1127         vpextrq \$1,$offload,%r15               # borrow $a2
1128         vpand   $mask14,$temp,$temp
1129         vpor    $temp,$iv,$iv
1130         lea     -$PUSH8($Tbl),$Tbl
1131         vmovdqu $iv,(%r15,%r13)                 # write output
1132         lea     16(%r13),%r13                   # inp++
1133         cmp     %rsp,$Tbl
1134         jae     .Lower_avx2
1135
1136         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1137         lea     16*$SZ(%r13),%r13
1138         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1139         add     $a1,$A
1140         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1141
1142         add     $SZ*0(%r15),$A
1143         add     $SZ*1(%r15),$B
1144         add     $SZ*2(%r15),$C
1145         add     $SZ*3(%r15),$D
1146         add     $SZ*4(%r15),$E
1147         add     $SZ*5(%r15),$F
1148         add     $SZ*6(%r15),$G
1149         lea     (%rsi,%r13),%r12
1150         add     $SZ*7(%r15),$H
1151
1152         cmp     $_end,%r13
1153
1154         mov     $A,$SZ*0(%r15)
1155         cmove   %rsp,%r12               # next block or stale data
1156         mov     $B,$SZ*1(%r15)
1157         mov     $C,$SZ*2(%r15)
1158         mov     $D,$SZ*3(%r15)
1159         mov     $E,$SZ*4(%r15)
1160         mov     $F,$SZ*5(%r15)
1161         mov     $G,$SZ*6(%r15)
1162         mov     $H,$SZ*7(%r15)
1163
1164         jbe     .Loop_avx2
1165         lea     (%rsp),$Tbl
1166
1167 .Ldone_avx2:
1168         lea     ($Tbl),%rsp
1169         mov     $_ivp,$ivp
1170         mov     $_rsp,%rsi
1171         vmovdqu $iv,($ivp)              # output IV
1172         vzeroall
1173 ___
1174 $code.=<<___ if ($win64);
1175         movaps  `$framesz+16*0`(%rsp),%xmm6
1176         movaps  `$framesz+16*1`(%rsp),%xmm7
1177         movaps  `$framesz+16*2`(%rsp),%xmm8
1178         movaps  `$framesz+16*3`(%rsp),%xmm9
1179         movaps  `$framesz+16*4`(%rsp),%xmm10
1180         movaps  `$framesz+16*5`(%rsp),%xmm11
1181         movaps  `$framesz+16*6`(%rsp),%xmm12
1182         movaps  `$framesz+16*7`(%rsp),%xmm13
1183         movaps  `$framesz+16*8`(%rsp),%xmm14
1184         movaps  `$framesz+16*9`(%rsp),%xmm15
1185 ___
1186 $code.=<<___;
1187         mov     (%rsi),%r15
1188         mov     8(%rsi),%r14
1189         mov     16(%rsi),%r13
1190         mov     24(%rsi),%r12
1191         mov     32(%rsi),%rbp
1192         mov     40(%rsi),%rbx
1193         lea     48(%rsi),%rsp
1194 .Lepilogue_avx2:
1195         ret
1196 .size   ${func}_avx2,.-${func}_avx2
1197 ___
1198 }}
1199 }}}}}
1200
1201 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1202 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1203 if ($win64) {
1204 $rec="%rcx";
1205 $frame="%rdx";
1206 $context="%r8";
1207 $disp="%r9";
1208
1209 $code.=<<___ if ($avx);
1210 .extern __imp_RtlVirtualUnwind
1211 .type   se_handler,\@abi-omnipotent
1212 .align  16
1213 se_handler:
1214         push    %rsi
1215         push    %rdi
1216         push    %rbx
1217         push    %rbp
1218         push    %r12
1219         push    %r13
1220         push    %r14
1221         push    %r15
1222         pushfq
1223         sub     \$64,%rsp
1224
1225         mov     120($context),%rax      # pull context->Rax
1226         mov     248($context),%rbx      # pull context->Rip
1227
1228         mov     8($disp),%rsi           # disp->ImageBase
1229         mov     56($disp),%r11          # disp->HanderlData
1230
1231         mov     0(%r11),%r10d           # HandlerData[0]
1232         lea     (%rsi,%r10),%r10        # prologue label
1233         cmp     %r10,%rbx               # context->Rip<prologue label
1234         jb      .Lin_prologue
1235
1236         mov     152($context),%rax      # pull context->Rsp
1237
1238         mov     4(%r11),%r10d           # HandlerData[1]
1239         lea     (%rsi,%r10),%r10        # epilogue label
1240         cmp     %r10,%rbx               # context->Rip>=epilogue label
1241         jae     .Lin_prologue
1242 ___
1243 $code.=<<___ if ($avx>1);
1244         lea     .Lavx2_shortcut(%rip),%r10
1245         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1246         jb      .Lnot_in_avx2
1247
1248         and     \$-256*$SZ,%rax
1249         add     \$`2*$SZ*($rounds-8)`,%rax
1250 .Lnot_in_avx2:
1251 ___
1252 $code.=<<___;
1253         mov     %rax,%rsi               # put aside Rsp
1254         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1255         lea     48(%rax),%rax
1256
1257         mov     -8(%rax),%rbx
1258         mov     -16(%rax),%rbp
1259         mov     -24(%rax),%r12
1260         mov     -32(%rax),%r13
1261         mov     -40(%rax),%r14
1262         mov     -48(%rax),%r15
1263         mov     %rbx,144($context)      # restore context->Rbx
1264         mov     %rbp,160($context)      # restore context->Rbp
1265         mov     %r12,216($context)      # restore context->R12
1266         mov     %r13,224($context)      # restore context->R13
1267         mov     %r14,232($context)      # restore context->R14
1268         mov     %r15,240($context)      # restore context->R15
1269
1270         lea     .Lepilogue(%rip),%r10
1271         cmp     %r10,%rbx
1272         jb      .Lin_prologue           # non-AVX code
1273
1274         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1275         lea     512($context),%rdi      # &context.Xmm6
1276         mov     \$20,%ecx
1277         .long   0xa548f3fc              # cld; rep movsq
1278
1279 .Lin_prologue:
1280         mov     8(%rax),%rdi
1281         mov     16(%rax),%rsi
1282         mov     %rax,152($context)      # restore context->Rsp
1283         mov     %rsi,168($context)      # restore context->Rsi
1284         mov     %rdi,176($context)      # restore context->Rdi
1285
1286         mov     40($disp),%rdi          # disp->ContextRecord
1287         mov     $context,%rsi           # context
1288         mov     \$154,%ecx              # sizeof(CONTEXT)
1289         .long   0xa548f3fc              # cld; rep movsq
1290
1291         mov     $disp,%rsi
1292         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1293         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1294         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1295         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1296         mov     40(%rsi),%r10           # disp->ContextRecord
1297         lea     56(%rsi),%r11           # &disp->HandlerData
1298         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1299         mov     %r10,32(%rsp)           # arg5
1300         mov     %r11,40(%rsp)           # arg6
1301         mov     %r12,48(%rsp)           # arg7
1302         mov     %rcx,56(%rsp)           # arg8, (NULL)
1303         call    *__imp_RtlVirtualUnwind(%rip)
1304
1305         mov     \$1,%eax                # ExceptionContinueSearch
1306         add     \$64,%rsp
1307         popfq
1308         pop     %r15
1309         pop     %r14
1310         pop     %r13
1311         pop     %r12
1312         pop     %rbp
1313         pop     %rbx
1314         pop     %rdi
1315         pop     %rsi
1316         ret
1317 .size   se_handler,.-se_handler
1318
1319 .section        .pdata
1320         .rva    .LSEH_begin_${func}_xop
1321         .rva    .LSEH_end_${func}_xop
1322         .rva    .LSEH_info_${func}_xop
1323
1324         .rva    .LSEH_begin_${func}_avx
1325         .rva    .LSEH_end_${func}_avx
1326         .rva    .LSEH_info_${func}_avx
1327 ___
1328 $code.=<<___ if ($avx>1);
1329         .rva    .LSEH_begin_${func}_avx2
1330         .rva    .LSEH_end_${func}_avx2
1331         .rva    .LSEH_info_${func}_avx2
1332 ___
1333 $code.=<<___ if ($avx);
1334 .section        .xdata
1335 .align  8
1336 .LSEH_info_${func}_xop:
1337         .byte   9,0,0,0
1338         .rva    se_handler
1339         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1340
1341 .LSEH_info_${func}_avx:
1342         .byte   9,0,0,0
1343         .rva    se_handler
1344         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1345 ___
1346 $code.=<<___ if ($avx>1);
1347 .LSEH_info_${func}_avx2:
1348         .byte   9,0,0,0
1349         .rva    se_handler
1350         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1351 ___
1352 }
1353
1354 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1355 print $code;
1356 close STDOUT;