crypto/aes/asm/aesni-sha256-x86_64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # January 2013
  11 #
  12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
  13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  15 # parallelism, interleaving it with another algorithm would allow to
  16 # utilize processor resources better and achieve better performance.
  17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
  18 # AESNI code is weaved into it. As SHA256 dominates execution time,
  19 # stitch performance does not depend on AES key length. Below are
  20 # performance numbers in cycles per processed byte, less is better,
  21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
  22 # subroutine:
  23 #
  24 #                AES-128/-192/-256+SHA256       this(**)gain
  25 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
  26 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
  27 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
  28 #
  29 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
  30 #       Westmere is omitted from loop, this is because gain was not
  31 #       estimated high enough to justify the effort;
  32 # (**)  these are EVP-free results, results obtained with 'speed
  33 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
  34
  35 $flavour = shift;
  36 $output  = shift;
  37 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  38
  39 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  40
  41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  42 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  43 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  44 die "can't locate x86_64-xlate.pl";
  45
  46 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  47                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  48         $avx = ($1>=2.19) + ($1>=2.22);
  49 }
  50
  51 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  52            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  53         $avx = ($1>=2.09) + ($1>=2.10);
  54 }
  55
  56 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  57            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  58         $avx = ($1>=10) + ($1>=11);
  59 }
  60
  61 open OUT,"| \"$^X\" $xlate $flavour $output";
  62 *STDOUT=*OUT;
  63
  64 $func="aesni_cbc_sha256_enc";
  65 $TABLE="K256";
  66 $SZ=4;
  67 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  68                                 "%r8d","%r9d","%r10d","%r11d");
  69 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
  70 @Sigma0=( 2,13,22);
  71 @Sigma1=( 6,11,25);
  72 @sigma0=( 7,18, 3);
  73 @sigma1=(17,19,10);
  74 $rounds=64;
  75
  76 ########################################################################
  77 # void aesni_cbc_sha256_enc(const void *inp,
  78 #                       void *out,
  79 #                       size_t length,
  80 #                       const AES_KEY *key,
  81 #                       unsigned char *iv,
  82 #                       SHA256_CTX *ctx,
  83 #                       const void *in0);
  84 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
  85 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
  86
  87 $Tbl="%rbp";
  88
  89 $_inp="16*$SZ+0*8(%rsp)";
  90 $_out="16*$SZ+1*8(%rsp)";
  91 $_end="16*$SZ+2*8(%rsp)";
  92 $_key="16*$SZ+3*8(%rsp)";
  93 $_ivp="16*$SZ+4*8(%rsp)";
  94 $_ctx="16*$SZ+5*8(%rsp)";
  95 $_in0="16*$SZ+6*8(%rsp)";
  96 $_rsp="16*$SZ+7*8(%rsp)";
  97 $framesz=16*$SZ+8*8;
  98
  99 $code=<<___;
 100 .text
 101
 102 .extern OPENSSL_ia32cap_P
 103 .globl  $func
 104 .type   $func,\@abi-omnipotent
 105 .align  16
 106 $func:
 107 ___
 108 $code.=<<___ if ($avx);
 109         lea     OPENSSL_ia32cap_P(%rip),%r11
 110         mov     \$1,%eax
 111         cmp     \$0,`$win64?"%rcx":"%rdi"`
 112         je      .Lprobe
 113         mov     0(%r11),%eax
 114         mov     4(%r11),%r10d
 115         mov     8(%r11),%r11d
 116
 117         test    \$`1<<11`,%r10d                 # check for XOP
 118         jnz     ${func}_xop
 119 ___
 120 $code.=<<___ if ($avx>1);
 121         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
 122         cmp     \$`1<<8|1<<5|1<<3`,%r11d
 123         je      ${func}_avx2
 124 ___
 125 $code.=<<___ if ($avx);
 126         and     \$`1<<30`,%eax                  # mask "Intel CPU" bit
 127         and     \$`1<<28|1<<9`,%r10d            # mask AVX+SSSE3 bits
 128         or      %eax,%r10d
 129         cmp     \$`1<<28|1<<9|1<<30`,%r10d
 130         je      ${func}_avx
 131         ud2
 132 ___
 133 $code.=<<___;
 134         xor     %eax,%eax
 135         cmp     \$0,`$win64?"%rcx":"%rdi"`
 136         je      .Lprobe
 137         ud2
 138 .Lprobe:
 139         ret
 140 .size   $func,.-$func
 141
 142 .align  64
 143 .type   $TABLE,\@object
 144 $TABLE:
 145         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 146         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 147         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 148         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 149         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 150         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 151         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 152         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 153         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 154         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 155         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 156         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 157         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 158         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 159         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 160         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 161         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 162         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 163         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 164         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 165         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 166         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 167         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 168         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 169         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 170         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 171         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 172         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 173         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 174         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 175         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 176         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 177
 178         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 179         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
 180         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
 181         .long   0,0,0,0,   0,0,0,0
 182         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 183 .align  64
 184 ___
 185
 186 ######################################################################
 187 # SIMD code paths
 188 #
 189 {{{
 190 ($iv,$inout,$roundkey,$temp,
 191  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
 192
 193 $aesni_cbc_idx=0;
 194 @aesni_cbc_block = (
 195 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
 196 ##      &vmovdqu        ($inout,($inp));
 197 ##      &mov            ($_inp,$inp);
 198
 199         '&vpxor         ($inout,$inout,$roundkey);'.
 200         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
 201
 202         '&vpxor         ($inout,$inout,$iv);',
 203
 204         '&vaesenc       ($inout,$inout,$roundkey);'.
 205         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
 206
 207         '&vaesenc       ($inout,$inout,$roundkey);'.
 208         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
 209
 210         '&vaesenc       ($inout,$inout,$roundkey);'.
 211         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
 212
 213         '&vaesenc       ($inout,$inout,$roundkey);'.
 214         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
 215
 216         '&vaesenc       ($inout,$inout,$roundkey);'.
 217         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
 218
 219         '&vaesenc       ($inout,$inout,$roundkey);'.
 220         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
 221
 222         '&vaesenc       ($inout,$inout,$roundkey);'.
 223         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
 224
 225         '&vaesenc       ($inout,$inout,$roundkey);'.
 226         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
 227
 228         '&vaesenc       ($inout,$inout,$roundkey);'.
 229         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
 230
 231         '&vaesenclast   ($temp,$inout,$roundkey);'.
 232         ' &vaesenc      ($inout,$inout,$roundkey);'.
 233         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
 234
 235         '&vpand         ($iv,$temp,$mask10);'.
 236         ' &vaesenc      ($inout,$inout,$roundkey);'.
 237         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
 238
 239         '&vaesenclast   ($temp,$inout,$roundkey);'.
 240         ' &vaesenc      ($inout,$inout,$roundkey);'.
 241         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
 242
 243         '&vpand         ($temp,$temp,$mask12);'.
 244         ' &vaesenc      ($inout,$inout,$roundkey);'.
 245          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
 246
 247         '&vpor          ($iv,$iv,$temp);'.
 248         ' &vaesenclast  ($temp,$inout,$roundkey);'.
 249         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
 250
 251 ##      &mov            ($inp,$_inp);
 252 ##      &mov            ($out,$_out);
 253 ##      &vpand          ($temp,$temp,$mask14);
 254 ##      &vpor           ($iv,$iv,$temp);
 255 ##      &vmovdqu        ($iv,($out,$inp);
 256 ##      &lea            (inp,16($inp));
 257 );
 258
 259 my $a4=$T1;
 260 my ($a,$b,$c,$d,$e,$f,$g,$h);
 261
 262 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 263 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 264   my $arg = pop;
 265     $arg = "\$$arg" if ($arg*1 eq $arg);
 266     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 267 }
 268
 269 sub body_00_15 () {
 270         (
 271         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 272
 273         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
 274         '&mov   ($a,$a1)',
 275         '&mov   ($a4,$f)',
 276
 277         '&xor   ($a0,$e)',
 278         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
 279         '&xor   ($a4,$g)',                      # f^g
 280
 281         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
 282         '&xor   ($a1,$a)',
 283         '&and   ($a4,$e)',                      # (f^g)&e
 284
 285         @aesni_cbc_block[$aesni_cbc_idx++].
 286         '&xor   ($a0,$e)',
 287         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
 288         '&mov   ($a2,$a)',
 289
 290         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
 291         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
 292         '&xor   ($a2,$b)',                      # a^b, b^c in next round
 293
 294         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
 295         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
 296         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
 297
 298         '&xor   ($a1,$a)',
 299         '&add   ($h,$a0)',                      # h+=Sigma1(e)
 300         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
 301
 302         '&add   ($d,$h)',                       # d+=h
 303         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
 304         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
 305
 306         '&mov   ($a0,$d)',
 307         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
 308         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 309         );
 310 }
 311
 312 if ($avx) {{
 313 ######################################################################
 314 # XOP code path
 315 #
 316 $code.=<<___;
 317 .type   ${func}_xop,\@function,6
 318 .align  64
 319 ${func}_xop:
 320 .Lxop_shortcut:
 321         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 322         push    %rbx
 323         push    %rbp
 324         push    %r12
 325         push    %r13
 326         push    %r14
 327         push    %r15
 328         mov     %rsp,%r11               # copy %rsp
 329         sub     \$`$framesz+$win64*16*10`,%rsp
 330         and     \$-64,%rsp              # align stack frame
 331
 332         shl     \$6,$len
 333         sub     $inp,$out               # re-bias
 334         sub     $inp,$in0
 335         add     $inp,$len               # end of input
 336
 337         #mov    $inp,$_inp              # saved later
 338         mov     $out,$_out
 339         mov     $len,$_end
 340         #mov    $key,$_key              # remains resident in $inp register
 341         mov     $ivp,$_ivp
 342         mov     $ctx,$_ctx
 343         mov     $in0,$_in0
 344         mov     %r11,$_rsp
 345 ___
 346 $code.=<<___ if ($win64);
 347         movaps  %xmm6,`$framesz+16*0`(%rsp)
 348         movaps  %xmm7,`$framesz+16*1`(%rsp)
 349         movaps  %xmm8,`$framesz+16*2`(%rsp)
 350         movaps  %xmm9,`$framesz+16*3`(%rsp)
 351         movaps  %xmm10,`$framesz+16*4`(%rsp)
 352         movaps  %xmm11,`$framesz+16*5`(%rsp)
 353         movaps  %xmm12,`$framesz+16*6`(%rsp)
 354         movaps  %xmm13,`$framesz+16*7`(%rsp)
 355         movaps  %xmm14,`$framesz+16*8`(%rsp)
 356         movaps  %xmm15,`$framesz+16*9`(%rsp)
 357 ___
 358 $code.=<<___;
 359 .Lprologue_xop:
 360         vzeroall
 361
 362         mov     $inp,%r12               # borrow $a4
 363         lea     0x80($key),$inp         # size optimization, reassign
 364         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 365         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 366         mov     $ctx,%r15               # borrow $a2
 367         mov     $in0,%rsi               # borrow $a3
 368         vmovdqu ($ivp),$iv              # load IV
 369         sub     \$9,%r14
 370
 371         mov     $SZ*0(%r15),$A
 372         mov     $SZ*1(%r15),$B
 373         mov     $SZ*2(%r15),$C
 374         mov     $SZ*3(%r15),$D
 375         mov     $SZ*4(%r15),$E
 376         mov     $SZ*5(%r15),$F
 377         mov     $SZ*6(%r15),$G
 378         mov     $SZ*7(%r15),$H
 379
 380         vmovdqa 0x00(%r13,%r14,8),$mask14
 381         vmovdqa 0x10(%r13,%r14,8),$mask12
 382         vmovdqa 0x20(%r13,%r14,8),$mask10
 383         vmovdqu 0x00-0x80($inp),$roundkey
 384         jmp     .Lloop_xop
 385 ___
 386                                         if ($SZ==4) {   # SHA256
 387     my @X = map("%xmm$_",(0..3));
 388     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 389
 390 $code.=<<___;
 391 .align  16
 392 .Lloop_xop:
 393         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 394         vmovdqu 0x00(%rsi,%r12),@X[0]
 395         vmovdqu 0x10(%rsi,%r12),@X[1]
 396         vmovdqu 0x20(%rsi,%r12),@X[2]
 397         vmovdqu 0x30(%rsi,%r12),@X[3]
 398         vpshufb $t3,@X[0],@X[0]
 399         lea     $TABLE(%rip),$Tbl
 400         vpshufb $t3,@X[1],@X[1]
 401         vpshufb $t3,@X[2],@X[2]
 402         vpaddd  0x00($Tbl),@X[0],$t0
 403         vpshufb $t3,@X[3],@X[3]
 404         vpaddd  0x20($Tbl),@X[1],$t1
 405         vpaddd  0x40($Tbl),@X[2],$t2
 406         vpaddd  0x60($Tbl),@X[3],$t3
 407         vmovdqa $t0,0x00(%rsp)
 408         mov     $A,$a1
 409         vmovdqa $t1,0x10(%rsp)
 410         mov     $B,$a3
 411         vmovdqa $t2,0x20(%rsp)
 412         xor     $C,$a3                  # magic
 413         vmovdqa $t3,0x30(%rsp)
 414         mov     $E,$a0
 415         jmp     .Lxop_00_47
 416
 417 .align  16
 418 .Lxop_00_47:
 419         sub     \$-16*2*$SZ,$Tbl        # size optimization
 420         vmovdqu (%r12),$inout           # $a4
 421         mov     %r12,$_inp              # $a4
 422 ___
 423 sub XOP_256_00_47 () {
 424 my $j = shift;
 425 my $body = shift;
 426 my @X = @_;
 427 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 428
 429         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
 430           eval(shift(@insns));
 431           eval(shift(@insns));
 432          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
 433           eval(shift(@insns));
 434           eval(shift(@insns));
 435         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
 436           eval(shift(@insns));
 437           eval(shift(@insns));
 438         &vpsrld         ($t0,$t0,$sigma0[2]);
 439           eval(shift(@insns));
 440           eval(shift(@insns));
 441          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
 442           eval(shift(@insns));
 443           eval(shift(@insns));
 444           eval(shift(@insns));
 445           eval(shift(@insns));
 446         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
 447           eval(shift(@insns));
 448           eval(shift(@insns));
 449         &vpxor          ($t0,$t0,$t1);
 450           eval(shift(@insns));
 451           eval(shift(@insns));
 452           eval(shift(@insns));
 453           eval(shift(@insns));
 454          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
 455           eval(shift(@insns));
 456           eval(shift(@insns));
 457         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
 458           eval(shift(@insns));
 459           eval(shift(@insns));
 460          &vpsrld        ($t2,@X[3],$sigma1[2]);
 461           eval(shift(@insns));
 462           eval(shift(@insns));
 463         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
 464           eval(shift(@insns));
 465           eval(shift(@insns));
 466          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 467           eval(shift(@insns));
 468           eval(shift(@insns));
 469          &vpxor         ($t3,$t3,$t2);
 470           eval(shift(@insns));
 471           eval(shift(@insns));
 472           eval(shift(@insns));
 473           eval(shift(@insns));
 474          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
 475           eval(shift(@insns));
 476           eval(shift(@insns));
 477           eval(shift(@insns));
 478           eval(shift(@insns));
 479         &vpsrldq        ($t3,$t3,8);
 480           eval(shift(@insns));
 481           eval(shift(@insns));
 482           eval(shift(@insns));
 483           eval(shift(@insns));
 484         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
 485           eval(shift(@insns));
 486           eval(shift(@insns));
 487           eval(shift(@insns));
 488           eval(shift(@insns));
 489          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
 490           eval(shift(@insns));
 491           eval(shift(@insns));
 492          &vpsrld        ($t2,@X[0],$sigma1[2]);
 493           eval(shift(@insns));
 494           eval(shift(@insns));
 495          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
 496           eval(shift(@insns));
 497           eval(shift(@insns));
 498          &vpxor         ($t3,$t3,$t2);
 499           eval(shift(@insns));
 500           eval(shift(@insns));
 501           eval(shift(@insns));
 502           eval(shift(@insns));
 503          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
 504           eval(shift(@insns));
 505           eval(shift(@insns));
 506           eval(shift(@insns));
 507           eval(shift(@insns));
 508         &vpslldq        ($t3,$t3,8);            # 22 instructions
 509           eval(shift(@insns));
 510           eval(shift(@insns));
 511           eval(shift(@insns));
 512           eval(shift(@insns));
 513         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
 514           eval(shift(@insns));
 515           eval(shift(@insns));
 516           eval(shift(@insns));
 517           eval(shift(@insns));
 518         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 519           foreach (@insns) { eval; }            # remaining instructions
 520         &vmovdqa        (16*$j."(%rsp)",$t2);
 521 }
 522
 523     $aesni_cbc_idx=0;
 524     for ($i=0,$j=0; $j<4; $j++) {
 525         &XOP_256_00_47($j,\&body_00_15,@X);
 526         push(@X,shift(@X));                     # rotate(@X)
 527     }
 528         &mov            ("%r12",$_inp);         # borrow $a4
 529         &vpand          ($temp,$temp,$mask14);
 530         &mov            ("%r15",$_out);         # borrow $a2
 531         &vpor           ($iv,$iv,$temp);
 532         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 533         &lea            ("%r12","16(%r12)");    # inp++
 534
 535         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 536         &jne    (".Lxop_00_47");
 537
 538         &vmovdqu        ($inout,"(%r12)");
 539         &mov            ($_inp,"%r12");
 540
 541     $aesni_cbc_idx=0;
 542     for ($i=0; $i<16; ) {
 543         foreach(body_00_15()) { eval; }
 544     }
 545                                         }
 546 $code.=<<___;
 547         mov     $_inp,%r12              # borrow $a4
 548         mov     $_out,%r13              # borrow $a0
 549         mov     $_ctx,%r15              # borrow $a2
 550         mov     $_in0,%rsi              # borrow $a3
 551
 552         vpand   $mask14,$temp,$temp
 553         mov     $a1,$A
 554         vpor    $temp,$iv,$iv
 555         vmovdqu $iv,(%r13,%r12)         # write output
 556         lea     16(%r12),%r12           # inp++
 557
 558         add     $SZ*0(%r15),$A
 559         add     $SZ*1(%r15),$B
 560         add     $SZ*2(%r15),$C
 561         add     $SZ*3(%r15),$D
 562         add     $SZ*4(%r15),$E
 563         add     $SZ*5(%r15),$F
 564         add     $SZ*6(%r15),$G
 565         add     $SZ*7(%r15),$H
 566
 567         cmp     $_end,%r12
 568
 569         mov     $A,$SZ*0(%r15)
 570         mov     $B,$SZ*1(%r15)
 571         mov     $C,$SZ*2(%r15)
 572         mov     $D,$SZ*3(%r15)
 573         mov     $E,$SZ*4(%r15)
 574         mov     $F,$SZ*5(%r15)
 575         mov     $G,$SZ*6(%r15)
 576         mov     $H,$SZ*7(%r15)
 577
 578         jb      .Lloop_xop
 579
 580         mov     $_ivp,$ivp
 581         mov     $_rsp,%rsi
 582         vmovdqu $iv,($ivp)              # output IV
 583         vzeroall
 584 ___
 585 $code.=<<___ if ($win64);
 586         movaps  `$framesz+16*0`(%rsp),%xmm6
 587         movaps  `$framesz+16*1`(%rsp),%xmm7
 588         movaps  `$framesz+16*2`(%rsp),%xmm8
 589         movaps  `$framesz+16*3`(%rsp),%xmm9
 590         movaps  `$framesz+16*4`(%rsp),%xmm10
 591         movaps  `$framesz+16*5`(%rsp),%xmm11
 592         movaps  `$framesz+16*6`(%rsp),%xmm12
 593         movaps  `$framesz+16*7`(%rsp),%xmm13
 594         movaps  `$framesz+16*8`(%rsp),%xmm14
 595         movaps  `$framesz+16*9`(%rsp),%xmm15
 596 ___
 597 $code.=<<___;
 598         mov     (%rsi),%r15
 599         mov     8(%rsi),%r14
 600         mov     16(%rsi),%r13
 601         mov     24(%rsi),%r12
 602         mov     32(%rsi),%rbp
 603         mov     40(%rsi),%rbx
 604         lea     48(%rsi),%rsp
 605 .Lepilogue_xop:
 606         ret
 607 .size   ${func}_xop,.-${func}_xop
 608 ___
 609 ######################################################################
 610 # AVX+shrd code path
 611 #
 612 local *ror = sub { &shrd(@_[0],@_) };
 613
 614 $code.=<<___;
 615 .type   ${func}_avx,\@function,6
 616 .align  64
 617 ${func}_avx:
 618 .Lavx_shortcut:
 619         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 620         push    %rbx
 621         push    %rbp
 622         push    %r12
 623         push    %r13
 624         push    %r14
 625         push    %r15
 626         mov     %rsp,%r11               # copy %rsp
 627         sub     \$`$framesz+$win64*16*10`,%rsp
 628         and     \$-64,%rsp              # align stack frame
 629
 630         shl     \$6,$len
 631         sub     $inp,$out               # re-bias
 632         sub     $inp,$in0
 633         add     $inp,$len               # end of input
 634
 635         #mov    $inp,$_inp              # saved later
 636         mov     $out,$_out
 637         mov     $len,$_end
 638         #mov    $key,$_key              # remains resident in $inp register
 639         mov     $ivp,$_ivp
 640         mov     $ctx,$_ctx
 641         mov     $in0,$_in0
 642         mov     %r11,$_rsp
 643 ___
 644 $code.=<<___ if ($win64);
 645         movaps  %xmm6,`$framesz+16*0`(%rsp)
 646         movaps  %xmm7,`$framesz+16*1`(%rsp)
 647         movaps  %xmm8,`$framesz+16*2`(%rsp)
 648         movaps  %xmm9,`$framesz+16*3`(%rsp)
 649         movaps  %xmm10,`$framesz+16*4`(%rsp)
 650         movaps  %xmm11,`$framesz+16*5`(%rsp)
 651         movaps  %xmm12,`$framesz+16*6`(%rsp)
 652         movaps  %xmm13,`$framesz+16*7`(%rsp)
 653         movaps  %xmm14,`$framesz+16*8`(%rsp)
 654         movaps  %xmm15,`$framesz+16*9`(%rsp)
 655 ___
 656 $code.=<<___;
 657 .Lprologue_avx:
 658         vzeroall
 659
 660         mov     $inp,%r12               # borrow $a4
 661         lea     0x80($key),$inp         # size optimization, reassign
 662         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
 663         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 664         mov     $ctx,%r15               # borrow $a2
 665         mov     $in0,%rsi               # borrow $a3
 666         vmovdqu ($ivp),$iv              # load IV
 667         sub     \$9,%r14
 668
 669         mov     $SZ*0(%r15),$A
 670         mov     $SZ*1(%r15),$B
 671         mov     $SZ*2(%r15),$C
 672         mov     $SZ*3(%r15),$D
 673         mov     $SZ*4(%r15),$E
 674         mov     $SZ*5(%r15),$F
 675         mov     $SZ*6(%r15),$G
 676         mov     $SZ*7(%r15),$H
 677
 678         vmovdqa 0x00(%r13,%r14,8),$mask14
 679         vmovdqa 0x10(%r13,%r14,8),$mask12
 680         vmovdqa 0x20(%r13,%r14,8),$mask10
 681         vmovdqu 0x00-0x80($inp),$roundkey
 682 ___
 683                                         if ($SZ==4) {   # SHA256
 684     my @X = map("%xmm$_",(0..3));
 685     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
 686
 687 $code.=<<___;
 688         jmp     .Lloop_avx
 689 .align  16
 690 .Lloop_avx:
 691         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 692         vmovdqu 0x00(%rsi,%r12),@X[0]
 693         vmovdqu 0x10(%rsi,%r12),@X[1]
 694         vmovdqu 0x20(%rsi,%r12),@X[2]
 695         vmovdqu 0x30(%rsi,%r12),@X[3]
 696         vpshufb $t3,@X[0],@X[0]
 697         lea     $TABLE(%rip),$Tbl
 698         vpshufb $t3,@X[1],@X[1]
 699         vpshufb $t3,@X[2],@X[2]
 700         vpaddd  0x00($Tbl),@X[0],$t0
 701         vpshufb $t3,@X[3],@X[3]
 702         vpaddd  0x20($Tbl),@X[1],$t1
 703         vpaddd  0x40($Tbl),@X[2],$t2
 704         vpaddd  0x60($Tbl),@X[3],$t3
 705         vmovdqa $t0,0x00(%rsp)
 706         mov     $A,$a1
 707         vmovdqa $t1,0x10(%rsp)
 708         mov     $B,$a3
 709         vmovdqa $t2,0x20(%rsp)
 710         xor     $C,$a3                  # magic
 711         vmovdqa $t3,0x30(%rsp)
 712         mov     $E,$a0
 713         jmp     .Lavx_00_47
 714
 715 .align  16
 716 .Lavx_00_47:
 717         sub     \$-16*2*$SZ,$Tbl        # size optimization
 718         vmovdqu (%r12),$inout           # $a4
 719         mov     %r12,$_inp              # $a4
 720 ___
 721 sub Xupdate_256_AVX () {
 722         (
 723         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
 724          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
 725         '&vpsrld        ($t2,$t0,$sigma0[0]);',
 726          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
 727         '&vpsrld        ($t3,$t0,$sigma0[2])',
 728         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
 729         '&vpxor         ($t0,$t3,$t2)',
 730          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
 731         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
 732         '&vpxor         ($t0,$t0,$t1)',
 733         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
 734         '&vpxor         ($t0,$t0,$t2)',
 735          '&vpsrld       ($t2,$t3,$sigma1[2]);',
 736         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
 737          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
 738         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
 739          '&vpxor        ($t2,$t2,$t3);',
 740          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 741          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
 742          '&vpshufd      ($t2,$t2,0b10000100)',
 743          '&vpsrldq      ($t2,$t2,8)',
 744         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
 745          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
 746          '&vpsrld       ($t2,$t3,$sigma1[2])',
 747          '&vpsrlq       ($t3,$t3,$sigma1[0])',
 748          '&vpxor        ($t2,$t2,$t3);',
 749          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
 750          '&vpxor        ($t2,$t2,$t3)',
 751          '&vpshufd      ($t2,$t2,0b11101000)',
 752          '&vpslldq      ($t2,$t2,8)',
 753         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
 754         );
 755 }
 756
 757 sub AVX_256_00_47 () {
 758 my $j = shift;
 759 my $body = shift;
 760 my @X = @_;
 761 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
 762
 763         foreach (Xupdate_256_AVX()) {           # 29 instructions
 764             eval;
 765             eval(shift(@insns));
 766             eval(shift(@insns));
 767             eval(shift(@insns));
 768         }
 769         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
 770           foreach (@insns) { eval; }            # remaining instructions
 771         &vmovdqa        (16*$j."(%rsp)",$t2);
 772 }
 773
 774     $aesni_cbc_idx=0;
 775     for ($i=0,$j=0; $j<4; $j++) {
 776         &AVX_256_00_47($j,\&body_00_15,@X);
 777         push(@X,shift(@X));                     # rotate(@X)
 778     }
 779         &mov            ("%r12",$_inp);         # borrow $a4
 780         &vpand          ($temp,$temp,$mask14);
 781         &mov            ("%r15",$_out);         # borrow $a2
 782         &vpor           ($iv,$iv,$temp);
 783         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
 784         &lea            ("%r12","16(%r12)");    # inp++
 785
 786         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
 787         &jne    (".Lavx_00_47");
 788
 789         &vmovdqu        ($inout,"(%r12)");
 790         &mov            ($_inp,"%r12");
 791
 792     $aesni_cbc_idx=0;
 793     for ($i=0; $i<16; ) {
 794         foreach(body_00_15()) { eval; }
 795     }
 796
 797                                         }
 798 $code.=<<___;
 799         mov     $_inp,%r12              # borrow $a4
 800         mov     $_out,%r13              # borrow $a0
 801         mov     $_ctx,%r15              # borrow $a2
 802         mov     $_in0,%rsi              # borrow $a3
 803
 804         vpand   $mask14,$temp,$temp
 805         mov     $a1,$A
 806         vpor    $temp,$iv,$iv
 807         vmovdqu $iv,(%r13,%r12)         # write output
 808         lea     16(%r12),%r12           # inp++
 809
 810         add     $SZ*0(%r15),$A
 811         add     $SZ*1(%r15),$B
 812         add     $SZ*2(%r15),$C
 813         add     $SZ*3(%r15),$D
 814         add     $SZ*4(%r15),$E
 815         add     $SZ*5(%r15),$F
 816         add     $SZ*6(%r15),$G
 817         add     $SZ*7(%r15),$H
 818
 819         cmp     $_end,%r12
 820
 821         mov     $A,$SZ*0(%r15)
 822         mov     $B,$SZ*1(%r15)
 823         mov     $C,$SZ*2(%r15)
 824         mov     $D,$SZ*3(%r15)
 825         mov     $E,$SZ*4(%r15)
 826         mov     $F,$SZ*5(%r15)
 827         mov     $G,$SZ*6(%r15)
 828         mov     $H,$SZ*7(%r15)
 829         jb      .Lloop_avx
 830
 831         mov     $_ivp,$ivp
 832         mov     $_rsp,%rsi
 833         vmovdqu $iv,($ivp)              # output IV
 834         vzeroall
 835 ___
 836 $code.=<<___ if ($win64);
 837         movaps  `$framesz+16*0`(%rsp),%xmm6
 838         movaps  `$framesz+16*1`(%rsp),%xmm7
 839         movaps  `$framesz+16*2`(%rsp),%xmm8
 840         movaps  `$framesz+16*3`(%rsp),%xmm9
 841         movaps  `$framesz+16*4`(%rsp),%xmm10
 842         movaps  `$framesz+16*5`(%rsp),%xmm11
 843         movaps  `$framesz+16*6`(%rsp),%xmm12
 844         movaps  `$framesz+16*7`(%rsp),%xmm13
 845         movaps  `$framesz+16*8`(%rsp),%xmm14
 846         movaps  `$framesz+16*9`(%rsp),%xmm15
 847 ___
 848 $code.=<<___;
 849         mov     (%rsi),%r15
 850         mov     8(%rsi),%r14
 851         mov     16(%rsi),%r13
 852         mov     24(%rsi),%r12
 853         mov     32(%rsi),%rbp
 854         mov     40(%rsi),%rbx
 855         lea     48(%rsi),%rsp
 856 .Lepilogue_avx:
 857         ret
 858 .size   ${func}_avx,.-${func}_avx
 859 ___
 860
 861 if ($avx>1) {{
 862 ######################################################################
 863 # AVX2+BMI code path
 864 #
 865 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp
 866 my $PUSH8=8*2*$SZ;
 867 use integer;
 868
 869 sub bodyx_00_15 () {
 870         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
 871         (
 872         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
 873
 874         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
 875         '&and   ($a4,$e)',              # f&e
 876         '&rorx  ($a0,$e,$Sigma1[2])',
 877         '&rorx  ($a2,$e,$Sigma1[1])',
 878
 879         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
 880         '&lea   ($h,"($h,$a4)")',
 881         '&andn  ($a4,$e,$g)',           # ~e&g
 882         '&xor   ($a0,$a2)',
 883
 884         '&rorx  ($a1,$e,$Sigma1[0])',
 885         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
 886         '&xor   ($a0,$a1)',             # Sigma1(e)
 887         '&mov   ($a2,$a)',
 888
 889         '&rorx  ($a4,$a,$Sigma0[2])',
 890         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
 891         '&xor   ($a2,$b)',              # a^b, b^c in next round
 892         '&rorx  ($a1,$a,$Sigma0[1])',
 893
 894         '&rorx  ($a0,$a,$Sigma0[0])',
 895         '&lea   ($d,"($d,$h)")',        # d+=h
 896         '&and   ($a3,$a2)',             # (b^c)&(a^b)
 897         @aesni_cbc_block[$aesni_cbc_idx++].
 898         '&xor   ($a1,$a4)',
 899
 900         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
 901         '&xor   ($a1,$a0)',             # Sigma0(a)
 902         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
 903         '&mov   ($a4,$e)',              # copy of f in future
 904
 905         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
 906         );
 907         # and at the finish one has to $a+=$a1
 908 }
 909
 910 $code.=<<___;
 911 .type   ${func}_avx2,\@function,6
 912 .align  64
 913 ${func}_avx2:
 914 .Lavx2_shortcut:
 915         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
 916         push    %rbx
 917         push    %rbp
 918         push    %r12
 919         push    %r13
 920         push    %r14
 921         push    %r15
 922         mov     %rsp,%r11               # copy %rsp
 923         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
 924         and     \$-256*$SZ,%rsp         # align stack frame
 925         add     \$`2*$SZ*($rounds-8)`,%rsp
 926
 927         shl     \$6,$len
 928         sub     $inp,$out               # re-bias
 929         sub     $inp,$in0
 930         add     $inp,$len               # end of input
 931
 932         #mov    $inp,$_inp              # saved later
 933         #mov    $out,$_out              # kept in $offload
 934         mov     $len,$_end
 935         #mov    $key,$_key              # remains resident in $inp register
 936         mov     $ivp,$_ivp
 937         mov     $ctx,$_ctx
 938         mov     $in0,$_in0
 939         mov     %r11,$_rsp
 940 ___
 941 $code.=<<___ if ($win64);
 942         movaps  %xmm6,`$framesz+16*0`(%rsp)
 943         movaps  %xmm7,`$framesz+16*1`(%rsp)
 944         movaps  %xmm8,`$framesz+16*2`(%rsp)
 945         movaps  %xmm9,`$framesz+16*3`(%rsp)
 946         movaps  %xmm10,`$framesz+16*4`(%rsp)
 947         movaps  %xmm11,`$framesz+16*5`(%rsp)
 948         movaps  %xmm12,`$framesz+16*6`(%rsp)
 949         movaps  %xmm13,`$framesz+16*7`(%rsp)
 950         movaps  %xmm14,`$framesz+16*8`(%rsp)
 951         movaps  %xmm15,`$framesz+16*9`(%rsp)
 952 ___
 953 $code.=<<___;
 954 .Lprologue_avx2:
 955         vzeroall
 956
 957         mov     $inp,%r13               # borrow $a0
 958         vpinsrq \$1,$out,$offload,$offload
 959         lea     0x80($key),$inp         # size optimization, reassign
 960         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
 961         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
 962         mov     $ctx,%r15               # borrow $a2
 963         mov     $in0,%rsi               # borrow $a3
 964         vmovdqu ($ivp),$iv              # load IV
 965         lea     -9(%r14),%r14
 966
 967         vmovdqa 0x00(%r12,%r14,8),$mask14
 968         vmovdqa 0x10(%r12,%r14,8),$mask12
 969         vmovdqa 0x20(%r12,%r14,8),$mask10
 970
 971         sub     \$-16*$SZ,%r13          # inp++, size optimization
 972         mov     $SZ*0(%r15),$A
 973         xor     %r12,%r12               # borrow $a0
 974         mov     $SZ*1(%r15),$B
 975         cmp     $len,%r13               # $_end
 976         mov     $SZ*2(%r15),$C
 977         sete    %r12b
 978         mov     $SZ*3(%r15),$D
 979         mov     $SZ*4(%r15),$E
 980         mov     $SZ*5(%r15),$F
 981         mov     $SZ*6(%r15),$G
 982         mov     $SZ*7(%r15),$H
 983         vmovdqu 0x00-0x80($inp),$roundkey
 984 ___
 985                                         if ($SZ==4) {   # SHA256
 986     my @X = map("%ymm$_",(0..3));
 987     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
 988
 989 $code.=<<___;
 990         jmp     .Loop_avx2
 991 .align  16
 992 .Loop_avx2:
 993         shl     \$`log(16*$SZ)/log(2)`,%r12
 994         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
 995         neg     %r12
 996         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
 997         add     %rsi,%r12               # next or same input block
 998         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
 999         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1000         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1001
1002         vinserti128     \$1,(%r12,%r13),@X[0],@X[0]
1003         vinserti128     \$1,16(%r12,%r13),@X[1],@X[1]
1004          vpshufb        $t3,@X[0],@X[0]
1005         vinserti128     \$1,32(%r12,%r13),@X[2],@X[2]
1006          vpshufb        $t3,@X[1],@X[1]
1007         vinserti128     \$1,48(%r12,%r13),@X[3],@X[3]
1008
1009         lea     $TABLE(%rip),$Tbl
1010         vpshufb $t3,@X[2],@X[2]
1011         lea     -16*$SZ(%r13),%r13
1012         vpaddd  0x00($Tbl),@X[0],$t0
1013         vpshufb $t3,@X[3],@X[3]
1014         vpaddd  0x20($Tbl),@X[1],$t1
1015         vpaddd  0x40($Tbl),@X[2],$t2
1016         vpaddd  0x60($Tbl),@X[3],$t3
1017         vmovdqa $t0,0x00(%rsp)
1018         xor     $a1,$a1
1019         vmovdqa $t1,0x20(%rsp)
1020         lea     -$PUSH8(%rsp),%rsp
1021         mov     $B,$a3
1022         vmovdqa $t2,0x00(%rsp)
1023         xor     $C,$a3                  # magic
1024         vmovdqa $t3,0x20(%rsp)
1025         mov     $F,$a4
1026         sub     \$-16*2*$SZ,$Tbl        # size optimization
1027         jmp     .Lavx2_00_47
1028
1029 .align  16
1030 .Lavx2_00_47:
1031         vmovdqu (%r13),$inout
1032         vpinsrq \$0,%r13,$offload,$offload
1033 ___
1034
1035 sub AVX2_256_00_47 () {
1036 my $j = shift;
1037 my $body = shift;
1038 my @X = @_;
1039 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1040 my $base = "+2*$PUSH8(%rsp)";
1041
1042         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1043         foreach (Xupdate_256_AVX()) {           # 29 instructions
1044             eval;
1045             eval(shift(@insns));
1046             eval(shift(@insns));
1047             eval(shift(@insns));
1048         }
1049         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1050           foreach (@insns) { eval; }            # remaining instructions
1051         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1052 }
1053     $aesni_cbc_idx=0;
1054     for ($i=0,$j=0; $j<4; $j++) {
1055         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1056         push(@X,shift(@X));                     # rotate(@X)
1057     }
1058         &vmovq          ("%r13",$offload);      # borrow $a0
1059         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1060         &vpand          ($temp,$temp,$mask14);
1061         &vpor           ($iv,$iv,$temp);
1062         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1063         &lea            ("%r13","16(%r13)");    # inp++
1064
1065         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1066         &cmpb   (($SZ-1)."($Tbl)",0);
1067         &jne    (".Lavx2_00_47");
1068
1069         &vmovdqu        ($inout,"(%r13)");
1070         &vpinsrq        ($offload,$offload,"%r13",0);
1071
1072     $aesni_cbc_idx=0;
1073     for ($i=0; $i<16; ) {
1074         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1075         foreach(bodyx_00_15()) { eval; }
1076     }
1077                                         }
1078 $code.=<<___;
1079         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1080         vmovq   $offload,%r13                   # $_inp, borrow $a0
1081         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1082         add     $a1,$A
1083         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1084
1085         vpand   $mask14,$temp,$temp
1086         vpor    $temp,$iv,$iv
1087         vmovdqu $iv,(%r12,%r13)                 # write output
1088         lea     16(%r13),%r13
1089
1090         add     $SZ*0(%r15),$A
1091         add     $SZ*1(%r15),$B
1092         add     $SZ*2(%r15),$C
1093         add     $SZ*3(%r15),$D
1094         add     $SZ*4(%r15),$E
1095         add     $SZ*5(%r15),$F
1096         add     $SZ*6(%r15),$G
1097         add     $SZ*7(%r15),$H
1098
1099         mov     $A,$SZ*0(%r15)
1100         mov     $B,$SZ*1(%r15)
1101         mov     $C,$SZ*2(%r15)
1102         mov     $D,$SZ*3(%r15)
1103         mov     $E,$SZ*4(%r15)
1104         mov     $F,$SZ*5(%r15)
1105         mov     $G,$SZ*6(%r15)
1106         mov     $H,$SZ*7(%r15)
1107
1108         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1109         je      .Ldone_avx2
1110
1111         xor     $a1,$a1
1112         mov     $B,$a3
1113         mov     $F,$a4
1114         xor     $C,$a3                  # magic
1115         jmp     .Lower_avx2
1116 .align  16
1117 .Lower_avx2:
1118         vmovdqu (%r13),$inout
1119         vpinsrq \$0,%r13,$offload,$offload
1120 ___
1121     $aesni_cbc_idx=0;
1122     for ($i=0; $i<16; ) {
1123         my $base="+16($Tbl)";
1124         foreach(bodyx_00_15()) { eval; }
1125         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1126     }
1127 $code.=<<___;
1128         vmovq   $offload,%r13                   # borrow $a0
1129         vpextrq \$1,$offload,%r15               # borrow $a2
1130         vpand   $mask14,$temp,$temp
1131         vpor    $temp,$iv,$iv
1132         lea     -$PUSH8($Tbl),$Tbl
1133         vmovdqu $iv,(%r15,%r13)                 # write output
1134         lea     16(%r13),%r13                   # inp++
1135         cmp     %rsp,$Tbl
1136         jae     .Lower_avx2
1137
1138         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1139         lea     16*$SZ(%r13),%r13
1140         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1141         add     $a1,$A
1142         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1143
1144         add     $SZ*0(%r15),$A
1145         add     $SZ*1(%r15),$B
1146         add     $SZ*2(%r15),$C
1147         add     $SZ*3(%r15),$D
1148         add     $SZ*4(%r15),$E
1149         add     $SZ*5(%r15),$F
1150         add     $SZ*6(%r15),$G
1151         xor     %r12,%r12
1152         add     $SZ*7(%r15),$H
1153
1154         cmp     $_end,%r13
1155
1156         mov     $A,$SZ*0(%r15)
1157         mov     $B,$SZ*1(%r15)
1158         mov     $C,$SZ*2(%r15)
1159         mov     $D,$SZ*3(%r15)
1160         mov     $E,$SZ*4(%r15)
1161         mov     $F,$SZ*5(%r15)
1162         mov     $G,$SZ*6(%r15)
1163         mov     $H,$SZ*7(%r15)
1164
1165         sete    %r12b
1166         jbe     .Loop_avx2
1167         lea     (%rsp),$Tbl
1168
1169 .Ldone_avx2:
1170         lea     ($Tbl),%rsp
1171         mov     $_ivp,$ivp
1172         mov     $_rsp,%rsi
1173         vmovdqu $iv,($ivp)              # output IV
1174         vzeroall
1175 ___
1176 $code.=<<___ if ($win64);
1177         movaps  `$framesz+16*0`(%rsp),%xmm6
1178         movaps  `$framesz+16*1`(%rsp),%xmm7
1179         movaps  `$framesz+16*2`(%rsp),%xmm8
1180         movaps  `$framesz+16*3`(%rsp),%xmm9
1181         movaps  `$framesz+16*4`(%rsp),%xmm10
1182         movaps  `$framesz+16*5`(%rsp),%xmm11
1183         movaps  `$framesz+16*6`(%rsp),%xmm12
1184         movaps  `$framesz+16*7`(%rsp),%xmm13
1185         movaps  `$framesz+16*8`(%rsp),%xmm14
1186         movaps  `$framesz+16*9`(%rsp),%xmm15
1187 ___
1188 $code.=<<___;
1189         mov     (%rsi),%r15
1190         mov     8(%rsi),%r14
1191         mov     16(%rsi),%r13
1192         mov     24(%rsi),%r12
1193         mov     32(%rsi),%rbp
1194         mov     40(%rsi),%rbx
1195         lea     48(%rsi),%rsp
1196 .Lepilogue_avx2:
1197         ret
1198 .size   ${func}_avx2,.-${func}_avx2
1199 ___
1200 }}
1201 }}}}}
1202
1203 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1204 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1205 if ($win64) {
1206 $rec="%rcx";
1207 $frame="%rdx";
1208 $context="%r8";
1209 $disp="%r9";
1210
1211 $code.=<<___ if ($avx);
1212 .extern __imp_RtlVirtualUnwind
1213 .type   se_handler,\@abi-omnipotent
1214 .align  16
1215 se_handler:
1216         push    %rsi
1217         push    %rdi
1218         push    %rbx
1219         push    %rbp
1220         push    %r12
1221         push    %r13
1222         push    %r14
1223         push    %r15
1224         pushfq
1225         sub     \$64,%rsp
1226
1227         mov     120($context),%rax      # pull context->Rax
1228         mov     248($context),%rbx      # pull context->Rip
1229
1230         mov     8($disp),%rsi           # disp->ImageBase
1231         mov     56($disp),%r11          # disp->HanderlData
1232
1233         mov     0(%r11),%r10d           # HandlerData[0]
1234         lea     (%rsi,%r10),%r10        # prologue label
1235         cmp     %r10,%rbx               # context->Rip<prologue label
1236         jb      .Lin_prologue
1237
1238         mov     152($context),%rax      # pull context->Rsp
1239
1240         mov     4(%r11),%r10d           # HandlerData[1]
1241         lea     (%rsi,%r10),%r10        # epilogue label
1242         cmp     %r10,%rbx               # context->Rip>=epilogue label
1243         jae     .Lin_prologue
1244 ___
1245 $code.=<<___ if ($avx>1);
1246         lea     .Lavx2_shortcut(%rip),%r10
1247         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1248         jb      .Lnot_in_avx2
1249
1250         and     \$-256*$SZ,%rax
1251         add     \$`2*$SZ*($rounds-8)`,%rax
1252 .Lnot_in_avx2:
1253 ___
1254 $code.=<<___;
1255         mov     %rax,%rsi               # put aside Rsp
1256         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1257         lea     48(%rax),%rax
1258
1259         mov     -8(%rax),%rbx
1260         mov     -16(%rax),%rbp
1261         mov     -24(%rax),%r12
1262         mov     -32(%rax),%r13
1263         mov     -40(%rax),%r14
1264         mov     -48(%rax),%r15
1265         mov     %rbx,144($context)      # restore context->Rbx
1266         mov     %rbp,160($context)      # restore context->Rbp
1267         mov     %r12,216($context)      # restore context->R12
1268         mov     %r13,224($context)      # restore context->R13
1269         mov     %r14,232($context)      # restore context->R14
1270         mov     %r15,240($context)      # restore context->R15
1271
1272         lea     .Lepilogue(%rip),%r10
1273         cmp     %r10,%rbx
1274         jb      .Lin_prologue           # non-AVX code
1275
1276         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1277         lea     512($context),%rdi      # &context.Xmm6
1278         mov     \$20,%ecx
1279         .long   0xa548f3fc              # cld; rep movsq
1280
1281 .Lin_prologue:
1282         mov     8(%rax),%rdi
1283         mov     16(%rax),%rsi
1284         mov     %rax,152($context)      # restore context->Rsp
1285         mov     %rsi,168($context)      # restore context->Rsi
1286         mov     %rdi,176($context)      # restore context->Rdi
1287
1288         mov     40($disp),%rdi          # disp->ContextRecord
1289         mov     $context,%rsi           # context
1290         mov     \$154,%ecx              # sizeof(CONTEXT)
1291         .long   0xa548f3fc              # cld; rep movsq
1292
1293         mov     $disp,%rsi
1294         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1295         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1296         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1297         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1298         mov     40(%rsi),%r10           # disp->ContextRecord
1299         lea     56(%rsi),%r11           # &disp->HandlerData
1300         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1301         mov     %r10,32(%rsp)           # arg5
1302         mov     %r11,40(%rsp)           # arg6
1303         mov     %r12,48(%rsp)           # arg7
1304         mov     %rcx,56(%rsp)           # arg8, (NULL)
1305         call    *__imp_RtlVirtualUnwind(%rip)
1306
1307         mov     \$1,%eax                # ExceptionContinueSearch
1308         add     \$64,%rsp
1309         popfq
1310         pop     %r15
1311         pop     %r14
1312         pop     %r13
1313         pop     %r12
1314         pop     %rbp
1315         pop     %rbx
1316         pop     %rdi
1317         pop     %rsi
1318         ret
1319 .size   se_handler,.-se_handler
1320
1321 .section        .pdata
1322         .rva    .LSEH_begin_${func}_xop
1323         .rva    .LSEH_end_${func}_xop
1324         .rva    .LSEH_info_${func}_xop
1325
1326         .rva    .LSEH_begin_${func}_avx
1327         .rva    .LSEH_end_${func}_avx
1328         .rva    .LSEH_info_${func}_avx
1329 ___
1330 $code.=<<___ if ($avx>1);
1331         .rva    .LSEH_begin_${func}_avx2
1332         .rva    .LSEH_end_${func}_avx2
1333         .rva    .LSEH_info_${func}_avx2
1334 ___
1335 $code.=<<___ if ($avx);
1336 .section        .xdata
1337 .align  8
1338 .LSEH_info_${func}_xop:
1339         .byte   9,0,0,0
1340         .rva    se_handler
1341         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1342
1343 .LSEH_info_${func}_avx:
1344         .byte   9,0,0,0
1345         .rva    se_handler
1346         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1347 ___
1348 $code.=<<___ if ($avx>1);
1349 .LSEH_info_${func}_avx2:
1350         .byte   9,0,0,0
1351         .rva    se_handler
1352         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1353 ___
1354 }
1355
1356 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1357 print $code;
1358 close STDOUT;