crypto/aes/asm/aesni-sha1-x86_64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # June 2011
  11 #
  12 # This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
  13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  15 # parallelism, interleaving it with another algorithm would allow to
  16 # utilize processor resources better and achieve better performance.
  17 # SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
  18 # AESNI code is weaved into it. Below are performance numbers in
  19 # cycles per processed byte, less is better, for standalone AESNI-CBC
  20 # encrypt, sum of the latter and standalone SHA1, and "stitched"
  21 # subroutine:
  22 #
  23 #               AES-128-CBC     +SHA1           stitch      gain
  24 # Westmere      3.77[+5.3]      9.07            6.55        +38%
  25 # Sandy Bridge  5.05[+5.0(6.1)] 10.06(11.15)    5.98(7.05)  +68%(+58%)
  26 # Ivy Bridge    5.05[+4.6]      9.65            5.54        +74%
  27 # Haswell       4.43[+3.6(4.2)] 8.00(8.58)      4.55(5.21)  +75%(+65%)
  28 # Bulldozer     5.77[+6.0]      11.72           6.37        +84%
  29 #
  30 #               AES-192-CBC
  31 # Westmere      4.51            9.81            6.80        +44%
  32 # Sandy Bridge  6.05            11.06(12.15)    6.11(7.19)  +81%(+69%)
  33 # Ivy Bridge    6.05            10.65           6.07        +75%
  34 # Haswell       5.29            8.86(9.44)      5.32(5.32)  +67%(+77%)
  35 # Bulldozer     6.89            12.84           6.96        +84%
  36 #
  37 #               AES-256-CBC
  38 # Westmere      5.25            10.55           7.21        +46%
  39 # Sandy Bridge  7.05            12.06(13.15)    7.12(7.72)  +69%(+70%)
  40 # Ivy Bridge    7.05            11.65           7.12        +64%
  41 # Haswell       6.19            9.76(10.34)     6.21(6.25)  +57%(+65%)
  42 # Bulldozer     8.00            13.95           8.25        +69%
  43 #
  44 # (*)   There are two code paths: SSSE3 and AVX. See sha1-568.pl for
  45 #       background information. Above numbers in parentheses are SSSE3
  46 #       results collected on AVX-capable CPU, i.e. apply on OSes that
  47 #       don't support AVX.
  48 #
  49 # Needless to mention that it makes no sense to implement "stitched"
  50 # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
  51 # fully utilize parallelism, so stitching would not give any gain
  52 # anyway. Well, there might be some, e.g. because of better cache
  53 # locality... For reference, here are performance results for
  54 # standalone AESNI-CBC decrypt:
  55 #
  56 #               AES-128-CBC     AES-192-CBC     AES-256-CBC
  57 # Westmere      1.25            1.50            1.75
  58 # Sandy Bridge  0.74            0.91            1.09
  59 # Ivy Bridge    0.74            0.90            1.11
  60 # Haswell       0.63            0.76            0.88
  61 # Bulldozer     0.70            0.85            0.99
  62
  63 # And indeed:
  64 #
  65 #               AES-256-CBC     +SHA1           stitch      gain
  66 # Westmere      1.75            7.20            6.68        +7.8%
  67 # Sandy Bridge  1.09            6.09(7.22)      5.82(6.95)  +4.6%(+3.9%)
  68 # Ivy Bridge    1.11            5.70            5.45        +4.6%
  69 # Haswell       0.88            4.45(5.00)      4.39(4.69)  +1.4%(*)(+6.6%)
  70 # Bulldozer     0.99            6.95            5.95        +17%(**)
  71 #
  72 # (*)   Tiny improvement coefficient on Haswell is because we compare
  73 #       AVX1 stitch to sum with AVX2 SHA1.
  74 # (**)  Execution is fully dominated by integer code sequence and
  75 #       SIMD still hardly shows [in single-process benchmark;-]
  76
  77 $flavour = shift;
  78 $output  = shift;
  79 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  80
  81 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  82
  83 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  84 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  85 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  86 die "can't locate x86_64-xlate.pl";
  87
  88 $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  89                 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
  90            $1>=2.19);
  91 $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  92            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
  93            $1>=2.09);
  94 $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  95            `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
  96            $1>=10);
  97
  98 $stitched_decrypt=0;
  99
 100 open OUT,"| \"$^X\" $xlate $flavour $output";
 101 *STDOUT=*OUT;
 102
 103 # void aesni_cbc_sha1_enc(const void *inp,
 104 #                       void *out,
 105 #                       size_t length,
 106 #                       const AES_KEY *key,
 107 #                       unsigned char *iv,
 108 #                       SHA_CTX *ctx,
 109 #                       const void *in0);
 110
 111 $code.=<<___;
 112 .text
 113 .extern OPENSSL_ia32cap_P
 114
 115 .globl  aesni_cbc_sha1_enc
 116 .type   aesni_cbc_sha1_enc,\@abi-omnipotent
 117 .align  32
 118 aesni_cbc_sha1_enc:
 119         # caller should check for SSSE3 and AES-NI bits
 120         mov     OPENSSL_ia32cap_P+0(%rip),%r10d
 121         mov     OPENSSL_ia32cap_P+4(%rip),%r11d
 122 ___
 123 $code.=<<___ if ($avx);
 124         and     \$`1<<28`,%r11d         # mask AVX bit
 125         and     \$`1<<30`,%r10d         # mask "Intel CPU" bit
 126         or      %r11d,%r10d
 127         cmp     \$`1<<28|1<<30`,%r10d
 128         je      aesni_cbc_sha1_enc_avx
 129 ___
 130 $code.=<<___;
 131         jmp     aesni_cbc_sha1_enc_ssse3
 132         ret
 133 .size   aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
 134 ___
 135
 136 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 137
 138 my $Xi=4;
 139 my @X=map("%xmm$_",(4..7,0..3));
 140 my @Tx=map("%xmm$_",(8..10));
 141 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");    # size optimization
 142 my @T=("%esi","%edi");
 143 my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0;
 144 my $K_XX_XX="%r11";
 145 my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));                   # for enc
 146 my @rndkey=("%xmm14","%xmm15");                                 # for enc
 147 my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15));    # for dec
 148
 149 if (1) {        # reassign for Atom Silvermont
 150     # The goal is to minimize amount of instructions with more than
 151     # 3 prefix bytes. Or in more practical terms to keep AES-NI *and*
 152     # SSSE3 instructions to upper half of the register bank.
 153     @X=map("%xmm$_",(8..11,4..7));
 154     @Tx=map("%xmm$_",(12,13,3));
 155     ($iv,$in,$rndkey0)=map("%xmm$_",(2,14,15));
 156     @rndkey=("%xmm0","%xmm1");
 157 }
 158
 159 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 160 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 161   my $arg = pop;
 162     $arg = "\$$arg" if ($arg*1 eq $arg);
 163     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 164 }
 165
 166 my $_rol=sub { &rol(@_) };
 167 my $_ror=sub { &ror(@_) };
 168
 169 $code.=<<___;
 170 .type   aesni_cbc_sha1_enc_ssse3,\@function,6
 171 .align  32
 172 aesni_cbc_sha1_enc_ssse3:
 173         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
 174         #shr    \$6,$len                        # debugging artefact
 175         #jz     .Lepilogue_ssse3                # debugging artefact
 176         push    %rbx
 177         push    %rbp
 178         push    %r12
 179         push    %r13
 180         push    %r14
 181         push    %r15
 182         lea     `-104-($win64?10*16:0)`(%rsp),%rsp
 183         #mov    $in0,$inp                       # debugging artefact
 184         #lea    64(%rsp),$ctx                   # debugging artefact
 185 ___
 186 $code.=<<___ if ($win64);
 187         movaps  %xmm6,96+0(%rsp)
 188         movaps  %xmm7,96+16(%rsp)
 189         movaps  %xmm8,96+32(%rsp)
 190         movaps  %xmm9,96+48(%rsp)
 191         movaps  %xmm10,96+64(%rsp)
 192         movaps  %xmm11,96+80(%rsp)
 193         movaps  %xmm12,96+96(%rsp)
 194         movaps  %xmm13,96+112(%rsp)
 195         movaps  %xmm14,96+128(%rsp)
 196         movaps  %xmm15,96+144(%rsp)
 197 .Lprologue_ssse3:
 198 ___
 199 $code.=<<___;
 200         mov     $in0,%r12                       # reassign arguments
 201         mov     $out,%r13
 202         mov     $len,%r14
 203         mov     $key,%r15
 204         movdqu  ($ivp),$iv                      # load IV
 205         mov     $ivp,88(%rsp)                   # save $ivp
 206 ___
 207 ($in0,$out,$len,$key)=map("%r$_",(12..15));     # reassign arguments
 208 my $rounds="${ivp}d";
 209 $code.=<<___;
 210         shl     \$6,$len
 211         sub     $in0,$out
 212         mov     240($key),$rounds
 213         add     $inp,$len               # end of input
 214
 215         lea     K_XX_XX(%rip),$K_XX_XX
 216         mov     0($ctx),$A              # load context
 217         mov     4($ctx),$B
 218         mov     8($ctx),$C
 219         mov     12($ctx),$D
 220         mov     $B,@T[0]                # magic seed
 221         mov     16($ctx),$E
 222         mov     $C,@T[1]
 223         xor     $D,@T[1]
 224         and     @T[1],@T[0]
 225
 226         movdqa  64($K_XX_XX),@Tx[2]     # pbswap mask
 227         movdqa  0($K_XX_XX),@Tx[1]      # K_00_19
 228         movdqu  0($inp),@X[-4&7]        # load input to %xmm[0-3]
 229         movdqu  16($inp),@X[-3&7]
 230         movdqu  32($inp),@X[-2&7]
 231         movdqu  48($inp),@X[-1&7]
 232         pshufb  @Tx[2],@X[-4&7]         # byte swap
 233         pshufb  @Tx[2],@X[-3&7]
 234         pshufb  @Tx[2],@X[-2&7]
 235         add     \$64,$inp
 236         paddd   @Tx[1],@X[-4&7]         # add K_00_19
 237         pshufb  @Tx[2],@X[-1&7]
 238         paddd   @Tx[1],@X[-3&7]
 239         paddd   @Tx[1],@X[-2&7]
 240         movdqa  @X[-4&7],0(%rsp)        # X[]+K xfer to IALU
 241         psubd   @Tx[1],@X[-4&7]         # restore X[]
 242         movdqa  @X[-3&7],16(%rsp)
 243         psubd   @Tx[1],@X[-3&7]
 244         movdqa  @X[-2&7],32(%rsp)
 245         psubd   @Tx[1],@X[-2&7]
 246         movups  ($key),$rndkey0         # $key[0]
 247         movups  16($key),$rndkey[0]     # forward reference
 248         jmp     .Loop_ssse3
 249 ___
 250
 251 my $aesenc=sub {
 252   use integer;
 253   my ($n,$k)=($r/10,$r%10);
 254     if ($k==0) {
 255       $code.=<<___;
 256         movups          `16*$n`($in0),$in               # load input
 257         xorps           $rndkey0,$in
 258 ___
 259       $code.=<<___ if ($n);
 260         movups          $iv,`16*($n-1)`($out,$in0)      # write output
 261 ___
 262       $code.=<<___;
 263         xorps           $in,$iv
 264         aesenc          $rndkey[0],$iv
 265         movups          `32+16*$k`($key),$rndkey[1]
 266 ___
 267     } elsif ($k==9) {
 268       $sn++;
 269       $code.=<<___;
 270         cmp             \$11,$rounds
 271         jb              .Laesenclast$sn
 272         movups          `32+16*($k+0)`($key),$rndkey[1]
 273         aesenc          $rndkey[0],$iv
 274         movups          `32+16*($k+1)`($key),$rndkey[0]
 275         aesenc          $rndkey[1],$iv
 276         je              .Laesenclast$sn
 277         movups          `32+16*($k+2)`($key),$rndkey[1]
 278         aesenc          $rndkey[0],$iv
 279         movups          `32+16*($k+3)`($key),$rndkey[0]
 280         aesenc          $rndkey[1],$iv
 281 .Laesenclast$sn:
 282         aesenclast      $rndkey[0],$iv
 283         movups          16($key),$rndkey[1]             # forward reference
 284 ___
 285     } else {
 286       $code.=<<___;
 287         aesenc          $rndkey[0],$iv
 288         movups          `32+16*$k`($key),$rndkey[1]
 289 ___
 290     }
 291     $r++;       unshift(@rndkey,pop(@rndkey));
 292 };
 293
 294 sub Xupdate_ssse3_16_31()               # recall that $Xi starts wtih 4
 295 { use integer;
 296   my $body = shift;
 297   my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
 298   my ($a,$b,$c,$d,$e);
 299
 300          eval(shift(@insns));           # ror
 301         &pshufd (@X[0],@X[-4&7],0xee);  # was &movdqa   (@X[0],@X[-3&7]);
 302          eval(shift(@insns));
 303         &movdqa (@Tx[0],@X[-1&7]);
 304           &paddd        (@Tx[1],@X[-1&7]);
 305          eval(shift(@insns));
 306          eval(shift(@insns));
 307
 308         &punpcklqdq(@X[0],@X[-3&7]);    # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
 309          eval(shift(@insns));
 310          eval(shift(@insns));           # rol
 311          eval(shift(@insns));
 312         &psrldq (@Tx[0],4);             # "X[-3]", 3 dwords
 313          eval(shift(@insns));
 314          eval(shift(@insns));
 315
 316         &pxor   (@X[0],@X[-4&7]);       # "X[0]"^="X[-16]"
 317          eval(shift(@insns));
 318          eval(shift(@insns));           # ror
 319         &pxor   (@Tx[0],@X[-2&7]);      # "X[-3]"^"X[-8]"
 320          eval(shift(@insns));
 321          eval(shift(@insns));
 322          eval(shift(@insns));
 323
 324         &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-3]"^"X[-8]"
 325          eval(shift(@insns));
 326          eval(shift(@insns));           # rol
 327           &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 328          eval(shift(@insns));
 329          eval(shift(@insns));
 330
 331         &movdqa (@Tx[2],@X[0]);
 332          eval(shift(@insns));
 333          eval(shift(@insns));
 334          eval(shift(@insns));           # ror
 335         &movdqa (@Tx[0],@X[0]);
 336          eval(shift(@insns));
 337
 338         &pslldq (@Tx[2],12);            # "X[0]"<<96, extract one dword
 339         &paddd  (@X[0],@X[0]);
 340          eval(shift(@insns));
 341          eval(shift(@insns));
 342
 343         &psrld  (@Tx[0],31);
 344          eval(shift(@insns));
 345          eval(shift(@insns));           # rol
 346          eval(shift(@insns));
 347         &movdqa (@Tx[1],@Tx[2]);
 348          eval(shift(@insns));
 349          eval(shift(@insns));
 350
 351         &psrld  (@Tx[2],30);
 352          eval(shift(@insns));
 353          eval(shift(@insns));           # ror
 354         &por    (@X[0],@Tx[0]);         # "X[0]"<<<=1
 355          eval(shift(@insns));
 356          eval(shift(@insns));
 357          eval(shift(@insns));
 358
 359         &pslld  (@Tx[1],2);
 360         &pxor   (@X[0],@Tx[2]);
 361          eval(shift(@insns));
 362           &movdqa       (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");       # K_XX_XX
 363          eval(shift(@insns));           # rol
 364          eval(shift(@insns));
 365          eval(shift(@insns));
 366
 367         &pxor   (@X[0],@Tx[1]);         # "X[0]"^=("X[0]">>96)<<<2
 368         &pshufd (@Tx[1],@X[-1&7],0xee)  if ($Xi==7);    # was &movdqa   (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
 369
 370          foreach (@insns) { eval; }     # remaining instructions [if any]
 371
 372   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 373                 push(@Tx,shift(@Tx));
 374 }
 375
 376 sub Xupdate_ssse3_32_79()
 377 { use integer;
 378   my $body = shift;
 379   my @insns = (&$body,&$body,&$body,&$body);    # 32 to 44 instructions
 380   my ($a,$b,$c,$d,$e);
 381
 382          eval(shift(@insns))            if ($Xi==8);
 383         &pxor   (@X[0],@X[-4&7]);       # "X[0]"="X[-32]"^"X[-16]"
 384          eval(shift(@insns))            if ($Xi==8);
 385          eval(shift(@insns));           # body_20_39
 386          eval(shift(@insns));
 387          eval(shift(@insns))            if (@insns[1] =~ /_ror/);
 388          eval(shift(@insns))            if (@insns[0] =~ /_ror/);
 389         &punpcklqdq(@Tx[0],@X[-1&7]);   # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
 390          eval(shift(@insns));
 391          eval(shift(@insns));           # rol
 392
 393         &pxor   (@X[0],@X[-7&7]);       # "X[0]"^="X[-28]"
 394          eval(shift(@insns));
 395          eval(shift(@insns));
 396         if ($Xi%5) {
 397           &movdqa       (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
 398         } else {                        # ... or load next one
 399           &movdqa       (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
 400         }
 401          eval(shift(@insns));           # ror
 402           &paddd        (@Tx[1],@X[-1&7]);
 403          eval(shift(@insns));
 404
 405         &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-6]"
 406          eval(shift(@insns));           # body_20_39
 407          eval(shift(@insns));
 408          eval(shift(@insns));
 409          eval(shift(@insns));           # rol
 410          eval(shift(@insns))            if (@insns[0] =~ /_ror/);
 411
 412         &movdqa (@Tx[0],@X[0]);
 413          eval(shift(@insns));
 414          eval(shift(@insns));
 415           &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 416          eval(shift(@insns));           # ror
 417          eval(shift(@insns));
 418          eval(shift(@insns));           # body_20_39
 419
 420         &pslld  (@X[0],2);
 421          eval(shift(@insns));
 422          eval(shift(@insns));
 423         &psrld  (@Tx[0],30);
 424          eval(shift(@insns))            if (@insns[0] =~ /_rol/);# rol
 425          eval(shift(@insns));
 426          eval(shift(@insns));
 427          eval(shift(@insns));           # ror
 428
 429         &por    (@X[0],@Tx[0]);         # "X[0]"<<<=2
 430          eval(shift(@insns));
 431          eval(shift(@insns));           # body_20_39
 432          eval(shift(@insns))            if (@insns[1] =~ /_rol/);
 433          eval(shift(@insns))            if (@insns[0] =~ /_rol/);
 434           &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19);    # was &movdqa   (@Tx[1],@X[0])
 435          eval(shift(@insns));
 436          eval(shift(@insns));           # rol
 437          eval(shift(@insns));
 438          eval(shift(@insns));
 439          eval(shift(@insns));           # rol
 440          eval(shift(@insns));
 441
 442          foreach (@insns) { eval; }     # remaining instructions
 443
 444   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 445                 push(@Tx,shift(@Tx));
 446 }
 447
 448 sub Xuplast_ssse3_80()
 449 { use integer;
 450   my $body = shift;
 451   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 452   my ($a,$b,$c,$d,$e);
 453
 454          eval(shift(@insns));
 455          eval(shift(@insns));
 456          eval(shift(@insns));
 457          eval(shift(@insns));
 458           &paddd        (@Tx[1],@X[-1&7]);
 459          eval(shift(@insns));
 460          eval(shift(@insns));
 461
 462           &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
 463
 464          foreach (@insns) { eval; }             # remaining instructions
 465
 466         &cmp    ($inp,$len);
 467         &je     (shift);
 468
 469         unshift(@Tx,pop(@Tx));
 470
 471         &movdqa (@Tx[2],"64($K_XX_XX)");        # pbswap mask
 472         &movdqa (@Tx[1],"0($K_XX_XX)");         # K_00_19
 473         &movdqu (@X[-4&7],"0($inp)");           # load input
 474         &movdqu (@X[-3&7],"16($inp)");
 475         &movdqu (@X[-2&7],"32($inp)");
 476         &movdqu (@X[-1&7],"48($inp)");
 477         &pshufb (@X[-4&7],@Tx[2]);              # byte swap
 478         &add    ($inp,64);
 479
 480   $Xi=0;
 481 }
 482
 483 sub Xloop_ssse3()
 484 { use integer;
 485   my $body = shift;
 486   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 487   my ($a,$b,$c,$d,$e);
 488
 489          eval(shift(@insns));
 490          eval(shift(@insns));
 491          eval(shift(@insns));
 492         &pshufb (@X[($Xi-3)&7],@Tx[2]);
 493          eval(shift(@insns));
 494          eval(shift(@insns));
 495          eval(shift(@insns));
 496          eval(shift(@insns));
 497         &paddd  (@X[($Xi-4)&7],@Tx[1]);
 498          eval(shift(@insns));
 499          eval(shift(@insns));
 500          eval(shift(@insns));
 501          eval(shift(@insns));
 502         &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);  # X[]+K xfer to IALU
 503          eval(shift(@insns));
 504          eval(shift(@insns));
 505          eval(shift(@insns));
 506          eval(shift(@insns));
 507         &psubd  (@X[($Xi-4)&7],@Tx[1]);
 508
 509         foreach (@insns) { eval; }
 510   $Xi++;
 511 }
 512
 513 sub Xtail_ssse3()
 514 { use integer;
 515   my $body = shift;
 516   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 517   my ($a,$b,$c,$d,$e);
 518
 519         foreach (@insns) { eval; }
 520 }
 521
 522 my @body_00_19 = (
 523         '($a,$b,$c,$d,$e)=@V;'.
 524         '&$_ror ($b,$j?7:2);',  # $b>>>2
 525         '&xor   (@T[0],$d);',
 526         '&mov   (@T[1],$a);',   # $b for next round
 527
 528         '&add   ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
 529         '&xor   ($b,$c);',      # $c^$d for next round
 530
 531         '&$_rol ($a,5);',
 532         '&add   ($e,@T[0]);',
 533         '&and   (@T[1],$b);',   # ($b&($c^$d)) for next round
 534
 535         '&xor   ($b,$c);',      # restore $b
 536         '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 537         );
 538
 539 sub body_00_19 () {     # ((c^d)&b)^d
 540     # on start @T[0]=(c^d)&b
 541     return &body_20_39() if ($rx==19); $rx++;
 542
 543     use integer;
 544     my ($k,$n);
 545     my @r=@body_00_19;
 546
 547         $n = scalar(@r);
 548         $k = (($jj+1)*12/20)*20*$n/12;  # 12 aesencs per these 20 rounds
 549         @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n);
 550         $jj++;
 551
 552     return @r;
 553 }
 554
 555 my @body_20_39 = (
 556         '($a,$b,$c,$d,$e)=@V;'.
 557         '&add   ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
 558         '&xor   (@T[0],$d)      if($j==19);'.
 559         '&xor   (@T[0],$c)      if($j> 19);',   # ($b^$d^$c)
 560         '&mov   (@T[1],$a);',   # $b for next round
 561
 562         '&$_rol ($a,5);',
 563         '&add   ($e,@T[0]);',
 564         '&xor   (@T[1],$c)      if ($j< 79);',  # $b^$d for next round
 565
 566         '&$_ror ($b,7);',       # $b>>>2
 567         '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 568         );
 569
 570 sub body_20_39 () {     # b^d^c
 571     # on entry @T[0]=b^d
 572     return &body_40_59() if ($rx==39); $rx++;
 573
 574     use integer;
 575     my ($k,$n);
 576     my @r=@body_20_39;
 577
 578         $n = scalar(@r);
 579         $k = (($jj+1)*8/20)*20*$n/8;    # 8 aesencs per these 20 rounds
 580         @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n && $rx!=20);
 581         $jj++;
 582
 583     return @r;
 584 }
 585
 586 my @body_40_59 = (
 587         '($a,$b,$c,$d,$e)=@V;'.
 588         '&add   ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
 589         '&and   (@T[0],$c)      if ($j>=40);',  # (b^c)&(c^d)
 590         '&xor   ($c,$d)         if ($j>=40);',  # restore $c
 591
 592         '&$_ror ($b,7);',       # $b>>>2
 593         '&mov   (@T[1],$a);',   # $b for next round
 594         '&xor   (@T[0],$c);',
 595
 596         '&$_rol ($a,5);',
 597         '&add   ($e,@T[0]);',
 598         '&xor   (@T[1],$c)      if ($j==59);'.
 599         '&xor   (@T[1],$b)      if ($j< 59);',  # b^c for next round
 600
 601         '&xor   ($b,$c)         if ($j< 59);',  # c^d for next round
 602         '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 603         );
 604
 605 sub body_40_59 () {     # ((b^c)&(c^d))^c
 606     # on entry @T[0]=(b^c), (c^=d)
 607     $rx++;
 608
 609     use integer;
 610     my ($k,$n);
 611     my @r=@body_40_59;
 612
 613         $n = scalar(@r);
 614         $k=(($jj+1)*12/20)*20*$n/12;    # 12 aesencs per these 20 rounds
 615         @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n && $rx!=40);
 616         $jj++;
 617
 618     return @r;
 619 }
 620 $code.=<<___;
 621 .align  32
 622 .Loop_ssse3:
 623 ___
 624         &Xupdate_ssse3_16_31(\&body_00_19);
 625         &Xupdate_ssse3_16_31(\&body_00_19);
 626         &Xupdate_ssse3_16_31(\&body_00_19);
 627         &Xupdate_ssse3_16_31(\&body_00_19);
 628         &Xupdate_ssse3_32_79(\&body_00_19);
 629         &Xupdate_ssse3_32_79(\&body_20_39);
 630         &Xupdate_ssse3_32_79(\&body_20_39);
 631         &Xupdate_ssse3_32_79(\&body_20_39);
 632         &Xupdate_ssse3_32_79(\&body_20_39);
 633         &Xupdate_ssse3_32_79(\&body_20_39);
 634         &Xupdate_ssse3_32_79(\&body_40_59);
 635         &Xupdate_ssse3_32_79(\&body_40_59);
 636         &Xupdate_ssse3_32_79(\&body_40_59);
 637         &Xupdate_ssse3_32_79(\&body_40_59);
 638         &Xupdate_ssse3_32_79(\&body_40_59);
 639         &Xupdate_ssse3_32_79(\&body_20_39);
 640         &Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3"); # can jump to "done"
 641
 642                                 $saved_j=$j; @saved_V=@V;
 643                                 $saved_r=$r; @saved_rndkey=@rndkey;
 644
 645         &Xloop_ssse3(\&body_20_39);
 646         &Xloop_ssse3(\&body_20_39);
 647         &Xloop_ssse3(\&body_20_39);
 648
 649 $code.=<<___;
 650         movups  $iv,48($out,$in0)               # write output
 651         lea     64($in0),$in0
 652
 653         add     0($ctx),$A                      # update context
 654         add     4($ctx),@T[0]
 655         add     8($ctx),$C
 656         add     12($ctx),$D
 657         mov     $A,0($ctx)
 658         add     16($ctx),$E
 659         mov     @T[0],4($ctx)
 660         mov     @T[0],$B                        # magic seed
 661         mov     $C,8($ctx)
 662         mov     $C,@T[1]
 663         mov     $D,12($ctx)
 664         xor     $D,@T[1]
 665         mov     $E,16($ctx)
 666         and     @T[1],@T[0]
 667         jmp     .Loop_ssse3
 668
 669 .Ldone_ssse3:
 670 ___
 671                                 $jj=$j=$saved_j; @V=@saved_V;
 672                                 $r=$saved_r;     @rndkey=@saved_rndkey;
 673
 674         &Xtail_ssse3(\&body_20_39);
 675         &Xtail_ssse3(\&body_20_39);
 676         &Xtail_ssse3(\&body_20_39);
 677
 678 $code.=<<___;
 679         movups  $iv,48($out,$in0)               # write output
 680         mov     88(%rsp),$ivp                   # restore $ivp
 681
 682         add     0($ctx),$A                      # update context
 683         add     4($ctx),@T[0]
 684         add     8($ctx),$C
 685         mov     $A,0($ctx)
 686         add     12($ctx),$D
 687         mov     @T[0],4($ctx)
 688         add     16($ctx),$E
 689         mov     $C,8($ctx)
 690         mov     $D,12($ctx)
 691         mov     $E,16($ctx)
 692         movups  $iv,($ivp)                      # write IV
 693 ___
 694 $code.=<<___ if ($win64);
 695         movaps  96+0(%rsp),%xmm6
 696         movaps  96+16(%rsp),%xmm7
 697         movaps  96+32(%rsp),%xmm8
 698         movaps  96+48(%rsp),%xmm9
 699         movaps  96+64(%rsp),%xmm10
 700         movaps  96+80(%rsp),%xmm11
 701         movaps  96+96(%rsp),%xmm12
 702         movaps  96+112(%rsp),%xmm13
 703         movaps  96+128(%rsp),%xmm14
 704         movaps  96+144(%rsp),%xmm15
 705 ___
 706 $code.=<<___;
 707         lea     `104+($win64?10*16:0)`(%rsp),%rsi
 708         mov     0(%rsi),%r15
 709         mov     8(%rsi),%r14
 710         mov     16(%rsi),%r13
 711         mov     24(%rsi),%r12
 712         mov     32(%rsi),%rbp
 713         mov     40(%rsi),%rbx
 714         lea     48(%rsi),%rsp
 715 .Lepilogue_ssse3:
 716         ret
 717 .size   aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
 718 ___
 719
 720                                                 if ($stitched_decrypt) {{{
 721 # reset
 722 ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 723 $j=$jj=$r=$sn=$rx=0;
 724 $Xi=4;
 725
 726 # reassign for Atom Silvermont (see above)
 727 ($inout0,$inout1,$inout2,$inout3,$rndkey0)=map("%xmm$_",(0..4));
 728 @X=map("%xmm$_",(8..13,6,7));
 729 @Tx=map("%xmm$_",(14,15,5));
 730
 731 my @aes256_dec = (
 732         '&movdqu($inout0,"0x00($in0)");',
 733         '&movdqu($inout1,"0x10($in0)"); &pxor   ($inout0,$rndkey0);',
 734         '&movdqu($inout2,"0x20($in0)"); &pxor   ($inout1,$rndkey0);',
 735         '&movdqu($inout3,"0x30($in0)"); &pxor   ($inout2,$rndkey0);',
 736
 737         '&pxor  ($inout3,$rndkey0);     &movups ($rndkey0,"16-112($key)");',
 738         '&movaps("64(%rsp)",@X[2]);',   # save IV, originally @X[3]
 739         undef,undef
 740         );
 741 for ($i=0;$i<13;$i++) {
 742     push (@aes256_dec,(
 743         '&aesdec        ($inout0,$rndkey0);',
 744         '&aesdec        ($inout1,$rndkey0);',
 745         '&aesdec        ($inout2,$rndkey0);',
 746         '&aesdec        ($inout3,$rndkey0);     &movups($rndkey0,"'.(16*($i+2)-112).'($key)");'
 747         ));
 748     push (@aes256_dec,(undef,undef))    if (($i>=3 && $i<=5) || $i>=11);
 749     push (@aes256_dec,(undef,undef))    if ($i==5);
 750 }
 751 push(@aes256_dec,(
 752         '&aesdeclast    ($inout0,$rndkey0);     &movups (@X[0],"0x00($in0)");',
 753         '&aesdeclast    ($inout1,$rndkey0);     &movups (@X[1],"0x10($in0)");',
 754         '&aesdeclast    ($inout2,$rndkey0);     &movups (@X[2],"0x20($in0)");',
 755         '&aesdeclast    ($inout3,$rndkey0);     &movups (@X[3],"0x30($in0)");',
 756
 757         '&xorps         ($inout0,"64(%rsp)");   &movdqu ($rndkey0,"-112($key)");',
 758         '&xorps         ($inout1,@X[0]);        &movups ("0x00($out,$in0)",$inout0);',
 759         '&xorps         ($inout2,@X[1]);        &movups ("0x10($out,$in0)",$inout1);',
 760         '&xorps         ($inout3,@X[2]);        &movups ("0x20($out,$in0)",$inout2);',
 761
 762         '&movups        ("0x30($out,$in0)",$inout3);'
 763         ));
 764
 765 sub body_00_19_dec () { # ((c^d)&b)^d
 766     # on start @T[0]=(c^d)&b
 767     return &body_20_39_dec() if ($rx==19);
 768
 769     my @r=@body_00_19;
 770
 771         unshift (@r,@aes256_dec[$rx])   if (@aes256_dec[$rx]);
 772         $rx++;
 773
 774     return @r;
 775 }
 776
 777 sub body_20_39_dec () { # b^d^c
 778     # on entry @T[0]=b^d
 779     return &body_40_59_dec() if ($rx==39);
 780
 781     my @r=@body_20_39;
 782
 783         unshift (@r,@aes256_dec[$rx])   if (@aes256_dec[$rx]);
 784         $rx++;
 785
 786     return @r;
 787 }
 788
 789 sub body_40_59_dec () { # ((b^c)&(c^d))^c
 790     # on entry @T[0]=(b^c), (c^=d)
 791
 792     my @r=@body_40_59;
 793
 794         unshift (@r,@aes256_dec[$rx])   if (@aes256_dec[$rx]);
 795         $rx++;
 796
 797     return @r;
 798 }
 799
 800 $code.=<<___;
 801 .globl  aesni256_cbc_sha1_dec
 802 .type   aesni256_cbc_sha1_dec,\@abi-omnipotent
 803 .align  32
 804 aesni256_cbc_sha1_dec:
 805         # caller should check for SSSE3 and AES-NI bits
 806         mov     OPENSSL_ia32cap_P+0(%rip),%r10d
 807         mov     OPENSSL_ia32cap_P+4(%rip),%r11d
 808 ___
 809 $code.=<<___ if ($avx);
 810         and     \$`1<<28`,%r11d         # mask AVX bit
 811         and     \$`1<<30`,%r10d         # mask "Intel CPU" bit
 812         or      %r11d,%r10d
 813         cmp     \$`1<<28|1<<30`,%r10d
 814         je      aesni256_cbc_sha1_dec_avx
 815 ___
 816 $code.=<<___;
 817         jmp     aesni256_cbc_sha1_dec_ssse3
 818         ret
 819 .size   aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec
 820
 821 .type   aesni256_cbc_sha1_dec_ssse3,\@function,6
 822 .align  32
 823 aesni256_cbc_sha1_dec_ssse3:
 824         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
 825         push    %rbx
 826         push    %rbp
 827         push    %r12
 828         push    %r13
 829         push    %r14
 830         push    %r15
 831         lea     `-104-($win64?10*16:0)`(%rsp),%rsp
 832 ___
 833 $code.=<<___ if ($win64);
 834         movaps  %xmm6,96+0(%rsp)
 835         movaps  %xmm7,96+16(%rsp)
 836         movaps  %xmm8,96+32(%rsp)
 837         movaps  %xmm9,96+48(%rsp)
 838         movaps  %xmm10,96+64(%rsp)
 839         movaps  %xmm11,96+80(%rsp)
 840         movaps  %xmm12,96+96(%rsp)
 841         movaps  %xmm13,96+112(%rsp)
 842         movaps  %xmm14,96+128(%rsp)
 843         movaps  %xmm15,96+144(%rsp)
 844 .Lprologue_dec_ssse3:
 845 ___
 846 $code.=<<___;
 847         mov     $in0,%r12                       # reassign arguments
 848         mov     $out,%r13
 849         mov     $len,%r14
 850         lea     112($key),%r15                  # size optimization
 851         movdqu  ($ivp),@X[3]                    # load IV
 852         #mov    $ivp,88(%rsp)                   # save $ivp
 853 ___
 854 ($in0,$out,$len,$key)=map("%r$_",(12..15));     # reassign arguments
 855 $code.=<<___;
 856         shl     \$6,$len
 857         sub     $in0,$out
 858         add     $inp,$len               # end of input
 859
 860         lea     K_XX_XX(%rip),$K_XX_XX
 861         mov     0($ctx),$A              # load context
 862         mov     4($ctx),$B
 863         mov     8($ctx),$C
 864         mov     12($ctx),$D
 865         mov     $B,@T[0]                # magic seed
 866         mov     16($ctx),$E
 867         mov     $C,@T[1]
 868         xor     $D,@T[1]
 869         and     @T[1],@T[0]
 870
 871         movdqa  64($K_XX_XX),@Tx[2]     # pbswap mask
 872         movdqa  0($K_XX_XX),@Tx[1]      # K_00_19
 873         movdqu  0($inp),@X[-4&7]        # load input to %xmm[0-3]
 874         movdqu  16($inp),@X[-3&7]
 875         movdqu  32($inp),@X[-2&7]
 876         movdqu  48($inp),@X[-1&7]
 877         pshufb  @Tx[2],@X[-4&7]         # byte swap
 878         add     \$64,$inp
 879         pshufb  @Tx[2],@X[-3&7]
 880         pshufb  @Tx[2],@X[-2&7]
 881         pshufb  @Tx[2],@X[-1&7]
 882         paddd   @Tx[1],@X[-4&7]         # add K_00_19
 883         paddd   @Tx[1],@X[-3&7]
 884         paddd   @Tx[1],@X[-2&7]
 885         movdqa  @X[-4&7],0(%rsp)        # X[]+K xfer to IALU
 886         psubd   @Tx[1],@X[-4&7]         # restore X[]
 887         movdqa  @X[-3&7],16(%rsp)
 888         psubd   @Tx[1],@X[-3&7]
 889         movdqa  @X[-2&7],32(%rsp)
 890         psubd   @Tx[1],@X[-2&7]
 891         movdqu  -112($key),$rndkey0     # $key[0]
 892         jmp     .Loop_dec_ssse3
 893
 894 .align  32
 895 .Loop_dec_ssse3:
 896 ___
 897         &Xupdate_ssse3_16_31(\&body_00_19_dec);
 898         &Xupdate_ssse3_16_31(\&body_00_19_dec);
 899         &Xupdate_ssse3_16_31(\&body_00_19_dec);
 900         &Xupdate_ssse3_16_31(\&body_00_19_dec);
 901         &Xupdate_ssse3_32_79(\&body_00_19_dec);
 902         &Xupdate_ssse3_32_79(\&body_20_39_dec);
 903         &Xupdate_ssse3_32_79(\&body_20_39_dec);
 904         &Xupdate_ssse3_32_79(\&body_20_39_dec);
 905         &Xupdate_ssse3_32_79(\&body_20_39_dec);
 906         &Xupdate_ssse3_32_79(\&body_20_39_dec);
 907         &Xupdate_ssse3_32_79(\&body_40_59_dec);
 908         &Xupdate_ssse3_32_79(\&body_40_59_dec);
 909         &Xupdate_ssse3_32_79(\&body_40_59_dec);
 910         &Xupdate_ssse3_32_79(\&body_40_59_dec);
 911         &Xupdate_ssse3_32_79(\&body_40_59_dec);
 912         &Xupdate_ssse3_32_79(\&body_20_39_dec);
 913         &Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3"); # can jump to "done"
 914
 915                                 $saved_j=$j;   @saved_V=@V;
 916                                 $saved_rx=$rx;
 917
 918         &Xloop_ssse3(\&body_20_39_dec);
 919         &Xloop_ssse3(\&body_20_39_dec);
 920         &Xloop_ssse3(\&body_20_39_dec);
 921
 922         eval(@aes256_dec[-1]);                  # last store
 923 $code.=<<___;
 924         lea     64($in0),$in0
 925
 926         add     0($ctx),$A                      # update context
 927         add     4($ctx),@T[0]
 928         add     8($ctx),$C
 929         add     12($ctx),$D
 930         mov     $A,0($ctx)
 931         add     16($ctx),$E
 932         mov     @T[0],4($ctx)
 933         mov     @T[0],$B                        # magic seed
 934         mov     $C,8($ctx)
 935         mov     $C,@T[1]
 936         mov     $D,12($ctx)
 937         xor     $D,@T[1]
 938         mov     $E,16($ctx)
 939         and     @T[1],@T[0]
 940         jmp     .Loop_dec_ssse3
 941
 942 .Ldone_dec_ssse3:
 943 ___
 944                                 $jj=$j=$saved_j; @V=@saved_V;
 945                                 $rx=$saved_rx;
 946
 947         &Xtail_ssse3(\&body_20_39_dec);
 948         &Xtail_ssse3(\&body_20_39_dec);
 949         &Xtail_ssse3(\&body_20_39_dec);
 950
 951         eval(@aes256_dec[-1]);                  # last store
 952 $code.=<<___;
 953         add     0($ctx),$A                      # update context
 954         add     4($ctx),@T[0]
 955         add     8($ctx),$C
 956         mov     $A,0($ctx)
 957         add     12($ctx),$D
 958         mov     @T[0],4($ctx)
 959         add     16($ctx),$E
 960         mov     $C,8($ctx)
 961         mov     $D,12($ctx)
 962         mov     $E,16($ctx)
 963         movups  @X[3],($ivp)                    # write IV
 964 ___
 965 $code.=<<___ if ($win64);
 966         movaps  96+0(%rsp),%xmm6
 967         movaps  96+16(%rsp),%xmm7
 968         movaps  96+32(%rsp),%xmm8
 969         movaps  96+48(%rsp),%xmm9
 970         movaps  96+64(%rsp),%xmm10
 971         movaps  96+80(%rsp),%xmm11
 972         movaps  96+96(%rsp),%xmm12
 973         movaps  96+112(%rsp),%xmm13
 974         movaps  96+128(%rsp),%xmm14
 975         movaps  96+144(%rsp),%xmm15
 976 ___
 977 $code.=<<___;
 978         lea     `104+($win64?10*16:0)`(%rsp),%rsi
 979         mov     0(%rsi),%r15
 980         mov     8(%rsi),%r14
 981         mov     16(%rsi),%r13
 982         mov     24(%rsi),%r12
 983         mov     32(%rsi),%rbp
 984         mov     40(%rsi),%rbx
 985         lea     48(%rsi),%rsp
 986 .Lepilogue_dec_ssse3:
 987         ret
 988 .size   aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3
 989 ___
 990                                                 }}}
 991 $j=$jj=$r=$sn=$rx=0;
 992
 993 if ($avx) {
 994 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 995
 996 my $Xi=4;
 997 my @X=map("%xmm$_",(4..7,0..3));
 998 my @Tx=map("%xmm$_",(8..10));
 999 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");    # size optimization
1000 my @T=("%esi","%edi");
1001 my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));
1002 my @rndkey=("%xmm14","%xmm15");
1003 my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15));    # for dec
1004 my $Kx=@Tx[2];
1005
1006 my $_rol=sub { &shld(@_[0],@_) };
1007 my $_ror=sub { &shrd(@_[0],@_) };
1008
1009 $code.=<<___;
1010 .type   aesni_cbc_sha1_enc_avx,\@function,6
1011 .align  32
1012 aesni_cbc_sha1_enc_avx:
1013         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1014         #shr    \$6,$len                        # debugging artefact
1015         #jz     .Lepilogue_avx                  # debugging artefact
1016         push    %rbx
1017         push    %rbp
1018         push    %r12
1019         push    %r13
1020         push    %r14
1021         push    %r15
1022         lea     `-104-($win64?10*16:0)`(%rsp),%rsp
1023         #mov    $in0,$inp                       # debugging artefact
1024         #lea    64(%rsp),$ctx                   # debugging artefact
1025 ___
1026 $code.=<<___ if ($win64);
1027         movaps  %xmm6,96+0(%rsp)
1028         movaps  %xmm7,96+16(%rsp)
1029         movaps  %xmm8,96+32(%rsp)
1030         movaps  %xmm9,96+48(%rsp)
1031         movaps  %xmm10,96+64(%rsp)
1032         movaps  %xmm11,96+80(%rsp)
1033         movaps  %xmm12,96+96(%rsp)
1034         movaps  %xmm13,96+112(%rsp)
1035         movaps  %xmm14,96+128(%rsp)
1036         movaps  %xmm15,96+144(%rsp)
1037 .Lprologue_avx:
1038 ___
1039 $code.=<<___;
1040         vzeroall
1041         mov     $in0,%r12                       # reassign arguments
1042         mov     $out,%r13
1043         mov     $len,%r14
1044         mov     $key,%r15
1045         vmovdqu ($ivp),$iv                      # load IV
1046         mov     $ivp,88(%rsp)                   # save $ivp
1047 ___
1048 ($in0,$out,$len,$key)=map("%r$_",(12..15));     # reassign arguments
1049 my $rounds="${ivp}d";
1050 $code.=<<___;
1051         shl     \$6,$len
1052         sub     $in0,$out
1053         mov     240($key),$rounds
1054         add     \$112,$key              # size optimization
1055         add     $inp,$len               # end of input
1056
1057         lea     K_XX_XX(%rip),$K_XX_XX
1058         mov     0($ctx),$A              # load context
1059         mov     4($ctx),$B
1060         mov     8($ctx),$C
1061         mov     12($ctx),$D
1062         mov     $B,@T[0]                # magic seed
1063         mov     16($ctx),$E
1064         mov     $C,@T[1]
1065         xor     $D,@T[1]
1066         and     @T[1],@T[0]
1067
1068         vmovdqa 64($K_XX_XX),@X[2]      # pbswap mask
1069         vmovdqa 0($K_XX_XX),$Kx         # K_00_19
1070         vmovdqu 0($inp),@X[-4&7]        # load input to %xmm[0-3]
1071         vmovdqu 16($inp),@X[-3&7]
1072         vmovdqu 32($inp),@X[-2&7]
1073         vmovdqu 48($inp),@X[-1&7]
1074         vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
1075         add     \$64,$inp
1076         vpshufb @X[2],@X[-3&7],@X[-3&7]
1077         vpshufb @X[2],@X[-2&7],@X[-2&7]
1078         vpshufb @X[2],@X[-1&7],@X[-1&7]
1079         vpaddd  $Kx,@X[-4&7],@X[0]      # add K_00_19
1080         vpaddd  $Kx,@X[-3&7],@X[1]
1081         vpaddd  $Kx,@X[-2&7],@X[2]
1082         vmovdqa @X[0],0(%rsp)           # X[]+K xfer to IALU
1083         vmovdqa @X[1],16(%rsp)
1084         vmovdqa @X[2],32(%rsp)
1085         vmovups -112($key),$rndkey[1]   # $key[0]
1086         vmovups 16-112($key),$rndkey[0] # forward reference
1087         jmp     .Loop_avx
1088 ___
1089
1090 my $aesenc=sub {
1091   use integer;
1092   my ($n,$k)=($r/10,$r%10);
1093     if ($k==0) {
1094       $code.=<<___;
1095         vmovdqu         `16*$n`($in0),$in               # load input
1096         vpxor           $rndkey[1],$in,$in
1097 ___
1098       $code.=<<___ if ($n);
1099         vmovups         $iv,`16*($n-1)`($out,$in0)      # write output
1100 ___
1101       $code.=<<___;
1102         vpxor           $in,$iv,$iv
1103         vaesenc         $rndkey[0],$iv,$iv
1104         vmovups         `32+16*$k-112`($key),$rndkey[1]
1105 ___
1106     } elsif ($k==9) {
1107       $sn++;
1108       $code.=<<___;
1109         cmp             \$11,$rounds
1110         jb              .Lvaesenclast$sn
1111         vaesenc         $rndkey[0],$iv,$iv
1112         vmovups         `32+16*($k+0)-112`($key),$rndkey[1]
1113         vaesenc         $rndkey[1],$iv,$iv
1114         vmovups         `32+16*($k+1)-112`($key),$rndkey[0]
1115         je              .Lvaesenclast$sn
1116         vaesenc         $rndkey[0],$iv,$iv
1117         vmovups         `32+16*($k+2)-112`($key),$rndkey[1]
1118         vaesenc         $rndkey[1],$iv,$iv
1119         vmovups         `32+16*($k+3)-112`($key),$rndkey[0]
1120 .Lvaesenclast$sn:
1121         vaesenclast     $rndkey[0],$iv,$iv
1122         vmovups         -112($key),$rndkey[0]
1123         vmovups         16-112($key),$rndkey[1]         # forward reference
1124 ___
1125     } else {
1126       $code.=<<___;
1127         vaesenc         $rndkey[0],$iv,$iv
1128         vmovups         `32+16*$k-112`($key),$rndkey[1]
1129 ___
1130     }
1131     $r++;       unshift(@rndkey,pop(@rndkey));
1132 };
1133
1134 sub Xupdate_avx_16_31()         # recall that $Xi starts wtih 4
1135 { use integer;
1136   my $body = shift;
1137   my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
1138   my ($a,$b,$c,$d,$e);
1139
1140          eval(shift(@insns));
1141          eval(shift(@insns));
1142         &vpalignr(@X[0],@X[-3&7],@X[-4&7],8);   # compose "X[-14]" in "X[0]"
1143          eval(shift(@insns));
1144          eval(shift(@insns));
1145
1146           &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
1147          eval(shift(@insns));
1148          eval(shift(@insns));
1149         &vpsrldq(@Tx[0],@X[-1&7],4);            # "X[-3]", 3 dwords
1150          eval(shift(@insns));
1151          eval(shift(@insns));
1152         &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"^="X[-16]"
1153          eval(shift(@insns));
1154          eval(shift(@insns));
1155
1156         &vpxor  (@Tx[0],@Tx[0],@X[-2&7]);       # "X[-3]"^"X[-8]"
1157          eval(shift(@insns));
1158          eval(shift(@insns));
1159          eval(shift(@insns));
1160          eval(shift(@insns));
1161
1162         &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-3]"^"X[-8]"
1163          eval(shift(@insns));
1164          eval(shift(@insns));
1165           &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1166          eval(shift(@insns));
1167          eval(shift(@insns));
1168
1169         &vpsrld (@Tx[0],@X[0],31);
1170          eval(shift(@insns));
1171          eval(shift(@insns));
1172          eval(shift(@insns));
1173          eval(shift(@insns));
1174
1175         &vpslldq(@Tx[1],@X[0],12);              # "X[0]"<<96, extract one dword
1176         &vpaddd (@X[0],@X[0],@X[0]);
1177          eval(shift(@insns));
1178          eval(shift(@insns));
1179          eval(shift(@insns));
1180          eval(shift(@insns));
1181
1182         &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=1
1183         &vpsrld (@Tx[0],@Tx[1],30);
1184          eval(shift(@insns));
1185          eval(shift(@insns));
1186          eval(shift(@insns));
1187          eval(shift(@insns));
1188
1189         &vpslld (@Tx[1],@Tx[1],2);
1190         &vpxor  (@X[0],@X[0],@Tx[0]);
1191          eval(shift(@insns));
1192          eval(shift(@insns));
1193          eval(shift(@insns));
1194          eval(shift(@insns));
1195
1196         &vpxor  (@X[0],@X[0],@Tx[1]);           # "X[0]"^=("X[0]">>96)<<<2
1197          eval(shift(@insns));
1198          eval(shift(@insns));
1199           &vmovdqa      ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)")   if ($Xi%5==0);  # K_XX_XX
1200          eval(shift(@insns));
1201          eval(shift(@insns));
1202
1203
1204          foreach (@insns) { eval; }     # remaining instructions [if any]
1205
1206   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
1207 }
1208
1209 sub Xupdate_avx_32_79()
1210 { use integer;
1211   my $body = shift;
1212   my @insns = (&$body,&$body,&$body,&$body);    # 32 to 48 instructions
1213   my ($a,$b,$c,$d,$e);
1214
1215         &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);  # compose "X[-6]"
1216         &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"="X[-32]"^"X[-16]"
1217          eval(shift(@insns));           # body_20_39
1218          eval(shift(@insns));
1219          eval(shift(@insns));
1220          eval(shift(@insns));           # rol
1221
1222         &vpxor  (@X[0],@X[0],@X[-7&7]);         # "X[0]"^="X[-28]"
1223          eval(shift(@insns));
1224          eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
1225           &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
1226           &vmovdqa      ($Kx,eval(16*($Xi/5))."($K_XX_XX)")     if ($Xi%5==0);
1227          eval(shift(@insns));           # ror
1228          eval(shift(@insns));
1229
1230         &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-6]"
1231          eval(shift(@insns));           # body_20_39
1232          eval(shift(@insns));
1233          eval(shift(@insns));
1234          eval(shift(@insns));           # rol
1235
1236         &vpsrld (@Tx[0],@X[0],30);
1237           &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1238          eval(shift(@insns));
1239          eval(shift(@insns));
1240          eval(shift(@insns));           # ror
1241          eval(shift(@insns));
1242
1243         &vpslld (@X[0],@X[0],2);
1244          eval(shift(@insns));           # body_20_39
1245          eval(shift(@insns));
1246          eval(shift(@insns));
1247          eval(shift(@insns));           # rol
1248          eval(shift(@insns));
1249          eval(shift(@insns));
1250          eval(shift(@insns));           # ror
1251          eval(shift(@insns));
1252
1253         &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=2
1254          eval(shift(@insns));           # body_20_39
1255          eval(shift(@insns));
1256          eval(shift(@insns));
1257          eval(shift(@insns));           # rol
1258          eval(shift(@insns));
1259          eval(shift(@insns));
1260          eval(shift(@insns));           # rol
1261          eval(shift(@insns));
1262
1263          foreach (@insns) { eval; }     # remaining instructions
1264
1265   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
1266 }
1267
1268 sub Xuplast_avx_80()
1269 { use integer;
1270   my $body = shift;
1271   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
1272   my ($a,$b,$c,$d,$e);
1273
1274          eval(shift(@insns));
1275           &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
1276          eval(shift(@insns));
1277          eval(shift(@insns));
1278          eval(shift(@insns));
1279          eval(shift(@insns));
1280
1281           &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
1282
1283          foreach (@insns) { eval; }             # remaining instructions
1284
1285         &cmp    ($inp,$len);
1286         &je     (shift);
1287
1288         &vmovdqa(@Tx[1],"64($K_XX_XX)");        # pbswap mask
1289         &vmovdqa($Kx,"0($K_XX_XX)");            # K_00_19
1290         &vmovdqu(@X[-4&7],"0($inp)");           # load input
1291         &vmovdqu(@X[-3&7],"16($inp)");
1292         &vmovdqu(@X[-2&7],"32($inp)");
1293         &vmovdqu(@X[-1&7],"48($inp)");
1294         &vpshufb(@X[-4&7],@X[-4&7],@Tx[1]);     # byte swap
1295         &add    ($inp,64);
1296
1297   $Xi=0;
1298 }
1299
1300 sub Xloop_avx()
1301 { use integer;
1302   my $body = shift;
1303   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
1304   my ($a,$b,$c,$d,$e);
1305
1306          eval(shift(@insns));
1307          eval(shift(@insns));
1308         &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]);
1309          eval(shift(@insns));
1310          eval(shift(@insns));
1311         &vpaddd (@Tx[0],@X[($Xi-4)&7],$Kx);
1312          eval(shift(@insns));
1313          eval(shift(@insns));
1314          eval(shift(@insns));
1315          eval(shift(@insns));
1316         &vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]); # X[]+K xfer to IALU
1317          eval(shift(@insns));
1318          eval(shift(@insns));
1319
1320         foreach (@insns) { eval; }
1321   $Xi++;
1322 }
1323
1324 sub Xtail_avx()
1325 { use integer;
1326   my $body = shift;
1327   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
1328   my ($a,$b,$c,$d,$e);
1329
1330         foreach (@insns) { eval; }
1331 }
1332
1333 $code.=<<___;
1334 .align  32
1335 .Loop_avx:
1336 ___
1337         &Xupdate_avx_16_31(\&body_00_19);
1338         &Xupdate_avx_16_31(\&body_00_19);
1339         &Xupdate_avx_16_31(\&body_00_19);
1340         &Xupdate_avx_16_31(\&body_00_19);
1341         &Xupdate_avx_32_79(\&body_00_19);
1342         &Xupdate_avx_32_79(\&body_20_39);
1343         &Xupdate_avx_32_79(\&body_20_39);
1344         &Xupdate_avx_32_79(\&body_20_39);
1345         &Xupdate_avx_32_79(\&body_20_39);
1346         &Xupdate_avx_32_79(\&body_20_39);
1347         &Xupdate_avx_32_79(\&body_40_59);
1348         &Xupdate_avx_32_79(\&body_40_59);
1349         &Xupdate_avx_32_79(\&body_40_59);
1350         &Xupdate_avx_32_79(\&body_40_59);
1351         &Xupdate_avx_32_79(\&body_40_59);
1352         &Xupdate_avx_32_79(\&body_20_39);
1353         &Xuplast_avx_80(\&body_20_39,".Ldone_avx");     # can jump to "done"
1354
1355                                 $saved_j=$j; @saved_V=@V;
1356                                 $saved_r=$r; @saved_rndkey=@rndkey;
1357
1358         &Xloop_avx(\&body_20_39);
1359         &Xloop_avx(\&body_20_39);
1360         &Xloop_avx(\&body_20_39);
1361
1362 $code.=<<___;
1363         vmovups $iv,48($out,$in0)               # write output
1364         lea     64($in0),$in0
1365
1366         add     0($ctx),$A                      # update context
1367         add     4($ctx),@T[0]
1368         add     8($ctx),$C
1369         add     12($ctx),$D
1370         mov     $A,0($ctx)
1371         add     16($ctx),$E
1372         mov     @T[0],4($ctx)
1373         mov     @T[0],$B                        # magic seed
1374         mov     $C,8($ctx)
1375         mov     $C,@T[1]
1376         mov     $D,12($ctx)
1377         xor     $D,@T[1]
1378         mov     $E,16($ctx)
1379         and     @T[1],@T[0]
1380         jmp     .Loop_avx
1381
1382 .Ldone_avx:
1383 ___
1384                                 $jj=$j=$saved_j; @V=@saved_V;
1385                                 $r=$saved_r;     @rndkey=@saved_rndkey;
1386
1387         &Xtail_avx(\&body_20_39);
1388         &Xtail_avx(\&body_20_39);
1389         &Xtail_avx(\&body_20_39);
1390
1391 $code.=<<___;
1392         vmovups $iv,48($out,$in0)               # write output
1393         mov     88(%rsp),$ivp                   # restore $ivp
1394
1395         add     0($ctx),$A                      # update context
1396         add     4($ctx),@T[0]
1397         add     8($ctx),$C
1398         mov     $A,0($ctx)
1399         add     12($ctx),$D
1400         mov     @T[0],4($ctx)
1401         add     16($ctx),$E
1402         mov     $C,8($ctx)
1403         mov     $D,12($ctx)
1404         mov     $E,16($ctx)
1405         vmovups $iv,($ivp)                      # write IV
1406         vzeroall
1407 ___
1408 $code.=<<___ if ($win64);
1409         movaps  96+0(%rsp),%xmm6
1410         movaps  96+16(%rsp),%xmm7
1411         movaps  96+32(%rsp),%xmm8
1412         movaps  96+48(%rsp),%xmm9
1413         movaps  96+64(%rsp),%xmm10
1414         movaps  96+80(%rsp),%xmm11
1415         movaps  96+96(%rsp),%xmm12
1416         movaps  96+112(%rsp),%xmm13
1417         movaps  96+128(%rsp),%xmm14
1418         movaps  96+144(%rsp),%xmm15
1419 ___
1420 $code.=<<___;
1421         lea     `104+($win64?10*16:0)`(%rsp),%rsi
1422         mov     0(%rsi),%r15
1423         mov     8(%rsi),%r14
1424         mov     16(%rsi),%r13
1425         mov     24(%rsi),%r12
1426         mov     32(%rsi),%rbp
1427         mov     40(%rsi),%rbx
1428         lea     48(%rsi),%rsp
1429 .Lepilogue_avx:
1430         ret
1431 .size   aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
1432 ___
1433
1434                                                 if ($stitched_decrypt) {{{
1435 # reset
1436 ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1437
1438 $j=$jj=$r=$sn=$rx=0;
1439 $Xi=4;
1440
1441 @aes256_dec = (
1442         '&vpxor ($inout0,$rndkey0,"0x00($in0)");',
1443         '&vpxor ($inout1,$rndkey0,"0x10($in0)");',
1444         '&vpxor ($inout2,$rndkey0,"0x20($in0)");',
1445         '&vpxor ($inout3,$rndkey0,"0x30($in0)");',
1446
1447         '&vmovups($rndkey0,"16-112($key)");',
1448         '&vmovups("64(%rsp)",@X[2]);',          # save IV, originally @X[3]
1449         undef,undef
1450         );
1451 for ($i=0;$i<13;$i++) {
1452     push (@aes256_dec,(
1453         '&vaesdec       ($inout0,$inout0,$rndkey0);',
1454         '&vaesdec       ($inout1,$inout1,$rndkey0);',
1455         '&vaesdec       ($inout2,$inout2,$rndkey0);',
1456         '&vaesdec       ($inout3,$inout3,$rndkey0);     &vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");'
1457         ));
1458     push (@aes256_dec,(undef,undef))    if (($i>=3 && $i<=5) || $i>=11);
1459     push (@aes256_dec,(undef,undef))    if ($i==5);
1460 }
1461 push(@aes256_dec,(
1462         '&vaesdeclast   ($inout0,$inout0,$rndkey0);     &vmovups(@X[0],"0x00($in0)");',
1463         '&vaesdeclast   ($inout1,$inout1,$rndkey0);     &vmovups(@X[1],"0x10($in0)");',
1464         '&vaesdeclast   ($inout2,$inout2,$rndkey0);     &vmovups(@X[2],"0x20($in0)");',
1465         '&vaesdeclast   ($inout3,$inout3,$rndkey0);     &vmovups(@X[3],"0x30($in0)");',
1466
1467         '&vxorps        ($inout0,$inout0,"64(%rsp)");   &vmovdqu($rndkey0,"-112($key)");',
1468         '&vxorps        ($inout1,$inout1,@X[0]);        &vmovups("0x00($out,$in0)",$inout0);',
1469         '&vxorps        ($inout2,$inout2,@X[1]);        &vmovups("0x10($out,$in0)",$inout1);',
1470         '&vxorps        ($inout3,$inout3,@X[2]);        &vmovups("0x20($out,$in0)",$inout2);',
1471
1472         '&vmovups       ("0x30($out,$in0)",$inout3);'
1473         ));
1474
1475 $code.=<<___;
1476 .type   aesni256_cbc_sha1_dec_avx,\@function,6
1477 .align  32
1478 aesni256_cbc_sha1_dec_avx:
1479         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1480         push    %rbx
1481         push    %rbp
1482         push    %r12
1483         push    %r13
1484         push    %r14
1485         push    %r15
1486         lea     `-104-($win64?10*16:0)`(%rsp),%rsp
1487 ___
1488 $code.=<<___ if ($win64);
1489         movaps  %xmm6,96+0(%rsp)
1490         movaps  %xmm7,96+16(%rsp)
1491         movaps  %xmm8,96+32(%rsp)
1492         movaps  %xmm9,96+48(%rsp)
1493         movaps  %xmm10,96+64(%rsp)
1494         movaps  %xmm11,96+80(%rsp)
1495         movaps  %xmm12,96+96(%rsp)
1496         movaps  %xmm13,96+112(%rsp)
1497         movaps  %xmm14,96+128(%rsp)
1498         movaps  %xmm15,96+144(%rsp)
1499 .Lprologue_dec_avx:
1500 ___
1501 $code.=<<___;
1502         vzeroall
1503         mov     $in0,%r12                       # reassign arguments
1504         mov     $out,%r13
1505         mov     $len,%r14
1506         lea     112($key),%r15                  # size optimization
1507         vmovdqu ($ivp),@X[3]                    # load IV
1508 ___
1509 ($in0,$out,$len,$key)=map("%r$_",(12..15));     # reassign arguments
1510 $code.=<<___;
1511         shl     \$6,$len
1512         sub     $in0,$out
1513         add     $inp,$len               # end of input
1514
1515         lea     K_XX_XX(%rip),$K_XX_XX
1516         mov     0($ctx),$A              # load context
1517         mov     4($ctx),$B
1518         mov     8($ctx),$C
1519         mov     12($ctx),$D
1520         mov     $B,@T[0]                # magic seed
1521         mov     16($ctx),$E
1522         mov     $C,@T[1]
1523         xor     $D,@T[1]
1524         and     @T[1],@T[0]
1525
1526         vmovdqa 64($K_XX_XX),@X[2]      # pbswap mask
1527         vmovdqa 0($K_XX_XX),$Kx         # K_00_19
1528         vmovdqu 0($inp),@X[-4&7]        # load input to %xmm[0-3]
1529         vmovdqu 16($inp),@X[-3&7]
1530         vmovdqu 32($inp),@X[-2&7]
1531         vmovdqu 48($inp),@X[-1&7]
1532         vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
1533         add     \$64,$inp
1534         vpshufb @X[2],@X[-3&7],@X[-3&7]
1535         vpshufb @X[2],@X[-2&7],@X[-2&7]
1536         vpshufb @X[2],@X[-1&7],@X[-1&7]
1537         vpaddd  $Kx,@X[-4&7],@X[0]      # add K_00_19
1538         vpaddd  $Kx,@X[-3&7],@X[1]
1539         vpaddd  $Kx,@X[-2&7],@X[2]
1540         vmovdqa @X[0],0(%rsp)           # X[]+K xfer to IALU
1541         vmovdqa @X[1],16(%rsp)
1542         vmovdqa @X[2],32(%rsp)
1543         vmovups -112($key),$rndkey0     # $key[0]
1544         jmp     .Loop_dec_avx
1545
1546 .align  32
1547 .Loop_dec_avx:
1548 ___
1549         &Xupdate_avx_16_31(\&body_00_19_dec);
1550         &Xupdate_avx_16_31(\&body_00_19_dec);
1551         &Xupdate_avx_16_31(\&body_00_19_dec);
1552         &Xupdate_avx_16_31(\&body_00_19_dec);
1553         &Xupdate_avx_32_79(\&body_00_19_dec);
1554         &Xupdate_avx_32_79(\&body_20_39_dec);
1555         &Xupdate_avx_32_79(\&body_20_39_dec);
1556         &Xupdate_avx_32_79(\&body_20_39_dec);
1557         &Xupdate_avx_32_79(\&body_20_39_dec);
1558         &Xupdate_avx_32_79(\&body_20_39_dec);
1559         &Xupdate_avx_32_79(\&body_40_59_dec);
1560         &Xupdate_avx_32_79(\&body_40_59_dec);
1561         &Xupdate_avx_32_79(\&body_40_59_dec);
1562         &Xupdate_avx_32_79(\&body_40_59_dec);
1563         &Xupdate_avx_32_79(\&body_40_59_dec);
1564         &Xupdate_avx_32_79(\&body_20_39_dec);
1565         &Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx");     # can jump to "done"
1566
1567                                 $saved_j=$j; @saved_V=@V;
1568                                 $saved_rx=$rx;
1569
1570         &Xloop_avx(\&body_20_39_dec);
1571         &Xloop_avx(\&body_20_39_dec);
1572         &Xloop_avx(\&body_20_39_dec);
1573
1574         eval(@aes256_dec[-1]);                  # last store
1575 $code.=<<___;
1576         lea     64($in0),$in0
1577
1578         add     0($ctx),$A                      # update context
1579         add     4($ctx),@T[0]
1580         add     8($ctx),$C
1581         add     12($ctx),$D
1582         mov     $A,0($ctx)
1583         add     16($ctx),$E
1584         mov     @T[0],4($ctx)
1585         mov     @T[0],$B                        # magic seed
1586         mov     $C,8($ctx)
1587         mov     $C,@T[1]
1588         mov     $D,12($ctx)
1589         xor     $D,@T[1]
1590         mov     $E,16($ctx)
1591         and     @T[1],@T[0]
1592         jmp     .Loop_dec_avx
1593
1594 .Ldone_dec_avx:
1595 ___
1596                                 $jj=$j=$saved_j; @V=@saved_V;
1597                                 $rx=$saved_rx;
1598
1599         &Xtail_avx(\&body_20_39_dec);
1600         &Xtail_avx(\&body_20_39_dec);
1601         &Xtail_avx(\&body_20_39_dec);
1602
1603         eval(@aes256_dec[-1]);                  # last store
1604 $code.=<<___;
1605
1606         add     0($ctx),$A                      # update context
1607         add     4($ctx),@T[0]
1608         add     8($ctx),$C
1609         mov     $A,0($ctx)
1610         add     12($ctx),$D
1611         mov     @T[0],4($ctx)
1612         add     16($ctx),$E
1613         mov     $C,8($ctx)
1614         mov     $D,12($ctx)
1615         mov     $E,16($ctx)
1616         vmovups @X[3],($ivp)                    # write IV
1617         vzeroall
1618 ___
1619 $code.=<<___ if ($win64);
1620         movaps  96+0(%rsp),%xmm6
1621         movaps  96+16(%rsp),%xmm7
1622         movaps  96+32(%rsp),%xmm8
1623         movaps  96+48(%rsp),%xmm9
1624         movaps  96+64(%rsp),%xmm10
1625         movaps  96+80(%rsp),%xmm11
1626         movaps  96+96(%rsp),%xmm12
1627         movaps  96+112(%rsp),%xmm13
1628         movaps  96+128(%rsp),%xmm14
1629         movaps  96+144(%rsp),%xmm15
1630 ___
1631 $code.=<<___;
1632         lea     `104+($win64?10*16:0)`(%rsp),%rsi
1633         mov     0(%rsi),%r15
1634         mov     8(%rsi),%r14
1635         mov     16(%rsi),%r13
1636         mov     24(%rsi),%r12
1637         mov     32(%rsi),%rbp
1638         mov     40(%rsi),%rbx
1639         lea     48(%rsi),%rsp
1640 .Lepilogue_dec_avx:
1641         ret
1642 .size   aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx
1643 ___
1644                                                 }}}
1645 }
1646 $code.=<<___;
1647 .align  64
1648 K_XX_XX:
1649 .long   0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
1650 .long   0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
1651 .long   0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
1652 .long   0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
1653 .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap mask
1654
1655 .asciz  "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1656 .align  64
1657 ___
1658
1659 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1660 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1661 if ($win64) {
1662 $rec="%rcx";
1663 $frame="%rdx";
1664 $context="%r8";
1665 $disp="%r9";
1666
1667 $code.=<<___;
1668 .extern __imp_RtlVirtualUnwind
1669 .type   ssse3_handler,\@abi-omnipotent
1670 .align  16
1671 ssse3_handler:
1672         push    %rsi
1673         push    %rdi
1674         push    %rbx
1675         push    %rbp
1676         push    %r12
1677         push    %r13
1678         push    %r14
1679         push    %r15
1680         pushfq
1681         sub     \$64,%rsp
1682
1683         mov     120($context),%rax      # pull context->Rax
1684         mov     248($context),%rbx      # pull context->Rip
1685
1686         mov     8($disp),%rsi           # disp->ImageBase
1687         mov     56($disp),%r11          # disp->HandlerData
1688
1689         mov     0(%r11),%r10d           # HandlerData[0]
1690         lea     (%rsi,%r10),%r10        # prologue label
1691         cmp     %r10,%rbx               # context->Rip<prologue label
1692         jb      .Lcommon_seh_tail
1693
1694         mov     152($context),%rax      # pull context->Rsp
1695
1696         mov     4(%r11),%r10d           # HandlerData[1]
1697         lea     (%rsi,%r10),%r10        # epilogue label
1698         cmp     %r10,%rbx               # context->Rip>=epilogue label
1699         jae     .Lcommon_seh_tail
1700
1701         lea     96(%rax),%rsi
1702         lea     512($context),%rdi      # &context.Xmm6
1703         mov     \$20,%ecx
1704         .long   0xa548f3fc              # cld; rep movsq
1705         lea     `104+10*16`(%rax),%rax  # adjust stack pointer
1706
1707         mov     0(%rax),%r15
1708         mov     8(%rax),%r14
1709         mov     16(%rax),%r13
1710         mov     24(%rax),%r12
1711         mov     32(%rax),%rbp
1712         mov     40(%rax),%rbx
1713         lea     48(%rax),%rax
1714         mov     %rbx,144($context)      # restore context->Rbx
1715         mov     %rbp,160($context)      # restore context->Rbp
1716         mov     %r12,216($context)      # restore context->R12
1717         mov     %r13,224($context)      # restore context->R13
1718         mov     %r14,232($context)      # restore context->R14
1719         mov     %r15,240($context)      # restore context->R15
1720
1721 .Lcommon_seh_tail:
1722         mov     8(%rax),%rdi
1723         mov     16(%rax),%rsi
1724         mov     %rax,152($context)      # restore context->Rsp
1725         mov     %rsi,168($context)      # restore context->Rsi
1726         mov     %rdi,176($context)      # restore context->Rdi
1727
1728         mov     40($disp),%rdi          # disp->ContextRecord
1729         mov     $context,%rsi           # context
1730         mov     \$154,%ecx              # sizeof(CONTEXT)
1731         .long   0xa548f3fc              # cld; rep movsq
1732
1733         mov     $disp,%rsi
1734         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1735         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1736         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1737         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1738         mov     40(%rsi),%r10           # disp->ContextRecord
1739         lea     56(%rsi),%r11           # &disp->HandlerData
1740         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1741         mov     %r10,32(%rsp)           # arg5
1742         mov     %r11,40(%rsp)           # arg6
1743         mov     %r12,48(%rsp)           # arg7
1744         mov     %rcx,56(%rsp)           # arg8, (NULL)
1745         call    *__imp_RtlVirtualUnwind(%rip)
1746
1747         mov     \$1,%eax                # ExceptionContinueSearch
1748         add     \$64,%rsp
1749         popfq
1750         pop     %r15
1751         pop     %r14
1752         pop     %r13
1753         pop     %r12
1754         pop     %rbp
1755         pop     %rbx
1756         pop     %rdi
1757         pop     %rsi
1758         ret
1759 .size   ssse3_handler,.-ssse3_handler
1760
1761 .section        .pdata
1762 .align  4
1763         .rva    .LSEH_begin_aesni_cbc_sha1_enc_ssse3
1764         .rva    .LSEH_end_aesni_cbc_sha1_enc_ssse3
1765         .rva    .LSEH_info_aesni_cbc_sha1_enc_ssse3
1766 ___
1767 $code.=<<___ if ($avx);
1768         .rva    .LSEH_begin_aesni_cbc_sha1_enc_avx
1769         .rva    .LSEH_end_aesni_cbc_sha1_enc_avx
1770         .rva    .LSEH_info_aesni_cbc_sha1_enc_avx
1771 ___
1772 $code.=<<___;
1773 .section        .xdata
1774 .align  8
1775 .LSEH_info_aesni_cbc_sha1_enc_ssse3:
1776         .byte   9,0,0,0
1777         .rva    ssse3_handler
1778         .rva    .Lprologue_ssse3,.Lepilogue_ssse3       # HandlerData[]
1779 ___
1780 $code.=<<___ if ($avx);
1781 .LSEH_info_aesni_cbc_sha1_enc_avx:
1782         .byte   9,0,0,0
1783         .rva    ssse3_handler
1784         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1785 ___
1786 }
1787
1788 ####################################################################
1789 sub rex {
1790   local *opcode=shift;
1791   my ($dst,$src)=@_;
1792   my $rex=0;
1793
1794     $rex|=0x04                  if($dst>=8);
1795     $rex|=0x01                  if($src>=8);
1796     push @opcode,$rex|0x40      if($rex);
1797 }
1798
1799 sub aesni {
1800   my $line=shift;
1801   my @opcode=(0x66);
1802
1803     if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1804         my %opcodelet = (
1805                 "aesenc" => 0xdc,       "aesenclast" => 0xdd,
1806                 "aesdec" => 0xde,       "aesdeclast" => 0xdf
1807         );
1808         return undef if (!defined($opcodelet{$1}));
1809         rex(\@opcode,$3,$2);
1810         push @opcode,0x0f,0x38,$opcodelet{$1};
1811         push @opcode,0xc0|($2&7)|(($3&7)<<3);   # ModR/M
1812         return ".byte\t".join(',',@opcode);
1813     }
1814     return $line;
1815 }
1816
1817 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1818 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1819
1820 print $code;
1821 close STDOUT;