crypto/aes/asm/aesni-sha1-x86_64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # June 2011
  11 #
  12 # This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
  13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  15 # parallelism, interleaving it with another algorithm would allow to
  16 # utilize processor resources better and achieve better performance.
  17 # SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
  18 # AESNI code is weaved into it. Below are performance numbers in
  19 # cycles per processed byte, less is better, for standalone AESNI-CBC
  20 # encrypt, sum of the latter and standalone SHA1, and "stitched"
  21 # subroutine:
  22 #
  23 #               AES-128-CBC     +SHA1           stitch      gain
  24 # Westmere      3.77[+5.5]      9.26            6.66        +39%
  25 # Sandy Bridge  5.05[+5.0(6.2)] 10.06(11.21)    5.98(7.01)  +68%(+60%)
  26 # Ivy Bridge    5.05[+4.6]      9.65            5.54        +74%
  27 # Haswell       4.43[+3.6(4.1)] 8.00(8.55)      4.55(5.21)  +75%(+64%)
  28 # Bulldozer     5.77[+6.0]      11.72           6.37        +84%
  29 #
  30 #               AES-192-CBC
  31 # Westmere      4.51            10.00           6.87        +46%
  32 # Sandy Bridge  6.05            11.06(12.21)    6.11(7.20)  +81%(+70%)
  33 # Ivy Bridge    6.05            10.65           6.07        +75%
  34 # Haswell       5.29            8.86(9.42)      5.32(5.32)  +67%(+77%)
  35 # Bulldozer     6.89            12.84           6.96        +84%
  36 #
  37 #               AES-256-CBC
  38 # Westmere      5.25            10.74           7.24        +48%
  39 # Sandy Bridge  7.05            12.06(13.21)    7.12(7.63)  +69%(+73%)
  40 # Ivy Bridge    7.05            11.65           7.12        +64%
  41 # Haswell       6.19            9.76(10.3)      6.21(6.25)  +57%(+65%)
  42 # Bulldozer     8.00            13.95           8.25        +69%
  43 #
  44 # (*)   There are two code paths: SSSE3 and AVX. See sha1-568.pl for
  45 #       background information. Above numbers in parentheses are SSSE3
  46 #       results collected on AVX-capable CPU, i.e. apply on OSes that
  47 #       don't support AVX.
  48 #
  49 # Needless to mention that it makes no sense to implement "stitched"
  50 # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
  51 # fully utilize parallelism, so stitching would not give any gain
  52 # anyway. Well, there might be some, e.g. because of better cache
  53 # locality... For reference, here are performance results for
  54 # standalone AESNI-CBC decrypt:
  55 #
  56 #               AES-128-CBC     AES-192-CBC     AES-256-CBC
  57 # Westmere      1.25            1.50            1.75
  58 # Sandy Bridge  0.74            0.91            1.09
  59 # Ivy Bridge    0.74            0.90            1.11
  60 # Haswell       0.63            0.76            0.88
  61 # Bulldozer     0.70            0.85            0.99
  62
  63 # And indeed:
  64 #
  65 #               AES-256-CBC     +SHA1           stitch      gain
  66 # Westmere      1.75            7.20            6.68        +7.8%
  67 # Sandy Bridge  1.09            6.09(7.22)      5.82(6.95)  +4.6%(+3.9%)
  68 # Ivy Bridge    1.11            5.70            5.45        +4.6%
  69 # Haswell       0.88            4.45(5.00)      4.39(4.69)  +1.4%(+6.6%)
  70 # Bulldozer     0.99            6.95            5.95        +17%
  71
  72 $flavour = shift;
  73 $output  = shift;
  74 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  75
  76 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  77
  78 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  79 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  80 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  81 die "can't locate x86_64-xlate.pl";
  82
  83 $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  84                 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
  85            $1>=2.19);
  86 $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  87            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
  88            $1>=2.09);
  89 $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  90            `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
  91            $1>=10);
  92
  93 $stitched_decrypt=0;
  94
  95 open OUT,"| \"$^X\" $xlate $flavour $output";
  96 *STDOUT=*OUT;
  97
  98 # void aesni_cbc_sha1_enc(const void *inp,
  99 #                       void *out,
 100 #                       size_t length,
 101 #                       const AES_KEY *key,
 102 #                       unsigned char *iv,
 103 #                       SHA_CTX *ctx,
 104 #                       const void *in0);
 105
 106 $code.=<<___;
 107 .text
 108 .extern OPENSSL_ia32cap_P
 109
 110 .globl  aesni_cbc_sha1_enc
 111 .type   aesni_cbc_sha1_enc,\@abi-omnipotent
 112 .align  32
 113 aesni_cbc_sha1_enc:
 114         # caller should check for SSSE3 and AES-NI bits
 115         mov     OPENSSL_ia32cap_P+0(%rip),%r10d
 116         mov     OPENSSL_ia32cap_P+4(%rip),%r11d
 117 ___
 118 $code.=<<___ if ($avx);
 119         and     \$`1<<28`,%r11d         # mask AVX bit
 120         and     \$`1<<30`,%r10d         # mask "Intel CPU" bit
 121         or      %r11d,%r10d
 122         cmp     \$`1<<28|1<<30`,%r10d
 123         je      aesni_cbc_sha1_enc_avx
 124 ___
 125 $code.=<<___;
 126         jmp     aesni_cbc_sha1_enc_ssse3
 127         ret
 128 .size   aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
 129 ___
 130
 131 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 132
 133 my $Xi=4;
 134 my @X=map("%xmm$_",(4..7,0..3));
 135 my @Tx=map("%xmm$_",(8..10));
 136 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");    # size optimization
 137 my @T=("%esi","%edi");
 138 my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0;
 139 my $K_XX_XX="%r11";
 140 my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));                   # for enc
 141 my @rndkey=("%xmm14","%xmm15");                                 # for enc
 142 my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15));    # for dec
 143
 144 if (1) {        # reassign for Atom Silvermont
 145     @X=map("%xmm$_",(8..15));
 146     @Tx=map("%xmm$_",(5..7));
 147     ($iv,$in,$rndkey0)=map("%xmm$_",(2..4));                    # for enc
 148     @rndkey=("%xmm0","%xmm1");                                  # for enc
 149     ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(0..3));     # for dec
 150 }
 151
 152 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 153 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 154   my $arg = pop;
 155     $arg = "\$$arg" if ($arg*1 eq $arg);
 156     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 157 }
 158
 159 my $_rol=sub { &rol(@_) };
 160 my $_ror=sub { &ror(@_) };
 161
 162 $code.=<<___;
 163 .type   aesni_cbc_sha1_enc_ssse3,\@function,6
 164 .align  32
 165 aesni_cbc_sha1_enc_ssse3:
 166         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
 167         #shr    \$6,$len                        # debugging artefact
 168         #jz     .Lepilogue_ssse3                # debugging artefact
 169         push    %rbx
 170         push    %rbp
 171         push    %r12
 172         push    %r13
 173         push    %r14
 174         push    %r15
 175         lea     `-104-($win64?10*16:0)`(%rsp),%rsp
 176         #mov    $in0,$inp                       # debugging artefact
 177         #lea    64(%rsp),$ctx                   # debugging artefact
 178 ___
 179 $code.=<<___ if ($win64);
 180         movaps  %xmm6,96+0(%rsp)
 181         movaps  %xmm7,96+16(%rsp)
 182         movaps  %xmm8,96+32(%rsp)
 183         movaps  %xmm9,96+48(%rsp)
 184         movaps  %xmm10,96+64(%rsp)
 185         movaps  %xmm11,96+80(%rsp)
 186         movaps  %xmm12,96+96(%rsp)
 187         movaps  %xmm13,96+112(%rsp)
 188         movaps  %xmm14,96+128(%rsp)
 189         movaps  %xmm15,96+144(%rsp)
 190 .Lprologue_ssse3:
 191 ___
 192 $code.=<<___;
 193         mov     $in0,%r12                       # reassign arguments
 194         mov     $out,%r13
 195         mov     $len,%r14
 196         mov     $key,%r15
 197         movdqu  ($ivp),$iv                      # load IV
 198         mov     $ivp,88(%rsp)                   # save $ivp
 199 ___
 200 ($in0,$out,$len,$key)=map("%r$_",(12..15));     # reassign arguments
 201 my $rounds="${ivp}d";
 202 $code.=<<___;
 203         shl     \$6,$len
 204         sub     $in0,$out
 205         mov     240($key),$rounds
 206         add     $inp,$len               # end of input
 207
 208         lea     K_XX_XX(%rip),$K_XX_XX
 209         mov     0($ctx),$A              # load context
 210         mov     4($ctx),$B
 211         mov     8($ctx),$C
 212         mov     12($ctx),$D
 213         mov     $B,@T[0]                # magic seed
 214         mov     16($ctx),$E
 215         mov     $C,@T[1]
 216         xor     $D,@T[1]
 217         and     @T[1],@T[0]
 218
 219         movdqa  64($K_XX_XX),@X[2]      # pbswap mask
 220         movdqa  0($K_XX_XX),@Tx[1]      # K_00_19
 221         movdqu  0($inp),@X[-4&7]        # load input to %xmm[0-3]
 222         movdqu  16($inp),@X[-3&7]
 223         movdqu  32($inp),@X[-2&7]
 224         movdqu  48($inp),@X[-1&7]
 225         pshufb  @X[2],@X[-4&7]          # byte swap
 226         add     \$64,$inp
 227         pshufb  @X[2],@X[-3&7]
 228         pshufb  @X[2],@X[-2&7]
 229         pshufb  @X[2],@X[-1&7]
 230         paddd   @Tx[1],@X[-4&7]         # add K_00_19
 231         paddd   @Tx[1],@X[-3&7]
 232         paddd   @Tx[1],@X[-2&7]
 233         movdqa  @X[-4&7],0(%rsp)        # X[]+K xfer to IALU
 234         psubd   @Tx[1],@X[-4&7]         # restore X[]
 235         movdqa  @X[-3&7],16(%rsp)
 236         psubd   @Tx[1],@X[-3&7]
 237         movdqa  @X[-2&7],32(%rsp)
 238         psubd   @Tx[1],@X[-2&7]
 239         movups  ($key),$rndkey0         # $key[0]
 240         movups  16($key),$rndkey[0]     # forward reference
 241         jmp     .Loop_ssse3
 242 ___
 243
 244 my $aesenc=sub {
 245   use integer;
 246   my ($n,$k)=($r/10,$r%10);
 247     if ($k==0) {
 248       $code.=<<___;
 249         movups          `16*$n`($in0),$in               # load input
 250         xorps           $rndkey0,$in
 251 ___
 252       $code.=<<___ if ($n);
 253         movups          $iv,`16*($n-1)`($out,$in0)      # write output
 254 ___
 255       $code.=<<___;
 256         xorps           $in,$iv
 257         aesenc          $rndkey[0],$iv
 258         movups          `32+16*$k`($key),$rndkey[1]
 259 ___
 260     } elsif ($k==9) {
 261       $sn++;
 262       $code.=<<___;
 263         cmp             \$11,$rounds
 264         jb              .Laesenclast$sn
 265         movups          `32+16*($k+0)`($key),$rndkey[1]
 266         aesenc          $rndkey[0],$iv
 267         movups          `32+16*($k+1)`($key),$rndkey[0]
 268         aesenc          $rndkey[1],$iv
 269         je              .Laesenclast$sn
 270         movups          `32+16*($k+2)`($key),$rndkey[1]
 271         aesenc          $rndkey[0],$iv
 272         movups          `32+16*($k+3)`($key),$rndkey[0]
 273         aesenc          $rndkey[1],$iv
 274 .Laesenclast$sn:
 275         aesenclast      $rndkey[0],$iv
 276         movups          16($key),$rndkey[1]             # forward reference
 277 ___
 278     } else {
 279       $code.=<<___;
 280         aesenc          $rndkey[0],$iv
 281         movups          `32+16*$k`($key),$rndkey[1]
 282 ___
 283     }
 284     $r++;       unshift(@rndkey,pop(@rndkey));
 285 };
 286
 287 sub Xupdate_ssse3_16_31()               # recall that $Xi starts wtih 4
 288 { use integer;
 289   my $body = shift;
 290   my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
 291   my ($a,$b,$c,$d,$e);
 292
 293         &pshufd (@X[0],@X[-4&7],0xee);  # was &movdqa(@X[0],@X[-3&7]);
 294          eval(shift(@insns));
 295          eval(shift(@insns));
 296         &movdqa (@Tx[0],@X[-1&7]);
 297         &punpcklqdq(@X[0],@X[-3&7]);    # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
 298          eval(shift(@insns));
 299          eval(shift(@insns));
 300
 301           &paddd        (@Tx[1],@X[-1&7]);
 302          eval(shift(@insns));
 303          eval(shift(@insns));
 304         &psrldq (@Tx[0],4);             # "X[-3]", 3 dwords
 305          eval(shift(@insns));
 306          eval(shift(@insns));
 307         &pxor   (@X[0],@X[-4&7]);       # "X[0]"^="X[-16]"
 308          eval(shift(@insns));
 309          eval(shift(@insns));
 310
 311         &pxor   (@Tx[0],@X[-2&7]);      # "X[-3]"^"X[-8]"
 312          eval(shift(@insns));
 313          eval(shift(@insns));
 314          eval(shift(@insns));
 315          eval(shift(@insns));
 316
 317         &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-3]"^"X[-8]"
 318          eval(shift(@insns));
 319          eval(shift(@insns));
 320           &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 321          eval(shift(@insns));
 322          eval(shift(@insns));
 323
 324         &movdqa (@Tx[2],@X[0]);
 325         &movdqa (@Tx[0],@X[0]);
 326          eval(shift(@insns));
 327          eval(shift(@insns));
 328          eval(shift(@insns));
 329          eval(shift(@insns));
 330
 331         &pslldq (@Tx[2],12);            # "X[0]"<<96, extract one dword
 332         &paddd  (@X[0],@X[0]);
 333          eval(shift(@insns));
 334          eval(shift(@insns));
 335          eval(shift(@insns));
 336          eval(shift(@insns));
 337
 338         &psrld  (@Tx[0],31);
 339          eval(shift(@insns));
 340          eval(shift(@insns));
 341         &movdqa (@Tx[1],@Tx[2]);
 342          eval(shift(@insns));
 343          eval(shift(@insns));
 344
 345         &psrld  (@Tx[2],30);
 346         &por    (@X[0],@Tx[0]);         # "X[0]"<<<=1
 347          eval(shift(@insns));
 348          eval(shift(@insns));
 349          eval(shift(@insns));
 350          eval(shift(@insns));
 351
 352         &pslld  (@Tx[1],2);
 353         &pxor   (@X[0],@Tx[2]);
 354          eval(shift(@insns));
 355          eval(shift(@insns));
 356           &movdqa       (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");       # K_XX_XX
 357          eval(shift(@insns));
 358          eval(shift(@insns));
 359
 360         &pxor   (@X[0],@Tx[1]);         # "X[0]"^=("X[0]">>96)<<<2
 361
 362          foreach (@insns) { eval; }     # remaining instructions [if any]
 363
 364   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 365                 push(@Tx,shift(@Tx));
 366 }
 367
 368 sub Xupdate_ssse3_32_79()
 369 { use integer;
 370   my $body = shift;
 371   my @insns = (&$body,&$body,&$body,&$body);    # 32 to 48 instructions
 372   my ($a,$b,$c,$d,$e);
 373
 374         &pshufd (@Tx[0],@X[-2&7],0xee)  if ($Xi==8);    # was &movdqa   (@Tx[0],@X[-1&7])
 375          eval(shift(@insns));           # body_20_39
 376         &pxor   (@X[0],@X[-4&7]);       # "X[0]"="X[-32]"^"X[-16]"
 377         &punpcklqdq(@Tx[0],@X[-1&7]);   # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
 378          eval(shift(@insns));
 379          eval(shift(@insns));
 380          eval(shift(@insns));           # rol
 381
 382         &pxor   (@X[0],@X[-7&7]);       # "X[0]"^="X[-28]"
 383          eval(shift(@insns));
 384          eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
 385         if ($Xi%5) {
 386           &movdqa       (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
 387         } else {                        # ... or load next one
 388           &movdqa       (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
 389         }
 390           &paddd        (@Tx[1],@X[-1&7]);
 391          eval(shift(@insns));           # ror
 392          eval(shift(@insns));
 393
 394         &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-6]"
 395          eval(shift(@insns));           # body_20_39
 396          eval(shift(@insns));
 397          eval(shift(@insns));
 398          eval(shift(@insns));           # rol
 399
 400         &movdqa (@Tx[0],@X[0]);
 401           &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 402          eval(shift(@insns));
 403          eval(shift(@insns));
 404          eval(shift(@insns));           # ror
 405          eval(shift(@insns));
 406
 407         &pslld  (@X[0],2);
 408          eval(shift(@insns));           # body_20_39
 409          eval(shift(@insns));
 410         &psrld  (@Tx[0],30);
 411          eval(shift(@insns));
 412          eval(shift(@insns));           # rol
 413          eval(shift(@insns));
 414          eval(shift(@insns));
 415          eval(shift(@insns));           # ror
 416          eval(shift(@insns));
 417
 418         &por    (@X[0],@Tx[0]);         # "X[0]"<<<=2
 419          eval(shift(@insns));           # body_20_39
 420          eval(shift(@insns));
 421           &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19);    # was &movdqa   (@Tx[1],@X[0])
 422          eval(shift(@insns));
 423          eval(shift(@insns));           # rol
 424          eval(shift(@insns));
 425          eval(shift(@insns));
 426          eval(shift(@insns));           # rol
 427          eval(shift(@insns));
 428
 429          foreach (@insns) { eval; }     # remaining instructions
 430
 431   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 432                 push(@Tx,shift(@Tx));
 433 }
 434
 435 sub Xuplast_ssse3_80()
 436 { use integer;
 437   my $body = shift;
 438   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 439   my ($a,$b,$c,$d,$e);
 440
 441          eval(shift(@insns));
 442           &paddd        (@Tx[1],@X[-1&7]);
 443          eval(shift(@insns));
 444          eval(shift(@insns));
 445          eval(shift(@insns));
 446          eval(shift(@insns));
 447
 448           &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
 449
 450          foreach (@insns) { eval; }             # remaining instructions
 451
 452         &cmp    ($inp,$len);
 453         &je     (shift);
 454
 455         unshift(@Tx,pop(@Tx));
 456
 457         &movdqa (@Tx[2],"64($K_XX_XX)");        # pbswap mask
 458         &movdqa (@Tx[1],"0($K_XX_XX)");         # K_00_19
 459         &movdqu (@X[-4&7],"0($inp)");           # load input
 460         &movdqu (@X[-3&7],"16($inp)");
 461         &movdqu (@X[-2&7],"32($inp)");
 462         &movdqu (@X[-1&7],"48($inp)");
 463         &pshufb (@X[-4&7],@Tx[2]);              # byte swap
 464         &add    ($inp,64);
 465
 466   $Xi=0;
 467 }
 468
 469 sub Xloop_ssse3()
 470 { use integer;
 471   my $body = shift;
 472   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 473   my ($a,$b,$c,$d,$e);
 474
 475          eval(shift(@insns));
 476          eval(shift(@insns));
 477         &pshufb (@X[($Xi-3)&7],@Tx[2]);
 478          eval(shift(@insns));
 479          eval(shift(@insns));
 480         &paddd  (@X[($Xi-4)&7],@Tx[1]);
 481          eval(shift(@insns));
 482          eval(shift(@insns));
 483          eval(shift(@insns));
 484          eval(shift(@insns));
 485         &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);  # X[]+K xfer to IALU
 486          eval(shift(@insns));
 487          eval(shift(@insns));
 488         &psubd  (@X[($Xi-4)&7],@Tx[1]);
 489
 490         foreach (@insns) { eval; }
 491   $Xi++;
 492 }
 493
 494 sub Xtail_ssse3()
 495 { use integer;
 496   my $body = shift;
 497   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 498   my ($a,$b,$c,$d,$e);
 499
 500         foreach (@insns) { eval; }
 501 }
 502
 503 my @body_00_19 = (
 504         '($a,$b,$c,$d,$e)=@V;'.
 505         '&$_ror ($b,$j?7:2);',  # $b>>>2
 506         '&xor   (@T[0],$d);',
 507         '&mov   (@T[1],$a);',   # $b for next round
 508
 509         '&add   ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
 510         '&xor   ($b,$c);',      # $c^$d for next round
 511
 512         '&$_rol ($a,5);',
 513         '&add   ($e,@T[0]);',
 514         '&and   (@T[1],$b);',   # ($b&($c^$d)) for next round
 515
 516         '&xor   ($b,$c);',      # restore $b
 517         '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 518         );
 519
 520 sub body_00_19 () {     # ((c^d)&b)^d
 521     # on start @T[0]=(c^d)&b
 522     return &body_20_39() if ($rx==19); $rx++;
 523
 524     use integer;
 525     my ($k,$n);
 526     my @r=@body_00_19;
 527
 528         $n = scalar(@r);
 529         $k = (($jj+1)*12/20)*20*$n/12;  # 12 aesencs per these 20 rounds
 530         @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n);
 531         $jj++;
 532
 533     return @r;
 534 }
 535
 536 my @body_20_39 = (
 537         '($a,$b,$c,$d,$e)=@V;'.
 538         '&add   ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
 539         '&xor   (@T[0],$d)      if($j==19);'.
 540         '&xor   (@T[0],$c)      if($j> 19);',   # ($b^$d^$c)
 541         '&mov   (@T[1],$a);',   # $b for next round
 542
 543         '&$_rol ($a,5);',
 544         '&add   ($e,@T[0]);',
 545         '&xor   (@T[1],$c)      if ($j< 79);',  # $b^$d for next round
 546
 547         '&$_ror ($b,7);',       # $b>>>2
 548         '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 549         );
 550
 551 sub body_20_39 () {     # b^d^c
 552     # on entry @T[0]=b^d
 553     return &body_40_59() if ($rx==39); $rx++;
 554
 555     use integer;
 556     my ($k,$n);
 557     my @r=@body_20_39;
 558
 559         $n = scalar(@r);
 560         $k = (($jj+1)*8/20)*20*$n/8;    # 8 aesencs per these 20 rounds
 561         @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n && $rx!=20);
 562         $jj++;
 563
 564     return @r;
 565 }
 566
 567 my @body_40_59 = (
 568         '($a,$b,$c,$d,$e)=@V;'.
 569         '&add   ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
 570         '&and   (@T[0],$c)      if ($j>=40);',  # (b^c)&(c^d)
 571         '&xor   ($c,$d)         if ($j>=40);',  # restore $c
 572
 573         '&$_ror ($b,7);',       # $b>>>2
 574         '&mov   (@T[1],$a);',   # $b for next round
 575         '&xor   (@T[0],$c);',
 576
 577         '&$_rol ($a,5);',
 578         '&add   ($e,@T[0]);',
 579         '&xor   (@T[1],$c)      if ($j==59);'.
 580         '&xor   (@T[1],$b)      if ($j< 59);',  # b^c for next round
 581
 582         '&xor   ($b,$c)         if ($j< 59);',  # c^d for next round
 583         '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 584         );
 585
 586 sub body_40_59 () {     # ((b^c)&(c^d))^c
 587     # on entry @T[0]=(b^c), (c^=d)
 588     $rx++;
 589
 590     use integer;
 591     my ($k,$n);
 592     my @r=@body_40_59;
 593
 594         $n = scalar(@r);
 595         $k=(($jj+1)*12/20)*20*$n/12;    # 12 aesencs per these 20 rounds
 596         @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n && $rx!=40);
 597         $jj++;
 598
 599     return @r;
 600 }
 601 $code.=<<___;
 602 .align  32
 603 .Loop_ssse3:
 604 ___
 605         &Xupdate_ssse3_16_31(\&body_00_19);
 606         &Xupdate_ssse3_16_31(\&body_00_19);
 607         &Xupdate_ssse3_16_31(\&body_00_19);
 608         &Xupdate_ssse3_16_31(\&body_00_19);
 609         &Xupdate_ssse3_32_79(\&body_00_19);
 610         &Xupdate_ssse3_32_79(\&body_20_39);
 611         &Xupdate_ssse3_32_79(\&body_20_39);
 612         &Xupdate_ssse3_32_79(\&body_20_39);
 613         &Xupdate_ssse3_32_79(\&body_20_39);
 614         &Xupdate_ssse3_32_79(\&body_20_39);
 615         &Xupdate_ssse3_32_79(\&body_40_59);
 616         &Xupdate_ssse3_32_79(\&body_40_59);
 617         &Xupdate_ssse3_32_79(\&body_40_59);
 618         &Xupdate_ssse3_32_79(\&body_40_59);
 619         &Xupdate_ssse3_32_79(\&body_40_59);
 620         &Xupdate_ssse3_32_79(\&body_20_39);
 621         &Xuplast_ssse3_80(\&body_20_39,".Ldone_ssse3"); # can jump to "done"
 622
 623                                 $saved_j=$j; @saved_V=@V;
 624                                 $saved_r=$r; @saved_rndkey=@rndkey;
 625
 626         &Xloop_ssse3(\&body_20_39);
 627         &Xloop_ssse3(\&body_20_39);
 628         &Xloop_ssse3(\&body_20_39);
 629
 630 $code.=<<___;
 631         movups  $iv,48($out,$in0)               # write output
 632         lea     64($in0),$in0
 633
 634         add     0($ctx),$A                      # update context
 635         add     4($ctx),@T[0]
 636         add     8($ctx),$C
 637         add     12($ctx),$D
 638         mov     $A,0($ctx)
 639         add     16($ctx),$E
 640         mov     @T[0],4($ctx)
 641         mov     @T[0],$B                        # magic seed
 642         mov     $C,8($ctx)
 643         mov     $C,@T[1]
 644         mov     $D,12($ctx)
 645         xor     $D,@T[1]
 646         mov     $E,16($ctx)
 647         and     @T[1],@T[0]
 648         jmp     .Loop_ssse3
 649
 650 .Ldone_ssse3:
 651 ___
 652                                 $jj=$j=$saved_j; @V=@saved_V;
 653                                 $r=$saved_r;     @rndkey=@saved_rndkey;
 654
 655         &Xtail_ssse3(\&body_20_39);
 656         &Xtail_ssse3(\&body_20_39);
 657         &Xtail_ssse3(\&body_20_39);
 658
 659 $code.=<<___;
 660         movups  $iv,48($out,$in0)               # write output
 661         mov     88(%rsp),$ivp                   # restore $ivp
 662
 663         add     0($ctx),$A                      # update context
 664         add     4($ctx),@T[0]
 665         add     8($ctx),$C
 666         mov     $A,0($ctx)
 667         add     12($ctx),$D
 668         mov     @T[0],4($ctx)
 669         add     16($ctx),$E
 670         mov     $C,8($ctx)
 671         mov     $D,12($ctx)
 672         mov     $E,16($ctx)
 673         movups  $iv,($ivp)                      # write IV
 674 ___
 675 $code.=<<___ if ($win64);
 676         movaps  96+0(%rsp),%xmm6
 677         movaps  96+16(%rsp),%xmm7
 678         movaps  96+32(%rsp),%xmm8
 679         movaps  96+48(%rsp),%xmm9
 680         movaps  96+64(%rsp),%xmm10
 681         movaps  96+80(%rsp),%xmm11
 682         movaps  96+96(%rsp),%xmm12
 683         movaps  96+112(%rsp),%xmm13
 684         movaps  96+128(%rsp),%xmm14
 685         movaps  96+144(%rsp),%xmm15
 686 ___
 687 $code.=<<___;
 688         lea     `104+($win64?10*16:0)`(%rsp),%rsi
 689         mov     0(%rsi),%r15
 690         mov     8(%rsi),%r14
 691         mov     16(%rsi),%r13
 692         mov     24(%rsi),%r12
 693         mov     32(%rsi),%rbp
 694         mov     40(%rsi),%rbx
 695         lea     48(%rsi),%rsp
 696 .Lepilogue_ssse3:
 697         ret
 698 .size   aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
 699 ___
 700
 701                                                 if ($stitched_decrypt) {{{
 702 # reset
 703 ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 704 $j=$jj=$r=$sn=$rx=0;
 705 $Xi=4;
 706
 707 my @aes256_dec = (
 708         '&movdqu($inout0,"0x00($in0)");',
 709         '&movdqu($inout1,"0x10($in0)"); &pxor   ($inout0,$rndkey0);',
 710         '&movdqu($inout2,"0x20($in0)"); &pxor   ($inout1,$rndkey0);',
 711         '&movdqu($inout3,"0x30($in0)"); &pxor   ($inout2,$rndkey0);',
 712
 713         '&pxor  ($inout3,$rndkey0);     &movups ($rndkey0,"16-112($key)");',
 714         '&movaps("64(%rsp)",@X[2]);',   # save IV, originally @X[3]
 715         undef,undef
 716         );
 717 for ($i=0;$i<13;$i++) {
 718     push (@aes256_dec,(
 719         '&aesdec        ($inout0,$rndkey0);',
 720         '&aesdec        ($inout1,$rndkey0);',
 721         '&aesdec        ($inout2,$rndkey0);',
 722         '&aesdec        ($inout3,$rndkey0);     &movups($rndkey0,"'.(16*($i+2)-112).'($key)");'
 723         ));
 724     push (@aes256_dec,(undef,undef))    if (($i>=3 && $i<=5) || $i>=11);
 725     push (@aes256_dec,(undef,undef))    if ($i==5);
 726 }
 727 push(@aes256_dec,(
 728         '&aesdeclast    ($inout0,$rndkey0);     &movups (@X[0],"0x00($in0)");',
 729         '&aesdeclast    ($inout1,$rndkey0);     &movups (@X[1],"0x10($in0)");',
 730         '&aesdeclast    ($inout2,$rndkey0);     &movups (@X[2],"0x20($in0)");',
 731         '&aesdeclast    ($inout3,$rndkey0);     &movups (@X[3],"0x30($in0)");',
 732
 733         '&xorps         ($inout0,"64(%rsp)");   &movdqu ($rndkey0,"-112($key)");',
 734         '&xorps         ($inout1,@X[0]);        &movups ("0x00($out,$in0)",$inout0);',
 735         '&xorps         ($inout2,@X[1]);        &movups ("0x10($out,$in0)",$inout1);',
 736         '&xorps         ($inout3,@X[2]);        &movups ("0x20($out,$in0)",$inout2);',
 737
 738         '&movups        ("0x30($out,$in0)",$inout3);'
 739         ));
 740
 741 sub body_00_19_dec () { # ((c^d)&b)^d
 742     # on start @T[0]=(c^d)&b
 743     return &body_20_39_dec() if ($rx==19);
 744
 745     my @r=@body_00_19;
 746
 747         unshift (@r,@aes256_dec[$rx])   if (@aes256_dec[$rx]);
 748         $rx++;
 749
 750     return @r;
 751 }
 752
 753 sub body_20_39_dec () { # b^d^c
 754     # on entry @T[0]=b^d
 755     return &body_40_59_dec() if ($rx==39);
 756
 757     my @r=@body_20_39;
 758
 759         unshift (@r,@aes256_dec[$rx])   if (@aes256_dec[$rx]);
 760         $rx++;
 761
 762     return @r;
 763 }
 764
 765 sub body_40_59_dec () { # ((b^c)&(c^d))^c
 766     # on entry @T[0]=(b^c), (c^=d)
 767
 768     my @r=@body_40_59;
 769
 770         unshift (@r,@aes256_dec[$rx])   if (@aes256_dec[$rx]);
 771         $rx++;
 772
 773     return @r;
 774 }
 775
 776 $code.=<<___;
 777 .globl  aesni256_cbc_sha1_dec
 778 .type   aesni256_cbc_sha1_dec,\@abi-omnipotent
 779 .align  32
 780 aesni256_cbc_sha1_dec:
 781         # caller should check for SSSE3 and AES-NI bits
 782         mov     OPENSSL_ia32cap_P+0(%rip),%r10d
 783         mov     OPENSSL_ia32cap_P+4(%rip),%r11d
 784 ___
 785 $code.=<<___ if ($avx);
 786         and     \$`1<<28`,%r11d         # mask AVX bit
 787         and     \$`1<<30`,%r10d         # mask "Intel CPU" bit
 788         or      %r11d,%r10d
 789         cmp     \$`1<<28|1<<30`,%r10d
 790         je      aesni256_cbc_sha1_dec_avx
 791 ___
 792 $code.=<<___;
 793         jmp     aesni256_cbc_sha1_dec_ssse3
 794         ret
 795 .size   aesni256_cbc_sha1_dec,.-aesni256_cbc_sha1_dec
 796
 797 .type   aesni256_cbc_sha1_dec_ssse3,\@function,6
 798 .align  32
 799 aesni256_cbc_sha1_dec_ssse3:
 800         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
 801         push    %rbx
 802         push    %rbp
 803         push    %r12
 804         push    %r13
 805         push    %r14
 806         push    %r15
 807         lea     `-104-($win64?10*16:0)`(%rsp),%rsp
 808 ___
 809 $code.=<<___ if ($win64);
 810         movaps  %xmm6,96+0(%rsp)
 811         movaps  %xmm7,96+16(%rsp)
 812         movaps  %xmm8,96+32(%rsp)
 813         movaps  %xmm9,96+48(%rsp)
 814         movaps  %xmm10,96+64(%rsp)
 815         movaps  %xmm11,96+80(%rsp)
 816         movaps  %xmm12,96+96(%rsp)
 817         movaps  %xmm13,96+112(%rsp)
 818         movaps  %xmm14,96+128(%rsp)
 819         movaps  %xmm15,96+144(%rsp)
 820 .Lprologue_dec_ssse3:
 821 ___
 822 $code.=<<___;
 823         mov     $in0,%r12                       # reassign arguments
 824         mov     $out,%r13
 825         mov     $len,%r14
 826         lea     112($key),%r15                  # size optimization
 827         movdqu  ($ivp),@X[3]                    # load IV
 828         #mov    $ivp,88(%rsp)                   # save $ivp
 829 ___
 830 ($in0,$out,$len,$key)=map("%r$_",(12..15));     # reassign arguments
 831 $code.=<<___;
 832         shl     \$6,$len
 833         sub     $in0,$out
 834         add     $inp,$len               # end of input
 835
 836         lea     K_XX_XX(%rip),$K_XX_XX
 837         mov     0($ctx),$A              # load context
 838         mov     4($ctx),$B
 839         mov     8($ctx),$C
 840         mov     12($ctx),$D
 841         mov     $B,@T[0]                # magic seed
 842         mov     16($ctx),$E
 843         mov     $C,@T[1]
 844         xor     $D,@T[1]
 845         and     @T[1],@T[0]
 846
 847         movdqa  64($K_XX_XX),@X[2]      # pbswap mask
 848         movdqa  0($K_XX_XX),@Tx[1]      # K_00_19
 849         movdqu  0($inp),@X[-4&7]        # load input to %xmm[0-3]
 850         movdqu  16($inp),@X[-3&7]
 851         movdqu  32($inp),@X[-2&7]
 852         movdqu  48($inp),@X[-1&7]
 853         pshufb  @X[2],@X[-4&7]          # byte swap
 854         add     \$64,$inp
 855         pshufb  @X[2],@X[-3&7]
 856         pshufb  @X[2],@X[-2&7]
 857         pshufb  @X[2],@X[-1&7]
 858         paddd   @Tx[1],@X[-4&7]         # add K_00_19
 859         paddd   @Tx[1],@X[-3&7]
 860         paddd   @Tx[1],@X[-2&7]
 861         movdqa  @X[-4&7],0(%rsp)        # X[]+K xfer to IALU
 862         psubd   @Tx[1],@X[-4&7]         # restore X[]
 863         movdqa  @X[-3&7],16(%rsp)
 864         psubd   @Tx[1],@X[-3&7]
 865         movdqa  @X[-2&7],32(%rsp)
 866         psubd   @Tx[1],@X[-2&7]
 867         movdqu  -112($key),$rndkey0     # $key[0]
 868         jmp     .Loop_dec_ssse3
 869
 870 .align  32
 871 .Loop_dec_ssse3:
 872 ___
 873         &Xupdate_ssse3_16_31(\&body_00_19_dec);
 874         &Xupdate_ssse3_16_31(\&body_00_19_dec);
 875         &Xupdate_ssse3_16_31(\&body_00_19_dec);
 876         &Xupdate_ssse3_16_31(\&body_00_19_dec);
 877         &Xupdate_ssse3_32_79(\&body_00_19_dec);
 878         &Xupdate_ssse3_32_79(\&body_20_39_dec);
 879         &Xupdate_ssse3_32_79(\&body_20_39_dec);
 880         &Xupdate_ssse3_32_79(\&body_20_39_dec);
 881         &Xupdate_ssse3_32_79(\&body_20_39_dec);
 882         &Xupdate_ssse3_32_79(\&body_20_39_dec);
 883         &Xupdate_ssse3_32_79(\&body_40_59_dec);
 884         &Xupdate_ssse3_32_79(\&body_40_59_dec);
 885         &Xupdate_ssse3_32_79(\&body_40_59_dec);
 886         &Xupdate_ssse3_32_79(\&body_40_59_dec);
 887         &Xupdate_ssse3_32_79(\&body_40_59_dec);
 888         &Xupdate_ssse3_32_79(\&body_20_39_dec);
 889         &Xuplast_ssse3_80(\&body_20_39_dec,".Ldone_dec_ssse3"); # can jump to "done"
 890
 891                                 $saved_j=$j;   @saved_V=@V;
 892                                 $saved_rx=$rx;
 893
 894         &Xloop_ssse3(\&body_20_39_dec);
 895         &Xloop_ssse3(\&body_20_39_dec);
 896         &Xloop_ssse3(\&body_20_39_dec);
 897
 898         eval(@aes256_dec[-1]);                  # last store
 899 $code.=<<___;
 900         lea     64($in0),$in0
 901
 902         add     0($ctx),$A                      # update context
 903         add     4($ctx),@T[0]
 904         add     8($ctx),$C
 905         add     12($ctx),$D
 906         mov     $A,0($ctx)
 907         add     16($ctx),$E
 908         mov     @T[0],4($ctx)
 909         mov     @T[0],$B                        # magic seed
 910         mov     $C,8($ctx)
 911         mov     $C,@T[1]
 912         mov     $D,12($ctx)
 913         xor     $D,@T[1]
 914         mov     $E,16($ctx)
 915         and     @T[1],@T[0]
 916         jmp     .Loop_dec_ssse3
 917
 918 .Ldone_dec_ssse3:
 919 ___
 920                                 $jj=$j=$saved_j; @V=@saved_V;
 921                                 $rx=$saved_rx;
 922
 923         &Xtail_ssse3(\&body_20_39_dec);
 924         &Xtail_ssse3(\&body_20_39_dec);
 925         &Xtail_ssse3(\&body_20_39_dec);
 926
 927         eval(@aes256_dec[-1]);                  # last store
 928 $code.=<<___;
 929         add     0($ctx),$A                      # update context
 930         add     4($ctx),@T[0]
 931         add     8($ctx),$C
 932         mov     $A,0($ctx)
 933         add     12($ctx),$D
 934         mov     @T[0],4($ctx)
 935         add     16($ctx),$E
 936         mov     $C,8($ctx)
 937         mov     $D,12($ctx)
 938         mov     $E,16($ctx)
 939         movups  @X[3],($ivp)                    # write IV
 940 ___
 941 $code.=<<___ if ($win64);
 942         movaps  96+0(%rsp),%xmm6
 943         movaps  96+16(%rsp),%xmm7
 944         movaps  96+32(%rsp),%xmm8
 945         movaps  96+48(%rsp),%xmm9
 946         movaps  96+64(%rsp),%xmm10
 947         movaps  96+80(%rsp),%xmm11
 948         movaps  96+96(%rsp),%xmm12
 949         movaps  96+112(%rsp),%xmm13
 950         movaps  96+128(%rsp),%xmm14
 951         movaps  96+144(%rsp),%xmm15
 952 ___
 953 $code.=<<___;
 954         lea     `104+($win64?10*16:0)`(%rsp),%rsi
 955         mov     0(%rsi),%r15
 956         mov     8(%rsi),%r14
 957         mov     16(%rsi),%r13
 958         mov     24(%rsi),%r12
 959         mov     32(%rsi),%rbp
 960         mov     40(%rsi),%rbx
 961         lea     48(%rsi),%rsp
 962 .Lepilogue_dec_ssse3:
 963         ret
 964 .size   aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3
 965 ___
 966                                                 }}}
 967 $j=$jj=$r=$sn=$rx=0;
 968
 969 if ($avx) {
 970 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 971
 972 my $Xi=4;
 973 my @X=map("%xmm$_",(4..7,0..3));
 974 my @Tx=map("%xmm$_",(8..10));
 975 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");    # size optimization
 976 my @T=("%esi","%edi");
 977 my ($rndkey0,$iv,$in)=map("%xmm$_",(11..13));
 978 my @rndkey=("%xmm14","%xmm15");
 979 my ($inout0,$inout1,$inout2,$inout3)=map("%xmm$_",(12..15));    # for dec
 980 my $Kx=@Tx[2];
 981
 982 my $_rol=sub { &shld(@_[0],@_) };
 983 my $_ror=sub { &shrd(@_[0],@_) };
 984
 985 $code.=<<___;
 986 .type   aesni_cbc_sha1_enc_avx,\@function,6
 987 .align  32
 988 aesni_cbc_sha1_enc_avx:
 989         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
 990         #shr    \$6,$len                        # debugging artefact
 991         #jz     .Lepilogue_avx                  # debugging artefact
 992         push    %rbx
 993         push    %rbp
 994         push    %r12
 995         push    %r13
 996         push    %r14
 997         push    %r15
 998         lea     `-104-($win64?10*16:0)`(%rsp),%rsp
 999         #mov    $in0,$inp                       # debugging artefact
1000         #lea    64(%rsp),$ctx                   # debugging artefact
1001 ___
1002 $code.=<<___ if ($win64);
1003         movaps  %xmm6,96+0(%rsp)
1004         movaps  %xmm7,96+16(%rsp)
1005         movaps  %xmm8,96+32(%rsp)
1006         movaps  %xmm9,96+48(%rsp)
1007         movaps  %xmm10,96+64(%rsp)
1008         movaps  %xmm11,96+80(%rsp)
1009         movaps  %xmm12,96+96(%rsp)
1010         movaps  %xmm13,96+112(%rsp)
1011         movaps  %xmm14,96+128(%rsp)
1012         movaps  %xmm15,96+144(%rsp)
1013 .Lprologue_avx:
1014 ___
1015 $code.=<<___;
1016         vzeroall
1017         mov     $in0,%r12                       # reassign arguments
1018         mov     $out,%r13
1019         mov     $len,%r14
1020         mov     $key,%r15
1021         vmovdqu ($ivp),$iv                      # load IV
1022         mov     $ivp,88(%rsp)                   # save $ivp
1023 ___
1024 ($in0,$out,$len,$key)=map("%r$_",(12..15));     # reassign arguments
1025 my $rounds="${ivp}d";
1026 $code.=<<___;
1027         shl     \$6,$len
1028         sub     $in0,$out
1029         mov     240($key),$rounds
1030         add     \$112,$key              # size optimization
1031         add     $inp,$len               # end of input
1032
1033         lea     K_XX_XX(%rip),$K_XX_XX
1034         mov     0($ctx),$A              # load context
1035         mov     4($ctx),$B
1036         mov     8($ctx),$C
1037         mov     12($ctx),$D
1038         mov     $B,@T[0]                # magic seed
1039         mov     16($ctx),$E
1040         mov     $C,@T[1]
1041         xor     $D,@T[1]
1042         and     @T[1],@T[0]
1043
1044         vmovdqa 64($K_XX_XX),@X[2]      # pbswap mask
1045         vmovdqa 0($K_XX_XX),$Kx         # K_00_19
1046         vmovdqu 0($inp),@X[-4&7]        # load input to %xmm[0-3]
1047         vmovdqu 16($inp),@X[-3&7]
1048         vmovdqu 32($inp),@X[-2&7]
1049         vmovdqu 48($inp),@X[-1&7]
1050         vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
1051         add     \$64,$inp
1052         vpshufb @X[2],@X[-3&7],@X[-3&7]
1053         vpshufb @X[2],@X[-2&7],@X[-2&7]
1054         vpshufb @X[2],@X[-1&7],@X[-1&7]
1055         vpaddd  $Kx,@X[-4&7],@X[0]      # add K_00_19
1056         vpaddd  $Kx,@X[-3&7],@X[1]
1057         vpaddd  $Kx,@X[-2&7],@X[2]
1058         vmovdqa @X[0],0(%rsp)           # X[]+K xfer to IALU
1059         vmovdqa @X[1],16(%rsp)
1060         vmovdqa @X[2],32(%rsp)
1061         vmovups -112($key),$rndkey[1]   # $key[0]
1062         vmovups 16-112($key),$rndkey[0] # forward reference
1063         jmp     .Loop_avx
1064 ___
1065
1066 my $aesenc=sub {
1067   use integer;
1068   my ($n,$k)=($r/10,$r%10);
1069     if ($k==0) {
1070       $code.=<<___;
1071         vmovdqu         `16*$n`($in0),$in               # load input
1072         vpxor           $rndkey[1],$in,$in
1073 ___
1074       $code.=<<___ if ($n);
1075         vmovups         $iv,`16*($n-1)`($out,$in0)      # write output
1076 ___
1077       $code.=<<___;
1078         vpxor           $in,$iv,$iv
1079         vaesenc         $rndkey[0],$iv,$iv
1080         vmovups         `32+16*$k-112`($key),$rndkey[1]
1081 ___
1082     } elsif ($k==9) {
1083       $sn++;
1084       $code.=<<___;
1085         cmp             \$11,$rounds
1086         jb              .Lvaesenclast$sn
1087         vaesenc         $rndkey[0],$iv,$iv
1088         vmovups         `32+16*($k+0)-112`($key),$rndkey[1]
1089         vaesenc         $rndkey[1],$iv,$iv
1090         vmovups         `32+16*($k+1)-112`($key),$rndkey[0]
1091         je              .Lvaesenclast$sn
1092         vaesenc         $rndkey[0],$iv,$iv
1093         vmovups         `32+16*($k+2)-112`($key),$rndkey[1]
1094         vaesenc         $rndkey[1],$iv,$iv
1095         vmovups         `32+16*($k+3)-112`($key),$rndkey[0]
1096 .Lvaesenclast$sn:
1097         vaesenclast     $rndkey[0],$iv,$iv
1098         vmovups         -112($key),$rndkey[0]
1099         vmovups         16-112($key),$rndkey[1]         # forward reference
1100 ___
1101     } else {
1102       $code.=<<___;
1103         vaesenc         $rndkey[0],$iv,$iv
1104         vmovups         `32+16*$k-112`($key),$rndkey[1]
1105 ___
1106     }
1107     $r++;       unshift(@rndkey,pop(@rndkey));
1108 };
1109
1110 sub Xupdate_avx_16_31()         # recall that $Xi starts wtih 4
1111 { use integer;
1112   my $body = shift;
1113   my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
1114   my ($a,$b,$c,$d,$e);
1115
1116          eval(shift(@insns));
1117          eval(shift(@insns));
1118         &vpalignr(@X[0],@X[-3&7],@X[-4&7],8);   # compose "X[-14]" in "X[0]"
1119          eval(shift(@insns));
1120          eval(shift(@insns));
1121
1122           &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
1123          eval(shift(@insns));
1124          eval(shift(@insns));
1125         &vpsrldq(@Tx[0],@X[-1&7],4);            # "X[-3]", 3 dwords
1126          eval(shift(@insns));
1127          eval(shift(@insns));
1128         &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"^="X[-16]"
1129          eval(shift(@insns));
1130          eval(shift(@insns));
1131
1132         &vpxor  (@Tx[0],@Tx[0],@X[-2&7]);       # "X[-3]"^"X[-8]"
1133          eval(shift(@insns));
1134          eval(shift(@insns));
1135          eval(shift(@insns));
1136          eval(shift(@insns));
1137
1138         &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-3]"^"X[-8]"
1139          eval(shift(@insns));
1140          eval(shift(@insns));
1141           &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1142          eval(shift(@insns));
1143          eval(shift(@insns));
1144
1145         &vpsrld (@Tx[0],@X[0],31);
1146          eval(shift(@insns));
1147          eval(shift(@insns));
1148          eval(shift(@insns));
1149          eval(shift(@insns));
1150
1151         &vpslldq(@Tx[1],@X[0],12);              # "X[0]"<<96, extract one dword
1152         &vpaddd (@X[0],@X[0],@X[0]);
1153          eval(shift(@insns));
1154          eval(shift(@insns));
1155          eval(shift(@insns));
1156          eval(shift(@insns));
1157
1158         &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=1
1159         &vpsrld (@Tx[0],@Tx[1],30);
1160          eval(shift(@insns));
1161          eval(shift(@insns));
1162          eval(shift(@insns));
1163          eval(shift(@insns));
1164
1165         &vpslld (@Tx[1],@Tx[1],2);
1166         &vpxor  (@X[0],@X[0],@Tx[0]);
1167          eval(shift(@insns));
1168          eval(shift(@insns));
1169          eval(shift(@insns));
1170          eval(shift(@insns));
1171
1172         &vpxor  (@X[0],@X[0],@Tx[1]);           # "X[0]"^=("X[0]">>96)<<<2
1173          eval(shift(@insns));
1174          eval(shift(@insns));
1175           &vmovdqa      ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)")   if ($Xi%5==0);  # K_XX_XX
1176          eval(shift(@insns));
1177          eval(shift(@insns));
1178
1179
1180          foreach (@insns) { eval; }     # remaining instructions [if any]
1181
1182   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
1183 }
1184
1185 sub Xupdate_avx_32_79()
1186 { use integer;
1187   my $body = shift;
1188   my @insns = (&$body,&$body,&$body,&$body);    # 32 to 48 instructions
1189   my ($a,$b,$c,$d,$e);
1190
1191         &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);  # compose "X[-6]"
1192         &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"="X[-32]"^"X[-16]"
1193          eval(shift(@insns));           # body_20_39
1194          eval(shift(@insns));
1195          eval(shift(@insns));
1196          eval(shift(@insns));           # rol
1197
1198         &vpxor  (@X[0],@X[0],@X[-7&7]);         # "X[0]"^="X[-28]"
1199          eval(shift(@insns));
1200          eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
1201           &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
1202           &vmovdqa      ($Kx,eval(16*($Xi/5))."($K_XX_XX)")     if ($Xi%5==0);
1203          eval(shift(@insns));           # ror
1204          eval(shift(@insns));
1205
1206         &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-6]"
1207          eval(shift(@insns));           # body_20_39
1208          eval(shift(@insns));
1209          eval(shift(@insns));
1210          eval(shift(@insns));           # rol
1211
1212         &vpsrld (@Tx[0],@X[0],30);
1213           &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1214          eval(shift(@insns));
1215          eval(shift(@insns));
1216          eval(shift(@insns));           # ror
1217          eval(shift(@insns));
1218
1219         &vpslld (@X[0],@X[0],2);
1220          eval(shift(@insns));           # body_20_39
1221          eval(shift(@insns));
1222          eval(shift(@insns));
1223          eval(shift(@insns));           # rol
1224          eval(shift(@insns));
1225          eval(shift(@insns));
1226          eval(shift(@insns));           # ror
1227          eval(shift(@insns));
1228
1229         &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=2
1230          eval(shift(@insns));           # body_20_39
1231          eval(shift(@insns));
1232          eval(shift(@insns));
1233          eval(shift(@insns));           # rol
1234          eval(shift(@insns));
1235          eval(shift(@insns));
1236          eval(shift(@insns));           # rol
1237          eval(shift(@insns));
1238
1239          foreach (@insns) { eval; }     # remaining instructions
1240
1241   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
1242 }
1243
1244 sub Xuplast_avx_80()
1245 { use integer;
1246   my $body = shift;
1247   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
1248   my ($a,$b,$c,$d,$e);
1249
1250          eval(shift(@insns));
1251           &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
1252          eval(shift(@insns));
1253          eval(shift(@insns));
1254          eval(shift(@insns));
1255          eval(shift(@insns));
1256
1257           &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
1258
1259          foreach (@insns) { eval; }             # remaining instructions
1260
1261         &cmp    ($inp,$len);
1262         &je     (shift);
1263
1264         &vmovdqa(@Tx[1],"64($K_XX_XX)");        # pbswap mask
1265         &vmovdqa($Kx,"0($K_XX_XX)");            # K_00_19
1266         &vmovdqu(@X[-4&7],"0($inp)");           # load input
1267         &vmovdqu(@X[-3&7],"16($inp)");
1268         &vmovdqu(@X[-2&7],"32($inp)");
1269         &vmovdqu(@X[-1&7],"48($inp)");
1270         &vpshufb(@X[-4&7],@X[-4&7],@Tx[1]);     # byte swap
1271         &add    ($inp,64);
1272
1273   $Xi=0;
1274 }
1275
1276 sub Xloop_avx()
1277 { use integer;
1278   my $body = shift;
1279   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
1280   my ($a,$b,$c,$d,$e);
1281
1282          eval(shift(@insns));
1283          eval(shift(@insns));
1284         &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@Tx[1]);
1285          eval(shift(@insns));
1286          eval(shift(@insns));
1287         &vpaddd (@Tx[0],@X[($Xi-4)&7],$Kx);
1288          eval(shift(@insns));
1289          eval(shift(@insns));
1290          eval(shift(@insns));
1291          eval(shift(@insns));
1292         &vmovdqa(eval(16*$Xi)."(%rsp)",@Tx[0]); # X[]+K xfer to IALU
1293          eval(shift(@insns));
1294          eval(shift(@insns));
1295
1296         foreach (@insns) { eval; }
1297   $Xi++;
1298 }
1299
1300 sub Xtail_avx()
1301 { use integer;
1302   my $body = shift;
1303   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
1304   my ($a,$b,$c,$d,$e);
1305
1306         foreach (@insns) { eval; }
1307 }
1308
1309 $code.=<<___;
1310 .align  32
1311 .Loop_avx:
1312 ___
1313         &Xupdate_avx_16_31(\&body_00_19);
1314         &Xupdate_avx_16_31(\&body_00_19);
1315         &Xupdate_avx_16_31(\&body_00_19);
1316         &Xupdate_avx_16_31(\&body_00_19);
1317         &Xupdate_avx_32_79(\&body_00_19);
1318         &Xupdate_avx_32_79(\&body_20_39);
1319         &Xupdate_avx_32_79(\&body_20_39);
1320         &Xupdate_avx_32_79(\&body_20_39);
1321         &Xupdate_avx_32_79(\&body_20_39);
1322         &Xupdate_avx_32_79(\&body_20_39);
1323         &Xupdate_avx_32_79(\&body_40_59);
1324         &Xupdate_avx_32_79(\&body_40_59);
1325         &Xupdate_avx_32_79(\&body_40_59);
1326         &Xupdate_avx_32_79(\&body_40_59);
1327         &Xupdate_avx_32_79(\&body_40_59);
1328         &Xupdate_avx_32_79(\&body_20_39);
1329         &Xuplast_avx_80(\&body_20_39,".Ldone_avx");     # can jump to "done"
1330
1331                                 $saved_j=$j; @saved_V=@V;
1332                                 $saved_r=$r; @saved_rndkey=@rndkey;
1333
1334         &Xloop_avx(\&body_20_39);
1335         &Xloop_avx(\&body_20_39);
1336         &Xloop_avx(\&body_20_39);
1337
1338 $code.=<<___;
1339         vmovups $iv,48($out,$in0)               # write output
1340         lea     64($in0),$in0
1341
1342         add     0($ctx),$A                      # update context
1343         add     4($ctx),@T[0]
1344         add     8($ctx),$C
1345         add     12($ctx),$D
1346         mov     $A,0($ctx)
1347         add     16($ctx),$E
1348         mov     @T[0],4($ctx)
1349         mov     @T[0],$B                        # magic seed
1350         mov     $C,8($ctx)
1351         mov     $C,@T[1]
1352         mov     $D,12($ctx)
1353         xor     $D,@T[1]
1354         mov     $E,16($ctx)
1355         and     @T[1],@T[0]
1356         jmp     .Loop_avx
1357
1358 .Ldone_avx:
1359 ___
1360                                 $jj=$j=$saved_j; @V=@saved_V;
1361                                 $r=$saved_r;     @rndkey=@saved_rndkey;
1362
1363         &Xtail_avx(\&body_20_39);
1364         &Xtail_avx(\&body_20_39);
1365         &Xtail_avx(\&body_20_39);
1366
1367 $code.=<<___;
1368         vmovups $iv,48($out,$in0)               # write output
1369         mov     88(%rsp),$ivp                   # restore $ivp
1370
1371         add     0($ctx),$A                      # update context
1372         add     4($ctx),@T[0]
1373         add     8($ctx),$C
1374         mov     $A,0($ctx)
1375         add     12($ctx),$D
1376         mov     @T[0],4($ctx)
1377         add     16($ctx),$E
1378         mov     $C,8($ctx)
1379         mov     $D,12($ctx)
1380         mov     $E,16($ctx)
1381         vmovups $iv,($ivp)                      # write IV
1382         vzeroall
1383 ___
1384 $code.=<<___ if ($win64);
1385         movaps  96+0(%rsp),%xmm6
1386         movaps  96+16(%rsp),%xmm7
1387         movaps  96+32(%rsp),%xmm8
1388         movaps  96+48(%rsp),%xmm9
1389         movaps  96+64(%rsp),%xmm10
1390         movaps  96+80(%rsp),%xmm11
1391         movaps  96+96(%rsp),%xmm12
1392         movaps  96+112(%rsp),%xmm13
1393         movaps  96+128(%rsp),%xmm14
1394         movaps  96+144(%rsp),%xmm15
1395 ___
1396 $code.=<<___;
1397         lea     `104+($win64?10*16:0)`(%rsp),%rsi
1398         mov     0(%rsi),%r15
1399         mov     8(%rsi),%r14
1400         mov     16(%rsi),%r13
1401         mov     24(%rsi),%r12
1402         mov     32(%rsi),%rbp
1403         mov     40(%rsi),%rbx
1404         lea     48(%rsi),%rsp
1405 .Lepilogue_avx:
1406         ret
1407 .size   aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
1408 ___
1409
1410                                                 if ($stiched_decrypt) {{{
1411 # reset
1412 ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1413
1414 $j=$jj=$r=$sn=$rx=0;
1415 $Xi=4;
1416
1417 @aes256_dec = (
1418         '&vpxor ($inout0,$rndkey0,"0x00($in0)");',
1419         '&vpxor ($inout1,$rndkey0,"0x10($in0)");',
1420         '&vpxor ($inout2,$rndkey0,"0x20($in0)");',
1421         '&vpxor ($inout3,$rndkey0,"0x30($in0)");',
1422
1423         '&vmovups($rndkey0,"16-112($key)");',
1424         '&vmovups("64(%rsp)",@X[2]);',          # save IV, originally @X[3]
1425         undef,undef
1426         );
1427 for ($i=0;$i<13;$i++) {
1428     push (@aes256_dec,(
1429         '&vaesdec       ($inout0,$inout0,$rndkey0);',
1430         '&vaesdec       ($inout1,$inout1,$rndkey0);',
1431         '&vaesdec       ($inout2,$inout2,$rndkey0);',
1432         '&vaesdec       ($inout3,$inout3,$rndkey0);     &vmovups($rndkey0,"'.(16*($i+2)-112).'($key)");'
1433         ));
1434     push (@aes256_dec,(undef,undef))    if (($i>=3 && $i<=5) || $i>=11);
1435     push (@aes256_dec,(undef,undef))    if ($i==5);
1436 }
1437 push(@aes256_dec,(
1438         '&vaesdeclast   ($inout0,$inout0,$rndkey0);     &vmovups(@X[0],"0x00($in0)");',
1439         '&vaesdeclast   ($inout1,$inout1,$rndkey0);     &vmovups(@X[1],"0x10($in0)");',
1440         '&vaesdeclast   ($inout2,$inout2,$rndkey0);     &vmovups(@X[2],"0x20($in0)");',
1441         '&vaesdeclast   ($inout3,$inout3,$rndkey0);     &vmovups(@X[3],"0x30($in0)");',
1442
1443         '&vxorps        ($inout0,$inout0,"64(%rsp)");   &vmovdqu($rndkey0,"-112($key)");',
1444         '&vxorps        ($inout1,$inout1,@X[0]);        &vmovups("0x00($out,$in0)",$inout0);',
1445         '&vxorps        ($inout2,$inout2,@X[1]);        &vmovups("0x10($out,$in0)",$inout1);',
1446         '&vxorps        ($inout3,$inout3,@X[2]);        &vmovups("0x20($out,$in0)",$inout2);',
1447
1448         '&vmovups       ("0x30($out,$in0)",$inout3);'
1449         ));
1450
1451 $code.=<<___;
1452 .type   aesni256_cbc_sha1_dec_avx,\@function,6
1453 .align  32
1454 aesni256_cbc_sha1_dec_avx:
1455         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1456         push    %rbx
1457         push    %rbp
1458         push    %r12
1459         push    %r13
1460         push    %r14
1461         push    %r15
1462         lea     `-104-($win64?10*16:0)`(%rsp),%rsp
1463 ___
1464 $code.=<<___ if ($win64);
1465         movaps  %xmm6,96+0(%rsp)
1466         movaps  %xmm7,96+16(%rsp)
1467         movaps  %xmm8,96+32(%rsp)
1468         movaps  %xmm9,96+48(%rsp)
1469         movaps  %xmm10,96+64(%rsp)
1470         movaps  %xmm11,96+80(%rsp)
1471         movaps  %xmm12,96+96(%rsp)
1472         movaps  %xmm13,96+112(%rsp)
1473         movaps  %xmm14,96+128(%rsp)
1474         movaps  %xmm15,96+144(%rsp)
1475 .Lprologue_dec_avx:
1476 ___
1477 $code.=<<___;
1478         vzeroall
1479         mov     $in0,%r12                       # reassign arguments
1480         mov     $out,%r13
1481         mov     $len,%r14
1482         lea     112($key),%r15                  # size optimization
1483         vmovdqu ($ivp),@X[3]                    # load IV
1484 ___
1485 ($in0,$out,$len,$key)=map("%r$_",(12..15));     # reassign arguments
1486 $code.=<<___;
1487         shl     \$6,$len
1488         sub     $in0,$out
1489         add     $inp,$len               # end of input
1490
1491         lea     K_XX_XX(%rip),$K_XX_XX
1492         mov     0($ctx),$A              # load context
1493         mov     4($ctx),$B
1494         mov     8($ctx),$C
1495         mov     12($ctx),$D
1496         mov     $B,@T[0]                # magic seed
1497         mov     16($ctx),$E
1498         mov     $C,@T[1]
1499         xor     $D,@T[1]
1500         and     @T[1],@T[0]
1501
1502         vmovdqa 64($K_XX_XX),@X[2]      # pbswap mask
1503         vmovdqa 0($K_XX_XX),$Kx         # K_00_19
1504         vmovdqu 0($inp),@X[-4&7]        # load input to %xmm[0-3]
1505         vmovdqu 16($inp),@X[-3&7]
1506         vmovdqu 32($inp),@X[-2&7]
1507         vmovdqu 48($inp),@X[-1&7]
1508         vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
1509         add     \$64,$inp
1510         vpshufb @X[2],@X[-3&7],@X[-3&7]
1511         vpshufb @X[2],@X[-2&7],@X[-2&7]
1512         vpshufb @X[2],@X[-1&7],@X[-1&7]
1513         vpaddd  $Kx,@X[-4&7],@X[0]      # add K_00_19
1514         vpaddd  $Kx,@X[-3&7],@X[1]
1515         vpaddd  $Kx,@X[-2&7],@X[2]
1516         vmovdqa @X[0],0(%rsp)           # X[]+K xfer to IALU
1517         vmovdqa @X[1],16(%rsp)
1518         vmovdqa @X[2],32(%rsp)
1519         vmovups -112($key),$rndkey0     # $key[0]
1520         jmp     .Loop_dec_avx
1521
1522 .align  32
1523 .Loop_dec_avx:
1524 ___
1525         &Xupdate_avx_16_31(\&body_00_19_dec);
1526         &Xupdate_avx_16_31(\&body_00_19_dec);
1527         &Xupdate_avx_16_31(\&body_00_19_dec);
1528         &Xupdate_avx_16_31(\&body_00_19_dec);
1529         &Xupdate_avx_32_79(\&body_00_19_dec);
1530         &Xupdate_avx_32_79(\&body_20_39_dec);
1531         &Xupdate_avx_32_79(\&body_20_39_dec);
1532         &Xupdate_avx_32_79(\&body_20_39_dec);
1533         &Xupdate_avx_32_79(\&body_20_39_dec);
1534         &Xupdate_avx_32_79(\&body_20_39_dec);
1535         &Xupdate_avx_32_79(\&body_40_59_dec);
1536         &Xupdate_avx_32_79(\&body_40_59_dec);
1537         &Xupdate_avx_32_79(\&body_40_59_dec);
1538         &Xupdate_avx_32_79(\&body_40_59_dec);
1539         &Xupdate_avx_32_79(\&body_40_59_dec);
1540         &Xupdate_avx_32_79(\&body_20_39_dec);
1541         &Xuplast_avx_80(\&body_20_39_dec,".Ldone_dec_avx");     # can jump to "done"
1542
1543                                 $saved_j=$j; @saved_V=@V;
1544                                 $saved_rx=$rx;
1545
1546         &Xloop_avx(\&body_20_39_dec);
1547         &Xloop_avx(\&body_20_39_dec);
1548         &Xloop_avx(\&body_20_39_dec);
1549
1550         eval(@aes256_dec[-1]);                  # last store
1551 $code.=<<___;
1552         lea     64($in0),$in0
1553
1554         add     0($ctx),$A                      # update context
1555         add     4($ctx),@T[0]
1556         add     8($ctx),$C
1557         add     12($ctx),$D
1558         mov     $A,0($ctx)
1559         add     16($ctx),$E
1560         mov     @T[0],4($ctx)
1561         mov     @T[0],$B                        # magic seed
1562         mov     $C,8($ctx)
1563         mov     $C,@T[1]
1564         mov     $D,12($ctx)
1565         xor     $D,@T[1]
1566         mov     $E,16($ctx)
1567         and     @T[1],@T[0]
1568         jmp     .Loop_dec_avx
1569
1570 .Ldone_dec_avx:
1571 ___
1572                                 $jj=$j=$saved_j; @V=@saved_V;
1573                                 $rx=$saved_rx;
1574
1575         &Xtail_avx(\&body_20_39_dec);
1576         &Xtail_avx(\&body_20_39_dec);
1577         &Xtail_avx(\&body_20_39_dec);
1578
1579         eval(@aes256_dec[-1]);                  # last store
1580 $code.=<<___;
1581
1582         add     0($ctx),$A                      # update context
1583         add     4($ctx),@T[0]
1584         add     8($ctx),$C
1585         mov     $A,0($ctx)
1586         add     12($ctx),$D
1587         mov     @T[0],4($ctx)
1588         add     16($ctx),$E
1589         mov     $C,8($ctx)
1590         mov     $D,12($ctx)
1591         mov     $E,16($ctx)
1592         vmovups @X[3],($ivp)                    # write IV
1593         vzeroall
1594 ___
1595 $code.=<<___ if ($win64);
1596         movaps  96+0(%rsp),%xmm6
1597         movaps  96+16(%rsp),%xmm7
1598         movaps  96+32(%rsp),%xmm8
1599         movaps  96+48(%rsp),%xmm9
1600         movaps  96+64(%rsp),%xmm10
1601         movaps  96+80(%rsp),%xmm11
1602         movaps  96+96(%rsp),%xmm12
1603         movaps  96+112(%rsp),%xmm13
1604         movaps  96+128(%rsp),%xmm14
1605         movaps  96+144(%rsp),%xmm15
1606 ___
1607 $code.=<<___;
1608         lea     `104+($win64?10*16:0)`(%rsp),%rsi
1609         mov     0(%rsi),%r15
1610         mov     8(%rsi),%r14
1611         mov     16(%rsi),%r13
1612         mov     24(%rsi),%r12
1613         mov     32(%rsi),%rbp
1614         mov     40(%rsi),%rbx
1615         lea     48(%rsi),%rsp
1616 .Lepilogue_dec_avx:
1617         ret
1618 .size   aesni256_cbc_sha1_dec_avx,.-aesni256_cbc_sha1_dec_avx
1619 ___
1620                                                 }}}
1621 }
1622 $code.=<<___;
1623 .align  64
1624 K_XX_XX:
1625 .long   0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
1626 .long   0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
1627 .long   0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
1628 .long   0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
1629 .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap mask
1630
1631 .asciz  "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1632 .align  64
1633 ___
1634
1635 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1636 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1637 if ($win64) {
1638 $rec="%rcx";
1639 $frame="%rdx";
1640 $context="%r8";
1641 $disp="%r9";
1642
1643 $code.=<<___;
1644 .extern __imp_RtlVirtualUnwind
1645 .type   ssse3_handler,\@abi-omnipotent
1646 .align  16
1647 ssse3_handler:
1648         push    %rsi
1649         push    %rdi
1650         push    %rbx
1651         push    %rbp
1652         push    %r12
1653         push    %r13
1654         push    %r14
1655         push    %r15
1656         pushfq
1657         sub     \$64,%rsp
1658
1659         mov     120($context),%rax      # pull context->Rax
1660         mov     248($context),%rbx      # pull context->Rip
1661
1662         mov     8($disp),%rsi           # disp->ImageBase
1663         mov     56($disp),%r11          # disp->HandlerData
1664
1665         mov     0(%r11),%r10d           # HandlerData[0]
1666         lea     (%rsi,%r10),%r10        # prologue label
1667         cmp     %r10,%rbx               # context->Rip<prologue label
1668         jb      .Lcommon_seh_tail
1669
1670         mov     152($context),%rax      # pull context->Rsp
1671
1672         mov     4(%r11),%r10d           # HandlerData[1]
1673         lea     (%rsi,%r10),%r10        # epilogue label
1674         cmp     %r10,%rbx               # context->Rip>=epilogue label
1675         jae     .Lcommon_seh_tail
1676
1677         lea     96(%rax),%rsi
1678         lea     512($context),%rdi      # &context.Xmm6
1679         mov     \$20,%ecx
1680         .long   0xa548f3fc              # cld; rep movsq
1681         lea     `104+10*16`(%rax),%rax  # adjust stack pointer
1682
1683         mov     0(%rax),%r15
1684         mov     8(%rax),%r14
1685         mov     16(%rax),%r13
1686         mov     24(%rax),%r12
1687         mov     32(%rax),%rbp
1688         mov     40(%rax),%rbx
1689         lea     48(%rax),%rax
1690         mov     %rbx,144($context)      # restore context->Rbx
1691         mov     %rbp,160($context)      # restore context->Rbp
1692         mov     %r12,216($context)      # restore context->R12
1693         mov     %r13,224($context)      # restore context->R13
1694         mov     %r14,232($context)      # restore context->R14
1695         mov     %r15,240($context)      # restore context->R15
1696
1697 .Lcommon_seh_tail:
1698         mov     8(%rax),%rdi
1699         mov     16(%rax),%rsi
1700         mov     %rax,152($context)      # restore context->Rsp
1701         mov     %rsi,168($context)      # restore context->Rsi
1702         mov     %rdi,176($context)      # restore context->Rdi
1703
1704         mov     40($disp),%rdi          # disp->ContextRecord
1705         mov     $context,%rsi           # context
1706         mov     \$154,%ecx              # sizeof(CONTEXT)
1707         .long   0xa548f3fc              # cld; rep movsq
1708
1709         mov     $disp,%rsi
1710         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1711         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1712         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1713         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1714         mov     40(%rsi),%r10           # disp->ContextRecord
1715         lea     56(%rsi),%r11           # &disp->HandlerData
1716         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1717         mov     %r10,32(%rsp)           # arg5
1718         mov     %r11,40(%rsp)           # arg6
1719         mov     %r12,48(%rsp)           # arg7
1720         mov     %rcx,56(%rsp)           # arg8, (NULL)
1721         call    *__imp_RtlVirtualUnwind(%rip)
1722
1723         mov     \$1,%eax                # ExceptionContinueSearch
1724         add     \$64,%rsp
1725         popfq
1726         pop     %r15
1727         pop     %r14
1728         pop     %r13
1729         pop     %r12
1730         pop     %rbp
1731         pop     %rbx
1732         pop     %rdi
1733         pop     %rsi
1734         ret
1735 .size   ssse3_handler,.-ssse3_handler
1736
1737 .section        .pdata
1738 .align  4
1739         .rva    .LSEH_begin_aesni_cbc_sha1_enc_ssse3
1740         .rva    .LSEH_end_aesni_cbc_sha1_enc_ssse3
1741         .rva    .LSEH_info_aesni_cbc_sha1_enc_ssse3
1742 ___
1743 $code.=<<___ if ($avx);
1744         .rva    .LSEH_begin_aesni_cbc_sha1_enc_avx
1745         .rva    .LSEH_end_aesni_cbc_sha1_enc_avx
1746         .rva    .LSEH_info_aesni_cbc_sha1_enc_avx
1747 ___
1748 $code.=<<___;
1749 .section        .xdata
1750 .align  8
1751 .LSEH_info_aesni_cbc_sha1_enc_ssse3:
1752         .byte   9,0,0,0
1753         .rva    ssse3_handler
1754         .rva    .Lprologue_ssse3,.Lepilogue_ssse3       # HandlerData[]
1755 ___
1756 $code.=<<___ if ($avx);
1757 .LSEH_info_aesni_cbc_sha1_enc_avx:
1758         .byte   9,0,0,0
1759         .rva    ssse3_handler
1760         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1761 ___
1762 }
1763
1764 ####################################################################
1765 sub rex {
1766   local *opcode=shift;
1767   my ($dst,$src)=@_;
1768   my $rex=0;
1769
1770     $rex|=0x04                  if($dst>=8);
1771     $rex|=0x01                  if($src>=8);
1772     push @opcode,$rex|0x40      if($rex);
1773 }
1774
1775 sub aesni {
1776   my $line=shift;
1777   my @opcode=(0x66);
1778
1779     if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1780         my %opcodelet = (
1781                 "aesenc" => 0xdc,       "aesenclast" => 0xdd,
1782                 "aesdec" => 0xde,       "aesdeclast" => 0xdf
1783         );
1784         return undef if (!defined($opcodelet{$1}));
1785         rex(\@opcode,$3,$2);
1786         push @opcode,0x0f,0x38,$opcodelet{$1};
1787         push @opcode,0xc0|($2&7)|(($3&7)<<3);   # ModR/M
1788         return ".byte\t".join(',',@opcode);
1789     }
1790     return $line;
1791 }
1792
1793 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1794 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1795
1796 print $code;
1797 close STDOUT;