crypto/aes/asm/aesni-sha1-x86_64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # June 2011
  11 #
  12 # This is AESNI-CBC+SHA1 "stitch" implementation. The idea, as spelled
  13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
  14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
  15 # parallelism, interleaving it with another algorithm would allow to
  16 # utilize processor resources better and achieve better performance.
  17 # SHA1 instruction sequences(*) are taken from sha1-x86_64.pl and
  18 # AESNI code is weaved into it. Below are performance numbers in
  19 # cycles per processed byte, less is better, for standalone AESNI-CBC
  20 # encrypt, sum of the latter and standalone SHA1, and "stitched"
  21 # subroutine:
  22 #
  23 #               AES-128-CBC     +SHA1           stitch      gain
  24 # Westmere      3.77[+5.5]      9.26            6.58        +41%
  25 # Sandy Bridge  5.05[+5.0(6.2)] 10.06(11.21)    6.09(7.05)  +65%(+59%)
  26 # Ivy Bridge    5.05[+4.6]      9.65            5.54        +74%
  27 # Bulldozer     5.77[+6.0]      11.72           6.37        +84%
  28 #
  29 #               AES-192-CBC
  30 # Westmere      4.51            10.00           6.87        +46%
  31 # Sandy Bridge  6.05            11.06(12.21)    6.11(7.20)  +81%(+70%)
  32 # Ivy Bridge    6.05            10.65           6.07        +75%
  33 # Bulldozer     6.89            12.84           6.96        +84%
  34 #
  35 #               AES-256-CBC
  36 # Westmere      5.25            10.74           7.19        +49%
  37 # Sandy Bridge  7.05            12.06(13.21)    7.12(7.68)  +69%(+72%)
  38 # Ivy Bridge    7.05            11.65           7.12        +64%
  39 # Bulldozer     8.00            13.95           8.25        +69%
  40 #
  41 # (*)   There are two code paths: SSSE3 and AVX. See sha1-568.pl for
  42 #       background information. Above numbers in parentheses are SSSE3
  43 #       results collected on AVX-capable CPU, i.e. apply on OSes that
  44 #       don't support AVX.
  45 #
  46 # Needless to mention that it makes no sense to implement "stitched"
  47 # *decrypt* subroutine. Because *both* AESNI-CBC decrypt and SHA1
  48 # fully utilize parallelism, so stitching would not give any gain
  49 # anyway. Well, there might be some, e.g. because of better cache
  50 # locality... For reference, here are performance results for
  51 # standalone AESNI-CBC decrypt:
  52 #
  53 #               AES-128-CBC     AES-192-CBC     AES-256-CBC
  54 # Westmere      1.31            1.55            1.80
  55 # Sandy Bridge  0.74            0.91            1.09
  56 # Ivy Bridge    0.74            0.90            1.11
  57 # Bulldozer     0.70            0.85            0.99
  58
  59 $flavour = shift;
  60 $output  = shift;
  61 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  62
  63 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  64
  65 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  66 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  67 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  68 die "can't locate x86_64-xlate.pl";
  69
  70 $avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  71                 =~ /GNU assembler version ([2-9]\.[0-9]+)/ &&
  72            $1>=2.19);
  73 $avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  74            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ &&
  75            $1>=2.09);
  76 $avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  77            `ml64 2>&1` =~ /Version ([0-9]+)\./ &&
  78            $1>=10);
  79
  80 open OUT,"| \"$^X\" $xlate $flavour $output";
  81 *STDOUT=*OUT;
  82
  83 # void aesni_cbc_sha1_enc(const void *inp,
  84 #                       void *out,
  85 #                       size_t length,
  86 #                       const AES_KEY *key,
  87 #                       unsigned char *iv,
  88 #                       SHA_CTX *ctx,
  89 #                       const void *in0);
  90
  91 $code.=<<___;
  92 .text
  93 .extern OPENSSL_ia32cap_P
  94
  95 .globl  aesni_cbc_sha1_enc
  96 .type   aesni_cbc_sha1_enc,\@abi-omnipotent
  97 .align  16
  98 aesni_cbc_sha1_enc:
  99         # caller should check for SSSE3 and AES-NI bits
 100         mov     OPENSSL_ia32cap_P+0(%rip),%r10d
 101         mov     OPENSSL_ia32cap_P+4(%rip),%r11d
 102 ___
 103 $code.=<<___ if ($avx);
 104         and     \$`1<<28`,%r11d         # mask AVX bit
 105         and     \$`1<<30`,%r10d         # mask "Intel CPU" bit
 106         or      %r11d,%r10d
 107         cmp     \$`1<<28|1<<30`,%r10d
 108         je      aesni_cbc_sha1_enc_avx
 109 ___
 110 $code.=<<___;
 111         jmp     aesni_cbc_sha1_enc_ssse3
 112         ret
 113 .size   aesni_cbc_sha1_enc,.-aesni_cbc_sha1_enc
 114 ___
 115
 116 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 117
 118 my $Xi=4;
 119 my @X=map("%xmm$_",(4..7,0..3));
 120 my @Tx=map("%xmm$_",(8..10));
 121 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");    # size optimization
 122 my @T=("%esi","%edi");
 123 my $j=0; my $jj=0; my $r=0; my $sn=0; my $rx=0;
 124 my $K_XX_XX="%r11";
 125 my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
 126 my @rndkey=("%xmm14","%xmm15");
 127
 128 if (1) {
 129     @X=map("%xmm$_",(4..11));
 130     @Tx=map("%xmm$_",(12..14));
 131     ($iv,$in,$rndkey0)=map("%xmm$_",(2,3,15));
 132     @rndkey=("%xmm0","%xmm1");
 133 }
 134
 135 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
 136 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
 137   my $arg = pop;
 138     $arg = "\$$arg" if ($arg*1 eq $arg);
 139     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
 140 }
 141
 142 my $_rol=sub { &rol(@_) };
 143 my $_ror=sub { &ror(@_) };
 144
 145 $code.=<<___;
 146 .type   aesni_cbc_sha1_enc_ssse3,\@function,6
 147 .align  16
 148 aesni_cbc_sha1_enc_ssse3:
 149         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
 150         #shr    \$6,$len                        # debugging artefact
 151         #jz     .Lepilogue_ssse3                # debugging artefact
 152         push    %rbx
 153         push    %rbp
 154         push    %r12
 155         push    %r13
 156         push    %r14
 157         push    %r15
 158         lea     `-104-($win64?10*16:0)`(%rsp),%rsp
 159         #mov    $in0,$inp                       # debugging artefact
 160         #lea    64(%rsp),$ctx                   # debugging artefact
 161 ___
 162 $code.=<<___ if ($win64);
 163         movaps  %xmm6,96+0(%rsp)
 164         movaps  %xmm7,96+16(%rsp)
 165         movaps  %xmm8,96+32(%rsp)
 166         movaps  %xmm9,96+48(%rsp)
 167         movaps  %xmm10,96+64(%rsp)
 168         movaps  %xmm11,96+80(%rsp)
 169         movaps  %xmm12,96+96(%rsp)
 170         movaps  %xmm13,96+112(%rsp)
 171         movaps  %xmm14,96+128(%rsp)
 172         movaps  %xmm15,96+144(%rsp)
 173 .Lprologue_ssse3:
 174 ___
 175 $code.=<<___;
 176         mov     $in0,%r12                       # reassign arguments
 177         mov     $out,%r13
 178         mov     $len,%r14
 179         mov     $key,%r15
 180         movdqu  ($ivp),$iv                      # load IV
 181         mov     $ivp,88(%rsp)                   # save $ivp
 182 ___
 183 my ($in0,$out,$len,$key)=map("%r$_",(12..15));  # reassign arguments
 184 my $rounds="${ivp}d";
 185 $code.=<<___;
 186         shl     \$6,$len
 187         sub     $in0,$out
 188         mov     240($key),$rounds
 189         add     $inp,$len               # end of input
 190
 191         lea     K_XX_XX(%rip),$K_XX_XX
 192         mov     0($ctx),$A              # load context
 193         mov     4($ctx),$B
 194         mov     8($ctx),$C
 195         mov     12($ctx),$D
 196         mov     $B,@T[0]                # magic seed
 197         mov     16($ctx),$E
 198         mov     $C,@T[1]
 199         xor     $D,@T[1]
 200         and     @T[1],@T[0]
 201
 202         movdqa  64($K_XX_XX),@X[2]      # pbswap mask
 203         movdqa  0($K_XX_XX),@Tx[1]      # K_00_19
 204         movdqu  0($inp),@X[-4&7]        # load input to %xmm[0-3]
 205         movdqu  16($inp),@X[-3&7]
 206         movdqu  32($inp),@X[-2&7]
 207         movdqu  48($inp),@X[-1&7]
 208         pshufb  @X[2],@X[-4&7]          # byte swap
 209         add     \$64,$inp
 210         pshufb  @X[2],@X[-3&7]
 211         pshufb  @X[2],@X[-2&7]
 212         pshufb  @X[2],@X[-1&7]
 213         paddd   @Tx[1],@X[-4&7]         # add K_00_19
 214         paddd   @Tx[1],@X[-3&7]
 215         paddd   @Tx[1],@X[-2&7]
 216         movdqa  @X[-4&7],0(%rsp)        # X[]+K xfer to IALU
 217         psubd   @Tx[1],@X[-4&7]         # restore X[]
 218         movdqa  @X[-3&7],16(%rsp)
 219         psubd   @Tx[1],@X[-3&7]
 220         movdqa  @X[-2&7],32(%rsp)
 221         psubd   @Tx[1],@X[-2&7]
 222         movups  ($key),$rndkey0         # $key[0]
 223         movups  16($key),$rndkey[0]     # forward reference
 224         jmp     .Loop_ssse3
 225 ___
 226
 227 my $aesenc=sub {
 228   use integer;
 229   my ($n,$k)=($r/10,$r%10);
 230     if ($k==0) {
 231       $code.=<<___;
 232         movups          `16*$n`($in0),$in               # load input
 233         xorps           $rndkey0,$in
 234 ___
 235       $code.=<<___ if ($n);
 236         movups          $iv,`16*($n-1)`($out,$in0)      # write output
 237 ___
 238       $code.=<<___;
 239         xorps           $in,$iv
 240         aesenc          $rndkey[0],$iv
 241         movups          `32+16*$k`($key),$rndkey[1]
 242 ___
 243     } elsif ($k==9) {
 244       $sn++;
 245       $code.=<<___;
 246         cmp             \$11,$rounds
 247         jb              .Laesenclast$sn
 248         movups          `32+16*($k+0)`($key),$rndkey[1]
 249         aesenc          $rndkey[0],$iv
 250         movups          `32+16*($k+1)`($key),$rndkey[0]
 251         aesenc          $rndkey[1],$iv
 252         je              .Laesenclast$sn
 253         movups          `32+16*($k+2)`($key),$rndkey[1]
 254         aesenc          $rndkey[0],$iv
 255         movups          `32+16*($k+3)`($key),$rndkey[0]
 256         aesenc          $rndkey[1],$iv
 257 .Laesenclast$sn:
 258         aesenclast      $rndkey[0],$iv
 259         movups          16($key),$rndkey[1]             # forward reference
 260 ___
 261     } else {
 262       $code.=<<___;
 263         aesenc          $rndkey[0],$iv
 264         movups          `32+16*$k`($key),$rndkey[1]
 265 ___
 266     }
 267     $r++;       unshift(@rndkey,pop(@rndkey));
 268 };
 269
 270 sub Xupdate_ssse3_16_31()               # recall that $Xi starts wtih 4
 271 { use integer;
 272   my $body = shift;
 273   my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
 274   my ($a,$b,$c,$d,$e);
 275
 276         &movdqa (@X[0],@X[-3&7]);
 277          eval(shift(@insns));
 278          eval(shift(@insns));
 279         &movdqa (@Tx[0],@X[-1&7]);
 280         &palignr(@X[0],@X[-4&7],8);     # compose "X[-14]" in "X[0]"
 281          eval(shift(@insns));
 282          eval(shift(@insns));
 283
 284           &paddd        (@Tx[1],@X[-1&7]);
 285          eval(shift(@insns));
 286          eval(shift(@insns));
 287         &psrldq (@Tx[0],4);             # "X[-3]", 3 dwords
 288          eval(shift(@insns));
 289          eval(shift(@insns));
 290         &pxor   (@X[0],@X[-4&7]);       # "X[0]"^="X[-16]"
 291          eval(shift(@insns));
 292          eval(shift(@insns));
 293
 294         &pxor   (@Tx[0],@X[-2&7]);      # "X[-3]"^"X[-8]"
 295          eval(shift(@insns));
 296          eval(shift(@insns));
 297          eval(shift(@insns));
 298          eval(shift(@insns));
 299
 300         &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-3]"^"X[-8]"
 301          eval(shift(@insns));
 302          eval(shift(@insns));
 303           &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 304          eval(shift(@insns));
 305          eval(shift(@insns));
 306
 307         &movdqa (@Tx[2],@X[0]);
 308         &movdqa (@Tx[0],@X[0]);
 309          eval(shift(@insns));
 310          eval(shift(@insns));
 311          eval(shift(@insns));
 312          eval(shift(@insns));
 313
 314         &pslldq (@Tx[2],12);            # "X[0]"<<96, extract one dword
 315         &paddd  (@X[0],@X[0]);
 316          eval(shift(@insns));
 317          eval(shift(@insns));
 318          eval(shift(@insns));
 319          eval(shift(@insns));
 320
 321         &psrld  (@Tx[0],31);
 322          eval(shift(@insns));
 323          eval(shift(@insns));
 324         &movdqa (@Tx[1],@Tx[2]);
 325          eval(shift(@insns));
 326          eval(shift(@insns));
 327
 328         &psrld  (@Tx[2],30);
 329         &por    (@X[0],@Tx[0]);         # "X[0]"<<<=1
 330          eval(shift(@insns));
 331          eval(shift(@insns));
 332          eval(shift(@insns));
 333          eval(shift(@insns));
 334
 335         &pslld  (@Tx[1],2);
 336         &pxor   (@X[0],@Tx[2]);
 337          eval(shift(@insns));
 338          eval(shift(@insns));
 339           &movdqa       (@Tx[2],eval(16*(($Xi)/5))."($K_XX_XX)");       # K_XX_XX
 340          eval(shift(@insns));
 341          eval(shift(@insns));
 342
 343         &pxor   (@X[0],@Tx[1]);         # "X[0]"^=("X[0]">>96)<<<2
 344
 345          foreach (@insns) { eval; }     # remaining instructions [if any]
 346
 347   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 348                 push(@Tx,shift(@Tx));
 349 }
 350
 351 sub Xupdate_ssse3_32_79()
 352 { use integer;
 353   my $body = shift;
 354   my @insns = (&$body,&$body,&$body,&$body);    # 32 to 48 instructions
 355   my ($a,$b,$c,$d,$e);
 356
 357         &movdqa (@Tx[0],@X[-1&7])       if ($Xi==8);
 358          eval(shift(@insns));           # body_20_39
 359         &pxor   (@X[0],@X[-4&7]);       # "X[0]"="X[-32]"^"X[-16]"
 360         &palignr(@Tx[0],@X[-2&7],8);    # compose "X[-6]"
 361          eval(shift(@insns));
 362          eval(shift(@insns));
 363          eval(shift(@insns));           # rol
 364
 365         &pxor   (@X[0],@X[-7&7]);       # "X[0]"^="X[-28]"
 366          eval(shift(@insns));
 367          eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
 368         if ($Xi%5) {
 369           &movdqa       (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
 370         } else {                        # ... or load next one
 371           &movdqa       (@Tx[2],eval(16*($Xi/5))."($K_XX_XX)");
 372         }
 373           &paddd        (@Tx[1],@X[-1&7]);
 374          eval(shift(@insns));           # ror
 375          eval(shift(@insns));
 376
 377         &pxor   (@X[0],@Tx[0]);         # "X[0]"^="X[-6]"
 378          eval(shift(@insns));           # body_20_39
 379          eval(shift(@insns));
 380          eval(shift(@insns));
 381          eval(shift(@insns));           # rol
 382
 383         &movdqa (@Tx[0],@X[0]);
 384           &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 385          eval(shift(@insns));
 386          eval(shift(@insns));
 387          eval(shift(@insns));           # ror
 388          eval(shift(@insns));
 389
 390         &pslld  (@X[0],2);
 391          eval(shift(@insns));           # body_20_39
 392          eval(shift(@insns));
 393         &psrld  (@Tx[0],30);
 394          eval(shift(@insns));
 395          eval(shift(@insns));           # rol
 396          eval(shift(@insns));
 397          eval(shift(@insns));
 398          eval(shift(@insns));           # ror
 399          eval(shift(@insns));
 400
 401         &por    (@X[0],@Tx[0]);         # "X[0]"<<<=2
 402          eval(shift(@insns));           # body_20_39
 403          eval(shift(@insns));
 404           &movdqa       (@Tx[1],@X[0])  if ($Xi<19);
 405          eval(shift(@insns));
 406          eval(shift(@insns));           # rol
 407          eval(shift(@insns));
 408          eval(shift(@insns));
 409          eval(shift(@insns));           # rol
 410          eval(shift(@insns));
 411
 412          foreach (@insns) { eval; }     # remaining instructions
 413
 414   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 415                 push(@Tx,shift(@Tx));
 416 }
 417
 418 sub Xuplast_ssse3_80()
 419 { use integer;
 420   my $body = shift;
 421   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 422   my ($a,$b,$c,$d,$e);
 423
 424          eval(shift(@insns));
 425           &paddd        (@Tx[1],@X[-1&7]);
 426          eval(shift(@insns));
 427          eval(shift(@insns));
 428          eval(shift(@insns));
 429          eval(shift(@insns));
 430
 431           &movdqa       (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
 432
 433          foreach (@insns) { eval; }             # remaining instructions
 434
 435         &cmp    ($inp,$len);
 436         &je     (".Ldone_ssse3");
 437
 438         unshift(@Tx,pop(@Tx));
 439
 440         &movdqa (@X[2],"64($K_XX_XX)");         # pbswap mask
 441         &movdqa (@Tx[1],"0($K_XX_XX)");         # K_00_19
 442         &movdqu (@X[-4&7],"0($inp)");           # load input
 443         &movdqu (@X[-3&7],"16($inp)");
 444         &movdqu (@X[-2&7],"32($inp)");
 445         &movdqu (@X[-1&7],"48($inp)");
 446         &pshufb (@X[-4&7],@X[2]);               # byte swap
 447         &add    ($inp,64);
 448
 449   $Xi=0;
 450 }
 451
 452 sub Xloop_ssse3()
 453 { use integer;
 454   my $body = shift;
 455   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 456   my ($a,$b,$c,$d,$e);
 457
 458          eval(shift(@insns));
 459          eval(shift(@insns));
 460         &pshufb (@X[($Xi-3)&7],@X[2]);
 461          eval(shift(@insns));
 462          eval(shift(@insns));
 463         &paddd  (@X[($Xi-4)&7],@Tx[1]);
 464          eval(shift(@insns));
 465          eval(shift(@insns));
 466          eval(shift(@insns));
 467          eval(shift(@insns));
 468         &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]);  # X[]+K xfer to IALU
 469          eval(shift(@insns));
 470          eval(shift(@insns));
 471         &psubd  (@X[($Xi-4)&7],@Tx[1]);
 472
 473         foreach (@insns) { eval; }
 474   $Xi++;
 475 }
 476
 477 sub Xtail_ssse3()
 478 { use integer;
 479   my $body = shift;
 480   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 481   my ($a,$b,$c,$d,$e);
 482
 483         foreach (@insns) { eval; }
 484 }
 485
 486 sub body_00_19 () {     # ((c^d)&b)^d
 487   # on start @T[0]=(c^d)&b
 488   return &body_20_39() if ($rx==19); $rx++;
 489
 490   use integer;
 491   my ($k,$n);
 492   my @r=(
 493         '($a,$b,$c,$d,$e)=@V;'.
 494         '&$_ror ($b,$j?7:2);',  # $b>>>2
 495         '&xor   (@T[0],$d);',
 496         '&mov   (@T[1],$a);',   # $b for next round
 497
 498         '&add   ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
 499         '&xor   ($b,$c);',      # $c^$d for next round
 500
 501         '&$_rol ($a,5);',
 502         '&add   ($e,@T[0]);',
 503         '&and   (@T[1],$b);',   # ($b&($c^$d)) for next round
 504
 505         '&xor   ($b,$c);',      # restore $b
 506         '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 507         );
 508         $n = scalar(@r);
 509         $k = (($jj+1)*12/20)*20*$n/12;  # 12 aesencs per these 20 rounds
 510         @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n);
 511         $jj++;
 512     return @r;
 513 }
 514
 515 sub body_20_39 () {     # b^d^c
 516   # on entry @T[0]=b^d
 517   return &body_40_59() if ($rx==39); $rx++;
 518
 519   use integer;
 520   my ($k,$n);
 521   my @r=(
 522         '($a,$b,$c,$d,$e)=@V;'.
 523         '&add   ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
 524         '&xor   (@T[0],$d)      if($j==19);'.
 525         '&xor   (@T[0],$c)      if($j> 19);',   # ($b^$d^$c)
 526         '&mov   (@T[1],$a);',   # $b for next round
 527
 528         '&$_rol ($a,5);',
 529         '&add   ($e,@T[0]);',
 530         '&xor   (@T[1],$c)      if ($j< 79);',  # $b^$d for next round
 531
 532         '&$_ror ($b,7);',       # $b>>>2
 533         '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 534         );
 535         $n = scalar(@r);
 536         $k = (($jj+1)*8/20)*20*$n/8;    # 8 aesencs per these 20 rounds
 537         @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n && $rx!=20);
 538         $jj++;
 539     return @r;
 540 }
 541
 542 sub body_40_59 () {     # ((b^c)&(c^d))^c
 543   # on entry @T[0]=(b^c), (c^=d)
 544   $rx++;
 545
 546   use integer;
 547   my ($k,$n);
 548   my @r=(
 549         '($a,$b,$c,$d,$e)=@V;'.
 550         '&add   ($e,eval(4*($j&15))."(%rsp)");',# X[]+K xfer
 551         '&and   (@T[0],$c)      if ($j>=40);',  # (b^c)&(c^d)
 552         '&xor   ($c,$d)         if ($j>=40);',  # restore $c
 553
 554         '&$_ror ($b,7);',       # $b>>>2
 555         '&mov   (@T[1],$a);',   # $b for next round
 556         '&xor   (@T[0],$c);',
 557
 558         '&$_rol ($a,5);',
 559         '&add   ($e,@T[0]);',
 560         '&xor   (@T[1],$c)      if ($j==59);'.
 561         '&xor   (@T[1],$b)      if ($j< 59);',  # b^c for next round
 562
 563         '&xor   ($b,$c)         if ($j< 59);',  # c^d for next round
 564         '&add   ($e,$a);'       .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
 565         );
 566         $n = scalar(@r);
 567         $k=(($jj+1)*12/20)*20*$n/12;    # 12 aesencs per these 20 rounds
 568         @r[$k%$n].='&$aesenc();'        if ($jj==$k/$n && $rx!=40);
 569         $jj++;
 570     return @r;
 571 }
 572 $code.=<<___;
 573 .align  16
 574 .Loop_ssse3:
 575 ___
 576         &Xupdate_ssse3_16_31(\&body_00_19);
 577         &Xupdate_ssse3_16_31(\&body_00_19);
 578         &Xupdate_ssse3_16_31(\&body_00_19);
 579         &Xupdate_ssse3_16_31(\&body_00_19);
 580         &Xupdate_ssse3_32_79(\&body_00_19);
 581         &Xupdate_ssse3_32_79(\&body_20_39);
 582         &Xupdate_ssse3_32_79(\&body_20_39);
 583         &Xupdate_ssse3_32_79(\&body_20_39);
 584         &Xupdate_ssse3_32_79(\&body_20_39);
 585         &Xupdate_ssse3_32_79(\&body_20_39);
 586         &Xupdate_ssse3_32_79(\&body_40_59);
 587         &Xupdate_ssse3_32_79(\&body_40_59);
 588         &Xupdate_ssse3_32_79(\&body_40_59);
 589         &Xupdate_ssse3_32_79(\&body_40_59);
 590         &Xupdate_ssse3_32_79(\&body_40_59);
 591         &Xupdate_ssse3_32_79(\&body_20_39);
 592         &Xuplast_ssse3_80(\&body_20_39);        # can jump to "done"
 593
 594                                 $saved_j=$j; @saved_V=@V;
 595                                 $saved_r=$r; @saved_rndkey=@rndkey;
 596
 597         &Xloop_ssse3(\&body_20_39);
 598         &Xloop_ssse3(\&body_20_39);
 599         &Xloop_ssse3(\&body_20_39);
 600
 601 $code.=<<___;
 602         movups  $iv,48($out,$in0)               # write output
 603         lea     64($in0),$in0
 604
 605         add     0($ctx),$A                      # update context
 606         add     4($ctx),@T[0]
 607         add     8($ctx),$C
 608         add     12($ctx),$D
 609         mov     $A,0($ctx)
 610         add     16($ctx),$E
 611         mov     @T[0],4($ctx)
 612         mov     @T[0],$B                        # magic seed
 613         mov     $C,8($ctx)
 614         mov     $C,@T[1]
 615         mov     $D,12($ctx)
 616         xor     $D,@T[1]
 617         mov     $E,16($ctx)
 618         and     @T[1],@T[0]
 619         jmp     .Loop_ssse3
 620
 621 .align  16
 622 .Ldone_ssse3:
 623 ___
 624                                 $jj=$j=$saved_j; @V=@saved_V;
 625                                 $r=$saved_r;     @rndkey=@saved_rndkey;
 626
 627         &Xtail_ssse3(\&body_20_39);
 628         &Xtail_ssse3(\&body_20_39);
 629         &Xtail_ssse3(\&body_20_39);
 630
 631 $code.=<<___;
 632         movups  $iv,48($out,$in0)               # write output
 633         mov     88(%rsp),$ivp                   # restore $ivp
 634
 635         add     0($ctx),$A                      # update context
 636         add     4($ctx),@T[0]
 637         add     8($ctx),$C
 638         mov     $A,0($ctx)
 639         add     12($ctx),$D
 640         mov     @T[0],4($ctx)
 641         add     16($ctx),$E
 642         mov     $C,8($ctx)
 643         mov     $D,12($ctx)
 644         mov     $E,16($ctx)
 645         movups  $iv,($ivp)                      # write IV
 646 ___
 647 $code.=<<___ if ($win64);
 648         movaps  96+0(%rsp),%xmm6
 649         movaps  96+16(%rsp),%xmm7
 650         movaps  96+32(%rsp),%xmm8
 651         movaps  96+48(%rsp),%xmm9
 652         movaps  96+64(%rsp),%xmm10
 653         movaps  96+80(%rsp),%xmm11
 654         movaps  96+96(%rsp),%xmm12
 655         movaps  96+112(%rsp),%xmm13
 656         movaps  96+128(%rsp),%xmm14
 657         movaps  96+144(%rsp),%xmm15
 658 ___
 659 $code.=<<___;
 660         lea     `104+($win64?10*16:0)`(%rsp),%rsi
 661         mov     0(%rsi),%r15
 662         mov     8(%rsi),%r14
 663         mov     16(%rsi),%r13
 664         mov     24(%rsi),%r12
 665         mov     32(%rsi),%rbp
 666         mov     40(%rsi),%rbx
 667         lea     48(%rsi),%rsp
 668 .Lepilogue_ssse3:
 669         ret
 670 .size   aesni_cbc_sha1_enc_ssse3,.-aesni_cbc_sha1_enc_ssse3
 671 ___
 672
 673 $j=$jj=$r=$sn=$rx=0;
 674
 675 if ($avx) {
 676 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
 677
 678 my $Xi=4;
 679 my @X=map("%xmm$_",(4..7,0..3));
 680 my @Tx=map("%xmm$_",(8..10));
 681 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp");    # size optimization
 682 my @T=("%esi","%edi");
 683 my ($iv,$in,$rndkey0)=map("%xmm$_",(11..13));
 684 my @rndkey=("%xmm14","%xmm15");
 685 my $Kx=$rndkey0;
 686
 687 my $_rol=sub { &shld(@_[0],@_) };
 688 my $_ror=sub { &shrd(@_[0],@_) };
 689
 690 $code.=<<___;
 691 .type   aesni_cbc_sha1_enc_avx,\@function,6
 692 .align  16
 693 aesni_cbc_sha1_enc_avx:
 694         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
 695         #shr    \$6,$len                        # debugging artefact
 696         #jz     .Lepilogue_avx                  # debugging artefact
 697         push    %rbx
 698         push    %rbp
 699         push    %r12
 700         push    %r13
 701         push    %r14
 702         push    %r15
 703         lea     `-104-($win64?10*16:0)`(%rsp),%rsp
 704         #mov    $in0,$inp                       # debugging artefact
 705         #lea    64(%rsp),$ctx                   # debugging artefact
 706 ___
 707 $code.=<<___ if ($win64);
 708         movaps  %xmm6,96+0(%rsp)
 709         movaps  %xmm7,96+16(%rsp)
 710         movaps  %xmm8,96+32(%rsp)
 711         movaps  %xmm9,96+48(%rsp)
 712         movaps  %xmm10,96+64(%rsp)
 713         movaps  %xmm11,96+80(%rsp)
 714         movaps  %xmm12,96+96(%rsp)
 715         movaps  %xmm13,96+112(%rsp)
 716         movaps  %xmm14,96+128(%rsp)
 717         movaps  %xmm15,96+144(%rsp)
 718 .Lprologue_avx:
 719 ___
 720 $code.=<<___;
 721         vzeroall
 722         mov     $in0,%r12                       # reassign arguments
 723         mov     $out,%r13
 724         mov     $len,%r14
 725         mov     $key,%r15
 726         vmovdqu ($ivp),$iv                      # load IV
 727         mov     $ivp,88(%rsp)                   # save $ivp
 728 ___
 729 my ($in0,$out,$len,$key)=map("%r$_",(12..15));  # reassign arguments
 730 my $rounds="${ivp}d";
 731 $code.=<<___;
 732         shl     \$6,$len
 733         sub     $in0,$out
 734         mov     240($key),$rounds
 735         add     \$112,$key              # size optimization
 736         add     $inp,$len               # end of input
 737
 738         lea     K_XX_XX(%rip),$K_XX_XX
 739         mov     0($ctx),$A              # load context
 740         mov     4($ctx),$B
 741         mov     8($ctx),$C
 742         mov     12($ctx),$D
 743         mov     $B,@T[0]                # magic seed
 744         mov     16($ctx),$E
 745         mov     $C,@T[1]
 746         xor     $D,@T[1]
 747         and     @T[1],@T[0]
 748
 749         vmovdqa 64($K_XX_XX),@X[2]      # pbswap mask
 750         vmovdqa 0($K_XX_XX),$Kx         # K_00_19
 751         vmovdqu 0($inp),@X[-4&7]        # load input to %xmm[0-3]
 752         vmovdqu 16($inp),@X[-3&7]
 753         vmovdqu 32($inp),@X[-2&7]
 754         vmovdqu 48($inp),@X[-1&7]
 755         vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
 756         add     \$64,$inp
 757         vpshufb @X[2],@X[-3&7],@X[-3&7]
 758         vpshufb @X[2],@X[-2&7],@X[-2&7]
 759         vpshufb @X[2],@X[-1&7],@X[-1&7]
 760         vpaddd  $Kx,@X[-4&7],@X[0]      # add K_00_19
 761         vpaddd  $Kx,@X[-3&7],@X[1]
 762         vpaddd  $Kx,@X[-2&7],@X[2]
 763         vmovdqa @X[0],0(%rsp)           # X[]+K xfer to IALU
 764         vmovdqa @X[1],16(%rsp)
 765         vmovdqa @X[2],32(%rsp)
 766         vmovups -112($key),$rndkey[1]   # $key[0]
 767         vmovups 16-112($key),$rndkey[0] # forward reference
 768         jmp     .Loop_avx
 769 ___
 770
 771 my $aesenc=sub {
 772   use integer;
 773   my ($n,$k)=($r/10,$r%10);
 774     if ($k==0) {
 775       $code.=<<___;
 776         vmovups         `16*$n`($in0),$in               # load input
 777         vxorps          $rndkey[1],$in,$in
 778 ___
 779       $code.=<<___ if ($n);
 780         vmovups         $iv,`16*($n-1)`($out,$in0)      # write output
 781 ___
 782       $code.=<<___;
 783         vxorps          $in,$iv,$iv
 784         vaesenc         $rndkey[0],$iv,$iv
 785         vmovups         `32+16*$k-112`($key),$rndkey[1]
 786 ___
 787     } elsif ($k==9) {
 788       $sn++;
 789       $code.=<<___;
 790         cmp             \$11,$rounds
 791         jb              .Lvaesenclast$sn
 792         vaesenc         $rndkey[0],$iv,$iv
 793         vmovups         `32+16*($k+0)-112`($key),$rndkey[1]
 794         vaesenc         $rndkey[1],$iv,$iv
 795         vmovups         `32+16*($k+1)-112`($key),$rndkey[0]
 796         je              .Lvaesenclast$sn
 797         vaesenc         $rndkey[0],$iv,$iv
 798         vmovups         `32+16*($k+2)-112`($key),$rndkey[1]
 799         vaesenc         $rndkey[1],$iv,$iv
 800         vmovups         `32+16*($k+3)-112`($key),$rndkey[0]
 801 .Lvaesenclast$sn:
 802         vaesenclast     $rndkey[0],$iv,$iv
 803         vmovups         -112($key),$rndkey[0]
 804         vmovups         16-112($key),$rndkey[1]         # forward reference
 805 ___
 806     } else {
 807       $code.=<<___;
 808         vaesenc         $rndkey[0],$iv,$iv
 809         vmovups         `32+16*$k-112`($key),$rndkey[1]
 810 ___
 811     }
 812     $r++;       unshift(@rndkey,pop(@rndkey));
 813 };
 814
 815 sub Xupdate_avx_16_31()         # recall that $Xi starts wtih 4
 816 { use integer;
 817   my $body = shift;
 818   my @insns = (&$body,&$body,&$body,&$body);    # 40 instructions
 819   my ($a,$b,$c,$d,$e);
 820
 821          eval(shift(@insns));
 822          eval(shift(@insns));
 823         &vpalignr(@X[0],@X[-3&7],@X[-4&7],8);   # compose "X[-14]" in "X[0]"
 824          eval(shift(@insns));
 825          eval(shift(@insns));
 826
 827           &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
 828          eval(shift(@insns));
 829          eval(shift(@insns));
 830         &vpsrldq(@Tx[0],@X[-1&7],4);            # "X[-3]", 3 dwords
 831          eval(shift(@insns));
 832          eval(shift(@insns));
 833         &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"^="X[-16]"
 834          eval(shift(@insns));
 835          eval(shift(@insns));
 836
 837         &vpxor  (@Tx[0],@Tx[0],@X[-2&7]);       # "X[-3]"^"X[-8]"
 838          eval(shift(@insns));
 839          eval(shift(@insns));
 840          eval(shift(@insns));
 841          eval(shift(@insns));
 842
 843         &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-3]"^"X[-8]"
 844          eval(shift(@insns));
 845          eval(shift(@insns));
 846           &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 847          eval(shift(@insns));
 848          eval(shift(@insns));
 849
 850         &vpsrld (@Tx[0],@X[0],31);
 851          eval(shift(@insns));
 852          eval(shift(@insns));
 853          eval(shift(@insns));
 854          eval(shift(@insns));
 855
 856         &vpslldq(@Tx[2],@X[0],12);              # "X[0]"<<96, extract one dword
 857         &vpaddd (@X[0],@X[0],@X[0]);
 858          eval(shift(@insns));
 859          eval(shift(@insns));
 860          eval(shift(@insns));
 861          eval(shift(@insns));
 862
 863         &vpsrld (@Tx[1],@Tx[2],30);
 864         &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=1
 865          eval(shift(@insns));
 866          eval(shift(@insns));
 867          eval(shift(@insns));
 868          eval(shift(@insns));
 869
 870         &vpslld (@Tx[2],@Tx[2],2);
 871         &vpxor  (@X[0],@X[0],@Tx[1]);
 872          eval(shift(@insns));
 873          eval(shift(@insns));
 874          eval(shift(@insns));
 875          eval(shift(@insns));
 876
 877         &vpxor  (@X[0],@X[0],@Tx[2]);           # "X[0]"^=("X[0]">>96)<<<2
 878          eval(shift(@insns));
 879          eval(shift(@insns));
 880           &vmovdqa      ($Kx,eval(16*(($Xi)/5))."($K_XX_XX)")   if ($Xi%5==0);  # K_XX_XX
 881          eval(shift(@insns));
 882          eval(shift(@insns));
 883
 884
 885          foreach (@insns) { eval; }     # remaining instructions [if any]
 886
 887   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 888 }
 889
 890 sub Xupdate_avx_32_79()
 891 { use integer;
 892   my $body = shift;
 893   my @insns = (&$body,&$body,&$body,&$body);    # 32 to 48 instructions
 894   my ($a,$b,$c,$d,$e);
 895
 896         &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8);  # compose "X[-6]"
 897         &vpxor  (@X[0],@X[0],@X[-4&7]);         # "X[0]"="X[-32]"^"X[-16]"
 898          eval(shift(@insns));           # body_20_39
 899          eval(shift(@insns));
 900          eval(shift(@insns));
 901          eval(shift(@insns));           # rol
 902
 903         &vpxor  (@X[0],@X[0],@X[-7&7]);         # "X[0]"^="X[-28]"
 904          eval(shift(@insns));
 905          eval(shift(@insns))    if (@insns[0] !~ /&ro[rl]/);
 906           &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
 907           &vmovdqa      ($Kx,eval(16*($Xi/5))."($K_XX_XX)")     if ($Xi%5==0);
 908          eval(shift(@insns));           # ror
 909          eval(shift(@insns));
 910
 911         &vpxor  (@X[0],@X[0],@Tx[0]);           # "X[0]"^="X[-6]"
 912          eval(shift(@insns));           # body_20_39
 913          eval(shift(@insns));
 914          eval(shift(@insns));
 915          eval(shift(@insns));           # rol
 916
 917         &vpsrld (@Tx[0],@X[0],30);
 918           &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
 919          eval(shift(@insns));
 920          eval(shift(@insns));
 921          eval(shift(@insns));           # ror
 922          eval(shift(@insns));
 923
 924         &vpslld (@X[0],@X[0],2);
 925          eval(shift(@insns));           # body_20_39
 926          eval(shift(@insns));
 927          eval(shift(@insns));
 928          eval(shift(@insns));           # rol
 929          eval(shift(@insns));
 930          eval(shift(@insns));
 931          eval(shift(@insns));           # ror
 932          eval(shift(@insns));
 933
 934         &vpor   (@X[0],@X[0],@Tx[0]);           # "X[0]"<<<=2
 935          eval(shift(@insns));           # body_20_39
 936          eval(shift(@insns));
 937          eval(shift(@insns));
 938          eval(shift(@insns));           # rol
 939          eval(shift(@insns));
 940          eval(shift(@insns));
 941          eval(shift(@insns));           # rol
 942          eval(shift(@insns));
 943
 944          foreach (@insns) { eval; }     # remaining instructions
 945
 946   $Xi++;        push(@X,shift(@X));     # "rotate" X[]
 947 }
 948
 949 sub Xuplast_avx_80()
 950 { use integer;
 951   my $body = shift;
 952   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 953   my ($a,$b,$c,$d,$e);
 954
 955          eval(shift(@insns));
 956           &vpaddd       (@Tx[1],$Kx,@X[-1&7]);
 957          eval(shift(@insns));
 958          eval(shift(@insns));
 959          eval(shift(@insns));
 960          eval(shift(@insns));
 961
 962           &vmovdqa      (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
 963
 964          foreach (@insns) { eval; }             # remaining instructions
 965
 966         &cmp    ($inp,$len);
 967         &je     (".Ldone_avx");
 968
 969         unshift(@Tx,pop(@Tx));
 970
 971         &vmovdqa(@X[2],"64($K_XX_XX)");         # pbswap mask
 972         &vmovdqa($Kx,"0($K_XX_XX)");            # K_00_19
 973         &vmovdqu(@X[-4&7],"0($inp)");           # load input
 974         &vmovdqu(@X[-3&7],"16($inp)");
 975         &vmovdqu(@X[-2&7],"32($inp)");
 976         &vmovdqu(@X[-1&7],"48($inp)");
 977         &vpshufb(@X[-4&7],@X[-4&7],@X[2]);      # byte swap
 978         &add    ($inp,64);
 979
 980   $Xi=0;
 981 }
 982
 983 sub Xloop_avx()
 984 { use integer;
 985   my $body = shift;
 986   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
 987   my ($a,$b,$c,$d,$e);
 988
 989          eval(shift(@insns));
 990          eval(shift(@insns));
 991         &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
 992          eval(shift(@insns));
 993          eval(shift(@insns));
 994         &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx);
 995          eval(shift(@insns));
 996          eval(shift(@insns));
 997          eval(shift(@insns));
 998          eval(shift(@insns));
 999         &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]);      # X[]+K xfer to IALU
1000          eval(shift(@insns));
1001          eval(shift(@insns));
1002
1003         foreach (@insns) { eval; }
1004   $Xi++;
1005 }
1006
1007 sub Xtail_avx()
1008 { use integer;
1009   my $body = shift;
1010   my @insns = (&$body,&$body,&$body,&$body);    # 32 instructions
1011   my ($a,$b,$c,$d,$e);
1012
1013         foreach (@insns) { eval; }
1014 }
1015
1016 $code.=<<___;
1017 .align  16
1018 .Loop_avx:
1019 ___
1020         &Xupdate_avx_16_31(\&body_00_19);
1021         &Xupdate_avx_16_31(\&body_00_19);
1022         &Xupdate_avx_16_31(\&body_00_19);
1023         &Xupdate_avx_16_31(\&body_00_19);
1024         &Xupdate_avx_32_79(\&body_00_19);
1025         &Xupdate_avx_32_79(\&body_20_39);
1026         &Xupdate_avx_32_79(\&body_20_39);
1027         &Xupdate_avx_32_79(\&body_20_39);
1028         &Xupdate_avx_32_79(\&body_20_39);
1029         &Xupdate_avx_32_79(\&body_20_39);
1030         &Xupdate_avx_32_79(\&body_40_59);
1031         &Xupdate_avx_32_79(\&body_40_59);
1032         &Xupdate_avx_32_79(\&body_40_59);
1033         &Xupdate_avx_32_79(\&body_40_59);
1034         &Xupdate_avx_32_79(\&body_40_59);
1035         &Xupdate_avx_32_79(\&body_20_39);
1036         &Xuplast_avx_80(\&body_20_39);  # can jump to "done"
1037
1038                                 $saved_j=$j; @saved_V=@V;
1039                                 $saved_r=$r; @saved_rndkey=@rndkey;
1040
1041         &Xloop_avx(\&body_20_39);
1042         &Xloop_avx(\&body_20_39);
1043         &Xloop_avx(\&body_20_39);
1044
1045 $code.=<<___;
1046         vmovups $iv,48($out,$in0)               # write output
1047         lea     64($in0),$in0
1048
1049         add     0($ctx),$A                      # update context
1050         add     4($ctx),@T[0]
1051         add     8($ctx),$C
1052         add     12($ctx),$D
1053         mov     $A,0($ctx)
1054         add     16($ctx),$E
1055         mov     @T[0],4($ctx)
1056         mov     @T[0],$B                        # magic seed
1057         mov     $C,8($ctx)
1058         mov     $C,@T[1]
1059         mov     $D,12($ctx)
1060         xor     $D,@T[1]
1061         mov     $E,16($ctx)
1062         and     @T[1],@T[0]
1063         jmp     .Loop_avx
1064
1065 .align  16
1066 .Ldone_avx:
1067 ___
1068                                 $jj=$j=$saved_j; @V=@saved_V;
1069                                 $r=$saved_r;     @rndkey=@saved_rndkey;
1070
1071         &Xtail_avx(\&body_20_39);
1072         &Xtail_avx(\&body_20_39);
1073         &Xtail_avx(\&body_20_39);
1074
1075 $code.=<<___;
1076         vmovups $iv,48($out,$in0)               # write output
1077         mov     88(%rsp),$ivp                   # restore $ivp
1078
1079         add     0($ctx),$A                      # update context
1080         add     4($ctx),@T[0]
1081         add     8($ctx),$C
1082         mov     $A,0($ctx)
1083         add     12($ctx),$D
1084         mov     @T[0],4($ctx)
1085         add     16($ctx),$E
1086         mov     $C,8($ctx)
1087         mov     $D,12($ctx)
1088         mov     $E,16($ctx)
1089         vmovups $iv,($ivp)                      # write IV
1090         vzeroall
1091 ___
1092 $code.=<<___ if ($win64);
1093         movaps  96+0(%rsp),%xmm6
1094         movaps  96+16(%rsp),%xmm7
1095         movaps  96+32(%rsp),%xmm8
1096         movaps  96+48(%rsp),%xmm9
1097         movaps  96+64(%rsp),%xmm10
1098         movaps  96+80(%rsp),%xmm11
1099         movaps  96+96(%rsp),%xmm12
1100         movaps  96+112(%rsp),%xmm13
1101         movaps  96+128(%rsp),%xmm14
1102         movaps  96+144(%rsp),%xmm15
1103 ___
1104 $code.=<<___;
1105         lea     `104+($win64?10*16:0)`(%rsp),%rsi
1106         mov     0(%rsi),%r15
1107         mov     8(%rsi),%r14
1108         mov     16(%rsi),%r13
1109         mov     24(%rsi),%r12
1110         mov     32(%rsi),%rbp
1111         mov     40(%rsi),%rbx
1112         lea     48(%rsi),%rsp
1113 .Lepilogue_avx:
1114         ret
1115 .size   aesni_cbc_sha1_enc_avx,.-aesni_cbc_sha1_enc_avx
1116 ___
1117 }
1118 $code.=<<___;
1119 .align  64
1120 K_XX_XX:
1121 .long   0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
1122 .long   0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
1123 .long   0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
1124 .long   0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
1125 .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap mask
1126
1127 .asciz  "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1128 .align  64
1129 ___
1130
1131 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1132 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1133 if ($win64) {
1134 $rec="%rcx";
1135 $frame="%rdx";
1136 $context="%r8";
1137 $disp="%r9";
1138
1139 $code.=<<___;
1140 .extern __imp_RtlVirtualUnwind
1141 .type   ssse3_handler,\@abi-omnipotent
1142 .align  16
1143 ssse3_handler:
1144         push    %rsi
1145         push    %rdi
1146         push    %rbx
1147         push    %rbp
1148         push    %r12
1149         push    %r13
1150         push    %r14
1151         push    %r15
1152         pushfq
1153         sub     \$64,%rsp
1154
1155         mov     120($context),%rax      # pull context->Rax
1156         mov     248($context),%rbx      # pull context->Rip
1157
1158         mov     8($disp),%rsi           # disp->ImageBase
1159         mov     56($disp),%r11          # disp->HandlerData
1160
1161         mov     0(%r11),%r10d           # HandlerData[0]
1162         lea     (%rsi,%r10),%r10        # prologue label
1163         cmp     %r10,%rbx               # context->Rip<prologue label
1164         jb      .Lcommon_seh_tail
1165
1166         mov     152($context),%rax      # pull context->Rsp
1167
1168         mov     4(%r11),%r10d           # HandlerData[1]
1169         lea     (%rsi,%r10),%r10        # epilogue label
1170         cmp     %r10,%rbx               # context->Rip>=epilogue label
1171         jae     .Lcommon_seh_tail
1172
1173         lea     96(%rax),%rsi
1174         lea     512($context),%rdi      # &context.Xmm6
1175         mov     \$20,%ecx
1176         .long   0xa548f3fc              # cld; rep movsq
1177         lea     `104+10*16`(%rax),%rax  # adjust stack pointer
1178
1179         mov     0(%rax),%r15
1180         mov     8(%rax),%r14
1181         mov     16(%rax),%r13
1182         mov     24(%rax),%r12
1183         mov     32(%rax),%rbp
1184         mov     40(%rax),%rbx
1185         lea     48(%rax),%rax
1186         mov     %rbx,144($context)      # restore context->Rbx
1187         mov     %rbp,160($context)      # restore context->Rbp
1188         mov     %r12,216($context)      # restore context->R12
1189         mov     %r13,224($context)      # restore context->R13
1190         mov     %r14,232($context)      # restore context->R14
1191         mov     %r15,240($context)      # restore context->R15
1192
1193 .Lcommon_seh_tail:
1194         mov     8(%rax),%rdi
1195         mov     16(%rax),%rsi
1196         mov     %rax,152($context)      # restore context->Rsp
1197         mov     %rsi,168($context)      # restore context->Rsi
1198         mov     %rdi,176($context)      # restore context->Rdi
1199
1200         mov     40($disp),%rdi          # disp->ContextRecord
1201         mov     $context,%rsi           # context
1202         mov     \$154,%ecx              # sizeof(CONTEXT)
1203         .long   0xa548f3fc              # cld; rep movsq
1204
1205         mov     $disp,%rsi
1206         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1207         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1208         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1209         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1210         mov     40(%rsi),%r10           # disp->ContextRecord
1211         lea     56(%rsi),%r11           # &disp->HandlerData
1212         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1213         mov     %r10,32(%rsp)           # arg5
1214         mov     %r11,40(%rsp)           # arg6
1215         mov     %r12,48(%rsp)           # arg7
1216         mov     %rcx,56(%rsp)           # arg8, (NULL)
1217         call    *__imp_RtlVirtualUnwind(%rip)
1218
1219         mov     \$1,%eax                # ExceptionContinueSearch
1220         add     \$64,%rsp
1221         popfq
1222         pop     %r15
1223         pop     %r14
1224         pop     %r13
1225         pop     %r12
1226         pop     %rbp
1227         pop     %rbx
1228         pop     %rdi
1229         pop     %rsi
1230         ret
1231 .size   ssse3_handler,.-ssse3_handler
1232
1233 .section        .pdata
1234 .align  4
1235         .rva    .LSEH_begin_aesni_cbc_sha1_enc_ssse3
1236         .rva    .LSEH_end_aesni_cbc_sha1_enc_ssse3
1237         .rva    .LSEH_info_aesni_cbc_sha1_enc_ssse3
1238 ___
1239 $code.=<<___ if ($avx);
1240         .rva    .LSEH_begin_aesni_cbc_sha1_enc_avx
1241         .rva    .LSEH_end_aesni_cbc_sha1_enc_avx
1242         .rva    .LSEH_info_aesni_cbc_sha1_enc_avx
1243 ___
1244 $code.=<<___;
1245 .section        .xdata
1246 .align  8
1247 .LSEH_info_aesni_cbc_sha1_enc_ssse3:
1248         .byte   9,0,0,0
1249         .rva    ssse3_handler
1250         .rva    .Lprologue_ssse3,.Lepilogue_ssse3       # HandlerData[]
1251 ___
1252 $code.=<<___ if ($avx);
1253 .LSEH_info_aesni_cbc_sha1_enc_avx:
1254         .byte   9,0,0,0
1255         .rva    ssse3_handler
1256         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1257 ___
1258 }
1259
1260 ####################################################################
1261 sub rex {
1262   local *opcode=shift;
1263   my ($dst,$src)=@_;
1264   my $rex=0;
1265
1266     $rex|=0x04                  if($dst>=8);
1267     $rex|=0x01                  if($src>=8);
1268     push @opcode,$rex|0x40      if($rex);
1269 }
1270
1271 sub aesni {
1272   my $line=shift;
1273   my @opcode=(0x66);
1274
1275     if ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1276         my %opcodelet = (
1277                 "aesenc" => 0xdc,       "aesenclast" => 0xdd
1278         );
1279         return undef if (!defined($opcodelet{$1}));
1280         rex(\@opcode,$3,$2);
1281         push @opcode,0x0f,0x38,$opcodelet{$1};
1282         push @opcode,0xc0|($2&7)|(($3&7)<<3);   # ModR/M
1283         return ".byte\t".join(',',@opcode);
1284     }
1285     return $line;
1286 }
1287
1288 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1289 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1290
1291 print $code;
1292 close STDOUT;