x86_64 assembler pack update from HEAD.
authorAndy Polyakov <appro@openssl.org>
Mon, 14 Nov 2011 21:01:21 +0000 (21:01 +0000)
committerAndy Polyakov <appro@openssl.org>
Mon, 14 Nov 2011 21:01:21 +0000 (21:01 +0000)
crypto/aes/asm/aes-x86_64.pl
crypto/aes/asm/bsaes-x86_64.pl [new file with mode: 0644]
crypto/aes/asm/vpaes-x86_64.pl [new file with mode: 0644]
crypto/bn/asm/x86_64-gf2m.pl [new file with mode: 0644]
crypto/bn/asm/x86_64-mont.pl
crypto/bn/asm/x86_64-mont5.pl [new file with mode: 0755]
crypto/sha/asm/sha512-x86_64.pl
crypto/x86_64cpuid.pl

index 7d09b35..48fa857 100755 (executable)
@@ -588,6 +588,9 @@ $code.=<<___;
 .globl AES_encrypt
 .type  AES_encrypt,\@function,3
 .align 16
+.globl asm_AES_encrypt
+.hidden        asm_AES_encrypt
+asm_AES_encrypt:
 AES_encrypt:
        push    %rbx
        push    %rbp
@@ -1184,6 +1187,9 @@ $code.=<<___;
 .globl AES_decrypt
 .type  AES_decrypt,\@function,3
 .align 16
+.globl asm_AES_decrypt
+.hidden        asm_AES_decrypt
+asm_AES_decrypt:
 AES_decrypt:
        push    %rbx
        push    %rbp
@@ -1648,6 +1654,9 @@ $code.=<<___;
 .type  AES_cbc_encrypt,\@function,6
 .align 16
 .extern        OPENSSL_ia32cap_P
+.globl asm_AES_cbc_encrypt
+.hidden        asm_AES_cbc_encrypt
+asm_AES_cbc_encrypt:
 AES_cbc_encrypt:
        cmp     \$0,%rdx        # check length
        je      .Lcbc_epilogue
diff --git a/crypto/aes/asm/bsaes-x86_64.pl b/crypto/aes/asm/bsaes-x86_64.pl
new file mode 100644 (file)
index 0000000..edc70fa
--- /dev/null
@@ -0,0 +1,3003 @@
+#!/usr/bin/env perl
+
+###################################################################
+### AES-128 [originally in CTR mode]                           ###
+### bitsliced implementation for Intel Core 2 processors       ###
+### requires support of SSE extensions up to SSSE3             ###
+### Author: Emilia Käsper and Peter Schwabe                   ###
+### Date: 2009-03-19                                           ###
+### Public domain                                              ###
+###                                                            ###
+### See http://homes.esat.kuleuven.be/~ekasper/#software for   ###
+### further information.                                       ###
+###################################################################
+#
+# September 2011.
+#
+# Started as transliteration to "perlasm" the original code has
+# undergone following changes:
+#
+# - code was made position-independent;
+# - rounds were folded into a loop resulting in >5x size reduction
+#   from 12.5KB to 2.2KB;
+# - above was possibile thanks to mixcolumns() modification that
+#   allowed to feed its output back to aesenc[last], this was
+#   achieved at cost of two additional inter-registers moves;
+# - some instruction reordering and interleaving;
+# - this module doesn't implement key setup subroutine, instead it
+#   relies on conversion of "conventional" key schedule as returned
+#   by AES_set_encrypt_key (see discussion below);
+# - first and last round keys are treated differently, which allowed
+#   to skip one shiftrows(), reduce bit-sliced key schedule and
+#   speed-up conversion by 22%;
+# - support for 192- and 256-bit keys was added;
+#
+# Resulting performance in CPU cycles spent to encrypt one byte out
+# of 4096-byte buffer with 128-bit key is:
+#
+#              Emilia's        this(*)         difference
+#
+# Core 2       9.30            8.69            +7%
+# Nehalem(**)  7.63            6.98            +9%
+# Atom         17.1            17.4            -2%(***)
+#
+# (*)  Comparison is not completely fair, because "this" is ECB,
+#      i.e. no extra processing such as counter values calculation
+#      and xor-ing input as in Emilia's CTR implementation is
+#      performed. However, the CTR calculations stand for not more
+#      than 1% of total time, so comparison is *rather* fair.
+#
+# (**) Results were collected on Westmere, which is considered to
+#      be equivalent to Nehalem for this code.
+#
+# (***)        Slowdown on Atom is rather strange per se, because original
+#      implementation has a number of 9+-bytes instructions, which
+#      are bad for Atom front-end, and which I eliminated completely.
+#      In attempt to address deterioration sbox() was tested in FP
+#      SIMD "domain" (movaps instead of movdqa, xorps instead of
+#      pxor, etc.). While it resulted in nominal 4% improvement on
+#      Atom, it hurted Westmere by more than 2x factor.
+#
+# As for key schedule conversion subroutine. Interface to OpenSSL
+# relies on per-invocation on-the-fly conversion. This naturally
+# has impact on performance, especially for short inputs. Conversion
+# time in CPU cycles and its ratio to CPU cycles spent in 8x block
+# function is:
+#
+#              conversion      conversion/8x block
+# Core 2       410             0.37
+# Nehalem      310             0.35
+# Atom         570             0.26
+#
+# The ratio values mean that 128-byte blocks will be processed
+# 21-27% slower, 256-byte blocks - 12-16%, 384-byte blocks - 8-11%,
+# etc. Then keep in mind that input sizes not divisible by 128 are
+# *effectively* slower, especially shortest ones, e.g. consecutive
+# 144-byte blocks are processed 44% slower than one would expect,
+# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
+# it's still faster than ["hyper-threading-safe" code path in]
+# aes-x86_64.pl on all lengths above 64 bytes...
+#
+# October 2011.
+#
+# Add decryption procedure. Performance in CPU cycles spent to decrypt
+# one byte out of 4096-byte buffer with 128-bit key is:
+#
+# Core 2       11.0
+# Nehalem      9.16
+#
+# November 2011.
+#
+# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
+# suboptimal, but XTS is meant to be used with larger blocks...
+#
+#                                              <appro@openssl.org>
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
+my @XMM=map("%xmm$_",(15,0..14));      # best on Atom, +10% over (0..15)
+my $ecb=0;     # suppress unreferenced ECB subroutines, spare some space...
+
+{
+my ($key,$rounds,$const)=("%rax","%r10d","%r11");
+
+sub Sbox {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+       &InBasisChange  (@b);
+       &Inv_GF256      (@b[6,5,0,3,7,1,4,2],@t,@s);
+       &OutBasisChange (@b[7,1,4,2,6,5,0,3]);
+}
+
+sub InBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb 
+my @b=@_[0..7];
+$code.=<<___;
+       pxor    @b[6], @b[5]
+       pxor    @b[1], @b[2]
+       pxor    @b[0], @b[3]
+       pxor    @b[2], @b[6]
+       pxor    @b[0], @b[5]
+
+       pxor    @b[3], @b[6]
+       pxor    @b[7], @b[3]
+       pxor    @b[5], @b[7]
+       pxor    @b[4], @b[3]
+       pxor    @b[5], @b[4]
+       pxor    @b[1], @b[3]
+
+       pxor    @b[7], @b[2]
+       pxor    @b[5], @b[1]
+___
+}
+
+sub OutBasisChange {
+# input in  lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
+my @b=@_[0..7];
+$code.=<<___;
+       pxor    @b[6], @b[0]
+       pxor    @b[4], @b[1]
+       pxor    @b[0], @b[2]
+       pxor    @b[6], @b[4]
+       pxor    @b[1], @b[6]
+
+       pxor    @b[5], @b[1]
+       pxor    @b[3], @b[5]
+       pxor    @b[7], @b[3]
+       pxor    @b[5], @b[7]
+       pxor    @b[5], @b[2]
+
+       pxor    @b[7], @b[4]
+___
+}
+
+sub InvSbox {
+# input in lsb         > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
+# output in lsb        > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
+my @b=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+       &InvInBasisChange       (@b);
+       &Inv_GF256              (@b[5,1,2,6,3,7,0,4],@t,@s);
+       &InvOutBasisChange      (@b[3,7,0,4,5,1,2,6]);
+}
+
+sub InvInBasisChange {         # OutBasisChange in reverse
+my @b=@_[5,1,2,6,3,7,0,4];
+$code.=<<___
+       pxor    @b[7], @b[4]
+
+       pxor    @b[5], @b[7]
+       pxor    @b[5], @b[2]
+       pxor    @b[7], @b[3]
+       pxor    @b[3], @b[5]
+       pxor    @b[5], @b[1]
+
+       pxor    @b[1], @b[6]
+       pxor    @b[0], @b[2]
+       pxor    @b[6], @b[4]
+       pxor    @b[6], @b[0]
+       pxor    @b[4], @b[1]
+___
+}
+
+sub InvOutBasisChange {                # InBasisChange in reverse
+my @b=@_[2,5,7,3,6,1,0,4];
+$code.=<<___;
+       pxor    @b[5], @b[1]
+       pxor    @b[7], @b[2]
+
+       pxor    @b[1], @b[3]
+       pxor    @b[5], @b[4]
+       pxor    @b[5], @b[7]
+       pxor    @b[4], @b[3]
+        pxor   @b[0], @b[5]
+       pxor    @b[7], @b[3]
+        pxor   @b[2], @b[6]
+        pxor   @b[1], @b[2]
+       pxor    @b[3], @b[6]
+
+       pxor    @b[0], @b[3]
+       pxor    @b[6], @b[5]
+___
+}
+
+sub Mul_GF4 {
+#;*************************************************************
+#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
+#;*************************************************************
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+       movdqa  $y0, $t0
+       pxor    $y1, $t0
+       pand    $x0, $t0
+       pxor    $x1, $x0
+       pand    $y0, $x1
+       pand    $y1, $x0
+       pxor    $x1, $x0
+       pxor    $t0, $x1
+___
+}
+
+sub Mul_GF4_N {                                # not used, see next subroutine
+# multiply and scale by N
+my ($x0,$x1,$y0,$y1,$t0)=@_;
+$code.=<<___;
+       movdqa  $y0, $t0
+       pxor    $y1, $t0
+       pand    $x0, $t0
+       pxor    $x1, $x0
+       pand    $y0, $x1
+       pand    $y1, $x0
+       pxor    $x0, $x1
+       pxor    $t0, $x0
+___
+}
+
+sub Mul_GF4_N_GF4 {
+# interleaved Mul_GF4_N and Mul_GF4
+my ($x0,$x1,$y0,$y1,$t0,
+    $x2,$x3,$y2,$y3,$t1)=@_;
+$code.=<<___;
+       movdqa  $y0, $t0
+        movdqa $y2, $t1
+       pxor    $y1, $t0
+        pxor   $y3, $t1
+       pand    $x0, $t0
+        pand   $x2, $t1
+       pxor    $x1, $x0
+        pxor   $x3, $x2
+       pand    $y0, $x1
+        pand   $y2, $x3
+       pand    $y1, $x0
+        pand   $y3, $x2
+       pxor    $x0, $x1
+        pxor   $x3, $x2
+       pxor    $t0, $x0
+        pxor   $t1, $x3
+___
+}
+sub Mul_GF16_2 {
+my @x=@_[0..7];
+my @y=@_[8..11];
+my @t=@_[12..15];
+$code.=<<___;
+       movdqa  @x[0], @t[0]
+       movdqa  @x[1], @t[1]
+___
+       &Mul_GF4        (@x[0], @x[1], @y[0], @y[1], @t[2]);
+$code.=<<___;
+       pxor    @x[2], @t[0]
+       pxor    @x[3], @t[1]
+       pxor    @y[2], @y[0]
+       pxor    @y[3], @y[1]
+___
+       Mul_GF4_N_GF4   (@t[0], @t[1], @y[0], @y[1], @t[3],
+                        @x[2], @x[3], @y[2], @y[3], @t[2]);
+$code.=<<___;
+       pxor    @t[0], @x[0]
+       pxor    @t[0], @x[2]
+       pxor    @t[1], @x[1]
+       pxor    @t[1], @x[3]
+
+       movdqa  @x[4], @t[0]
+       movdqa  @x[5], @t[1]
+       pxor    @x[6], @t[0]
+       pxor    @x[7], @t[1]
+___
+       &Mul_GF4_N_GF4  (@t[0], @t[1], @y[0], @y[1], @t[3],
+                        @x[6], @x[7], @y[2], @y[3], @t[2]);
+$code.=<<___;
+       pxor    @y[2], @y[0]
+       pxor    @y[3], @y[1]
+___
+       &Mul_GF4        (@x[4], @x[5], @y[0], @y[1], @t[3]);
+$code.=<<___;
+       pxor    @t[0], @x[4]
+       pxor    @t[0], @x[6]
+       pxor    @t[1], @x[5]
+       pxor    @t[1], @x[7]
+___
+}
+sub Inv_GF256 {
+#;********************************************************************
+#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144)       *
+#;********************************************************************
+my @x=@_[0..7];
+my @t=@_[8..11];
+my @s=@_[12..15];
+# direct optimizations from hardware
+$code.=<<___;
+       movdqa  @x[4], @t[3]
+       movdqa  @x[5], @t[2]
+       movdqa  @x[1], @t[1]
+       movdqa  @x[7], @s[1]
+       movdqa  @x[0], @s[0]
+
+       pxor    @x[6], @t[3]
+       pxor    @x[7], @t[2]
+       pxor    @x[3], @t[1]
+        movdqa @t[3], @s[2]
+       pxor    @x[6], @s[1]
+        movdqa @t[2], @t[0]
+       pxor    @x[2], @s[0]
+        movdqa @t[3], @s[3]
+
+       por     @t[1], @t[2]
+       por     @s[0], @t[3]
+       pxor    @t[0], @s[3]
+       pand    @s[0], @s[2]
+       pxor    @t[1], @s[0]
+       pand    @t[1], @t[0]
+       pand    @s[0], @s[3]
+       movdqa  @x[3], @s[0]
+       pxor    @x[2], @s[0]
+       pand    @s[0], @s[1]
+       pxor    @s[1], @t[3]
+       pxor    @s[1], @t[2]
+       movdqa  @x[4], @s[1]
+       movdqa  @x[1], @s[0]
+       pxor    @x[5], @s[1]
+       pxor    @x[0], @s[0]
+       movdqa  @s[1], @t[1]
+       pand    @s[0], @s[1]
+       por     @s[0], @t[1]
+       pxor    @s[1], @t[0]
+       pxor    @s[3], @t[3]
+       pxor    @s[2], @t[2]
+       pxor    @s[3], @t[1]
+       movdqa  @x[7], @s[0]
+       pxor    @s[2], @t[0]
+       movdqa  @x[6], @s[1]
+       pxor    @s[2], @t[1]
+       movdqa  @x[5], @s[2]
+       pand    @x[3], @s[0]
+       movdqa  @x[4], @s[3]
+       pand    @x[2], @s[1]
+       pand    @x[1], @s[2]
+       por     @x[0], @s[3]
+       pxor    @s[0], @t[3]
+       pxor    @s[1], @t[2]
+       pxor    @s[2], @t[1]
+       pxor    @s[3], @t[0] 
+
+       #Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
+
+       # new smaller inversion
+
+       movdqa  @t[3], @s[0]
+       pand    @t[1], @t[3]
+       pxor    @t[2], @s[0]
+
+       movdqa  @t[0], @s[2]
+       movdqa  @s[0], @s[3]
+       pxor    @t[3], @s[2]
+       pand    @s[2], @s[3]
+
+       movdqa  @t[1], @s[1]
+       pxor    @t[2], @s[3]
+       pxor    @t[0], @s[1]
+
+       pxor    @t[2], @t[3]
+
+       pand    @t[3], @s[1]
+
+       movdqa  @s[2], @t[2]
+       pxor    @t[0], @s[1]
+
+       pxor    @s[1], @t[2]
+       pxor    @s[1], @t[1]
+
+       pand    @t[0], @t[2]
+
+       pxor    @t[2], @s[2]
+       pxor    @t[2], @t[1]
+
+       pand    @s[3], @s[2]
+
+       pxor    @s[0], @s[2]
+___
+# output in s3, s2, s1, t1
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
+
+# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
+       &Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
+
+### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
+}
+
+# AES linear components
+
+sub ShiftRows {
+my @x=@_[0..7];
+my $mask=pop;
+$code.=<<___;
+       pxor    0x00($key),@x[0]
+       pxor    0x10($key),@x[1]
+       pshufb  $mask,@x[0]
+       pxor    0x20($key),@x[2]
+       pshufb  $mask,@x[1]
+       pxor    0x30($key),@x[3]
+       pshufb  $mask,@x[2]
+       pxor    0x40($key),@x[4]
+       pshufb  $mask,@x[3]
+       pxor    0x50($key),@x[5]
+       pshufb  $mask,@x[4]
+       pxor    0x60($key),@x[6]
+       pshufb  $mask,@x[5]
+       pxor    0x70($key),@x[7]
+       pshufb  $mask,@x[6]
+       lea     0x80($key),$key
+       pshufb  $mask,@x[7]
+___
+}
+
+sub MixColumns {
+# modified to emit output in order suitable for feeding back to aesenc[last]
+my @x=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+       pshufd  \$0x93, @x[0], @t[0]    # x0 <<< 32
+       pshufd  \$0x93, @x[1], @t[1]
+        pxor   @t[0], @x[0]            # x0 ^ (x0 <<< 32)
+       pshufd  \$0x93, @x[2], @t[2]
+        pxor   @t[1], @x[1]
+       pshufd  \$0x93, @x[3], @t[3]
+        pxor   @t[2], @x[2]
+       pshufd  \$0x93, @x[4], @t[4]
+        pxor   @t[3], @x[3]
+       pshufd  \$0x93, @x[5], @t[5]
+        pxor   @t[4], @x[4]
+       pshufd  \$0x93, @x[6], @t[6]
+        pxor   @t[5], @x[5]
+       pshufd  \$0x93, @x[7], @t[7]
+        pxor   @t[6], @x[6]
+        pxor   @t[7], @x[7]
+
+       pxor    @x[0], @t[1]
+       pxor    @x[7], @t[0]
+       pxor    @x[7], @t[1]
+        pshufd \$0x4E, @x[0], @x[0]    # (x0 ^ (x0 <<< 32)) <<< 64)
+       pxor    @x[1], @t[2]
+        pshufd \$0x4E, @x[1], @x[1]
+       pxor    @x[4], @t[5]
+        pxor   @t[0], @x[0]
+       pxor    @x[5], @t[6]
+        pxor   @t[1], @x[1]
+       pxor    @x[3], @t[4]
+        pshufd \$0x4E, @x[4], @t[0]
+       pxor    @x[6], @t[7]
+        pshufd \$0x4E, @x[5], @t[1]
+       pxor    @x[2], @t[3]
+        pshufd \$0x4E, @x[3], @x[4]
+       pxor    @x[7], @t[3]
+        pshufd \$0x4E, @x[7], @x[5]
+       pxor    @x[7], @t[4]
+        pshufd \$0x4E, @x[6], @x[3]
+       pxor    @t[4], @t[0]
+        pshufd \$0x4E, @x[2], @x[6]
+       pxor    @t[5], @t[1]
+
+       pxor    @t[3], @x[4]
+       pxor    @t[7], @x[5]
+       pxor    @t[6], @x[3]
+        movdqa @t[0], @x[2]
+       pxor    @t[2], @x[6]
+        movdqa @t[1], @x[7]
+___
+}
+
+sub InvMixColumns {
+my @x=@_[0..7];
+my @t=@_[8..15];
+
+$code.=<<___;
+       # multiplication by 0x0e
+       pshufd  \$0x93, @x[7], @t[7]
+       movdqa  @x[2], @t[2]
+       pxor    @x[5], @x[7]            # 7 5
+       pxor    @x[5], @x[2]            # 2 5
+       pshufd  \$0x93, @x[0], @t[0]
+       movdqa  @x[5], @t[5]
+       pxor    @x[0], @x[5]            # 5 0           [1]
+       pxor    @x[1], @x[0]            # 0 1
+       pshufd  \$0x93, @x[1], @t[1]
+       pxor    @x[2], @x[1]            # 1 25
+       pxor    @x[6], @x[0]            # 01 6          [2]
+       pxor    @x[3], @x[1]            # 125 3         [4]
+       pshufd  \$0x93, @x[3], @t[3]
+       pxor    @x[0], @x[2]            # 25 016        [3]
+       pxor    @x[7], @x[3]            # 3 75
+       pxor    @x[6], @x[7]            # 75 6          [0]
+       pshufd  \$0x93, @x[6], @t[6]
+       movdqa  @x[4], @t[4]
+       pxor    @x[4], @x[6]            # 6 4
+       pxor    @x[3], @x[4]            # 4 375         [6]
+       pxor    @x[7], @x[3]            # 375 756=36
+       pxor    @t[5], @x[6]            # 64 5          [7]
+       pxor    @t[2], @x[3]            # 36 2
+       pxor    @t[4], @x[3]            # 362 4         [5]
+       pshufd  \$0x93, @t[5], @t[5]
+___
+                                       my @y = @x[7,5,0,2,1,3,4,6];
+$code.=<<___;
+       # multiplication by 0x0b
+       pxor    @y[0], @y[1]
+       pxor    @t[0], @y[0]
+       pxor    @t[1], @y[1]
+       pshufd  \$0x93, @t[2], @t[2]
+       pxor    @t[5], @y[0]
+       pxor    @t[6], @y[1]
+       pxor    @t[7], @y[0]
+       pshufd  \$0x93, @t[4], @t[4]
+       pxor    @t[6], @t[7]            # clobber t[7]
+       pxor    @y[0], @y[1]
+
+       pxor    @t[0], @y[3]
+       pshufd  \$0x93, @t[0], @t[0]
+       pxor    @t[1], @y[2]
+       pxor    @t[1], @y[4]
+       pxor    @t[2], @y[2]
+       pshufd  \$0x93, @t[1], @t[1]
+       pxor    @t[2], @y[3]
+       pxor    @t[2], @y[5]
+       pxor    @t[7], @y[2]
+       pshufd  \$0x93, @t[2], @t[2]
+       pxor    @t[3], @y[3]
+       pxor    @t[3], @y[6]
+       pxor    @t[3], @y[4]
+       pshufd  \$0x93, @t[3], @t[3]
+       pxor    @t[4], @y[7]
+       pxor    @t[4], @y[5]
+       pxor    @t[7], @y[7]
+       pxor    @t[5], @y[3]
+       pxor    @t[4], @y[4]
+       pxor    @t[5], @t[7]            # clobber t[7] even more
+
+       pxor    @t[7], @y[5]
+       pshufd  \$0x93, @t[4], @t[4]
+       pxor    @t[7], @y[6]
+       pxor    @t[7], @y[4]
+
+       pxor    @t[5], @t[7]
+       pshufd  \$0x93, @t[5], @t[5]
+       pxor    @t[6], @t[7]            # restore t[7]
+
+       # multiplication by 0x0d
+       pxor    @y[7], @y[4]
+       pxor    @t[4], @y[7]
+       pshufd  \$0x93, @t[6], @t[6]
+       pxor    @t[0], @y[2]
+       pxor    @t[5], @y[7]
+       pxor    @t[2], @y[2]
+       pshufd  \$0x93, @t[7], @t[7]
+
+       pxor    @y[1], @y[3]
+       pxor    @t[1], @y[1]
+       pxor    @t[0], @y[0]
+       pxor    @t[0], @y[3]
+       pxor    @t[5], @y[1]
+       pxor    @t[5], @y[0]
+       pxor    @t[7], @y[1]
+       pshufd  \$0x93, @t[0], @t[0]
+       pxor    @t[6], @y[0]
+       pxor    @y[1], @y[3]
+       pxor    @t[1], @y[4]
+       pshufd  \$0x93, @t[1], @t[1]
+
+       pxor    @t[7], @y[7]
+       pxor    @t[2], @y[4]
+       pxor    @t[2], @y[5]
+       pshufd  \$0x93, @t[2], @t[2]
+       pxor    @t[6], @y[2]
+       pxor    @t[3], @t[6]            # clobber t[6]
+       pxor    @y[7], @y[4]
+       pxor    @t[6], @y[3]
+
+       pxor    @t[6], @y[6]
+       pxor    @t[5], @y[5]
+       pxor    @t[4], @y[6]
+       pshufd  \$0x93, @t[4], @t[4]
+       pxor    @t[6], @y[5]
+       pxor    @t[7], @y[6]
+       pxor    @t[3], @t[6]            # restore t[6]
+
+       pshufd  \$0x93, @t[5], @t[5]
+       pshufd  \$0x93, @t[6], @t[6]
+       pshufd  \$0x93, @t[7], @t[7]
+       pshufd  \$0x93, @t[3], @t[3]
+
+       # multiplication by 0x09
+       pxor    @y[1], @y[4]
+       pxor    @y[1], @t[1]            # t[1]=y[1]
+       pxor    @t[5], @t[0]            # clobber t[0]
+       pxor    @t[5], @t[1]
+       pxor    @t[0], @y[3]
+       pxor    @y[0], @t[0]            # t[0]=y[0]
+       pxor    @t[6], @t[1]
+       pxor    @t[7], @t[6]            # clobber t[6]
+       pxor    @t[1], @y[4]
+       pxor    @t[4], @y[7]
+       pxor    @y[4], @t[4]            # t[4]=y[4]
+       pxor    @t[3], @y[6]
+       pxor    @y[3], @t[3]            # t[3]=y[3]
+       pxor    @t[2], @y[5]
+       pxor    @y[2], @t[2]            # t[2]=y[2]
+       pxor    @t[7], @t[3]
+       pxor    @y[5], @t[5]            # t[5]=y[5]
+       pxor    @t[6], @t[2]
+       pxor    @t[6], @t[5]
+       pxor    @y[6], @t[6]            # t[6]=y[6]
+       pxor    @y[7], @t[7]            # t[7]=y[7]
+
+       movdqa  @t[0],@XMM[0]
+       movdqa  @t[1],@XMM[1]
+       movdqa  @t[2],@XMM[2]
+       movdqa  @t[3],@XMM[3]
+       movdqa  @t[4],@XMM[4]
+       movdqa  @t[5],@XMM[5]
+       movdqa  @t[6],@XMM[6]
+       movdqa  @t[7],@XMM[7]
+___
+}
+
+sub aesenc {                           # not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+       movdqa  0x30($const),@t[0]      # .LSR
+___
+       &ShiftRows      (@b,@t[0]);
+       &Sbox           (@b,@t);
+       &MixColumns     (@b[0,1,4,6,3,7,2,5],@t);
+}
+
+sub aesenclast {                       # not used
+my @b=@_[0..7];
+my @t=@_[8..15];
+$code.=<<___;
+       movdqa  0x40($const),@t[0]      # .LSRM0
+___
+       &ShiftRows      (@b,@t[0]);
+       &Sbox           (@b,@t);
+$code.=<<___
+       pxor    0x00($key),@b[0]
+       pxor    0x10($key),@b[1]
+       pxor    0x20($key),@b[4]
+       pxor    0x30($key),@b[6]
+       pxor    0x40($key),@b[3]
+       pxor    0x50($key),@b[7]
+       pxor    0x60($key),@b[2]
+       pxor    0x70($key),@b[5]
+___
+}
+
+sub swapmove {
+my ($a,$b,$n,$mask,$t)=@_;
+$code.=<<___;
+       movdqa  $b,$t
+       psrlq   \$$n,$b
+       pxor    $a,$b
+       pand    $mask,$b
+       pxor    $b,$a
+       psllq   \$$n,$b
+       pxor    $t,$b
+___
+}
+sub swapmove2x {
+my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
+$code.=<<___;
+       movdqa  $b0,$t0
+       psrlq   \$$n,$b0
+        movdqa $b1,$t1
+        psrlq  \$$n,$b1
+       pxor    $a0,$b0
+        pxor   $a1,$b1
+       pand    $mask,$b0
+        pand   $mask,$b1
+       pxor    $b0,$a0
+       psllq   \$$n,$b0
+        pxor   $b1,$a1
+        psllq  \$$n,$b1
+       pxor    $t0,$b0
+        pxor   $t1,$b1
+___
+}
+
+sub bitslice {
+my @x=reverse(@_[0..7]);
+my ($t0,$t1,$t2,$t3)=@_[8..11];
+$code.=<<___;
+       movdqa  0x00($const),$t0        # .LBS0
+       movdqa  0x10($const),$t1        # .LBS1
+___
+       &swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
+       &swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+$code.=<<___;
+       movdqa  0x20($const),$t0        # .LBS2
+___
+       &swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
+       &swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+
+       &swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
+       &swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
+}
+
+$code.=<<___;
+.text
+
+.extern        asm_AES_encrypt
+.extern        asm_AES_decrypt
+
+.type  _bsaes_encrypt8,\@abi-omnipotent
+.align 64
+_bsaes_encrypt8:
+       lea     .LBS0(%rip), $const     # constants table
+
+       movdqa  ($key), @XMM[9]         # round 0 key
+       lea     0x10($key), $key
+       movdqa  0x60($const), @XMM[8]   # .LM0SR
+       pxor    @XMM[9], @XMM[0]        # xor with round0 key
+       pxor    @XMM[9], @XMM[1]
+        pshufb @XMM[8], @XMM[0]
+       pxor    @XMM[9], @XMM[2]
+        pshufb @XMM[8], @XMM[1]
+       pxor    @XMM[9], @XMM[3]
+        pshufb @XMM[8], @XMM[2]
+       pxor    @XMM[9], @XMM[4]
+        pshufb @XMM[8], @XMM[3]
+       pxor    @XMM[9], @XMM[5]
+        pshufb @XMM[8], @XMM[4]
+       pxor    @XMM[9], @XMM[6]
+        pshufb @XMM[8], @XMM[5]
+       pxor    @XMM[9], @XMM[7]
+        pshufb @XMM[8], @XMM[6]
+        pshufb @XMM[8], @XMM[7]
+_bsaes_encrypt8_bitslice:
+___
+       &bitslice       (@XMM[0..7, 8..11]);
+$code.=<<___;
+       dec     $rounds
+       jmp     .Lenc_sbox
+.align 16
+.Lenc_loop:
+___
+       &ShiftRows      (@XMM[0..7, 8]);
+$code.=".Lenc_sbox:\n";
+       &Sbox           (@XMM[0..7, 8..15]);
+$code.=<<___;
+       dec     $rounds
+       jl      .Lenc_done
+___
+       &MixColumns     (@XMM[0,1,4,6,3,7,2,5, 8..15]);
+$code.=<<___;
+       movdqa  0x30($const), @XMM[8]   # .LSR
+       jnz     .Lenc_loop
+       movdqa  0x40($const), @XMM[8]   # .LSRM0
+       jmp     .Lenc_loop
+.align 16
+.Lenc_done:
+___
+       # output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
+       &bitslice       (@XMM[0,1,4,6,3,7,2,5, 8..11]);
+$code.=<<___;
+       movdqa  ($key), @XMM[8]         # last round key
+       pxor    @XMM[8], @XMM[4]
+       pxor    @XMM[8], @XMM[6]
+       pxor    @XMM[8], @XMM[3]
+       pxor    @XMM[8], @XMM[7]
+       pxor    @XMM[8], @XMM[2]
+       pxor    @XMM[8], @XMM[5]
+       pxor    @XMM[8], @XMM[0]
+       pxor    @XMM[8], @XMM[1]
+       ret
+.size  _bsaes_encrypt8,.-_bsaes_encrypt8
+
+.type  _bsaes_decrypt8,\@abi-omnipotent
+.align 64
+_bsaes_decrypt8:
+       lea     .LBS0(%rip), $const     # constants table
+
+       movdqa  ($key), @XMM[9]         # round 0 key
+       lea     0x10($key), $key
+       movdqa  -0x30($const), @XMM[8]  # .LM0ISR
+       pxor    @XMM[9], @XMM[0]        # xor with round0 key
+       pxor    @XMM[9], @XMM[1]
+        pshufb @XMM[8], @XMM[0]
+       pxor    @XMM[9], @XMM[2]
+        pshufb @XMM[8], @XMM[1]
+       pxor    @XMM[9], @XMM[3]
+        pshufb @XMM[8], @XMM[2]
+       pxor    @XMM[9], @XMM[4]
+        pshufb @XMM[8], @XMM[3]
+       pxor    @XMM[9], @XMM[5]
+        pshufb @XMM[8], @XMM[4]
+       pxor    @XMM[9], @XMM[6]
+        pshufb @XMM[8], @XMM[5]
+       pxor    @XMM[9], @XMM[7]
+        pshufb @XMM[8], @XMM[6]
+        pshufb @XMM[8], @XMM[7]
+___
+       &bitslice       (@XMM[0..7, 8..11]);
+$code.=<<___;
+       dec     $rounds
+       jmp     .Ldec_sbox
+.align 16
+.Ldec_loop:
+___
+       &ShiftRows      (@XMM[0..7, 8]);
+$code.=".Ldec_sbox:\n";
+       &InvSbox        (@XMM[0..7, 8..15]);
+$code.=<<___;
+       dec     $rounds
+       jl      .Ldec_done
+___
+       &InvMixColumns  (@XMM[0,1,6,4,2,7,3,5, 8..15]);
+$code.=<<___;
+       movdqa  -0x10($const), @XMM[8]  # .LISR
+       jnz     .Ldec_loop
+       movdqa  -0x20($const), @XMM[8]  # .LISRM0
+       jmp     .Ldec_loop
+.align 16
+.Ldec_done:
+___
+       &bitslice       (@XMM[0,1,6,4,2,7,3,5, 8..11]);
+$code.=<<___;
+       movdqa  ($key), @XMM[8]         # last round key
+       pxor    @XMM[8], @XMM[6]
+       pxor    @XMM[8], @XMM[4]
+       pxor    @XMM[8], @XMM[2]
+       pxor    @XMM[8], @XMM[7]
+       pxor    @XMM[8], @XMM[3]
+       pxor    @XMM[8], @XMM[5]
+       pxor    @XMM[8], @XMM[0]
+       pxor    @XMM[8], @XMM[1]
+       ret
+.size  _bsaes_decrypt8,.-_bsaes_decrypt8
+___
+}
+{
+my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
+
+sub bitslice_key {
+my @x=reverse(@_[0..7]);
+my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
+
+       &swapmove       (@x[0,1],1,$bs0,$t2,$t3);
+$code.=<<___;
+       #&swapmove(@x[2,3],1,$t0,$t2,$t3);
+       movdqa  @x[0], @x[2]
+       movdqa  @x[1], @x[3]
+___
+       #&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
+
+       &swapmove2x     (@x[0,2,1,3],2,$bs1,$t2,$t3);
+$code.=<<___;
+       #&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
+       movdqa  @x[0], @x[4]
+       movdqa  @x[2], @x[6]
+       movdqa  @x[1], @x[5]
+       movdqa  @x[3], @x[7]
+___
+       &swapmove2x     (@x[0,4,1,5],4,$bs2,$t2,$t3);
+       &swapmove2x     (@x[2,6,3,7],4,$bs2,$t2,$t3);
+}
+
+$code.=<<___;
+.type  _bsaes_key_convert,\@abi-omnipotent
+.align 16
+_bsaes_key_convert:
+       lea     .LBS1(%rip), $const
+       movdqu  ($inp), %xmm7           # load round 0 key
+       movdqa  -0x10($const), %xmm8    # .LBS0
+       movdqa  0x00($const), %xmm9     # .LBS1
+       movdqa  0x10($const), %xmm10    # .LBS2
+       movdqa  0x40($const), %xmm13    # .LM0
+       movdqa  0x60($const), %xmm14    # .LNOT
+
+       movdqu  0x10($inp), %xmm6       # load round 1 key
+       lea     0x10($inp), $inp
+       movdqa  %xmm7, ($out)           # save round 0 key
+       lea     0x10($out), $out
+       dec     $rounds
+       jmp     .Lkey_loop
+.align 16
+.Lkey_loop:
+       pshufb  %xmm13, %xmm6           # .LM0
+       movdqa  %xmm6, %xmm7
+___
+       &bitslice_key   (map("%xmm$_",(0..7, 8..12)));
+$code.=<<___;
+       pxor    %xmm14, %xmm5           # "pnot"
+       pxor    %xmm14, %xmm6
+       pxor    %xmm14, %xmm0
+       pxor    %xmm14, %xmm1
+       lea     0x10($inp), $inp
+       movdqa  %xmm0, 0x00($out)       # write bit-sliced round key
+       movdqa  %xmm1, 0x10($out)
+       movdqa  %xmm2, 0x20($out)
+       movdqa  %xmm3, 0x30($out)
+       movdqa  %xmm4, 0x40($out)
+       movdqa  %xmm5, 0x50($out)
+       movdqa  %xmm6, 0x60($out)
+       movdqa  %xmm7, 0x70($out)
+       lea     0x80($out),$out
+       movdqu  ($inp), %xmm6           # load next round key
+       dec     $rounds
+       jnz     .Lkey_loop
+
+       movdqa  0x70($const), %xmm7     # .L63
+       #movdqa %xmm6, ($out)           # don't save last round key
+       ret
+.size  _bsaes_key_convert,.-_bsaes_key_convert
+___
+}
+
+if (0 && !$win64) {    # following four functions are unsupported interface
+                       # used for benchmarking...
+$code.=<<___;
+.globl bsaes_enc_key_convert
+.type  bsaes_enc_key_convert,\@function,2
+.align 16
+bsaes_enc_key_convert:
+       mov     240($inp),%r10d         # pass rounds
+       mov     $inp,%rcx               # pass key
+       mov     $out,%rax               # pass key schedule
+       call    _bsaes_key_convert
+       pxor    %xmm6,%xmm7             # fix up last round key
+       movdqa  %xmm7,(%rax)            # save last round key
+       ret
+.size  bsaes_enc_key_convert,.-bsaes_enc_key_convert
+
+.globl bsaes_encrypt_128
+.type  bsaes_encrypt_128,\@function,4
+.align 16
+bsaes_encrypt_128:
+.Lenc128_loop:
+       movdqu  0x00($inp), @XMM[0]     # load input
+       movdqu  0x10($inp), @XMM[1]
+       movdqu  0x20($inp), @XMM[2]
+       movdqu  0x30($inp), @XMM[3]
+       movdqu  0x40($inp), @XMM[4]
+       movdqu  0x50($inp), @XMM[5]
+       movdqu  0x60($inp), @XMM[6]
+       movdqu  0x70($inp), @XMM[7]
+       mov     $key, %rax              # pass the $key
+       lea     0x80($inp), $inp
+       mov     \$10,%r10d
+
+       call    _bsaes_encrypt8
+
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[4], 0x20($out)
+       movdqu  @XMM[6], 0x30($out)
+       movdqu  @XMM[3], 0x40($out)
+       movdqu  @XMM[7], 0x50($out)
+       movdqu  @XMM[2], 0x60($out)
+       movdqu  @XMM[5], 0x70($out)
+       lea     0x80($out), $out
+       sub     \$0x80,$len
+       ja      .Lenc128_loop
+       ret
+.size  bsaes_encrypt_128,.-bsaes_encrypt_128
+
+.globl bsaes_dec_key_convert
+.type  bsaes_dec_key_convert,\@function,2
+.align 16
+bsaes_dec_key_convert:
+       mov     240($inp),%r10d         # pass rounds
+       mov     $inp,%rcx               # pass key
+       mov     $out,%rax               # pass key schedule
+       call    _bsaes_key_convert
+       pxor    ($out),%xmm7            # fix up round 0 key
+       movdqa  %xmm6,(%rax)            # save last round key
+       movdqa  %xmm7,($out)
+       ret
+.size  bsaes_dec_key_convert,.-bsaes_dec_key_convert
+
+.globl bsaes_decrypt_128
+.type  bsaes_decrypt_128,\@function,4
+.align 16
+bsaes_decrypt_128:
+.Ldec128_loop:
+       movdqu  0x00($inp), @XMM[0]     # load input
+       movdqu  0x10($inp), @XMM[1]
+       movdqu  0x20($inp), @XMM[2]
+       movdqu  0x30($inp), @XMM[3]
+       movdqu  0x40($inp), @XMM[4]
+       movdqu  0x50($inp), @XMM[5]
+       movdqu  0x60($inp), @XMM[6]
+       movdqu  0x70($inp), @XMM[7]
+       mov     $key, %rax              # pass the $key
+       lea     0x80($inp), $inp
+       mov     \$10,%r10d
+
+       call    _bsaes_decrypt8
+
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[6], 0x20($out)
+       movdqu  @XMM[4], 0x30($out)
+       movdqu  @XMM[2], 0x40($out)
+       movdqu  @XMM[7], 0x50($out)
+       movdqu  @XMM[3], 0x60($out)
+       movdqu  @XMM[5], 0x70($out)
+       lea     0x80($out), $out
+       sub     \$0x80,$len
+       ja      .Ldec128_loop
+       ret
+.size  bsaes_decrypt_128,.-bsaes_decrypt_128
+___
+}
+{
+######################################################################
+#
+# OpenSSL interface
+#
+my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64        ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
+                                               : ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
+my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
+
+if ($ecb) {
+$code.=<<___;
+.globl bsaes_ecb_encrypt_blocks
+.type  bsaes_ecb_encrypt_blocks,\@abi-omnipotent
+.align 16
+bsaes_ecb_encrypt_blocks:
+       mov     %rsp, %rax
+.Lecb_enc_prologue:
+       push    %rbp
+       push    %rbx
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       lea     -0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+       lea     -0xa0(%rsp), %rsp
+       movaps  %xmm6, 0x40(%rsp)
+       movaps  %xmm7, 0x50(%rsp)
+       movaps  %xmm8, 0x60(%rsp)
+       movaps  %xmm9, 0x70(%rsp)
+       movaps  %xmm10, 0x80(%rsp)
+       movaps  %xmm11, 0x90(%rsp)
+       movaps  %xmm12, 0xa0(%rsp)
+       movaps  %xmm13, 0xb0(%rsp)
+       movaps  %xmm14, 0xc0(%rsp)
+       movaps  %xmm15, 0xd0(%rsp)
+.Lecb_enc_body:
+___
+$code.=<<___;
+       mov     %rsp,%rbp               # backup %rsp
+       mov     240($arg4),%eax         # rounds
+       mov     $arg1,$inp              # backup arguments
+       mov     $arg2,$out
+       mov     $arg3,$len
+       mov     $arg4,$key
+       cmp     \$8,$arg3
+       jb      .Lecb_enc_short
+
+       mov     %eax,%ebx               # backup rounds
+       shl     \$7,%rax                # 128 bytes per inner round key
+       sub     \$`128-32`,%rax         # size of bit-sliced key schedule
+       sub     %rax,%rsp
+       mov     %rsp,%rax               # pass key schedule
+       mov     $key,%rcx               # pass key
+       mov     %ebx,%r10d              # pass rounds
+       call    _bsaes_key_convert
+       pxor    %xmm6,%xmm7             # fix up last round key
+       movdqa  %xmm7,(%rax)            # save last round key
+
+       sub     \$8,$len
+.Lecb_enc_loop:
+       movdqu  0x00($inp), @XMM[0]     # load input
+       movdqu  0x10($inp), @XMM[1]
+       movdqu  0x20($inp), @XMM[2]
+       movdqu  0x30($inp), @XMM[3]
+       movdqu  0x40($inp), @XMM[4]
+       movdqu  0x50($inp), @XMM[5]
+       mov     %rsp, %rax              # pass key schedule
+       movdqu  0x60($inp), @XMM[6]
+       mov     %ebx,%r10d              # pass rounds
+       movdqu  0x70($inp), @XMM[7]
+       lea     0x80($inp), $inp
+
+       call    _bsaes_encrypt8
+
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[4], 0x20($out)
+       movdqu  @XMM[6], 0x30($out)
+       movdqu  @XMM[3], 0x40($out)
+       movdqu  @XMM[7], 0x50($out)
+       movdqu  @XMM[2], 0x60($out)
+       movdqu  @XMM[5], 0x70($out)
+       lea     0x80($out), $out
+       sub     \$8,$len
+       jnc     .Lecb_enc_loop
+
+       add     \$8,$len
+       jz      .Lecb_enc_done
+
+       movdqu  0x00($inp), @XMM[0]     # load input
+       mov     %rsp, %rax              # pass key schedule
+       mov     %ebx,%r10d              # pass rounds
+       cmp     \$2,$len
+       jb      .Lecb_enc_one
+       movdqu  0x10($inp), @XMM[1]
+       je      .Lecb_enc_two
+       movdqu  0x20($inp), @XMM[2]
+       cmp     \$4,$len
+       jb      .Lecb_enc_three
+       movdqu  0x30($inp), @XMM[3]
+       je      .Lecb_enc_four
+       movdqu  0x40($inp), @XMM[4]
+       cmp     \$6,$len
+       jb      .Lecb_enc_five
+       movdqu  0x50($inp), @XMM[5]
+       je      .Lecb_enc_six
+       movdqu  0x60($inp), @XMM[6]
+       call    _bsaes_encrypt8
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[4], 0x20($out)
+       movdqu  @XMM[6], 0x30($out)
+       movdqu  @XMM[3], 0x40($out)
+       movdqu  @XMM[7], 0x50($out)
+       movdqu  @XMM[2], 0x60($out)
+       jmp     .Lecb_enc_done
+.align 16
+.Lecb_enc_six:
+       call    _bsaes_encrypt8
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[4], 0x20($out)
+       movdqu  @XMM[6], 0x30($out)
+       movdqu  @XMM[3], 0x40($out)
+       movdqu  @XMM[7], 0x50($out)
+       jmp     .Lecb_enc_done
+.align 16
+.Lecb_enc_five:
+       call    _bsaes_encrypt8
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[4], 0x20($out)
+       movdqu  @XMM[6], 0x30($out)
+       movdqu  @XMM[3], 0x40($out)
+       jmp     .Lecb_enc_done
+.align 16
+.Lecb_enc_four:
+       call    _bsaes_encrypt8
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[4], 0x20($out)
+       movdqu  @XMM[6], 0x30($out)
+       jmp     .Lecb_enc_done
+.align 16
+.Lecb_enc_three:
+       call    _bsaes_encrypt8
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[4], 0x20($out)
+       jmp     .Lecb_enc_done
+.align 16
+.Lecb_enc_two:
+       call    _bsaes_encrypt8
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       jmp     .Lecb_enc_done
+.align 16
+.Lecb_enc_one:
+       call    _bsaes_encrypt8
+       movdqu  @XMM[0], 0x00($out)     # write output
+       jmp     .Lecb_enc_done
+.align 16
+.Lecb_enc_short:
+       lea     ($inp), $arg1
+       lea     ($out), $arg2
+       lea     ($key), $arg3
+       call    asm_AES_encrypt
+       lea     16($inp), $inp
+       lea     16($out), $out
+       dec     $len
+       jnz     .Lecb_enc_short
+
+.Lecb_enc_done:
+       lea     (%rsp),%rax
+       pxor    %xmm0, %xmm0
+.Lecb_enc_bzero:                       # wipe key schedule [if any]
+       movdqa  %xmm0, 0x00(%rax)
+       movdqa  %xmm0, 0x10(%rax)
+       lea     0x20(%rax), %rax
+       cmp     %rax, %rbp
+       jb      .Lecb_enc_bzero
+
+       lea     (%rbp),%rsp             # restore %rsp
+___
+$code.=<<___ if ($win64);
+       movaps  0x40(%rbp), %xmm6
+       movaps  0x50(%rbp), %xmm7
+       movaps  0x60(%rbp), %xmm8
+       movaps  0x70(%rbp), %xmm9
+       movaps  0x80(%rbp), %xmm10
+       movaps  0x90(%rbp), %xmm11
+       movaps  0xa0(%rbp), %xmm12
+       movaps  0xb0(%rbp), %xmm13
+       movaps  0xc0(%rbp), %xmm14
+       movaps  0xd0(%rbp), %xmm15
+       lea     0xa0(%rbp), %rsp
+___
+$code.=<<___;
+       mov     0x48(%rsp), %r15
+       mov     0x50(%rsp), %r14
+       mov     0x58(%rsp), %r13
+       mov     0x60(%rsp), %r12
+       mov     0x68(%rsp), %rbx
+       mov     0x70(%rsp), %rax
+       lea     0x78(%rsp), %rsp
+       mov     %rax, %rbp
+.Lecb_enc_epilogue:
+       ret
+.size  bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
+
+.globl bsaes_ecb_decrypt_blocks
+.type  bsaes_ecb_decrypt_blocks,\@abi-omnipotent
+.align 16
+bsaes_ecb_decrypt_blocks:
+       mov     %rsp, %rax
+.Lecb_dec_prologue:
+       push    %rbp
+       push    %rbx
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       lea     -0x48(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+       lea     -0xa0(%rsp), %rsp
+       movaps  %xmm6, 0x40(%rsp)
+       movaps  %xmm7, 0x50(%rsp)
+       movaps  %xmm8, 0x60(%rsp)
+       movaps  %xmm9, 0x70(%rsp)
+       movaps  %xmm10, 0x80(%rsp)
+       movaps  %xmm11, 0x90(%rsp)
+       movaps  %xmm12, 0xa0(%rsp)
+       movaps  %xmm13, 0xb0(%rsp)
+       movaps  %xmm14, 0xc0(%rsp)
+       movaps  %xmm15, 0xd0(%rsp)
+.Lecb_dec_body:
+___
+$code.=<<___;
+       mov     %rsp,%rbp               # backup %rsp
+       mov     240($arg4),%eax         # rounds
+       mov     $arg1,$inp              # backup arguments
+       mov     $arg2,$out
+       mov     $arg3,$len
+       mov     $arg4,$key
+       cmp     \$8,$arg3
+       jb      .Lecb_dec_short
+
+       mov     %eax,%ebx               # backup rounds
+       shl     \$7,%rax                # 128 bytes per inner round key
+       sub     \$`128-32`,%rax         # size of bit-sliced key schedule
+       sub     %rax,%rsp
+       mov     %rsp,%rax               # pass key schedule
+       mov     $key,%rcx               # pass key
+       mov     %ebx,%r10d              # pass rounds
+       call    _bsaes_key_convert
+       pxor    (%rsp),%xmm7            # fix up 0 round key
+       movdqa  %xmm6,(%rax)            # save last round key
+       movdqa  %xmm7,(%rsp)
+
+       sub     \$8,$len
+.Lecb_dec_loop:
+       movdqu  0x00($inp), @XMM[0]     # load input
+       movdqu  0x10($inp), @XMM[1]
+       movdqu  0x20($inp), @XMM[2]
+       movdqu  0x30($inp), @XMM[3]
+       movdqu  0x40($inp), @XMM[4]
+       movdqu  0x50($inp), @XMM[5]
+       mov     %rsp, %rax              # pass key schedule
+       movdqu  0x60($inp), @XMM[6]
+       mov     %ebx,%r10d              # pass rounds
+       movdqu  0x70($inp), @XMM[7]
+       lea     0x80($inp), $inp
+
+       call    _bsaes_decrypt8
+
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[6], 0x20($out)
+       movdqu  @XMM[4], 0x30($out)
+       movdqu  @XMM[2], 0x40($out)
+       movdqu  @XMM[7], 0x50($out)
+       movdqu  @XMM[3], 0x60($out)
+       movdqu  @XMM[5], 0x70($out)
+       lea     0x80($out), $out
+       sub     \$8,$len
+       jnc     .Lecb_dec_loop
+
+       add     \$8,$len
+       jz      .Lecb_dec_done
+
+       movdqu  0x00($inp), @XMM[0]     # load input
+       mov     %rsp, %rax              # pass key schedule
+       mov     %ebx,%r10d              # pass rounds
+       cmp     \$2,$len
+       jb      .Lecb_dec_one
+       movdqu  0x10($inp), @XMM[1]
+       je      .Lecb_dec_two
+       movdqu  0x20($inp), @XMM[2]
+       cmp     \$4,$len
+       jb      .Lecb_dec_three
+       movdqu  0x30($inp), @XMM[3]
+       je      .Lecb_dec_four
+       movdqu  0x40($inp), @XMM[4]
+       cmp     \$6,$len
+       jb      .Lecb_dec_five
+       movdqu  0x50($inp), @XMM[5]
+       je      .Lecb_dec_six
+       movdqu  0x60($inp), @XMM[6]
+       call    _bsaes_decrypt8
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[6], 0x20($out)
+       movdqu  @XMM[4], 0x30($out)
+       movdqu  @XMM[2], 0x40($out)
+       movdqu  @XMM[7], 0x50($out)
+       movdqu  @XMM[3], 0x60($out)
+       jmp     .Lecb_dec_done
+.align 16
+.Lecb_dec_six:
+       call    _bsaes_decrypt8
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[6], 0x20($out)
+       movdqu  @XMM[4], 0x30($out)
+       movdqu  @XMM[2], 0x40($out)
+       movdqu  @XMM[7], 0x50($out)
+       jmp     .Lecb_dec_done
+.align 16
+.Lecb_dec_five:
+       call    _bsaes_decrypt8
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[6], 0x20($out)
+       movdqu  @XMM[4], 0x30($out)
+       movdqu  @XMM[2], 0x40($out)
+       jmp     .Lecb_dec_done
+.align 16
+.Lecb_dec_four:
+       call    _bsaes_decrypt8
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[6], 0x20($out)
+       movdqu  @XMM[4], 0x30($out)
+       jmp     .Lecb_dec_done
+.align 16
+.Lecb_dec_three:
+       call    _bsaes_decrypt8
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[6], 0x20($out)
+       jmp     .Lecb_dec_done
+.align 16
+.Lecb_dec_two:
+       call    _bsaes_decrypt8
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       jmp     .Lecb_dec_done
+.align 16
+.Lecb_dec_one:
+       call    _bsaes_decrypt8
+       movdqu  @XMM[0], 0x00($out)     # write output
+       jmp     .Lecb_dec_done
+.align 16
+.Lecb_dec_short:
+       lea     ($inp), $arg1
+       lea     ($out), $arg2
+       lea     ($key), $arg3
+       call    asm_AES_decrypt
+       lea     16($inp), $inp
+       lea     16($out), $out
+       dec     $len
+       jnz     .Lecb_dec_short
+
+.Lecb_dec_done:
+       lea     (%rsp),%rax
+       pxor    %xmm0, %xmm0
+.Lecb_dec_bzero:                       # wipe key schedule [if any]
+       movdqa  %xmm0, 0x00(%rax)
+       movdqa  %xmm0, 0x10(%rax)
+       lea     0x20(%rax), %rax
+       cmp     %rax, %rbp
+       jb      .Lecb_dec_bzero
+
+       lea     (%rbp),%rsp             # restore %rsp
+___
+$code.=<<___ if ($win64);
+       movaps  0x40(%rbp), %xmm6
+       movaps  0x50(%rbp), %xmm7
+       movaps  0x60(%rbp), %xmm8
+       movaps  0x70(%rbp), %xmm9
+       movaps  0x80(%rbp), %xmm10
+       movaps  0x90(%rbp), %xmm11
+       movaps  0xa0(%rbp), %xmm12
+       movaps  0xb0(%rbp), %xmm13
+       movaps  0xc0(%rbp), %xmm14
+       movaps  0xd0(%rbp), %xmm15
+       lea     0xa0(%rbp), %rsp
+___
+$code.=<<___;
+       mov     0x48(%rsp), %r15
+       mov     0x50(%rsp), %r14
+       mov     0x58(%rsp), %r13
+       mov     0x60(%rsp), %r12
+       mov     0x68(%rsp), %rbx
+       mov     0x70(%rsp), %rax
+       lea     0x78(%rsp), %rsp
+       mov     %rax, %rbp
+.Lecb_dec_epilogue:
+       ret
+.size  bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
+___
+}
+$code.=<<___;
+.extern        asm_AES_cbc_encrypt
+.globl bsaes_cbc_encrypt
+.type  bsaes_cbc_encrypt,\@abi-omnipotent
+.align 16
+bsaes_cbc_encrypt:
+___
+$code.=<<___ if ($win64);
+       mov     48(%rsp),$arg6          # pull direction flag
+___
+$code.=<<___;
+       cmp     \$0,$arg6
+       jne     asm_AES_cbc_encrypt
+       cmp     \$128,$arg3
+       jb      asm_AES_cbc_encrypt
+
+       mov     %rsp, %rax
+.Lcbc_dec_prologue:
+       push    %rbp
+       push    %rbx
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       lea     -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+       mov     0xa0(%rsp),$arg5        # pull ivp
+       lea     -0xa0(%rsp), %rsp
+       movaps  %xmm6, 0x40(%rsp)
+       movaps  %xmm7, 0x50(%rsp)
+       movaps  %xmm8, 0x60(%rsp)
+       movaps  %xmm9, 0x70(%rsp)
+       movaps  %xmm10, 0x80(%rsp)
+       movaps  %xmm11, 0x90(%rsp)
+       movaps  %xmm12, 0xa0(%rsp)
+       movaps  %xmm13, 0xb0(%rsp)
+       movaps  %xmm14, 0xc0(%rsp)
+       movaps  %xmm15, 0xd0(%rsp)
+.Lcbc_dec_body:
+___
+$code.=<<___;
+       mov     %rsp, %rbp              # backup %rsp
+       mov     240($arg4), %eax        # rounds
+       mov     $arg1, $inp             # backup arguments
+       mov     $arg2, $out
+       mov     $arg3, $len
+       mov     $arg4, $key
+       mov     $arg5, %rbx
+       shr     \$4, $len               # bytes to blocks
+
+       mov     %eax, %edx              # rounds
+       shl     \$7, %rax               # 128 bytes per inner round key
+       sub     \$`128-32`, %rax        # size of bit-sliced key schedule
+       sub     %rax, %rsp
+
+       mov     %rsp, %rax              # pass key schedule
+       mov     $key, %rcx              # pass key
+       mov     %edx, %r10d             # pass rounds
+       call    _bsaes_key_convert
+       pxor    (%rsp),%xmm7            # fix up 0 round key
+       movdqa  %xmm6,(%rax)            # save last round key
+       movdqa  %xmm7,(%rsp)
+
+       movdqu  (%rbx), @XMM[15]        # load IV
+       sub     \$8,$len
+.Lcbc_dec_loop:
+       movdqu  0x00($inp), @XMM[0]     # load input
+       movdqu  0x10($inp), @XMM[1]
+       movdqu  0x20($inp), @XMM[2]
+       movdqu  0x30($inp), @XMM[3]
+       movdqu  0x40($inp), @XMM[4]
+       movdqu  0x50($inp), @XMM[5]
+       mov     %rsp, %rax              # pass key schedule
+       movdqu  0x60($inp), @XMM[6]
+       mov     %edx,%r10d              # pass rounds
+       movdqu  0x70($inp), @XMM[7]
+       movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
+
+       call    _bsaes_decrypt8
+
+       pxor    0x20(%rbp), @XMM[0]     # ^= IV
+       movdqu  0x00($inp), @XMM[8]     # re-load input
+       movdqu  0x10($inp), @XMM[9]
+       pxor    @XMM[8], @XMM[1]
+       movdqu  0x20($inp), @XMM[10]
+       pxor    @XMM[9], @XMM[6]
+       movdqu  0x30($inp), @XMM[11]
+       pxor    @XMM[10], @XMM[4]
+       movdqu  0x40($inp), @XMM[12]
+       pxor    @XMM[11], @XMM[2]
+       movdqu  0x50($inp), @XMM[13]
+       pxor    @XMM[12], @XMM[7]
+       movdqu  0x60($inp), @XMM[14]
+       pxor    @XMM[13], @XMM[3]
+       movdqu  0x70($inp), @XMM[15]    # IV
+       pxor    @XMM[14], @XMM[5]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       lea     0x80($inp), $inp
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[6], 0x20($out)
+       movdqu  @XMM[4], 0x30($out)
+       movdqu  @XMM[2], 0x40($out)
+       movdqu  @XMM[7], 0x50($out)
+       movdqu  @XMM[3], 0x60($out)
+       movdqu  @XMM[5], 0x70($out)
+       lea     0x80($out), $out
+       sub     \$8,$len
+       jnc     .Lcbc_dec_loop
+
+       add     \$8,$len
+       jz      .Lcbc_dec_done
+
+       movdqu  0x00($inp), @XMM[0]     # load input
+       mov     %rsp, %rax              # pass key schedule
+       mov     %edx, %r10d             # pass rounds
+       cmp     \$2,$len
+       jb      .Lcbc_dec_one
+       movdqu  0x10($inp), @XMM[1]
+       je      .Lcbc_dec_two
+       movdqu  0x20($inp), @XMM[2]
+       cmp     \$4,$len
+       jb      .Lcbc_dec_three
+       movdqu  0x30($inp), @XMM[3]
+       je      .Lcbc_dec_four
+       movdqu  0x40($inp), @XMM[4]
+       cmp     \$6,$len
+       jb      .Lcbc_dec_five
+       movdqu  0x50($inp), @XMM[5]
+       je      .Lcbc_dec_six
+       movdqu  0x60($inp), @XMM[6]
+       movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
+       call    _bsaes_decrypt8
+       pxor    0x20(%rbp), @XMM[0]     # ^= IV
+       movdqu  0x00($inp), @XMM[8]     # re-load input
+       movdqu  0x10($inp), @XMM[9]
+       pxor    @XMM[8], @XMM[1]
+       movdqu  0x20($inp), @XMM[10]
+       pxor    @XMM[9], @XMM[6]
+       movdqu  0x30($inp), @XMM[11]
+       pxor    @XMM[10], @XMM[4]
+       movdqu  0x40($inp), @XMM[12]
+       pxor    @XMM[11], @XMM[2]
+       movdqu  0x50($inp), @XMM[13]
+       pxor    @XMM[12], @XMM[7]
+       movdqu  0x60($inp), @XMM[15]    # IV
+       pxor    @XMM[13], @XMM[3]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[6], 0x20($out)
+       movdqu  @XMM[4], 0x30($out)
+       movdqu  @XMM[2], 0x40($out)
+       movdqu  @XMM[7], 0x50($out)
+       movdqu  @XMM[3], 0x60($out)
+       jmp     .Lcbc_dec_done
+.align 16
+.Lcbc_dec_six:
+       movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
+       call    _bsaes_decrypt8
+       pxor    0x20(%rbp), @XMM[0]     # ^= IV
+       movdqu  0x00($inp), @XMM[8]     # re-load input
+       movdqu  0x10($inp), @XMM[9]
+       pxor    @XMM[8], @XMM[1]
+       movdqu  0x20($inp), @XMM[10]
+       pxor    @XMM[9], @XMM[6]
+       movdqu  0x30($inp), @XMM[11]
+       pxor    @XMM[10], @XMM[4]
+       movdqu  0x40($inp), @XMM[12]
+       pxor    @XMM[11], @XMM[2]
+       movdqu  0x50($inp), @XMM[15]    # IV
+       pxor    @XMM[12], @XMM[7]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[6], 0x20($out)
+       movdqu  @XMM[4], 0x30($out)
+       movdqu  @XMM[2], 0x40($out)
+       movdqu  @XMM[7], 0x50($out)
+       jmp     .Lcbc_dec_done
+.align 16
+.Lcbc_dec_five:
+       movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
+       call    _bsaes_decrypt8
+       pxor    0x20(%rbp), @XMM[0]     # ^= IV
+       movdqu  0x00($inp), @XMM[8]     # re-load input
+       movdqu  0x10($inp), @XMM[9]
+       pxor    @XMM[8], @XMM[1]
+       movdqu  0x20($inp), @XMM[10]
+       pxor    @XMM[9], @XMM[6]
+       movdqu  0x30($inp), @XMM[11]
+       pxor    @XMM[10], @XMM[4]
+       movdqu  0x40($inp), @XMM[15]    # IV
+       pxor    @XMM[11], @XMM[2]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[6], 0x20($out)
+       movdqu  @XMM[4], 0x30($out)
+       movdqu  @XMM[2], 0x40($out)
+       jmp     .Lcbc_dec_done
+.align 16
+.Lcbc_dec_four:
+       movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
+       call    _bsaes_decrypt8
+       pxor    0x20(%rbp), @XMM[0]     # ^= IV
+       movdqu  0x00($inp), @XMM[8]     # re-load input
+       movdqu  0x10($inp), @XMM[9]
+       pxor    @XMM[8], @XMM[1]
+       movdqu  0x20($inp), @XMM[10]
+       pxor    @XMM[9], @XMM[6]
+       movdqu  0x30($inp), @XMM[15]    # IV
+       pxor    @XMM[10], @XMM[4]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[6], 0x20($out)
+       movdqu  @XMM[4], 0x30($out)
+       jmp     .Lcbc_dec_done
+.align 16
+.Lcbc_dec_three:
+       movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
+       call    _bsaes_decrypt8
+       pxor    0x20(%rbp), @XMM[0]     # ^= IV
+       movdqu  0x00($inp), @XMM[8]     # re-load input
+       movdqu  0x10($inp), @XMM[9]
+       pxor    @XMM[8], @XMM[1]
+       movdqu  0x20($inp), @XMM[15]    # IV
+       pxor    @XMM[9], @XMM[6]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[6], 0x20($out)
+       jmp     .Lcbc_dec_done
+.align 16
+.Lcbc_dec_two:
+       movdqa  @XMM[15], 0x20(%rbp)    # put aside IV
+       call    _bsaes_decrypt8
+       pxor    0x20(%rbp), @XMM[0]     # ^= IV
+       movdqu  0x00($inp), @XMM[8]     # re-load input
+       movdqu  0x10($inp), @XMM[15]    # IV
+       pxor    @XMM[8], @XMM[1]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       jmp     .Lcbc_dec_done
+.align 16
+.Lcbc_dec_one:
+       lea     ($inp), $arg1
+       lea     0x20(%rbp), $arg2       # buffer output
+       lea     ($key), $arg3
+       call    asm_AES_decrypt         # doesn't touch %xmm
+       pxor    0x20(%rbp), @XMM[15]    # ^= IV
+       movdqu  @XMM[15], ($out)        # write output
+       movdqa  @XMM[0], @XMM[15]       # IV
+
+.Lcbc_dec_done:
+       movdqu  @XMM[15], (%rbx)        # return IV
+       lea     (%rsp), %rax
+       pxor    %xmm0, %xmm0
+.Lcbc_dec_bzero:                       # wipe key schedule [if any]
+       movdqa  %xmm0, 0x00(%rax)
+       movdqa  %xmm0, 0x10(%rax)
+       lea     0x20(%rax), %rax
+       cmp     %rax, %rbp
+       ja      .Lcbc_dec_bzero
+
+       lea     (%rbp),%rsp             # restore %rsp
+___
+$code.=<<___ if ($win64);
+       movaps  0x40(%rbp), %xmm6
+       movaps  0x50(%rbp), %xmm7
+       movaps  0x60(%rbp), %xmm8
+       movaps  0x70(%rbp), %xmm9
+       movaps  0x80(%rbp), %xmm10
+       movaps  0x90(%rbp), %xmm11
+       movaps  0xa0(%rbp), %xmm12
+       movaps  0xb0(%rbp), %xmm13
+       movaps  0xc0(%rbp), %xmm14
+       movaps  0xd0(%rbp), %xmm15
+       lea     0xa0(%rbp), %rsp
+___
+$code.=<<___;
+       mov     0x48(%rsp), %r15
+       mov     0x50(%rsp), %r14
+       mov     0x58(%rsp), %r13
+       mov     0x60(%rsp), %r12
+       mov     0x68(%rsp), %rbx
+       mov     0x70(%rsp), %rax
+       lea     0x78(%rsp), %rsp
+       mov     %rax, %rbp
+.Lcbc_dec_epilogue:
+       ret
+.size  bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
+
+.globl bsaes_ctr32_encrypt_blocks
+.type  bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
+.align 16
+bsaes_ctr32_encrypt_blocks:
+       mov     %rsp, %rax
+.Lctr_enc_prologue:
+       push    %rbp
+       push    %rbx
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       lea     -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+       mov     0xa0(%rsp),$arg5        # pull ivp
+       lea     -0xa0(%rsp), %rsp
+       movaps  %xmm6, 0x40(%rsp)
+       movaps  %xmm7, 0x50(%rsp)
+       movaps  %xmm8, 0x60(%rsp)
+       movaps  %xmm9, 0x70(%rsp)
+       movaps  %xmm10, 0x80(%rsp)
+       movaps  %xmm11, 0x90(%rsp)
+       movaps  %xmm12, 0xa0(%rsp)
+       movaps  %xmm13, 0xb0(%rsp)
+       movaps  %xmm14, 0xc0(%rsp)
+       movaps  %xmm15, 0xd0(%rsp)
+.Lctr_enc_body:
+___
+$code.=<<___;
+       mov     %rsp, %rbp              # backup %rsp
+       movdqu  ($arg5), %xmm0          # load counter
+       mov     240($arg4), %eax        # rounds
+       mov     $arg1, $inp             # backup arguments
+       mov     $arg2, $out
+       mov     $arg3, $len
+       mov     $arg4, $key
+       movdqa  %xmm0, 0x20(%rbp)       # copy counter
+       cmp     \$8, $arg3
+       jb      .Lctr_enc_short
+
+       mov     %eax, %ebx              # rounds
+       shl     \$7, %rax               # 128 bytes per inner round key
+       sub     \$`128-32`, %rax        # size of bit-sliced key schedule
+       sub     %rax, %rsp
+
+       mov     %rsp, %rax              # pass key schedule
+       mov     $key, %rcx              # pass key
+       mov     %ebx, %r10d             # pass rounds
+       call    _bsaes_key_convert
+       pxor    %xmm6,%xmm7             # fix up last round key
+       movdqa  %xmm7,(%rax)            # save last round key
+
+       movdqa  (%rsp), @XMM[9]         # load round0 key
+       lea     .LADD1(%rip), %r11
+       movdqa  0x20(%rbp), @XMM[0]     # counter copy
+       movdqa  -0x20(%r11), @XMM[8]    # .LSWPUP
+       pshufb  @XMM[8], @XMM[9]        # byte swap upper part
+       pshufb  @XMM[8], @XMM[0]
+       movdqa  @XMM[9], (%rsp)         # save adjusted round0 key
+       jmp     .Lctr_enc_loop
+.align 16
+.Lctr_enc_loop:
+       movdqa  @XMM[0], 0x20(%rbp)     # save counter
+       movdqa  @XMM[0], @XMM[1]        # prepare 8 counter values
+       movdqa  @XMM[0], @XMM[2]
+       paddd   0x00(%r11), @XMM[1]     # .LADD1
+       movdqa  @XMM[0], @XMM[3]
+       paddd   0x10(%r11), @XMM[2]     # .LADD2
+       movdqa  @XMM[0], @XMM[4]
+       paddd   0x20(%r11), @XMM[3]     # .LADD3
+       movdqa  @XMM[0], @XMM[5]
+       paddd   0x30(%r11), @XMM[4]     # .LADD4
+       movdqa  @XMM[0], @XMM[6]
+       paddd   0x40(%r11), @XMM[5]     # .LADD5
+       movdqa  @XMM[0], @XMM[7]
+       paddd   0x50(%r11), @XMM[6]     # .LADD6
+       paddd   0x60(%r11), @XMM[7]     # .LADD7
+
+       # Borrow prologue from _bsaes_encrypt8 to use the opportunity
+       # to flip byte order in 32-bit counter
+       movdqa  (%rsp), @XMM[9]         # round 0 key
+       lea     0x10(%rsp), %rax        # pass key schedule
+       movdqa  -0x10(%r11), @XMM[8]    # .LSWPUPM0SR
+       pxor    @XMM[9], @XMM[0]        # xor with round0 key
+       pxor    @XMM[9], @XMM[1]
+        pshufb @XMM[8], @XMM[0]
+       pxor    @XMM[9], @XMM[2]
+        pshufb @XMM[8], @XMM[1]
+       pxor    @XMM[9], @XMM[3]
+        pshufb @XMM[8], @XMM[2]
+       pxor    @XMM[9], @XMM[4]
+        pshufb @XMM[8], @XMM[3]
+       pxor    @XMM[9], @XMM[5]
+        pshufb @XMM[8], @XMM[4]
+       pxor    @XMM[9], @XMM[6]
+        pshufb @XMM[8], @XMM[5]
+       pxor    @XMM[9], @XMM[7]
+        pshufb @XMM[8], @XMM[6]
+       lea     .LBS0(%rip), %r11       # constants table
+        pshufb @XMM[8], @XMM[7]
+       mov     %ebx,%r10d              # pass rounds
+
+       call    _bsaes_encrypt8_bitslice
+
+       sub     \$8,$len
+       jc      .Lctr_enc_loop_done
+
+       movdqu  0x00($inp), @XMM[8]     # load input
+       movdqu  0x10($inp), @XMM[9]
+       movdqu  0x20($inp), @XMM[10]
+       movdqu  0x30($inp), @XMM[11]
+       movdqu  0x40($inp), @XMM[12]
+       movdqu  0x50($inp), @XMM[13]
+       movdqu  0x60($inp), @XMM[14]
+       movdqu  0x70($inp), @XMM[15]
+       lea     0x80($inp),$inp
+       pxor    @XMM[0], @XMM[8]
+       movdqa  0x20(%rbp), @XMM[0]     # load counter
+       pxor    @XMM[9], @XMM[1]
+       movdqu  @XMM[8], 0x00($out)     # write output
+       pxor    @XMM[10], @XMM[4]
+       movdqu  @XMM[1], 0x10($out)
+       pxor    @XMM[11], @XMM[6]
+       movdqu  @XMM[4], 0x20($out)
+       pxor    @XMM[12], @XMM[3]
+       movdqu  @XMM[6], 0x30($out)
+       pxor    @XMM[13], @XMM[7]
+       movdqu  @XMM[3], 0x40($out)
+       pxor    @XMM[14], @XMM[2]
+       movdqu  @XMM[7], 0x50($out)
+       pxor    @XMM[15], @XMM[5]
+       movdqu  @XMM[2], 0x60($out)
+       lea     .LADD1(%rip), %r11
+       movdqu  @XMM[5], 0x70($out)
+       lea     0x80($out), $out
+       paddd   0x70(%r11), @XMM[0]     # .LADD8
+       jnz     .Lctr_enc_loop
+
+       jmp     .Lctr_enc_done
+.align 16
+.Lctr_enc_loop_done:
+       movdqu  0x00($inp), @XMM[8]     # load input
+       pxor    @XMM[8], @XMM[0]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       cmp     \$2,$len
+       jb      .Lctr_enc_done
+       movdqu  0x10($inp), @XMM[9]
+       pxor    @XMM[9], @XMM[1]
+       movdqu  @XMM[1], 0x10($out)
+       je      .Lctr_enc_done
+       movdqu  0x20($inp), @XMM[10]
+       pxor    @XMM[10], @XMM[4]
+       movdqu  @XMM[4], 0x20($out)
+       cmp     \$4,$len
+       jb      .Lctr_enc_done
+       movdqu  0x30($inp), @XMM[11]
+       pxor    @XMM[11], @XMM[6]
+       movdqu  @XMM[6], 0x30($out)
+       je      .Lctr_enc_done
+       movdqu  0x40($inp), @XMM[12]
+       pxor    @XMM[12], @XMM[3]
+       movdqu  @XMM[3], 0x40($out)
+       cmp     \$6,$len
+       jb      .Lctr_enc_done
+       movdqu  0x50($inp), @XMM[13]
+       pxor    @XMM[13], @XMM[7]
+       movdqu  @XMM[7], 0x50($out)
+       je      .Lctr_enc_done
+       movdqu  0x60($inp), @XMM[14]
+       pxor    @XMM[14], @XMM[2]
+       movdqu  @XMM[2], 0x60($out)
+       jmp     .Lctr_enc_done
+
+.align 16
+.Lctr_enc_short:
+       lea     0x20(%rbp), $arg1
+       lea     0x30(%rbp), $arg2
+       lea     ($key), $arg3
+       call    asm_AES_encrypt
+       movdqu  ($inp), @XMM[1]
+       lea     16($inp), $inp
+       mov     0x2c(%rbp), %eax        # load 32-bit counter
+       bswap   %eax
+       pxor    0x30(%rbp), @XMM[1]
+       inc     %eax                    # increment
+       movdqu  @XMM[1], ($out)
+       bswap   %eax
+       lea     16($out), $out
+       mov     %eax, 0x2c(%rsp)        # save 32-bit counter
+       dec     $len
+       jnz     .Lctr_enc_short
+
+.Lctr_enc_done:
+       lea     (%rsp), %rax
+       pxor    %xmm0, %xmm0
+.Lctr_enc_bzero:                       # wipe key schedule [if any]
+       movdqa  %xmm0, 0x00(%rax)
+       movdqa  %xmm0, 0x10(%rax)
+       lea     0x20(%rax), %rax
+       cmp     %rax, %rbp
+       ja      .Lctr_enc_bzero
+
+       lea     (%rbp),%rsp             # restore %rsp
+___
+$code.=<<___ if ($win64);
+       movaps  0x40(%rbp), %xmm6
+       movaps  0x50(%rbp), %xmm7
+       movaps  0x60(%rbp), %xmm8
+       movaps  0x70(%rbp), %xmm9
+       movaps  0x80(%rbp), %xmm10
+       movaps  0x90(%rbp), %xmm11
+       movaps  0xa0(%rbp), %xmm12
+       movaps  0xb0(%rbp), %xmm13
+       movaps  0xc0(%rbp), %xmm14
+       movaps  0xd0(%rbp), %xmm15
+       lea     0xa0(%rbp), %rsp
+___
+$code.=<<___;
+       mov     0x48(%rsp), %r15
+       mov     0x50(%rsp), %r14
+       mov     0x58(%rsp), %r13
+       mov     0x60(%rsp), %r12
+       mov     0x68(%rsp), %rbx
+       mov     0x70(%rsp), %rax
+       lea     0x78(%rsp), %rsp
+       mov     %rax, %rbp
+.Lctr_enc_epilogue:
+       ret
+.size  bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
+___
+######################################################################
+# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
+#      const AES_KEY *key1, const AES_KEY *key2,
+#      const unsigned char iv[16]);
+#
+my ($twmask,$twres,$twtmp)=@XMM[13..15];
+$code.=<<___;
+.globl bsaes_xts_encrypt
+.type  bsaes_xts_encrypt,\@abi-omnipotent
+.align 16
+bsaes_xts_encrypt:
+       mov     %rsp, %rax
+.Lxts_enc_prologue:
+       push    %rbp
+       push    %rbx
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       lea     -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+       mov     0xa0(%rsp),$arg5        # pull key2
+       mov     0xa8(%rsp),$arg6        # pull ivp
+       lea     -0xa0(%rsp), %rsp
+       movaps  %xmm6, 0x40(%rsp)
+       movaps  %xmm7, 0x50(%rsp)
+       movaps  %xmm8, 0x60(%rsp)
+       movaps  %xmm9, 0x70(%rsp)
+       movaps  %xmm10, 0x80(%rsp)
+       movaps  %xmm11, 0x90(%rsp)
+       movaps  %xmm12, 0xa0(%rsp)
+       movaps  %xmm13, 0xb0(%rsp)
+       movaps  %xmm14, 0xc0(%rsp)
+       movaps  %xmm15, 0xd0(%rsp)
+.Lxts_enc_body:
+___
+$code.=<<___;
+       mov     %rsp, %rbp              # backup %rsp
+       mov     $arg1, $inp             # backup arguments
+       mov     $arg2, $out
+       mov     $arg3, $len
+       mov     $arg4, $key
+
+       lea     ($arg6), $arg1
+       lea     0x20(%rbp), $arg2
+       lea     ($arg5), $arg3
+       call    asm_AES_encrypt         # generate initial tweak
+
+       mov     240($key), %eax         # rounds
+       mov     $len, %rbx              # backup $len
+
+       mov     %eax, %edx              # rounds
+       shl     \$7, %rax               # 128 bytes per inner round key
+       sub     \$`128-32`, %rax        # size of bit-sliced key schedule
+       sub     %rax, %rsp
+
+       mov     %rsp, %rax              # pass key schedule
+       mov     $key, %rcx              # pass key
+       mov     %edx, %r10d             # pass rounds
+       call    _bsaes_key_convert
+       pxor    %xmm6, %xmm7            # fix up last round key
+       movdqa  %xmm7, (%rax)           # save last round key
+
+       and     \$-16, $len
+       sub     \$0x80, %rsp            # place for tweak[8]
+       movdqa  0x20(%rbp), @XMM[7]     # initial tweak
+
+       pxor    $twtmp, $twtmp
+       movdqa  .Lxts_magic(%rip), $twmask
+       pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
+
+       sub     \$0x80, $len
+       jc      .Lxts_enc_short
+       jmp     .Lxts_enc_loop
+
+.align 16
+.Lxts_enc_loop:
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+       pshufd  \$0x13, $twtmp, $twres
+       pxor    $twtmp, $twtmp
+       movdqa  @XMM[7], @XMM[$i]
+       movdqa  @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+       paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
+       pand    $twmask, $twres         # isolate carry and residue
+       pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
+       pxor    $twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+       movdqu  `0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+    $code.=<<___ if ($i>=2);
+       pxor    @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+       movdqu  0x60($inp), @XMM[8+6]
+       pxor    @XMM[8+5], @XMM[5]
+       movdqu  0x70($inp), @XMM[8+7]
+       lea     0x80($inp), $inp
+       movdqa  @XMM[7], 0x70(%rsp)
+       pxor    @XMM[8+6], @XMM[6]
+       lea     0x80(%rsp), %rax        # pass key schedule
+       pxor    @XMM[8+7], @XMM[7]
+       mov     %edx, %r10d             # pass rounds
+
+       call    _bsaes_encrypt8
+
+       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+       pxor    0x10(%rsp), @XMM[1]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       pxor    0x20(%rsp), @XMM[4]
+       movdqu  @XMM[1], 0x10($out)
+       pxor    0x30(%rsp), @XMM[6]
+       movdqu  @XMM[4], 0x20($out)
+       pxor    0x40(%rsp), @XMM[3]
+       movdqu  @XMM[6], 0x30($out)
+       pxor    0x50(%rsp), @XMM[7]
+       movdqu  @XMM[3], 0x40($out)
+       pxor    0x60(%rsp), @XMM[2]
+       movdqu  @XMM[7], 0x50($out)
+       pxor    0x70(%rsp), @XMM[5]
+       movdqu  @XMM[2], 0x60($out)
+       movdqu  @XMM[5], 0x70($out)
+       lea     0x80($out), $out
+
+       movdqa  0x70(%rsp), @XMM[7]     # prepare next iteration tweak
+       pxor    $twtmp, $twtmp
+       movdqa  .Lxts_magic(%rip), $twmask
+       pcmpgtd @XMM[7], $twtmp
+       pshufd  \$0x13, $twtmp, $twres
+       pxor    $twtmp, $twtmp
+       paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
+       pand    $twmask, $twres         # isolate carry and residue
+       pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
+       pxor    $twres, @XMM[7]
+
+       sub     \$0x80,$len
+       jnc     .Lxts_enc_loop
+
+.Lxts_enc_short:
+       add     \$0x80, $len
+       jz      .Lxts_enc_done
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+       pshufd  \$0x13, $twtmp, $twres
+       pxor    $twtmp, $twtmp
+       movdqa  @XMM[7], @XMM[$i]
+       movdqa  @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+       paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
+       pand    $twmask, $twres         # isolate carry and residue
+       pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
+       pxor    $twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+       movdqu  `0x10*($i-1)`($inp), @XMM[8+$i-1]
+       cmp     \$`0x10*$i`,$len
+       je      .Lxts_enc_$i
+___
+    $code.=<<___ if ($i>=2);
+       pxor    @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+       movdqu  0x60($inp), @XMM[8+6]
+       pxor    @XMM[8+5], @XMM[5]
+       movdqa  @XMM[7], 0x70(%rsp)
+       lea     0x70($inp), $inp
+       pxor    @XMM[8+6], @XMM[6]
+       lea     0x80(%rsp), %rax        # pass key schedule
+       mov     %edx, %r10d             # pass rounds
+
+       call    _bsaes_encrypt8
+
+       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+       pxor    0x10(%rsp), @XMM[1]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       pxor    0x20(%rsp), @XMM[4]
+       movdqu  @XMM[1], 0x10($out)
+       pxor    0x30(%rsp), @XMM[6]
+       movdqu  @XMM[4], 0x20($out)
+       pxor    0x40(%rsp), @XMM[3]
+       movdqu  @XMM[6], 0x30($out)
+       pxor    0x50(%rsp), @XMM[7]
+       movdqu  @XMM[3], 0x40($out)
+       pxor    0x60(%rsp), @XMM[2]
+       movdqu  @XMM[7], 0x50($out)
+       movdqu  @XMM[2], 0x60($out)
+       lea     0x70($out), $out
+
+       movdqa  0x70(%rsp), @XMM[7]     # next iteration tweak
+       jmp     .Lxts_enc_done
+.align 16
+.Lxts_enc_6:
+       pxor    @XMM[8+4], @XMM[4]
+       lea     0x60($inp), $inp
+       pxor    @XMM[8+5], @XMM[5]
+       lea     0x80(%rsp), %rax        # pass key schedule
+       mov     %edx, %r10d             # pass rounds
+
+       call    _bsaes_encrypt8
+
+       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+       pxor    0x10(%rsp), @XMM[1]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       pxor    0x20(%rsp), @XMM[4]
+       movdqu  @XMM[1], 0x10($out)
+       pxor    0x30(%rsp), @XMM[6]
+       movdqu  @XMM[4], 0x20($out)
+       pxor    0x40(%rsp), @XMM[3]
+       movdqu  @XMM[6], 0x30($out)
+       pxor    0x50(%rsp), @XMM[7]
+       movdqu  @XMM[3], 0x40($out)
+       movdqu  @XMM[7], 0x50($out)
+       lea     0x60($out), $out
+
+       movdqa  0x60(%rsp), @XMM[7]     # next iteration tweak
+       jmp     .Lxts_enc_done
+.align 16
+.Lxts_enc_5:
+       pxor    @XMM[8+3], @XMM[3]
+       lea     0x50($inp), $inp
+       pxor    @XMM[8+4], @XMM[4]
+       lea     0x80(%rsp), %rax        # pass key schedule
+       mov     %edx, %r10d             # pass rounds
+
+       call    _bsaes_encrypt8
+
+       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+       pxor    0x10(%rsp), @XMM[1]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       pxor    0x20(%rsp), @XMM[4]
+       movdqu  @XMM[1], 0x10($out)
+       pxor    0x30(%rsp), @XMM[6]
+       movdqu  @XMM[4], 0x20($out)
+       pxor    0x40(%rsp), @XMM[3]
+       movdqu  @XMM[6], 0x30($out)
+       movdqu  @XMM[3], 0x40($out)
+       lea     0x50($out), $out
+
+       movdqa  0x50(%rsp), @XMM[7]     # next iteration tweak
+       jmp     .Lxts_enc_done
+.align 16
+.Lxts_enc_4:
+       pxor    @XMM[8+2], @XMM[2]
+       lea     0x40($inp), $inp
+       pxor    @XMM[8+3], @XMM[3]
+       lea     0x80(%rsp), %rax        # pass key schedule
+       mov     %edx, %r10d             # pass rounds
+
+       call    _bsaes_encrypt8
+
+       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+       pxor    0x10(%rsp), @XMM[1]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       pxor    0x20(%rsp), @XMM[4]
+       movdqu  @XMM[1], 0x10($out)
+       pxor    0x30(%rsp), @XMM[6]
+       movdqu  @XMM[4], 0x20($out)
+       movdqu  @XMM[6], 0x30($out)
+       lea     0x40($out), $out
+
+       movdqa  0x40(%rsp), @XMM[7]     # next iteration tweak
+       jmp     .Lxts_enc_done
+.align 16
+.Lxts_enc_3:
+       pxor    @XMM[8+1], @XMM[1]
+       lea     0x30($inp), $inp
+       pxor    @XMM[8+2], @XMM[2]
+       lea     0x80(%rsp), %rax        # pass key schedule
+       mov     %edx, %r10d             # pass rounds
+
+       call    _bsaes_encrypt8
+
+       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+       pxor    0x10(%rsp), @XMM[1]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       pxor    0x20(%rsp), @XMM[4]
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[4], 0x20($out)
+       lea     0x30($out), $out
+
+       movdqa  0x30(%rsp), @XMM[7]     # next iteration tweak
+       jmp     .Lxts_enc_done
+.align 16
+.Lxts_enc_2:
+       pxor    @XMM[8+0], @XMM[0]
+       lea     0x20($inp), $inp
+       pxor    @XMM[8+1], @XMM[1]
+       lea     0x80(%rsp), %rax        # pass key schedule
+       mov     %edx, %r10d             # pass rounds
+
+       call    _bsaes_encrypt8
+
+       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+       pxor    0x10(%rsp), @XMM[1]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       lea     0x20($out), $out
+
+       movdqa  0x20(%rsp), @XMM[7]     # next iteration tweak
+       jmp     .Lxts_enc_done
+.align 16
+.Lxts_enc_1:
+       pxor    @XMM[0], @XMM[8]
+       lea     0x10($inp), $inp
+       movdqa  @XMM[8], 0x20(%rbp)
+       lea     0x20(%rbp), $arg1
+       lea     0x20(%rbp), $arg2
+       lea     ($key), $arg3
+       call    asm_AES_encrypt         # doesn't touch %xmm
+       pxor    0x20(%rbp), @XMM[0]     # ^= tweak[]
+       #pxor   @XMM[8], @XMM[0]
+       #lea    0x80(%rsp), %rax        # pass key schedule
+       #mov    %edx, %r10d             # pass rounds
+       #call   _bsaes_encrypt8
+       #pxor   0x00(%rsp), @XMM[0]     # ^= tweak[]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       lea     0x10($out), $out
+
+       movdqa  0x10(%rsp), @XMM[7]     # next iteration tweak
+
+.Lxts_enc_done:
+       and     \$15, %ebx
+       jz      .Lxts_enc_ret
+       mov     $out, %rdx
+
+.Lxts_enc_steal:
+       movzb   ($inp), %eax
+       movzb   -16(%rdx), %ecx
+       lea     1($inp), $inp
+       mov     %al, -16(%rdx)
+       mov     %cl, 0(%rdx)
+       lea     1(%rdx), %rdx
+       sub     \$1,%ebx
+       jnz     .Lxts_enc_steal
+
+       movdqu  -16($out), @XMM[0]
+       lea     0x20(%rbp), $arg1
+       pxor    @XMM[7], @XMM[0]
+       lea     0x20(%rbp), $arg2
+       movdqa  @XMM[0], 0x20(%rbp)
+       lea     ($key), $arg3
+       call    asm_AES_encrypt         # doesn't touch %xmm
+       pxor    0x20(%rbp), @XMM[7]
+       movdqu  @XMM[7], -16($out)
+
+.Lxts_enc_ret:
+       lea     (%rsp), %rax
+       pxor    %xmm0, %xmm0
+.Lxts_enc_bzero:                       # wipe key schedule [if any]
+       movdqa  %xmm0, 0x00(%rax)
+       movdqa  %xmm0, 0x10(%rax)
+       lea     0x20(%rax), %rax
+       cmp     %rax, %rbp
+       ja      .Lxts_enc_bzero
+
+       lea     (%rbp),%rsp             # restore %rsp
+___
+$code.=<<___ if ($win64);
+       movaps  0x40(%rbp), %xmm6
+       movaps  0x50(%rbp), %xmm7
+       movaps  0x60(%rbp), %xmm8
+       movaps  0x70(%rbp), %xmm9
+       movaps  0x80(%rbp), %xmm10
+       movaps  0x90(%rbp), %xmm11
+       movaps  0xa0(%rbp), %xmm12
+       movaps  0xb0(%rbp), %xmm13
+       movaps  0xc0(%rbp), %xmm14
+       movaps  0xd0(%rbp), %xmm15
+       lea     0xa0(%rbp), %rsp
+___
+$code.=<<___;
+       mov     0x48(%rsp), %r15
+       mov     0x50(%rsp), %r14
+       mov     0x58(%rsp), %r13
+       mov     0x60(%rsp), %r12
+       mov     0x68(%rsp), %rbx
+       mov     0x70(%rsp), %rax
+       lea     0x78(%rsp), %rsp
+       mov     %rax, %rbp
+.Lxts_enc_epilogue:
+       ret
+.size  bsaes_xts_encrypt,.-bsaes_xts_encrypt
+
+.globl bsaes_xts_decrypt
+.type  bsaes_xts_decrypt,\@abi-omnipotent
+.align 16
+bsaes_xts_decrypt:
+       mov     %rsp, %rax
+.Lxts_dec_prologue:
+       push    %rbp
+       push    %rbx
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       lea     -0x48(%rsp), %rsp
+___
+$code.=<<___ if ($win64);
+       mov     0xa0(%rsp),$arg5        # pull key2
+       mov     0xa8(%rsp),$arg6        # pull ivp
+       lea     -0xa0(%rsp), %rsp
+       movaps  %xmm6, 0x40(%rsp)
+       movaps  %xmm7, 0x50(%rsp)
+       movaps  %xmm8, 0x60(%rsp)
+       movaps  %xmm9, 0x70(%rsp)
+       movaps  %xmm10, 0x80(%rsp)
+       movaps  %xmm11, 0x90(%rsp)
+       movaps  %xmm12, 0xa0(%rsp)
+       movaps  %xmm13, 0xb0(%rsp)
+       movaps  %xmm14, 0xc0(%rsp)
+       movaps  %xmm15, 0xd0(%rsp)
+.Lxts_dec_body:
+___
+$code.=<<___;
+       mov     %rsp, %rbp              # backup %rsp
+       mov     $arg1, $inp             # backup arguments
+       mov     $arg2, $out
+       mov     $arg3, $len
+       mov     $arg4, $key
+
+       lea     ($arg6), $arg1
+       lea     0x20(%rbp), $arg2
+       lea     ($arg5), $arg3
+       call    asm_AES_encrypt         # generate initial tweak
+
+       mov     240($key), %eax         # rounds
+       mov     $len, %rbx              # backup $len
+
+       mov     %eax, %edx              # rounds
+       shl     \$7, %rax               # 128 bytes per inner round key
+       sub     \$`128-32`, %rax        # size of bit-sliced key schedule
+       sub     %rax, %rsp
+
+       mov     %rsp, %rax              # pass key schedule
+       mov     $key, %rcx              # pass key
+       mov     %edx, %r10d             # pass rounds
+       call    _bsaes_key_convert
+       pxor    (%rsp), %xmm7           # fix up round 0 key
+       movdqa  %xmm6, (%rax)           # save last round key
+       movdqa  %xmm7, (%rsp)
+
+       xor     %eax, %eax              # if ($len%16) len-=16;
+       and     \$-16, $len
+       test    \$15, %ebx
+       setnz   %al
+       shl     \$4, %rax
+       sub     %rax, $len
+
+       sub     \$0x80, %rsp            # place for tweak[8]
+       movdqa  0x20(%rbp), @XMM[7]     # initial tweak
+
+       pxor    $twtmp, $twtmp
+       movdqa  .Lxts_magic(%rip), $twmask
+       pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
+
+       sub     \$0x80, $len
+       jc      .Lxts_dec_short
+       jmp     .Lxts_dec_loop
+
+.align 16
+.Lxts_dec_loop:
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+       pshufd  \$0x13, $twtmp, $twres
+       pxor    $twtmp, $twtmp
+       movdqa  @XMM[7], @XMM[$i]
+       movdqa  @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+       paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
+       pand    $twmask, $twres         # isolate carry and residue
+       pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
+       pxor    $twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+       movdqu  `0x10*($i-1)`($inp), @XMM[8+$i-1]
+___
+    $code.=<<___ if ($i>=2);
+       pxor    @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+       movdqu  0x60($inp), @XMM[8+6]
+       pxor    @XMM[8+5], @XMM[5]
+       movdqu  0x70($inp), @XMM[8+7]
+       lea     0x80($inp), $inp
+       movdqa  @XMM[7], 0x70(%rsp)
+       pxor    @XMM[8+6], @XMM[6]
+       lea     0x80(%rsp), %rax        # pass key schedule
+       pxor    @XMM[8+7], @XMM[7]
+       mov     %edx, %r10d             # pass rounds
+
+       call    _bsaes_decrypt8
+
+       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+       pxor    0x10(%rsp), @XMM[1]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       pxor    0x20(%rsp), @XMM[6]
+       movdqu  @XMM[1], 0x10($out)
+       pxor    0x30(%rsp), @XMM[4]
+       movdqu  @XMM[6], 0x20($out)
+       pxor    0x40(%rsp), @XMM[2]
+       movdqu  @XMM[4], 0x30($out)
+       pxor    0x50(%rsp), @XMM[7]
+       movdqu  @XMM[2], 0x40($out)
+       pxor    0x60(%rsp), @XMM[3]
+       movdqu  @XMM[7], 0x50($out)
+       pxor    0x70(%rsp), @XMM[5]
+       movdqu  @XMM[3], 0x60($out)
+       movdqu  @XMM[5], 0x70($out)
+       lea     0x80($out), $out
+
+       movdqa  0x70(%rsp), @XMM[7]     # prepare next iteration tweak
+       pxor    $twtmp, $twtmp
+       movdqa  .Lxts_magic(%rip), $twmask
+       pcmpgtd @XMM[7], $twtmp
+       pshufd  \$0x13, $twtmp, $twres
+       pxor    $twtmp, $twtmp
+       paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
+       pand    $twmask, $twres         # isolate carry and residue
+       pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
+       pxor    $twres, @XMM[7]
+
+       sub     \$0x80,$len
+       jnc     .Lxts_dec_loop
+
+.Lxts_dec_short:
+       add     \$0x80, $len
+       jz      .Lxts_dec_done
+___
+    for ($i=0;$i<7;$i++) {
+    $code.=<<___;
+       pshufd  \$0x13, $twtmp, $twres
+       pxor    $twtmp, $twtmp
+       movdqa  @XMM[7], @XMM[$i]
+       movdqa  @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
+       paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
+       pand    $twmask, $twres         # isolate carry and residue
+       pcmpgtd @XMM[7], $twtmp         # broadcast upper bits
+       pxor    $twres, @XMM[7]
+___
+    $code.=<<___ if ($i>=1);
+       movdqu  `0x10*($i-1)`($inp), @XMM[8+$i-1]
+       cmp     \$`0x10*$i`,$len
+       je      .Lxts_dec_$i
+___
+    $code.=<<___ if ($i>=2);
+       pxor    @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
+___
+    }
+$code.=<<___;
+       movdqu  0x60($inp), @XMM[8+6]
+       pxor    @XMM[8+5], @XMM[5]
+       movdqa  @XMM[7], 0x70(%rsp)
+       lea     0x70($inp), $inp
+       pxor    @XMM[8+6], @XMM[6]
+       lea     0x80(%rsp), %rax        # pass key schedule
+       mov     %edx, %r10d             # pass rounds
+
+       call    _bsaes_decrypt8
+
+       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+       pxor    0x10(%rsp), @XMM[1]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       pxor    0x20(%rsp), @XMM[6]
+       movdqu  @XMM[1], 0x10($out)
+       pxor    0x30(%rsp), @XMM[4]
+       movdqu  @XMM[6], 0x20($out)
+       pxor    0x40(%rsp), @XMM[2]
+       movdqu  @XMM[4], 0x30($out)
+       pxor    0x50(%rsp), @XMM[7]
+       movdqu  @XMM[2], 0x40($out)
+       pxor    0x60(%rsp), @XMM[3]
+       movdqu  @XMM[7], 0x50($out)
+       movdqu  @XMM[3], 0x60($out)
+       lea     0x70($out), $out
+
+       movdqa  0x70(%rsp), @XMM[7]     # next iteration tweak
+       jmp     .Lxts_dec_done
+.align 16
+.Lxts_dec_6:
+       pxor    @XMM[8+4], @XMM[4]
+       lea     0x60($inp), $inp
+       pxor    @XMM[8+5], @XMM[5]
+       lea     0x80(%rsp), %rax        # pass key schedule
+       mov     %edx, %r10d             # pass rounds
+
+       call    _bsaes_decrypt8
+
+       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+       pxor    0x10(%rsp), @XMM[1]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       pxor    0x20(%rsp), @XMM[6]
+       movdqu  @XMM[1], 0x10($out)
+       pxor    0x30(%rsp), @XMM[4]
+       movdqu  @XMM[6], 0x20($out)
+       pxor    0x40(%rsp), @XMM[2]
+       movdqu  @XMM[4], 0x30($out)
+       pxor    0x50(%rsp), @XMM[7]
+       movdqu  @XMM[2], 0x40($out)
+       movdqu  @XMM[7], 0x50($out)
+       lea     0x60($out), $out
+
+       movdqa  0x60(%rsp), @XMM[7]     # next iteration tweak
+       jmp     .Lxts_dec_done
+.align 16
+.Lxts_dec_5:
+       pxor    @XMM[8+3], @XMM[3]
+       lea     0x50($inp), $inp
+       pxor    @XMM[8+4], @XMM[4]
+       lea     0x80(%rsp), %rax        # pass key schedule
+       mov     %edx, %r10d             # pass rounds
+
+       call    _bsaes_decrypt8
+
+       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+       pxor    0x10(%rsp), @XMM[1]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       pxor    0x20(%rsp), @XMM[6]
+       movdqu  @XMM[1], 0x10($out)
+       pxor    0x30(%rsp), @XMM[4]
+       movdqu  @XMM[6], 0x20($out)
+       pxor    0x40(%rsp), @XMM[2]
+       movdqu  @XMM[4], 0x30($out)
+       movdqu  @XMM[2], 0x40($out)
+       lea     0x50($out), $out
+
+       movdqa  0x50(%rsp), @XMM[7]     # next iteration tweak
+       jmp     .Lxts_dec_done
+.align 16
+.Lxts_dec_4:
+       pxor    @XMM[8+2], @XMM[2]
+       lea     0x40($inp), $inp
+       pxor    @XMM[8+3], @XMM[3]
+       lea     0x80(%rsp), %rax        # pass key schedule
+       mov     %edx, %r10d             # pass rounds
+
+       call    _bsaes_decrypt8
+
+       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+       pxor    0x10(%rsp), @XMM[1]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       pxor    0x20(%rsp), @XMM[6]
+       movdqu  @XMM[1], 0x10($out)
+       pxor    0x30(%rsp), @XMM[4]
+       movdqu  @XMM[6], 0x20($out)
+       movdqu  @XMM[4], 0x30($out)
+       lea     0x40($out), $out
+
+       movdqa  0x40(%rsp), @XMM[7]     # next iteration tweak
+       jmp     .Lxts_dec_done
+.align 16
+.Lxts_dec_3:
+       pxor    @XMM[8+1], @XMM[1]
+       lea     0x30($inp), $inp
+       pxor    @XMM[8+2], @XMM[2]
+       lea     0x80(%rsp), %rax        # pass key schedule
+       mov     %edx, %r10d             # pass rounds
+
+       call    _bsaes_decrypt8
+
+       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+       pxor    0x10(%rsp), @XMM[1]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       pxor    0x20(%rsp), @XMM[6]
+       movdqu  @XMM[1], 0x10($out)
+       movdqu  @XMM[6], 0x20($out)
+       lea     0x30($out), $out
+
+       movdqa  0x30(%rsp), @XMM[7]     # next iteration tweak
+       jmp     .Lxts_dec_done
+.align 16
+.Lxts_dec_2:
+       pxor    @XMM[8+0], @XMM[0]
+       lea     0x20($inp), $inp
+       pxor    @XMM[8+1], @XMM[1]
+       lea     0x80(%rsp), %rax        # pass key schedule
+       mov     %edx, %r10d             # pass rounds
+
+       call    _bsaes_decrypt8
+
+       pxor    0x00(%rsp), @XMM[0]     # ^= tweak[]
+       pxor    0x10(%rsp), @XMM[1]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       movdqu  @XMM[1], 0x10($out)
+       lea     0x20($out), $out
+
+       movdqa  0x20(%rsp), @XMM[7]     # next iteration tweak
+       jmp     .Lxts_dec_done
+.align 16
+.Lxts_dec_1:
+       pxor    @XMM[0], @XMM[8]
+       lea     0x10($inp), $inp
+       movdqa  @XMM[8], 0x20(%rbp)
+       lea     0x20(%rbp), $arg1
+       lea     0x20(%rbp), $arg2
+       lea     ($key), $arg3
+       call    asm_AES_decrypt         # doesn't touch %xmm
+       pxor    0x20(%rbp), @XMM[0]     # ^= tweak[]
+       #pxor   @XMM[8], @XMM[0]
+       #lea    0x80(%rsp), %rax        # pass key schedule
+       #mov    %edx, %r10d             # pass rounds
+       #call   _bsaes_decrypt8
+       #pxor   0x00(%rsp), @XMM[0]     # ^= tweak[]
+       movdqu  @XMM[0], 0x00($out)     # write output
+       lea     0x10($out), $out
+
+       movdqa  0x10(%rsp), @XMM[7]     # next iteration tweak
+
+.Lxts_dec_done:
+       and     \$15, %ebx
+       jz      .Lxts_dec_ret
+
+       pxor    $twtmp, $twtmp
+       movdqa  .Lxts_magic(%rip), $twmask
+       pcmpgtd @XMM[7], $twtmp
+       pshufd  \$0x13, $twtmp, $twres
+       movdqa  @XMM[7], @XMM[6]
+       paddq   @XMM[7], @XMM[7]        # psllq 1,$tweak
+       pand    $twmask, $twres         # isolate carry and residue
+       movdqu  ($inp), @XMM[0]
+       pxor    $twres, @XMM[7]
+
+       lea     0x20(%rbp), $arg1
+       pxor    @XMM[7], @XMM[0]
+       lea     0x20(%rbp), $arg2
+       movdqa  @XMM[0], 0x20(%rbp)
+       lea     ($key), $arg3
+       call    asm_AES_decrypt         # doesn't touch %xmm
+       pxor    0x20(%rbp), @XMM[7]
+       mov     $out, %rdx
+       movdqu  @XMM[7], ($out)
+
+.Lxts_dec_steal:
+       movzb   16($inp), %eax
+       movzb   (%rdx), %ecx
+       lea     1($inp), $inp
+       mov     %al, (%rdx)
+       mov     %cl, 16(%rdx)
+       lea     1(%rdx), %rdx
+       sub     \$1,%ebx
+       jnz     .Lxts_dec_steal
+
+       movdqu  ($out), @XMM[0]
+       lea     0x20(%rbp), $arg1
+       pxor    @XMM[6], @XMM[0]
+       lea     0x20(%rbp), $arg2
+       movdqa  @XMM[0], 0x20(%rbp)
+       lea     ($key), $arg3
+       call    asm_AES_decrypt         # doesn't touch %xmm
+       pxor    0x20(%rbp), @XMM[6]
+       movdqu  @XMM[6], ($out)
+
+.Lxts_dec_ret:
+       lea     (%rsp), %rax
+       pxor    %xmm0, %xmm0
+.Lxts_dec_bzero:                       # wipe key schedule [if any]
+       movdqa  %xmm0, 0x00(%rax)
+       movdqa  %xmm0, 0x10(%rax)
+       lea     0x20(%rax), %rax
+       cmp     %rax, %rbp
+       ja      .Lxts_dec_bzero
+
+       lea     (%rbp),%rsp             # restore %rsp
+___
+$code.=<<___ if ($win64);
+       movaps  0x40(%rbp), %xmm6
+       movaps  0x50(%rbp), %xmm7
+       movaps  0x60(%rbp), %xmm8
+       movaps  0x70(%rbp), %xmm9
+       movaps  0x80(%rbp), %xmm10
+       movaps  0x90(%rbp), %xmm11
+       movaps  0xa0(%rbp), %xmm12
+       movaps  0xb0(%rbp), %xmm13
+       movaps  0xc0(%rbp), %xmm14
+       movaps  0xd0(%rbp), %xmm15
+       lea     0xa0(%rbp), %rsp
+___
+$code.=<<___;
+       mov     0x48(%rsp), %r15
+       mov     0x50(%rsp), %r14
+       mov     0x58(%rsp), %r13
+       mov     0x60(%rsp), %r12
+       mov     0x68(%rsp), %rbx
+       mov     0x70(%rsp), %rax
+       lea     0x78(%rsp), %rsp
+       mov     %rax, %rbp
+.Lxts_dec_epilogue:
+       ret
+.size  bsaes_xts_decrypt,.-bsaes_xts_decrypt
+___
+}
+$code.=<<___;
+.type  _bsaes_const,\@object
+.align 64
+_bsaes_const:
+.LM0ISR:       # InvShiftRows constants
+       .quad   0x0a0e0206070b0f03, 0x0004080c0d010509
+.LISRM0:
+       .quad   0x01040b0e0205080f, 0x0306090c00070a0d
+.LISR:
+       .quad   0x0504070602010003, 0x0f0e0d0c080b0a09
+.LBS0:         # bit-slice constants
+       .quad   0x5555555555555555, 0x5555555555555555
+.LBS1:
+       .quad   0x3333333333333333, 0x3333333333333333
+.LBS2:
+       .quad   0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
+.LSR:          # shiftrows constants
+       .quad   0x0504070600030201, 0x0f0e0d0c0a09080b
+.LSRM0:
+       .quad   0x0304090e00050a0f, 0x01060b0c0207080d
+.LM0:
+       .quad   0x02060a0e03070b0f, 0x0004080c0105090d
+.LM0SR:
+       .quad   0x0a0e02060f03070b, 0x0004080c05090d01
+.LNOT:         # magic constants
+       .quad   0xffffffffffffffff, 0xffffffffffffffff
+.L63:
+       .quad   0x6363636363636363, 0x6363636363636363
+.LSWPUP:       # byte-swap upper dword
+       .quad   0x0706050403020100, 0x0c0d0e0f0b0a0908
+.LSWPUPM0SR:
+       .quad   0x0a0d02060c03070b, 0x0004080f05090e01
+.LADD1:                # counter increment constants
+       .quad   0x0000000000000000, 0x0000000100000000
+.LADD2:
+       .quad   0x0000000000000000, 0x0000000200000000
+.LADD3:
+       .quad   0x0000000000000000, 0x0000000300000000
+.LADD4:
+       .quad   0x0000000000000000, 0x0000000400000000
+.LADD5:
+       .quad   0x0000000000000000, 0x0000000500000000
+.LADD6:
+       .quad   0x0000000000000000, 0x0000000600000000
+.LADD7:
+       .quad   0x0000000000000000, 0x0000000700000000
+.LADD8:
+       .quad   0x0000000000000000, 0x0000000800000000
+.Lxts_magic:
+       .long   0x87,0,1,0
+.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
+.align 64
+.size  _bsaes_const,.-_bsaes_const
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern        __imp_RtlVirtualUnwind
+.type  se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # prologue label
+       cmp     %r10,%rbx               # context->Rip<prologue label
+       jb      .Lin_prologue
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jae     .Lin_prologue
+
+       mov     160($context),%rax      # pull context->Rbp
+
+       lea     0x40(%rax),%rsi         # %xmm save area
+       lea     512($context),%rdi      # &context.Xmm6
+       mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
+       .long   0xa548f3fc              # cld; rep movsq
+       lea     0xa0(%rax),%rax         # adjust stack pointer
+
+       mov     0x70(%rax),%rbp
+       mov     0x68(%rax),%rbx
+       mov     0x60(%rax),%r12
+       mov     0x58(%rax),%r13
+       mov     0x50(%rax),%r14
+       mov     0x48(%rax),%r15
+       lea     0x78(%rax),%rax         # adjust stack pointer
+       mov     %rbx,144($context)      # restore context->Rbx
+       mov     %rbp,160($context)      # restore context->Rbp
+       mov     %r12,216($context)      # restore context->R12
+       mov     %r13,224($context)      # restore context->R13
+       mov     %r14,232($context)      # restore context->R14
+       mov     %r15,240($context)      # restore context->R15
+
+.Lin_prologue:
+       mov     %rax,152($context)      # restore context->Rsp
+
+       mov     40($disp),%rdi          # disp->ContextRecord
+       mov     $context,%rsi           # context
+       mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
+       .long   0xa548f3fc              # cld; rep movsq
+
+       mov     $disp,%rsi
+       xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+       mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+       mov     0(%rsi),%r8             # arg3, disp->ControlPc
+       mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+       mov     40(%rsi),%r10           # disp->ContextRecord
+       lea     56(%rsi),%r11           # &disp->HandlerData
+       lea     24(%rsi),%r12           # &disp->EstablisherFrame
+       mov     %r10,32(%rsp)           # arg5
+       mov     %r11,40(%rsp)           # arg6
+       mov     %r12,48(%rsp)           # arg7
+       mov     %rcx,56(%rsp)           # arg8, (NULL)
+       call    *__imp_RtlVirtualUnwind(%rip)
+
+       mov     \$1,%eax                # ExceptionContinueSearch
+       add     \$64,%rsp
+       popfq
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+       pop     %rdi
+       pop     %rsi
+       ret
+.size  se_handler,.-se_handler
+
+.section       .pdata
+.align 4
+___
+$code.=<<___ if ($ecb);
+       .rva    .Lecb_enc_prologue
+       .rva    .Lecb_enc_epilogue
+       .rva    .Lecb_enc_info
+
+       .rva    .Lecb_dec_prologue
+       .rva    .Lecb_dec_epilogue
+       .rva    .Lecb_dec_info
+___
+$code.=<<___;
+       .rva    .Lcbc_dec_prologue
+       .rva    .Lcbc_dec_epilogue
+       .rva    .Lcbc_dec_info
+
+       .rva    .Lctr_enc_prologue
+       .rva    .Lctr_enc_epilogue
+       .rva    .Lctr_enc_info
+
+       .rva    .Lxts_enc_prologue
+       .rva    .Lxts_enc_epilogue
+       .rva    .Lxts_enc_info
+
+       .rva    .Lxts_dec_prologue
+       .rva    .Lxts_dec_epilogue
+       .rva    .Lxts_dec_info
+
+.section       .xdata
+.align 8
+___
+$code.=<<___ if ($ecb);
+.Lecb_enc_info:
+       .byte   9,0,0,0
+       .rva    se_handler
+       .rva    .Lecb_enc_body,.Lecb_enc_epilogue       # HandlerData[]
+.Lecb_dec_info:
+       .byte   9,0,0,0
+       .rva    se_handler
+       .rva    .Lecb_dec_body,.Lecb_dec_epilogue       # HandlerData[]
+___
+$code.=<<___;
+.Lcbc_dec_info:
+       .byte   9,0,0,0
+       .rva    se_handler
+       .rva    .Lcbc_dec_body,.Lcbc_dec_epilogue       # HandlerData[]
+.Lctr_enc_info:
+       .byte   9,0,0,0
+       .rva    se_handler
+       .rva    .Lctr_enc_body,.Lctr_enc_epilogue       # HandlerData[]
+.Lxts_enc_info:
+       .byte   9,0,0,0
+       .rva    se_handler
+       .rva    .Lxts_enc_body,.Lxts_enc_epilogue       # HandlerData[]
+.Lxts_dec_info:
+       .byte   9,0,0,0
+       .rva    se_handler
+       .rva    .Lxts_dec_body,.Lxts_dec_epilogue       # HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/crypto/aes/asm/vpaes-x86_64.pl b/crypto/aes/asm/vpaes-x86_64.pl
new file mode 100644 (file)
index 0000000..0254702
--- /dev/null
@@ -0,0 +1,1204 @@
+#!/usr/bin/env perl
+
+######################################################################
+## Constant-time SSSE3 AES core implementation.
+## version 0.1
+##
+## By Mike Hamburg (Stanford University), 2009
+## Public domain.
+##
+## For details see http://shiftleft.org/papers/vector_aes/ and
+## http://crypto.stanford.edu/vpaes/.
+
+######################################################################
+# September 2011.
+#
+# Interface to OpenSSL as "almost" drop-in replacement for
+# aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
+# doesn't handle partial vectors (doesn't have to if called from
+# EVP only). "Drop-in" implies that this module doesn't share key
+# schedule structure with the original nor does it make assumption
+# about its alignment...
+#
+# Performance summary. aes-x86_64.pl column lists large-block CBC
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86_64.pl column -
+# [also large-block CBC] encrypt/decrypt.
+#
+#              aes-x86_64.pl           vpaes-x86_64.pl
+#
+# Core 2(**)   30.5/43.7/14.3          21.8/25.7(***)
+# Nehalem      30.5/42.2/14.6           9.8/11.8
+# Atom         63.9/79.0/32.1          64.0/84.8(***)
+#
+# (*)  "Hyper-threading" in the context refers rather to cache shared
+#      among multiple cores, than to specifically Intel HTT. As vast
+#      majority of contemporary cores share cache, slower code path
+#      is common place. In other words "with-hyper-threading-off"
+#      results are presented mostly for reference purposes.
+#
+# (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
+#
+# (***)        Less impressive improvement on Core 2 and Atom is due to slow
+#      pshufb, yet it's respectable +40%/78% improvement on Core 2
+#      (as implied, over "hyper-threading-safe" code path).
+#
+#                                              <appro@openssl.org>
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+$PREFIX="vpaes";
+
+$code.=<<___;
+.text
+
+##
+##  _aes_encrypt_core
+##
+##  AES-encrypt %xmm0.
+##
+##  Inputs:
+##     %xmm0 = input
+##     %xmm9-%xmm15 as in _vpaes_preheat
+##    (%rdx) = scheduled keys
+##
+##  Output in %xmm0
+##  Clobbers  %xmm1-%xmm5, %r9, %r10, %r11, %rax
+##  Preserves %xmm6 - %xmm8 so you get some local vectors
+##
+##
+.type  _vpaes_encrypt_core,\@abi-omnipotent
+.align 16
+_vpaes_encrypt_core:
+       mov     %rdx,   %r9
+       mov     \$16,   %r11
+       mov     240(%rdx),%eax
+       movdqa  %xmm9,  %xmm1
+       movdqa  .Lk_ipt(%rip), %xmm2    # iptlo
+       pandn   %xmm0,  %xmm1
+       movdqu  (%r9),  %xmm5           # round0 key
+       psrld   \$4,    %xmm1
+       pand    %xmm9,  %xmm0
+       pshufb  %xmm0,  %xmm2
+       movdqa  .Lk_ipt+16(%rip), %xmm0 # ipthi
+       pshufb  %xmm1,  %xmm0
+       pxor    %xmm5,  %xmm2
+       pxor    %xmm2,  %xmm0
+       add     \$16,   %r9
+       lea     .Lk_mc_backward(%rip),%r10
+       jmp     .Lenc_entry
+
+.align 16
+.Lenc_loop:
+       # middle of middle round
+       movdqa  %xmm13, %xmm4   # 4 : sb1u
+       pshufb  %xmm2,  %xmm4   # 4 = sb1u
+       pxor    %xmm5,  %xmm4   # 4 = sb1u + k
+       movdqa  %xmm12, %xmm0   # 0 : sb1t
+       pshufb  %xmm3,  %xmm0   # 0 = sb1t
+       pxor    %xmm4,  %xmm0   # 0 = A
+       movdqa  %xmm15, %xmm5   # 4 : sb2u
+       pshufb  %xmm2,  %xmm5   # 4 = sb2u
+       movdqa  -0x40(%r11,%r10), %xmm1         # .Lk_mc_forward[]
+       movdqa  %xmm14, %xmm2   # 2 : sb2t
+       pshufb  %xmm3,  %xmm2   # 2 = sb2t
+       pxor    %xmm5,  %xmm2   # 2 = 2A
+       movdqa  (%r11,%r10), %xmm4              # .Lk_mc_backward[]
+       movdqa  %xmm0,  %xmm3   # 3 = A
+       pshufb  %xmm1,  %xmm0   # 0 = B
+       add     \$16,   %r9     # next key
+       pxor    %xmm2,  %xmm0   # 0 = 2A+B
+       pshufb  %xmm4,  %xmm3   # 3 = D
+       add     \$16,   %r11    # next mc
+       pxor    %xmm0,  %xmm3   # 3 = 2A+B+D
+       pshufb  %xmm1,  %xmm0   # 0 = 2B+C
+       and     \$0x30, %r11    # ... mod 4
+       pxor    %xmm3,  %xmm0   # 0 = 2A+3B+C+D
+       sub     \$1,%rax        # nr--
+
+.Lenc_entry:
+       # top of round
+       movdqa  %xmm9,  %xmm1   # 1 : i
+       pandn   %xmm0,  %xmm1   # 1 = i<<4
+       psrld   \$4,    %xmm1   # 1 = i
+       pand    %xmm9,  %xmm0   # 0 = k
+       movdqa  %xmm11, %xmm5   # 2 : a/k
+       pshufb  %xmm0,  %xmm5   # 2 = a/k
+       pxor    %xmm1,  %xmm0   # 0 = j
+       movdqa  %xmm10, %xmm3   # 3 : 1/i
+       pshufb  %xmm1,  %xmm3   # 3 = 1/i
+       pxor    %xmm5,  %xmm3   # 3 = iak = 1/i + a/k
+       movdqa  %xmm10, %xmm4   # 4 : 1/j
+       pshufb  %xmm0,  %xmm4   # 4 = 1/j
+       pxor    %xmm5,  %xmm4   # 4 = jak = 1/j + a/k
+       movdqa  %xmm10, %xmm2   # 2 : 1/iak
+       pshufb  %xmm3,  %xmm2   # 2 = 1/iak
+       pxor    %xmm0,  %xmm2   # 2 = io
+       movdqa  %xmm10, %xmm3   # 3 : 1/jak
+       movdqu  (%r9),  %xmm5
+       pshufb  %xmm4,  %xmm3   # 3 = 1/jak
+       pxor    %xmm1,  %xmm3   # 3 = jo
+       jnz     .Lenc_loop
+
+       # middle of last round
+       movdqa  -0x60(%r10), %xmm4      # 3 : sbou      .Lk_sbo
+       movdqa  -0x50(%r10), %xmm0      # 0 : sbot      .Lk_sbo+16
+       pshufb  %xmm2,  %xmm4   # 4 = sbou
+       pxor    %xmm5,  %xmm4   # 4 = sb1u + k
+       pshufb  %xmm3,  %xmm0   # 0 = sb1t
+       movdqa  0x40(%r11,%r10), %xmm1          # .Lk_sr[]
+       pxor    %xmm4,  %xmm0   # 0 = A
+       pshufb  %xmm1,  %xmm0
+       ret
+.size  _vpaes_encrypt_core,.-_vpaes_encrypt_core
+       
+##
+##  Decryption core
+##
+##  Same API as encryption core.
+##
+.type  _vpaes_decrypt_core,\@abi-omnipotent
+.align 16
+_vpaes_decrypt_core:
+       mov     %rdx,   %r9             # load key
+       mov     240(%rdx),%eax
+       movdqa  %xmm9,  %xmm1
+       movdqa  .Lk_dipt(%rip), %xmm2   # iptlo
+       pandn   %xmm0,  %xmm1
+       mov     %rax,   %r11
+       psrld   \$4,    %xmm1
+       movdqu  (%r9),  %xmm5           # round0 key
+       shl     \$4,    %r11
+       pand    %xmm9,  %xmm0
+       pshufb  %xmm0,  %xmm2
+       movdqa  .Lk_dipt+16(%rip), %xmm0 # ipthi
+       xor     \$0x30, %r11
+       lea     .Lk_dsbd(%rip),%r10
+       pshufb  %xmm1,  %xmm0
+       and     \$0x30, %r11
+       pxor    %xmm5,  %xmm2
+       movdqa  .Lk_mc_forward+48(%rip), %xmm5
+       pxor    %xmm2,  %xmm0
+       add     \$16,   %r9
+       add     %r10,   %r11
+       jmp     .Ldec_entry
+
+.align 16
+.Ldec_loop:
+##
+##  Inverse mix columns
+##
+       movdqa  -0x20(%r10),%xmm4       # 4 : sb9u
+       pshufb  %xmm2,  %xmm4           # 4 = sb9u
+       pxor    %xmm0,  %xmm4
+       movdqa  -0x10(%r10),%xmm0       # 0 : sb9t
+       pshufb  %xmm3,  %xmm0           # 0 = sb9t
+       pxor    %xmm4,  %xmm0           # 0 = ch
+       add     \$16, %r9               # next round key
+
+       pshufb  %xmm5,  %xmm0           # MC ch
+       movdqa  0x00(%r10),%xmm4        # 4 : sbdu
+       pshufb  %xmm2,  %xmm4           # 4 = sbdu
+       pxor    %xmm0,  %xmm4           # 4 = ch
+       movdqa  0x10(%r10),%xmm0        # 0 : sbdt
+       pshufb  %xmm3,  %xmm0           # 0 = sbdt
+       pxor    %xmm4,  %xmm0           # 0 = ch
+       sub     \$1,%rax                # nr--
+       
+       pshufb  %xmm5,  %xmm0           # MC ch
+       movdqa  0x20(%r10),%xmm4        # 4 : sbbu
+       pshufb  %xmm2,  %xmm4           # 4 = sbbu
+       pxor    %xmm0,  %xmm4           # 4 = ch
+       movdqa  0x30(%r10),%xmm0        # 0 : sbbt
+       pshufb  %xmm3,  %xmm0           # 0 = sbbt
+       pxor    %xmm4,  %xmm0           # 0 = ch
+       
+       pshufb  %xmm5,  %xmm0           # MC ch
+       movdqa  0x40(%r10),%xmm4        # 4 : sbeu
+       pshufb  %xmm2,  %xmm4           # 4 = sbeu
+       pxor    %xmm0,  %xmm4           # 4 = ch
+       movdqa  0x50(%r10),%xmm0        # 0 : sbet
+       pshufb  %xmm3,  %xmm0           # 0 = sbet
+       pxor    %xmm4,  %xmm0           # 0 = ch
+
+       palignr \$12,   %xmm5,  %xmm5
+       
+.Ldec_entry:
+       # top of round
+       movdqa  %xmm9,  %xmm1   # 1 : i
+       pandn   %xmm0,  %xmm1   # 1 = i<<4
+       psrld   \$4,    %xmm1   # 1 = i
+       pand    %xmm9,  %xmm0   # 0 = k
+       movdqa  %xmm11, %xmm2   # 2 : a/k
+       pshufb  %xmm0,  %xmm2   # 2 = a/k
+       pxor    %xmm1,  %xmm0   # 0 = j
+       movdqa  %xmm10, %xmm3   # 3 : 1/i
+       pshufb  %xmm1,  %xmm3   # 3 = 1/i
+       pxor    %xmm2,  %xmm3   # 3 = iak = 1/i + a/k
+       movdqa  %xmm10, %xmm4   # 4 : 1/j
+       pshufb  %xmm0,  %xmm4   # 4 = 1/j
+       pxor    %xmm2,  %xmm4   # 4 = jak = 1/j + a/k
+       movdqa  %xmm10, %xmm2   # 2 : 1/iak
+       pshufb  %xmm3,  %xmm2   # 2 = 1/iak
+       pxor    %xmm0,  %xmm2   # 2 = io
+       movdqa  %xmm10, %xmm3   # 3 : 1/jak
+       pshufb  %xmm4,  %xmm3   # 3 = 1/jak
+       pxor    %xmm1,  %xmm3   # 3 = jo
+       movdqu  (%r9),  %xmm0
+       jnz     .Ldec_loop
+
+       # middle of last round
+       movdqa  0x60(%r10), %xmm4       # 3 : sbou
+       pshufb  %xmm2,  %xmm4   # 4 = sbou
+       pxor    %xmm0,  %xmm4   # 4 = sb1u + k
+       movdqa  0x70(%r10), %xmm0       # 0 : sbot
+       movdqa  .Lk_sr-.Lk_dsbd(%r11), %xmm2
+       pshufb  %xmm3,  %xmm0   # 0 = sb1t
+       pxor    %xmm4,  %xmm0   # 0 = A
+       pshufb  %xmm2,  %xmm0
+       ret
+.size  _vpaes_decrypt_core,.-_vpaes_decrypt_core
+
+########################################################
+##                                                    ##
+##                  AES key schedule                  ##
+##                                                    ##
+########################################################
+.type  _vpaes_schedule_core,\@abi-omnipotent
+.align 16
+_vpaes_schedule_core:
+       # rdi = key
+       # rsi = size in bits
+       # rdx = buffer
+       # rcx = direction.  0=encrypt, 1=decrypt
+
+       call    _vpaes_preheat          # load the tables
+       movdqa  .Lk_rcon(%rip), %xmm8   # load rcon
+       movdqu  (%rdi), %xmm0           # load key (unaligned)
+
+       # input transform
+       movdqa  %xmm0,  %xmm3
+       lea     .Lk_ipt(%rip), %r11
+       call    _vpaes_schedule_transform
+       movdqa  %xmm0,  %xmm7
+
+       lea     .Lk_sr(%rip),%r10
+       test    %rcx,   %rcx
+       jnz     .Lschedule_am_decrypting
+
+       # encrypting, output zeroth round key after transform
+       movdqu  %xmm0,  (%rdx)
+       jmp     .Lschedule_go
+
+.Lschedule_am_decrypting:
+       # decrypting, output zeroth round key after shiftrows
+       movdqa  (%r8,%r10),%xmm1
+       pshufb  %xmm1,  %xmm3
+       movdqu  %xmm3,  (%rdx)
+       xor     \$0x30, %r8
+
+.Lschedule_go:
+       cmp     \$192,  %esi
+       ja      .Lschedule_256
+       je      .Lschedule_192
+       # 128: fall though
+
+##
+##  .schedule_128
+##
+##  128-bit specific part of key schedule.
+##
+##  This schedule is really simple, because all its parts
+##  are accomplished by the subroutines.
+##
+.Lschedule_128:
+       mov     \$10, %esi
+       
+.Loop_schedule_128:
+       call    _vpaes_schedule_round
+       dec     %rsi
+       jz      .Lschedule_mangle_last
+       call    _vpaes_schedule_mangle  # write output
+       jmp     .Loop_schedule_128
+
+##
+##  .aes_schedule_192
+##
+##  192-bit specific part of key schedule.
+##
+##  The main body of this schedule is the same as the 128-bit
+##  schedule, but with more smearing.  The long, high side is
+##  stored in %xmm7 as before, and the short, low side is in
+##  the high bits of %xmm6.
+##
+##  This schedule is somewhat nastier, however, because each
+##  round produces 192 bits of key material, or 1.5 round keys.
+##  Therefore, on each cycle we do 2 rounds and produce 3 round
+##  keys.
+##
+.align 16
+.Lschedule_192:
+       movdqu  8(%rdi),%xmm0           # load key part 2 (very unaligned)
+       call    _vpaes_schedule_transform       # input transform
+       movdqa  %xmm0,  %xmm6           # save short part
+       pxor    %xmm4,  %xmm4           # clear 4
+       movhlps %xmm4,  %xmm6           # clobber low side with zeros
+       mov     \$4,    %esi
+
+.Loop_schedule_192:
+       call    _vpaes_schedule_round
+       palignr \$8,%xmm6,%xmm0 
+       call    _vpaes_schedule_mangle  # save key n
+       call    _vpaes_schedule_192_smear
+       call    _vpaes_schedule_mangle  # save key n+1
+       call    _vpaes_schedule_round
+       dec     %rsi
+       jz      .Lschedule_mangle_last
+       call    _vpaes_schedule_mangle  # save key n+2
+       call    _vpaes_schedule_192_smear
+       jmp     .Loop_schedule_192
+
+##
+##  .aes_schedule_256
+##
+##  256-bit specific part of key schedule.
+##
+##  The structure here is very similar to the 128-bit
+##  schedule, but with an additional "low side" in
+##  %xmm6.  The low side's rounds are the same as the
+##  high side's, except no rcon and no rotation.
+##
+.align 16
+.Lschedule_256:
+       movdqu  16(%rdi),%xmm0          # load key part 2 (unaligned)
+       call    _vpaes_schedule_transform       # input transform
+       mov     \$7, %esi
+       
+.Loop_schedule_256:
+       call    _vpaes_schedule_mangle  # output low result
+       movdqa  %xmm0,  %xmm6           # save cur_lo in xmm6
+
+       # high round
+       call    _vpaes_schedule_round
+       dec     %rsi
+       jz      .Lschedule_mangle_last
+       call    _vpaes_schedule_mangle  
+
+       # low round. swap xmm7 and xmm6
+       pshufd  \$0xFF, %xmm0,  %xmm0
+       movdqa  %xmm7,  %xmm5
+       movdqa  %xmm6,  %xmm7
+       call    _vpaes_schedule_low_round
+       movdqa  %xmm5,  %xmm7
+       
+       jmp     .Loop_schedule_256
+
+       
+##
+##  .aes_schedule_mangle_last
+##
+##  Mangler for last round of key schedule
+##  Mangles %xmm0
+##    when encrypting, outputs out(%xmm0) ^ 63
+##    when decrypting, outputs unskew(%xmm0)
+##
+##  Always called right before return... jumps to cleanup and exits
+##
+.align 16
+.Lschedule_mangle_last:
+       # schedule last round key from xmm0
+       lea     .Lk_deskew(%rip),%r11   # prepare to deskew
+       test    %rcx,   %rcx
+       jnz     .Lschedule_mangle_last_dec
+
+       # encrypting
+       movdqa  (%r8,%r10),%xmm1
+       pshufb  %xmm1,  %xmm0           # output permute
+       lea     .Lk_opt(%rip),  %r11    # prepare to output transform
+       add     \$32,   %rdx
+
+.Lschedule_mangle_last_dec:
+       add     \$-16,  %rdx
+       pxor    .Lk_s63(%rip),  %xmm0
+       call    _vpaes_schedule_transform # output transform
+       movdqu  %xmm0,  (%rdx)          # save last key
+
+       # cleanup
+       pxor    %xmm0,  %xmm0
+       pxor    %xmm1,  %xmm1
+       pxor    %xmm2,  %xmm2
+       pxor    %xmm3,  %xmm3
+       pxor    %xmm4,  %xmm4
+       pxor    %xmm5,  %xmm5
+       pxor    %xmm6,  %xmm6
+       pxor    %xmm7,  %xmm7
+       ret
+.size  _vpaes_schedule_core,.-_vpaes_schedule_core
+
+##
+##  .aes_schedule_192_smear
+##
+##  Smear the short, low side in the 192-bit key schedule.
+##
+##  Inputs:
+##    %xmm7: high side, b  a  x  y
+##    %xmm6:  low side, d  c  0  0
+##    %xmm13: 0
+##
+##  Outputs:
+##    %xmm6: b+c+d  b+c  0  0
+##    %xmm0: b+c+d  b+c  b  a
+##
+.type  _vpaes_schedule_192_smear,\@abi-omnipotent
+.align 16
+_vpaes_schedule_192_smear:
+       pshufd  \$0x80, %xmm6,  %xmm0   # d c 0 0 -> c 0 0 0
+       pxor    %xmm0,  %xmm6           # -> c+d c 0 0
+       pshufd  \$0xFE, %xmm7,  %xmm0   # b a _ _ -> b b b a
+       pxor    %xmm0,  %xmm6           # -> b+c+d b+c b a
+       movdqa  %xmm6,  %xmm0
+       pxor    %xmm1,  %xmm1
+       movhlps %xmm1,  %xmm6           # clobber low side with zeros
+       ret
+.size  _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
+
+##
+##  .aes_schedule_round
+##
+##  Runs one main round of the key schedule on %xmm0, %xmm7
+##
+##  Specifically, runs subbytes on the high dword of %xmm0
+##  then rotates it by one byte and xors into the low dword of
+##  %xmm7.
+##
+##  Adds rcon from low byte of %xmm8, then rotates %xmm8 for
+##  next rcon.
+##
+##  Smears the dwords of %xmm7 by xoring the low into the
+##  second low, result into third, result into highest.
+##
+##  Returns results in %xmm7 = %xmm0.
+##  Clobbers %xmm1-%xmm4, %r11.
+##
+.type  _vpaes_schedule_round,\@abi-omnipotent
+.align 16
+_vpaes_schedule_round:
+       # extract rcon from xmm8
+       pxor    %xmm1,  %xmm1
+       palignr \$15,   %xmm8,  %xmm1
+       palignr \$15,   %xmm8,  %xmm8
+       pxor    %xmm1,  %xmm7
+
+       # rotate
+       pshufd  \$0xFF, %xmm0,  %xmm0
+       palignr \$1,    %xmm0,  %xmm0
+       
+       # fall through...
+       
+       # low round: same as high round, but no rotation and no rcon.
+_vpaes_schedule_low_round:
+       # smear xmm7
+       movdqa  %xmm7,  %xmm1
+       pslldq  \$4,    %xmm7
+       pxor    %xmm1,  %xmm7
+       movdqa  %xmm7,  %xmm1
+       pslldq  \$8,    %xmm7
+       pxor    %xmm1,  %xmm7
+       pxor    .Lk_s63(%rip), %xmm7
+
+       # subbytes
+       movdqa  %xmm9,  %xmm1
+       pandn   %xmm0,  %xmm1
+       psrld   \$4,    %xmm1           # 1 = i
+       pand    %xmm9,  %xmm0           # 0 = k
+       movdqa  %xmm11, %xmm2           # 2 : a/k
+       pshufb  %xmm0,  %xmm2           # 2 = a/k
+       pxor    %xmm1,  %xmm0           # 0 = j
+       movdqa  %xmm10, %xmm3           # 3 : 1/i
+       pshufb  %xmm1,  %xmm3           # 3 = 1/i
+       pxor    %xmm2,  %xmm3           # 3 = iak = 1/i + a/k
+       movdqa  %xmm10, %xmm4           # 4 : 1/j
+       pshufb  %xmm0,  %xmm4           # 4 = 1/j
+       pxor    %xmm2,  %xmm4           # 4 = jak = 1/j + a/k
+       movdqa  %xmm10, %xmm2           # 2 : 1/iak
+       pshufb  %xmm3,  %xmm2           # 2 = 1/iak
+       pxor    %xmm0,  %xmm2           # 2 = io
+       movdqa  %xmm10, %xmm3           # 3 : 1/jak
+       pshufb  %xmm4,  %xmm3           # 3 = 1/jak
+       pxor    %xmm1,  %xmm3           # 3 = jo
+       movdqa  %xmm13, %xmm4           # 4 : sbou
+       pshufb  %xmm2,  %xmm4           # 4 = sbou
+       movdqa  %xmm12, %xmm0           # 0 : sbot
+       pshufb  %xmm3,  %xmm0           # 0 = sb1t
+       pxor    %xmm4,  %xmm0           # 0 = sbox output
+
+       # add in smeared stuff
+       pxor    %xmm7,  %xmm0   
+       movdqa  %xmm0,  %xmm7
+       ret
+.size  _vpaes_schedule_round,.-_vpaes_schedule_round
+
+##
+##  .aes_schedule_transform
+##
+##  Linear-transform %xmm0 according to tables at (%r11)
+##
+##  Requires that %xmm9 = 0x0F0F... as in preheat
+##  Output in %xmm0
+##  Clobbers %xmm1, %xmm2
+##
+.type  _vpaes_schedule_transform,\@abi-omnipotent
+.align 16
+_vpaes_schedule_transform:
+       movdqa  %xmm9,  %xmm1
+       pandn   %xmm0,  %xmm1
+       psrld   \$4,    %xmm1
+       pand    %xmm9,  %xmm0
+       movdqa  (%r11), %xmm2   # lo
+       pshufb  %xmm0,  %xmm2
+       movdqa  16(%r11), %xmm0 # hi
+       pshufb  %xmm1,  %xmm0
+       pxor    %xmm2,  %xmm0
+       ret
+.size  _vpaes_schedule_transform,.-_vpaes_schedule_transform
+
+##
+##  .aes_schedule_mangle
+##
+##  Mangle xmm0 from (basis-transformed) standard version
+##  to our version.
+##
+##  On encrypt,
+##    xor with 0x63
+##    multiply by circulant 0,1,1,1
+##    apply shiftrows transform
+##
+##  On decrypt,
+##    xor with 0x63
+##    multiply by "inverse mixcolumns" circulant E,B,D,9
+##    deskew
+##    apply shiftrows transform
+##
+##
+##  Writes out to (%rdx), and increments or decrements it
+##  Keeps track of round number mod 4 in %r8
+##  Preserves xmm0
+##  Clobbers xmm1-xmm5
+##
+.type  _vpaes_schedule_mangle,\@abi-omnipotent
+.align 16
+_vpaes_schedule_mangle:
+       movdqa  %xmm0,  %xmm4   # save xmm0 for later
+       movdqa  .Lk_mc_forward(%rip),%xmm5
+       test    %rcx,   %rcx
+       jnz     .Lschedule_mangle_dec
+
+       # encrypting
+       add     \$16,   %rdx
+       pxor    .Lk_s63(%rip),%xmm4
+       pshufb  %xmm5,  %xmm4
+       movdqa  %xmm4,  %xmm3
+       pshufb  %xmm5,  %xmm4
+       pxor    %xmm4,  %xmm3
+       pshufb  %xmm5,  %xmm4
+       pxor    %xmm4,  %xmm3
+
+       jmp     .Lschedule_mangle_both
+.align 16
+.Lschedule_mangle_dec:
+       # inverse mix columns
+       lea     .Lk_dksd(%rip),%r11
+       movdqa  %xmm9,  %xmm1
+       pandn   %xmm4,  %xmm1
+       psrld   \$4,    %xmm1   # 1 = hi
+       pand    %xmm9,  %xmm4   # 4 = lo
+
+       movdqa  0x00(%r11), %xmm2
+       pshufb  %xmm4,  %xmm2
+       movdqa  0x10(%r11), %xmm3
+       pshufb  %xmm1,  %xmm3
+       pxor    %xmm2,  %xmm3
+       pshufb  %xmm5,  %xmm3
+
+       movdqa  0x20(%r11), %xmm2
+       pshufb  %xmm4,  %xmm2
+       pxor    %xmm3,  %xmm2
+       movdqa  0x30(%r11), %xmm3
+       pshufb  %xmm1,  %xmm3
+       pxor    %xmm2,  %xmm3
+       pshufb  %xmm5,  %xmm3
+
+       movdqa  0x40(%r11), %xmm2
+       pshufb  %xmm4,  %xmm2
+       pxor    %xmm3,  %xmm2
+       movdqa  0x50(%r11), %xmm3
+       pshufb  %xmm1,  %xmm3
+       pxor    %xmm2,  %xmm3
+       pshufb  %xmm5,  %xmm3
+
+       movdqa  0x60(%r11), %xmm2
+       pshufb  %xmm4,  %xmm2
+       pxor    %xmm3,  %xmm2
+       movdqa  0x70(%r11), %xmm3
+       pshufb  %xmm1,  %xmm3
+       pxor    %xmm2,  %xmm3
+
+       add     \$-16,  %rdx
+
+.Lschedule_mangle_both:
+       movdqa  (%r8,%r10),%xmm1
+       pshufb  %xmm1,%xmm3
+       add     \$-16,  %r8
+       and     \$0x30, %r8
+       movdqu  %xmm3,  (%rdx)
+       ret
+.size  _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
+
+#
+# Interface to OpenSSL
+#
+.globl ${PREFIX}_set_encrypt_key
+.type  ${PREFIX}_set_encrypt_key,\@function,3
+.align 16
+${PREFIX}_set_encrypt_key:
+___
+$code.=<<___ if ($win64);
+       lea     -0xb8(%rsp),%rsp
+       movaps  %xmm6,0x10(%rsp)
+       movaps  %xmm7,0x20(%rsp)
+       movaps  %xmm8,0x30(%rsp)
+       movaps  %xmm9,0x40(%rsp)
+       movaps  %xmm10,0x50(%rsp)
+       movaps  %xmm11,0x60(%rsp)
+       movaps  %xmm12,0x70(%rsp)
+       movaps  %xmm13,0x80(%rsp)
+       movaps  %xmm14,0x90(%rsp)
+       movaps  %xmm15,0xa0(%rsp)
+.Lenc_key_body:
+___
+$code.=<<___;
+       mov     %esi,%eax
+       shr     \$5,%eax
+       add     \$5,%eax
+       mov     %eax,240(%rdx)  # AES_KEY->rounds = nbits/32+5;
+
+       mov     \$0,%ecx
+       mov     \$0x30,%r8d
+       call    _vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+       movaps  0x10(%rsp),%xmm6
+       movaps  0x20(%rsp),%xmm7
+       movaps  0x30(%rsp),%xmm8
+       movaps  0x40(%rsp),%xmm9
+       movaps  0x50(%rsp),%xmm10
+       movaps  0x60(%rsp),%xmm11
+       movaps  0x70(%rsp),%xmm12
+       movaps  0x80(%rsp),%xmm13
+       movaps  0x90(%rsp),%xmm14
+       movaps  0xa0(%rsp),%xmm15
+       lea     0xb8(%rsp),%rsp
+.Lenc_key_epilogue:
+___
+$code.=<<___;
+       xor     %eax,%eax
+       ret
+.size  ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
+
+.globl ${PREFIX}_set_decrypt_key
+.type  ${PREFIX}_set_decrypt_key,\@function,3
+.align 16
+${PREFIX}_set_decrypt_key:
+___
+$code.=<<___ if ($win64);
+       lea     -0xb8(%rsp),%rsp
+       movaps  %xmm6,0x10(%rsp)
+       movaps  %xmm7,0x20(%rsp)
+       movaps  %xmm8,0x30(%rsp)
+       movaps  %xmm9,0x40(%rsp)
+       movaps  %xmm10,0x50(%rsp)
+       movaps  %xmm11,0x60(%rsp)
+       movaps  %xmm12,0x70(%rsp)
+       movaps  %xmm13,0x80(%rsp)
+       movaps  %xmm14,0x90(%rsp)
+       movaps  %xmm15,0xa0(%rsp)
+.Ldec_key_body:
+___
+$code.=<<___;
+       mov     %esi,%eax
+       shr     \$5,%eax
+       add     \$5,%eax
+       mov     %eax,240(%rdx)  # AES_KEY->rounds = nbits/32+5;
+       shl     \$4,%eax
+       lea     16(%rdx,%rax),%rdx
+
+       mov     \$1,%ecx
+       mov     %esi,%r8d
+       shr     \$1,%r8d
+       and     \$32,%r8d
+       xor     \$32,%r8d       # nbits==192?0:32
+       call    _vpaes_schedule_core
+___
+$code.=<<___ if ($win64);
+       movaps  0x10(%rsp),%xmm6
+       movaps  0x20(%rsp),%xmm7
+       movaps  0x30(%rsp),%xmm8
+       movaps  0x40(%rsp),%xmm9
+       movaps  0x50(%rsp),%xmm10
+       movaps  0x60(%rsp),%xmm11
+       movaps  0x70(%rsp),%xmm12
+       movaps  0x80(%rsp),%xmm13
+       movaps  0x90(%rsp),%xmm14
+       movaps  0xa0(%rsp),%xmm15
+       lea     0xb8(%rsp),%rsp
+.Ldec_key_epilogue:
+___
+$code.=<<___;
+       xor     %eax,%eax
+       ret
+.size  ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
+
+.globl ${PREFIX}_encrypt
+.type  ${PREFIX}_encrypt,\@function,3
+.align 16
+${PREFIX}_encrypt:
+___
+$code.=<<___ if ($win64);
+       lea     -0xb8(%rsp),%rsp
+       movaps  %xmm6,0x10(%rsp)
+       movaps  %xmm7,0x20(%rsp)
+       movaps  %xmm8,0x30(%rsp)
+       movaps  %xmm9,0x40(%rsp)
+       movaps  %xmm10,0x50(%rsp)
+       movaps  %xmm11,0x60(%rsp)
+       movaps  %xmm12,0x70(%rsp)
+       movaps  %xmm13,0x80(%rsp)
+       movaps  %xmm14,0x90(%rsp)
+       movaps  %xmm15,0xa0(%rsp)
+.Lenc_body:
+___
+$code.=<<___;
+       movdqu  (%rdi),%xmm0
+       call    _vpaes_preheat
+       call    _vpaes_encrypt_core
+       movdqu  %xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+       movaps  0x10(%rsp),%xmm6
+       movaps  0x20(%rsp),%xmm7
+       movaps  0x30(%rsp),%xmm8
+       movaps  0x40(%rsp),%xmm9
+       movaps  0x50(%rsp),%xmm10
+       movaps  0x60(%rsp),%xmm11
+       movaps  0x70(%rsp),%xmm12
+       movaps  0x80(%rsp),%xmm13
+       movaps  0x90(%rsp),%xmm14
+       movaps  0xa0(%rsp),%xmm15
+       lea     0xb8(%rsp),%rsp
+.Lenc_epilogue:
+___
+$code.=<<___;
+       ret
+.size  ${PREFIX}_encrypt,.-${PREFIX}_encrypt
+
+.globl ${PREFIX}_decrypt
+.type  ${PREFIX}_decrypt,\@function,3
+.align 16
+${PREFIX}_decrypt:
+___
+$code.=<<___ if ($win64);
+       lea     -0xb8(%rsp),%rsp
+       movaps  %xmm6,0x10(%rsp)
+       movaps  %xmm7,0x20(%rsp)
+       movaps  %xmm8,0x30(%rsp)
+       movaps  %xmm9,0x40(%rsp)
+       movaps  %xmm10,0x50(%rsp)
+       movaps  %xmm11,0x60(%rsp)
+       movaps  %xmm12,0x70(%rsp)
+       movaps  %xmm13,0x80(%rsp)
+       movaps  %xmm14,0x90(%rsp)
+       movaps  %xmm15,0xa0(%rsp)
+.Ldec_body:
+___
+$code.=<<___;
+       movdqu  (%rdi),%xmm0
+       call    _vpaes_preheat
+       call    _vpaes_decrypt_core
+       movdqu  %xmm0,(%rsi)
+___
+$code.=<<___ if ($win64);
+       movaps  0x10(%rsp),%xmm6
+       movaps  0x20(%rsp),%xmm7
+       movaps  0x30(%rsp),%xmm8
+       movaps  0x40(%rsp),%xmm9
+       movaps  0x50(%rsp),%xmm10
+       movaps  0x60(%rsp),%xmm11
+       movaps  0x70(%rsp),%xmm12
+       movaps  0x80(%rsp),%xmm13
+       movaps  0x90(%rsp),%xmm14
+       movaps  0xa0(%rsp),%xmm15
+       lea     0xb8(%rsp),%rsp
+.Ldec_epilogue:
+___
+$code.=<<___;
+       ret
+.size  ${PREFIX}_decrypt,.-${PREFIX}_decrypt
+___
+{
+my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+# void AES_cbc_encrypt (const void char *inp, unsigned char *out,
+#                       size_t length, const AES_KEY *key,
+#                       unsigned char *ivp,const int enc);
+$code.=<<___;
+.globl ${PREFIX}_cbc_encrypt
+.type  ${PREFIX}_cbc_encrypt,\@function,6
+.align 16
+${PREFIX}_cbc_encrypt:
+       xchg    $key,$len
+___
+($len,$key)=($key,$len);
+$code.=<<___;
+___
+$code.=<<___ if ($win64);
+       lea     -0xb8(%rsp),%rsp
+       movaps  %xmm6,0x10(%rsp)
+       movaps  %xmm7,0x20(%rsp)
+       movaps  %xmm8,0x30(%rsp)
+       movaps  %xmm9,0x40(%rsp)
+       movaps  %xmm10,0x50(%rsp)
+       movaps  %xmm11,0x60(%rsp)
+       movaps  %xmm12,0x70(%rsp)
+       movaps  %xmm13,0x80(%rsp)
+       movaps  %xmm14,0x90(%rsp)
+       movaps  %xmm15,0xa0(%rsp)
+.Lcbc_body:
+___
+$code.=<<___;
+       movdqu  ($ivp),%xmm6            # load IV
+       sub     $inp,$out
+       sub     \$16,$len
+       call    _vpaes_preheat
+       cmp     \$0,${enc}d
+       je      .Lcbc_dec_loop
+       jmp     .Lcbc_enc_loop
+.align 16
+.Lcbc_enc_loop:
+       movdqu  ($inp),%xmm0
+       pxor    %xmm6,%xmm0
+       call    _vpaes_encrypt_core
+       movdqa  %xmm0,%xmm6
+       movdqu  %xmm0,($out,$inp)
+       lea     16($inp),$inp
+       sub     \$16,$len
+       jnc     .Lcbc_enc_loop
+       jmp     .Lcbc_done
+.align 16
+.Lcbc_dec_loop:
+       movdqu  ($inp),%xmm0
+       movdqa  %xmm0,%xmm7
+       call    _vpaes_decrypt_core
+       pxor    %xmm6,%xmm0
+       movdqa  %xmm7,%xmm6
+       movdqu  %xmm0,($out,$inp)
+       lea     16($inp),$inp
+       sub     \$16,$len
+       jnc     .Lcbc_dec_loop
+.Lcbc_done:
+       movdqu  %xmm6,($ivp)            # save IV
+___
+$code.=<<___ if ($win64);
+       movaps  0x10(%rsp),%xmm6
+       movaps  0x20(%rsp),%xmm7
+       movaps  0x30(%rsp),%xmm8
+       movaps  0x40(%rsp),%xmm9
+       movaps  0x50(%rsp),%xmm10
+       movaps  0x60(%rsp),%xmm11
+       movaps  0x70(%rsp),%xmm12
+       movaps  0x80(%rsp),%xmm13
+       movaps  0x90(%rsp),%xmm14
+       movaps  0xa0(%rsp),%xmm15
+       lea     0xb8(%rsp),%rsp
+.Lcbc_epilogue:
+___
+$code.=<<___;
+       ret
+.size  ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
+___
+}
+$code.=<<___;
+##
+##  _aes_preheat
+##
+##  Fills register %r10 -> .aes_consts (so you can -fPIC)
+##  and %xmm9-%xmm15 as specified below.
+##
+.type  _vpaes_preheat,\@abi-omnipotent
+.align 16
+_vpaes_preheat:
+       lea     .Lk_s0F(%rip), %r10
+       movdqa  -0x20(%r10), %xmm10     # .Lk_inv
+       movdqa  -0x10(%r10), %xmm11     # .Lk_inv+16
+       movdqa  0x00(%r10), %xmm9       # .Lk_s0F
+       movdqa  0x30(%r10), %xmm13      # .Lk_sb1
+       movdqa  0x40(%r10), %xmm12      # .Lk_sb1+16
+       movdqa  0x50(%r10), %xmm15      # .Lk_sb2
+       movdqa  0x60(%r10), %xmm14      # .Lk_sb2+16
+       ret
+.size  _vpaes_preheat,.-_vpaes_preheat
+########################################################
+##                                                    ##
+##                     Constants                      ##
+##                                                    ##
+########################################################
+.type  _vpaes_consts,\@object
+.align 64
+_vpaes_consts:
+.Lk_inv:       # inv, inva
+       .quad   0x0E05060F0D080180, 0x040703090A0B0C02
+       .quad   0x01040A060F0B0780, 0x030D0E0C02050809
+
+.Lk_s0F:       # s0F
+       .quad   0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
+
+.Lk_ipt:       # input transform (lo, hi)
+       .quad   0xC2B2E8985A2A7000, 0xCABAE09052227808
+       .quad   0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
+
+.Lk_sb1:       # sb1u, sb1t
+       .quad   0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
+       .quad   0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
+.Lk_sb2:       # sb2u, sb2t
+       .quad   0xE27A93C60B712400, 0x5EB7E955BC982FCD
+       .quad   0x69EB88400AE12900, 0xC2A163C8AB82234A
+.Lk_sbo:       # sbou, sbot
+       .quad   0xD0D26D176FBDC700, 0x15AABF7AC502A878
+       .quad   0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
+
+.Lk_mc_forward:        # mc_forward
+       .quad   0x0407060500030201, 0x0C0F0E0D080B0A09
+       .quad   0x080B0A0904070605, 0x000302010C0F0E0D
+       .quad   0x0C0F0E0D080B0A09, 0x0407060500030201
+       .quad   0x000302010C0F0E0D, 0x080B0A0904070605
+
+.Lk_mc_backward:# mc_backward
+       .quad   0x0605040702010003, 0x0E0D0C0F0A09080B
+       .quad   0x020100030E0D0C0F, 0x0A09080B06050407
+       .quad   0x0E0D0C0F0A09080B, 0x0605040702010003
+       .quad   0x0A09080B06050407, 0x020100030E0D0C0F
+
+.Lk_sr:                # sr
+       .quad   0x0706050403020100, 0x0F0E0D0C0B0A0908
+       .quad   0x030E09040F0A0500, 0x0B06010C07020D08
+       .quad   0x0F060D040B020900, 0x070E050C030A0108
+       .quad   0x0B0E0104070A0D00, 0x0306090C0F020508
+
+.Lk_rcon:      # rcon
+       .quad   0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
+
+.Lk_s63:       # s63: all equal to 0x63 transformed
+       .quad   0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
+
+.Lk_opt:       # output transform
+       .quad   0xFF9F4929D6B66000, 0xF7974121DEBE6808
+       .quad   0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
+
+.Lk_deskew:    # deskew tables: inverts the sbox's "skew"
+       .quad   0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
+       .quad   0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
+
+##
+##  Decryption stuff
+##  Key schedule constants
+##
+.Lk_dksd:      # decryption key schedule: invskew x*D
+       .quad   0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
+       .quad   0x41C277F4B5368300, 0x5FDC69EAAB289D1E
+.Lk_dksb:      # decryption key schedule: invskew x*B
+       .quad   0x9A4FCA1F8550D500, 0x03D653861CC94C99
+       .quad   0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
+.Lk_dkse:      # decryption key schedule: invskew x*E + 0x63
+       .quad   0xD5031CCA1FC9D600, 0x53859A4C994F5086
+       .quad   0xA23196054FDC7BE8, 0xCD5EF96A20B31487
+.Lk_dks9:      # decryption key schedule: invskew x*9
+       .quad   0xB6116FC87ED9A700, 0x4AED933482255BFC
+       .quad   0x4576516227143300, 0x8BB89FACE9DAFDCE
+
+##
+##  Decryption stuff
+##  Round function constants
+##
+.Lk_dipt:      # decryption input transform
+       .quad   0x0F505B040B545F00, 0x154A411E114E451A
+       .quad   0x86E383E660056500, 0x12771772F491F194
+
+.Lk_dsb9:      # decryption sbox output *9*u, *9*t
+       .quad   0x851C03539A86D600, 0xCAD51F504F994CC9
+       .quad   0xC03B1789ECD74900, 0x725E2C9EB2FBA565
+.Lk_dsbd:      # decryption sbox output *D*u, *D*t
+       .quad   0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
+       .quad   0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
+.Lk_dsbb:      # decryption sbox output *B*u, *B*t
+       .quad   0xD022649296B44200, 0x602646F6B0F2D404
+       .quad   0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
+.Lk_dsbe:      # decryption sbox output *E*u, *E*t
+       .quad   0x46F2929626D4D000, 0x2242600464B4F6B0
+       .quad   0x0C55A6CDFFAAC100, 0x9467F36B98593E32
+.Lk_dsbo:      # decryption sbox final output
+       .quad   0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
+       .quad   0x12D7560F93441D00, 0xCA4B8159D8C58E9C
+.asciz "Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
+.align 64
+.size  _vpaes_consts,.-_vpaes_consts
+___
+
+if ($win64) {
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern        __imp_RtlVirtualUnwind
+.type  se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # prologue label
+       cmp     %r10,%rbx               # context->Rip<prologue label
+       jb      .Lin_prologue
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jae     .Lin_prologue
+
+       lea     16(%rax),%rsi           # %xmm save area
+       lea     512($context),%rdi      # &context.Xmm6
+       mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
+       .long   0xa548f3fc              # cld; rep movsq
+       lea     0xb8(%rax),%rax         # adjust stack pointer
+
+.Lin_prologue:
+       mov     8(%rax),%rdi
+       mov     16(%rax),%rsi
+       mov     %rax,152($context)      # restore context->Rsp
+       mov     %rsi,168($context)      # restore context->Rsi
+       mov     %rdi,176($context)      # restore context->Rdi
+
+       mov     40($disp),%rdi          # disp->ContextRecord
+       mov     $context,%rsi           # context
+       mov     \$`1232/8`,%ecx         # sizeof(CONTEXT)
+       .long   0xa548f3fc              # cld; rep movsq
+
+       mov     $disp,%rsi
+       xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+       mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+       mov     0(%rsi),%r8             # arg3, disp->ControlPc
+       mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+       mov     40(%rsi),%r10           # disp->ContextRecord
+       lea     56(%rsi),%r11           # &disp->HandlerData
+       lea     24(%rsi),%r12           # &disp->EstablisherFrame
+       mov     %r10,32(%rsp)           # arg5
+       mov     %r11,40(%rsp)           # arg6
+       mov     %r12,48(%rsp)           # arg7
+       mov     %rcx,56(%rsp)           # arg8, (NULL)
+       call    *__imp_RtlVirtualUnwind(%rip)
+
+       mov     \$1,%eax                # ExceptionContinueSearch
+       add     \$64,%rsp
+       popfq
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+       pop     %rdi
+       pop     %rsi
+       ret
+.size  se_handler,.-se_handler
+
+.section       .pdata
+.align 4
+       .rva    .LSEH_begin_${PREFIX}_set_encrypt_key
+       .rva    .LSEH_end_${PREFIX}_set_encrypt_key
+       .rva    .LSEH_info_${PREFIX}_set_encrypt_key
+
+       .rva    .LSEH_begin_${PREFIX}_set_decrypt_key
+       .rva    .LSEH_end_${PREFIX}_set_decrypt_key
+       .rva    .LSEH_info_${PREFIX}_set_decrypt_key
+
+       .rva    .LSEH_begin_${PREFIX}_encrypt
+       .rva    .LSEH_end_${PREFIX}_encrypt
+       .rva    .LSEH_info_${PREFIX}_encrypt
+
+       .rva    .LSEH_begin_${PREFIX}_decrypt
+       .rva    .LSEH_end_${PREFIX}_decrypt
+       .rva    .LSEH_info_${PREFIX}_decrypt
+
+       .rva    .LSEH_begin_${PREFIX}_cbc_encrypt
+       .rva    .LSEH_end_${PREFIX}_cbc_encrypt
+       .rva    .LSEH_info_${PREFIX}_cbc_encrypt
+
+.section       .xdata
+.align 8
+.LSEH_info_${PREFIX}_set_encrypt_key:
+       .byte   9,0,0,0
+       .rva    se_handler
+       .rva    .Lenc_key_body,.Lenc_key_epilogue       # HandlerData[]
+.LSEH_info_${PREFIX}_set_decrypt_key:
+       .byte   9,0,0,0
+       .rva    se_handler
+       .rva    .Ldec_key_body,.Ldec_key_epilogue       # HandlerData[]
+.LSEH_info_${PREFIX}_encrypt:
+       .byte   9,0,0,0
+       .rva    se_handler
+       .rva    .Lenc_body,.Lenc_epilogue               # HandlerData[]
+.LSEH_info_${PREFIX}_decrypt:
+       .byte   9,0,0,0
+       .rva    se_handler
+       .rva    .Ldec_body,.Ldec_epilogue               # HandlerData[]
+.LSEH_info_${PREFIX}_cbc_encrypt:
+       .byte   9,0,0,0
+       .rva    se_handler
+       .rva    .Lcbc_body,.Lcbc_epilogue               # HandlerData[]
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+
+close STDOUT;
diff --git a/crypto/bn/asm/x86_64-gf2m.pl b/crypto/bn/asm/x86_64-gf2m.pl
new file mode 100644 (file)
index 0000000..1658acb
--- /dev/null
@@ -0,0 +1,389 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# May 2011
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication used
+# in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
+# the time being... Except that it has two code paths: code suitable
+# for any x86_64 CPU and PCLMULQDQ one suitable for Westmere and
+# later. Improvement varies from one benchmark and µ-arch to another.
+# Vanilla code path is at most 20% faster than compiler-generated code
+# [not very impressive], while PCLMULQDQ - whole 85%-160% better on
+# 163- and 571-bit ECDH benchmarks on Intel CPUs. Keep in mind that
+# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
+# all CPU time is burnt in it...
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+($lo,$hi)=("%rax","%rdx");     $a=$lo;
+($i0,$i1)=("%rsi","%rdi");
+($t0,$t1)=("%rbx","%rcx");
+($b,$mask)=("%rbp","%r8");
+($a1,$a2,$a4,$a8,$a12,$a48)=map("%r$_",(9..15));
+($R,$Tx)=("%xmm0","%xmm1");
+
+$code.=<<___;
+.text
+
+.type  _mul_1x1,\@abi-omnipotent
+.align 16
+_mul_1x1:
+       sub     \$128+8,%rsp
+       mov     \$-1,$a1
+       lea     ($a,$a),$i0
+       shr     \$3,$a1
+       lea     (,$a,4),$i1
+       and     $a,$a1                  # a1=a&0x1fffffffffffffff
+       lea     (,$a,8),$a8
+       sar     \$63,$a                 # broadcast 63rd bit
+       lea     ($a1,$a1),$a2
+       sar     \$63,$i0                # broadcast 62nd bit
+       lea     (,$a1,4),$a4
+       and     $b,$a
+       sar     \$63,$i1                # boardcast 61st bit
+       mov     $a,$hi                  # $a is $lo
+       shl     \$63,$lo
+       and     $b,$i0
+       shr     \$1,$hi
+       mov     $i0,$t1
+       shl     \$62,$i0
+       and     $b,$i1
+       shr     \$2,$t1
+       xor     $i0,$lo
+       mov     $i1,$t0
+       shl     \$61,$i1
+       xor     $t1,$hi
+       shr     \$3,$t0
+       xor     $i1,$lo
+       xor     $t0,$hi
+
+       mov     $a1,$a12
+       movq    \$0,0(%rsp)             # tab[0]=0
+       xor     $a2,$a12                # a1^a2
+       mov     $a1,8(%rsp)             # tab[1]=a1
+        mov    $a4,$a48
+       mov     $a2,16(%rsp)            # tab[2]=a2
+        xor    $a8,$a48                # a4^a8
+       mov     $a12,24(%rsp)           # tab[3]=a1^a2
+
+       xor     $a4,$a1
+       mov     $a4,32(%rsp)            # tab[4]=a4
+       xor     $a4,$a2
+       mov     $a1,40(%rsp)            # tab[5]=a1^a4
+       xor     $a4,$a12
+       mov     $a2,48(%rsp)            # tab[6]=a2^a4
+        xor    $a48,$a1                # a1^a4^a4^a8=a1^a8
+       mov     $a12,56(%rsp)           # tab[7]=a1^a2^a4
+        xor    $a48,$a2                # a2^a4^a4^a8=a1^a8
+
+       mov     $a8,64(%rsp)            # tab[8]=a8
+       xor     $a48,$a12               # a1^a2^a4^a4^a8=a1^a2^a8
+       mov     $a1,72(%rsp)            # tab[9]=a1^a8
+        xor    $a4,$a1                 # a1^a8^a4
+       mov     $a2,80(%rsp)            # tab[10]=a2^a8
+        xor    $a4,$a2                 # a2^a8^a4
+       mov     $a12,88(%rsp)           # tab[11]=a1^a2^a8
+
+       xor     $a4,$a12                # a1^a2^a8^a4
+       mov     $a48,96(%rsp)           # tab[12]=a4^a8
+        mov    $mask,$i0
+       mov     $a1,104(%rsp)           # tab[13]=a1^a4^a8
+        and    $b,$i0
+       mov     $a2,112(%rsp)           # tab[14]=a2^a4^a8
+        shr    \$4,$b
+       mov     $a12,120(%rsp)          # tab[15]=a1^a2^a4^a8
+        mov    $mask,$i1
+        and    $b,$i1
+        shr    \$4,$b
+
+       movq    (%rsp,$i0,8),$R         # half of calculations is done in SSE2
+       mov     $mask,$i0
+       and     $b,$i0
+       shr     \$4,$b
+___
+    for ($n=1;$n<8;$n++) {
+       $code.=<<___;
+       mov     (%rsp,$i1,8),$t1
+       mov     $mask,$i1
+       mov     $t1,$t0
+       shl     \$`8*$n-4`,$t1
+       and     $b,$i1
+        movq   (%rsp,$i0,8),$Tx
+       shr     \$`64-(8*$n-4)`,$t0
+       xor     $t1,$lo
+        pslldq \$$n,$Tx
+        mov    $mask,$i0
+       shr     \$4,$b
+       xor     $t0,$hi
+        and    $b,$i0
+        shr    \$4,$b
+        pxor   $Tx,$R
+___
+    }
+$code.=<<___;
+       mov     (%rsp,$i1,8),$t1
+       mov     $t1,$t0
+       shl     \$`8*$n-4`,$t1
+       movq    $R,$i0
+       shr     \$`64-(8*$n-4)`,$t0
+       xor     $t1,$lo
+       psrldq  \$8,$R
+       xor     $t0,$hi
+       movq    $R,$i1
+       xor     $i0,$lo
+       xor     $i1,$hi
+
+       add     \$128+8,%rsp
+       ret
+.Lend_mul_1x1:
+.size  _mul_1x1,.-_mul_1x1
+___
+
+($rp,$a1,$a0,$b1,$b0) = $win64?        ("%rcx","%rdx","%r8", "%r9","%r10") :   # Win64 order
+                               ("%rdi","%rsi","%rdx","%rcx","%r8");    # Unix order
+
+$code.=<<___;
+.extern        OPENSSL_ia32cap_P
+.globl bn_GF2m_mul_2x2
+.type  bn_GF2m_mul_2x2,\@abi-omnipotent
+.align 16
+bn_GF2m_mul_2x2:
+       mov     OPENSSL_ia32cap_P(%rip),%rax
+       bt      \$33,%rax
+       jnc     .Lvanilla_mul_2x2
+
+       movq            $a1,%xmm0
+       movq            $b1,%xmm1
+       movq            $a0,%xmm2
+___
+$code.=<<___ if ($win64);
+       movq            40(%rsp),%xmm3
+___
+$code.=<<___ if (!$win64);
+       movq            $b0,%xmm3
+___
+$code.=<<___;
+       movdqa          %xmm0,%xmm4
+       movdqa          %xmm1,%xmm5
+       pclmulqdq       \$0,%xmm1,%xmm0 # a1·b1
+       pxor            %xmm2,%xmm4
+       pxor            %xmm3,%xmm5
+       pclmulqdq       \$0,%xmm3,%xmm2 # a0·b0
+       pclmulqdq       \$0,%xmm5,%xmm4 # (a0+a1)·(b0+b1)
+       xorps           %xmm0,%xmm4
+       xorps           %xmm2,%xmm4     # (a0+a1)·(b0+b1)-a0·b0-a1·b1
+       movdqa          %xmm4,%xmm5
+       pslldq          \$8,%xmm4
+       psrldq          \$8,%xmm5
+       pxor            %xmm4,%xmm2
+       pxor            %xmm5,%xmm0
+       movdqu          %xmm2,0($rp)
+       movdqu          %xmm0,16($rp)
+       ret
+
+.align 16
+.Lvanilla_mul_2x2:
+       lea     -8*17(%rsp),%rsp
+___
+$code.=<<___ if ($win64);
+       mov     `8*17+40`(%rsp),$b0
+       mov     %rdi,8*15(%rsp)
+       mov     %rsi,8*16(%rsp)
+___
+$code.=<<___;
+       mov     %r14,8*10(%rsp)
+       mov     %r13,8*11(%rsp)
+       mov     %r12,8*12(%rsp)
+       mov     %rbp,8*13(%rsp)
+       mov     %rbx,8*14(%rsp)
+.Lbody_mul_2x2:
+       mov     $rp,32(%rsp)            # save the arguments
+       mov     $a1,40(%rsp)
+       mov     $a0,48(%rsp)
+       mov     $b1,56(%rsp)
+       mov     $b0,64(%rsp)
+
+       mov     \$0xf,$mask
+       mov     $a1,$a
+       mov     $b1,$b
+       call    _mul_1x1                # a1·b1
+       mov     $lo,16(%rsp)
+       mov     $hi,24(%rsp)
+
+       mov     48(%rsp),$a
+       mov     64(%rsp),$b
+       call    _mul_1x1                # a0·b0
+       mov     $lo,0(%rsp)
+       mov     $hi,8(%rsp)
+
+       mov     40(%rsp),$a
+       mov     56(%rsp),$b
+       xor     48(%rsp),$a
+       xor     64(%rsp),$b
+       call    _mul_1x1                # (a0+a1)·(b0+b1)
+___
+       @r=("%rbx","%rcx","%rdi","%rsi");
+$code.=<<___;
+       mov     0(%rsp),@r[0]
+       mov     8(%rsp),@r[1]
+       mov     16(%rsp),@r[2]
+       mov     24(%rsp),@r[3]
+       mov     32(%rsp),%rbp
+
+       xor     $hi,$lo
+       xor     @r[1],$hi
+       xor     @r[0],$lo
+       mov     @r[0],0(%rbp)
+       xor     @r[2],$hi
+       mov     @r[3],24(%rbp)
+       xor     @r[3],$lo
+       xor     @r[3],$hi
+       xor     $hi,$lo
+       mov     $hi,16(%rbp)
+       mov     $lo,8(%rbp)
+
+       mov     8*10(%rsp),%r14
+       mov     8*11(%rsp),%r13
+       mov     8*12(%rsp),%r12
+       mov     8*13(%rsp),%rbp
+       mov     8*14(%rsp),%rbx
+___
+$code.=<<___ if ($win64);
+       mov     8*15(%rsp),%rdi
+       mov     8*16(%rsp),%rsi
+___
+$code.=<<___;
+       lea     8*17(%rsp),%rsp
+       ret
+.Lend_mul_2x2:
+.size  bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
+.asciz "GF(2^m) Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+.align 16
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#               CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern __imp_RtlVirtualUnwind
+
+.type  se_handler,\@abi-omnipotent
+.align 16
+se_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     152($context),%rax      # pull context->Rsp
+       mov     248($context),%rbx      # pull context->Rip
+
+       lea     .Lbody_mul_2x2(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip<"prologue" label
+       jb      .Lin_prologue
+
+       mov     8*10(%rax),%r14         # mimic epilogue
+       mov     8*11(%rax),%r13
+       mov     8*12(%rax),%r12
+       mov     8*13(%rax),%rbp
+       mov     8*14(%rax),%rbx
+       mov     8*15(%rax),%rdi
+       mov     8*16(%rax),%rsi
+
+       mov     %rbx,144($context)      # restore context->Rbx
+       mov     %rbp,160($context)      # restore context->Rbp
+       mov     %rsi,168($context)      # restore context->Rsi
+       mov     %rdi,176($context)      # restore context->Rdi
+       mov     %r12,216($context)      # restore context->R12
+       mov     %r13,224($context)      # restore context->R13
+       mov     %r14,232($context)      # restore context->R14
+
+.Lin_prologue:
+       lea     8*17(%rax),%rax
+       mov     %rax,152($context)      # restore context->Rsp
+
+       mov     40($disp),%rdi          # disp->ContextRecord
+       mov     $context,%rsi           # context
+       mov     \$154,%ecx              # sizeof(CONTEXT)
+       .long   0xa548f3fc              # cld; rep movsq
+
+       mov     $disp,%rsi
+       xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+       mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+       mov     0(%rsi),%r8             # arg3, disp->ControlPc
+       mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+       mov     40(%rsi),%r10           # disp->ContextRecord
+       lea     56(%rsi),%r11           # &disp->HandlerData
+       lea     24(%rsi),%r12           # &disp->EstablisherFrame
+       mov     %r10,32(%rsp)           # arg5
+       mov     %r11,40(%rsp)           # arg6
+       mov     %r12,48(%rsp)           # arg7
+       mov     %rcx,56(%rsp)           # arg8, (NULL)
+       call    *__imp_RtlVirtualUnwind(%rip)
+
+       mov     \$1,%eax                # ExceptionContinueSearch
+       add     \$64,%rsp
+       popfq
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+       pop     %rdi
+       pop     %rsi
+       ret
+.size  se_handler,.-se_handler
+
+.section       .pdata
+.align 4
+       .rva    _mul_1x1
+       .rva    .Lend_mul_1x1
+       .rva    .LSEH_info_1x1
+
+       .rva    .Lvanilla_mul_2x2
+       .rva    .Lend_mul_2x2
+       .rva    .LSEH_info_2x2
+.section       .xdata
+.align 8
+.LSEH_info_1x1:
+       .byte   0x01,0x07,0x02,0x00
+       .byte   0x07,0x01,0x11,0x00     # sub rsp,128+8
+.LSEH_info_2x2:
+       .byte   9,0,0,0
+       .rva    se_handler
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+print $code;
+close STDOUT;
index 3b7a6f2..5d79b35 100755 (executable)
@@ -1,7 +1,7 @@
 #!/usr/bin/env perl
 
 # ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # respectful 50%. It remains to be seen if loop unrolling and
 # dedicated squaring routine can provide further improvement...
 
+# July 2011.
+#
+# Add dedicated squaring procedure. Performance improvement varies
+# from platform to platform, but in average it's ~5%/15%/25%/33%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
+# August 2011.
+#
+# Unroll and modulo-schedule inner loops in such manner that they
+# are "fallen through" for input lengths of 8, which is critical for
+# 1024-bit RSA *sign*. Average performance improvement in comparison
+# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
+# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
+
 $flavour = shift;
 $output  = shift;
 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -37,7 +51,6 @@ $n0="%r8";    # const BN_ULONG *n0,
 $num="%r9";    # int num);
 $lo0="%r10";
 $hi0="%r11";
-$bp="%r12";    # reassign $bp
 $hi1="%r13";
 $i="%r14";
 $j="%r15";
@@ -51,6 +64,16 @@ $code=<<___;
 .type  bn_mul_mont,\@function,6
 .align 16
 bn_mul_mont:
+       test    \$3,${num}d
+       jnz     .Lmul_enter
+       cmp     \$8,${num}d
+       jb      .Lmul_enter
+       cmp     $ap,$bp
+       jne     .Lmul4x_enter
+       jmp     .Lsqr4x_enter
+
+.align 16
+.Lmul_enter:
        push    %rbx
        push    %rbp
        push    %r12
@@ -66,48 +89,66 @@ bn_mul_mont:
        and     \$-1024,%rsp            # minimize TLB usage
 
        mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
-.Lprologue:
-       mov     %rdx,$bp                # $bp reassigned, remember?
-
+.Lmul_body:
+       mov     $bp,%r12                # reassign $bp
+___
+               $bp="%r12";
+$code.=<<___;
        mov     ($n0),$n0               # pull n0[0] value
+       mov     ($bp),$m0               # m0=bp[0]
+       mov     ($ap),%rax
 
        xor     $i,$i                   # i=0
        xor     $j,$j                   # j=0
 
-       mov     ($bp),$m0               # m0=bp[0]
-       mov     ($ap),%rax
+       mov     $n0,$m1
        mulq    $m0                     # ap[0]*bp[0]
        mov     %rax,$lo0
-       mov     %rdx,$hi0
+       mov     ($np),%rax
 
-       imulq   $n0,%rax                # "tp[0]"*n0
-       mov     %rax,$m1
+       imulq   $lo0,$m1                # "tp[0]"*n0
+       mov     %rdx,$hi0
 
-       mulq    ($np)                   # np[0]*m1
-       add     $lo0,%rax               # discarded
+       mulq    $m1                     # np[0]*m1
+       add     %rax,$lo0               # discarded
+       mov     8($ap),%rax
        adc     \$0,%rdx
        mov     %rdx,$hi1
 
        lea     1($j),$j                # j++
+       jmp     .L1st_enter
+
+.align 16
 .L1st:
+       add     %rax,$hi1
        mov     ($ap,$j,8),%rax
-       mulq    $m0                     # ap[j]*bp[0]
-       add     $hi0,%rax
        adc     \$0,%rdx
-       mov     %rax,$lo0
+       add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
+       mov     $lo0,$hi0
+       adc     \$0,%rdx
+       mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$hi1
+
+.L1st_enter:
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$hi0
        mov     ($np,$j,8),%rax
-       mov     %rdx,$hi0
+       adc     \$0,%rdx
+       lea     1($j),$j                # j++
+       mov     %rdx,$lo0
 
        mulq    $m1                     # np[j]*m1
-       add     $hi1,%rax
-       lea     1($j),$j                # j++
+       cmp     $num,$j
+       jne     .L1st
+
+       add     %rax,$hi1
+       mov     ($ap),%rax              # ap[0]
        adc     \$0,%rdx
-       add     $lo0,%rax               # np[j]*m1+ap[j]*bp[0]
+       add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
        adc     \$0,%rdx
-       mov     %rax,-16(%rsp,$j,8)     # tp[j-1]
-       cmp     $num,$j
+       mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
        mov     %rdx,$hi1
-       jl      .L1st
+       mov     $lo0,$hi0
 
        xor     %rdx,%rdx
        add     $hi0,$hi1
@@ -116,50 +157,64 @@ bn_mul_mont:
        mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
 
        lea     1($i),$i                # i++
-.align 4
+       jmp     .Louter
+.align 16
 .Louter:
-       xor     $j,$j                   # j=0
-
        mov     ($bp,$i,8),$m0          # m0=bp[i]
-       mov     ($ap),%rax              # ap[0]
+       xor     $j,$j                   # j=0
+       mov     $n0,$m1
+       mov     (%rsp),$lo0
        mulq    $m0                     # ap[0]*bp[i]
-       add     (%rsp),%rax             # ap[0]*bp[i]+tp[0]
+       add     %rax,$lo0               # ap[0]*bp[i]+tp[0]
+       mov     ($np),%rax
        adc     \$0,%rdx
-       mov     %rax,$lo0
-       mov     %rdx,$hi0
 
-       imulq   $n0,%rax                # tp[0]*n0
-       mov     %rax,$m1
+       imulq   $lo0,$m1                # tp[0]*n0
+       mov     %rdx,$hi0
 
-       mulq    ($np,$j,8)              # np[0]*m1
-       add     $lo0,%rax               # discarded
-       mov     8(%rsp),$lo0            # tp[1]
+       mulq    $m1                     # np[0]*m1
+       add     %rax,$lo0               # discarded
+       mov     8($ap),%rax
        adc     \$0,%rdx
+       mov     8(%rsp),$lo0            # tp[1]
        mov     %rdx,$hi1
 
        lea     1($j),$j                # j++
-.align 4
+       jmp     .Linner_enter
+
+.align 16
 .Linner:
+       add     %rax,$hi1
        mov     ($ap,$j,8),%rax
-       mulq    $m0                     # ap[j]*bp[i]
-       add     $hi0,%rax
        adc     \$0,%rdx
-       add     %rax,$lo0               # ap[j]*bp[i]+tp[j]
+       add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
+       mov     (%rsp,$j,8),$lo0
+       adc     \$0,%rdx
+       mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$hi1
+
+.Linner_enter:
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$hi0
        mov     ($np,$j,8),%rax
        adc     \$0,%rdx
+       add     $hi0,$lo0               # ap[j]*bp[i]+tp[j]
        mov     %rdx,$hi0
+       adc     \$0,$hi0
+       lea     1($j),$j                # j++
 
        mulq    $m1                     # np[j]*m1
-       add     $hi1,%rax
-       lea     1($j),$j                # j++
-       adc     \$0,%rdx
-       add     $lo0,%rax               # np[j]*m1+ap[j]*bp[i]+tp[j]
+       cmp     $num,$j
+       jne     .Linner
+
+       add     %rax,$hi1
+       mov     ($ap),%rax              # ap[0]
        adc     \$0,%rdx
+       add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
        mov     (%rsp,$j,8),$lo0
-       cmp     $num,$j
-       mov     %rax,-16(%rsp,$j,8)     # tp[j-1]
+       adc     \$0,%rdx
+       mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
        mov     %rdx,$hi1
-       jl      .Linner
 
        xor     %rdx,%rdx
        add     $hi0,$hi1
@@ -173,35 +228,449 @@ bn_mul_mont:
        cmp     $num,$i
        jl      .Louter
 
-       lea     (%rsp),$ap              # borrow ap for tp
-       lea     -1($num),$j             # j=num-1
-
-       mov     ($ap),%rax              # tp[0]
        xor     $i,$i                   # i=0 and clear CF!
+       mov     (%rsp),%rax             # tp[0]
+       lea     (%rsp),$ap              # borrow ap for tp
+       mov     $num,$j                 # j=num
        jmp     .Lsub
 .align 16
 .Lsub: sbb     ($np,$i,8),%rax
        mov     %rax,($rp,$i,8)         # rp[i]=tp[i]-np[i]
-       dec     $j                      # doesn't affect CF!
        mov     8($ap,$i,8),%rax        # tp[i+1]
        lea     1($i),$i                # i++
-       jge     .Lsub
+       dec     $j                      # doesnn't affect CF!
+       jnz     .Lsub
 
        sbb     \$0,%rax                # handle upmost overflow bit
+       xor     $i,$i
        and     %rax,$ap
        not     %rax
        mov     $rp,$np
        and     %rax,$np
-       lea     -1($num),$j
+       mov     $num,$j                 # j=num
        or      $np,$ap                 # ap=borrow?tp:rp
 .align 16
 .Lcopy:                                        # copy or in-place refresh
+       mov     ($ap,$i,8),%rax
+       mov     $i,(%rsp,$i,8)          # zap temporary vector
+       mov     %rax,($rp,$i,8)         # rp[i]=tp[i]
+       lea     1($i),$i
+       sub     \$1,$j
+       jnz     .Lcopy
+
+       mov     8(%rsp,$num,8),%rsi     # restore %rsp
+       mov     \$1,%rax
+       mov     (%rsi),%r15
+       mov     8(%rsi),%r14
+       mov     16(%rsi),%r13
+       mov     24(%rsi),%r12
+       mov     32(%rsi),%rbp
+       mov     40(%rsi),%rbx
+       lea     48(%rsi),%rsp
+.Lmul_epilogue:
+       ret
+.size  bn_mul_mont,.-bn_mul_mont
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type  bn_mul4x_mont,\@function,6
+.align 16
+bn_mul4x_mont:
+.Lmul4x_enter:
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+
+       mov     ${num}d,${num}d
+       lea     4($num),%r10
+       mov     %rsp,%r11
+       neg     %r10
+       lea     (%rsp,%r10,8),%rsp      # tp=alloca(8*(num+4))
+       and     \$-1024,%rsp            # minimize TLB usage
+
+       mov     %r11,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.Lmul4x_body:
+       mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
+       mov     %rdx,%r12               # reassign $bp
+___
+               $bp="%r12";
+$code.=<<___;
+       mov     ($n0),$n0               # pull n0[0] value
+       mov     ($bp),$m0               # m0=bp[0]
+       mov     ($ap),%rax
+
+       xor     $i,$i                   # i=0
+       xor     $j,$j                   # j=0
+
+       mov     $n0,$m1
+       mulq    $m0                     # ap[0]*bp[0]
+       mov     %rax,$A[0]
+       mov     ($np),%rax
+
+       imulq   $A[0],$m1               # "tp[0]"*n0
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[0]*m1
+       add     %rax,$A[0]              # discarded
+       mov     8($ap),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$N[1]
+
+       mulq    $m0
+       add     %rax,$A[1]
+       mov     8($np),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1
+       add     %rax,$N[1]
+       mov     16($ap),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]
+       lea     4($j),$j                # j++
+       adc     \$0,%rdx
+       mov     $N[1],(%rsp)
+       mov     %rdx,$N[0]
+       jmp     .L1st4x
+.align 16
+.L1st4x:
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[0]
+       mov     -16($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     -8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[1]
+       mov     -8($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
        mov     ($ap,$j,8),%rax
-       mov     %rax,($rp,$j,8)         # rp[i]=tp[i]
-       mov     $i,(%rsp,$j,8)          # zap temporary vector
+       adc     \$0,%rdx
+       add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[0]
+       mov     ($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[1]
+       mov     8($np,$j,8),%rax
+       adc     \$0,%rdx
+       lea     4($j),$j                # j++
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     -16($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+       cmp     $num,$j
+       jl      .L1st4x
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[0]
+       mov     -16($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     -8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[1]
+       mov     -8($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     ($ap),%rax              # ap[0]
+       adc     \$0,%rdx
+       add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+
+       xor     $N[1],$N[1]
+       add     $A[0],$N[0]
+       adc     \$0,$N[1]
+       mov     $N[0],-8(%rsp,$j,8)
+       mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
+
+       lea     1($i),$i                # i++
+.align 4
+.Louter4x:
+       mov     ($bp,$i,8),$m0          # m0=bp[i]
+       xor     $j,$j                   # j=0
+       mov     (%rsp),$A[0]
+       mov     $n0,$m1
+       mulq    $m0                     # ap[0]*bp[i]
+       add     %rax,$A[0]              # ap[0]*bp[i]+tp[0]
+       mov     ($np),%rax
+       adc     \$0,%rdx
+
+       imulq   $A[0],$m1               # tp[0]*n0
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[0]*m1
+       add     %rax,$A[0]              # "$N[0]", discarded
+       mov     8($ap),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[1]
+       mov     8($np),%rax
+       adc     \$0,%rdx
+       add     8(%rsp),$A[1]           # +tp[1]
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     16($ap),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[i]+tp[j]
+       lea     4($j),$j                # j+=2
+       adc     \$0,%rdx
+       mov     $N[1],(%rsp)            # tp[j-1]
+       mov     %rdx,$N[0]
+       jmp     .Linner4x
+.align 16
+.Linner4x:
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[0]
+       mov     -16($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     -8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]
+       adc     \$0,%rdx
+       mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[1]
+       mov     -8($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     -8(%rsp,$j,8),$A[1]
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     ($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]
+       adc     \$0,%rdx
+       mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[0]
+       mov     ($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     (%rsp,$j,8),$A[0]       # ap[j]*bp[i]+tp[j]
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]
+       adc     \$0,%rdx
+       mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[1]
+       mov     8($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     8(%rsp,$j,8),$A[1]
+       adc     \$0,%rdx
+       lea     4($j),$j                # j++
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     -16($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]
+       adc     \$0,%rdx
+       mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+       cmp     $num,$j
+       jl      .Linner4x
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[0]
+       mov     -16($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     -8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]
+       adc     \$0,%rdx
+       mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[1]
+       mov     -8($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     -8(%rsp,$j,8),$A[1]
+       adc     \$0,%rdx
+       lea     1($i),$i                # i++
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     ($ap),%rax              # ap[0]
+       adc     \$0,%rdx
+       add     $A[1],$N[1]
+       adc     \$0,%rdx
+       mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+
+       xor     $N[1],$N[1]
+       add     $A[0],$N[0]
+       adc     \$0,$N[1]
+       add     (%rsp,$num,8),$N[0]     # pull upmost overflow bit
+       adc     \$0,$N[1]
+       mov     $N[0],-8(%rsp,$j,8)
+       mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
+
+       cmp     $num,$i
+       jl      .Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+       mov     16(%rsp,$num,8),$rp     # restore $rp
+       mov     0(%rsp),@ri[0]          # tp[0]
+       pxor    %xmm0,%xmm0
+       mov     8(%rsp),@ri[1]          # tp[1]
+       shr     \$2,$num                # num/=4
+       lea     (%rsp),$ap              # borrow ap for tp
+       xor     $i,$i                   # i=0 and clear CF!
+
+       sub     0($np),@ri[0]
+       mov     16($ap),@ri[2]          # tp[2]
+       mov     24($ap),@ri[3]          # tp[3]
+       sbb     8($np),@ri[1]
+       lea     -1($num),$j             # j=num/4-1
+       jmp     .Lsub4x
+.align 16
+.Lsub4x:
+       mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
+       mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
+       sbb     16($np,$i,8),@ri[2]
+       mov     32($ap,$i,8),@ri[0]     # tp[i+1]
+       mov     40($ap,$i,8),@ri[1]
+       sbb     24($np,$i,8),@ri[3]
+       mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
+       mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
+       sbb     32($np,$i,8),@ri[0]
+       mov     48($ap,$i,8),@ri[2]
+       mov     56($ap,$i,8),@ri[3]
+       sbb     40($np,$i,8),@ri[1]
+       lea     4($i),$i                # i++
+       dec     $j                      # doesnn't affect CF!
+       jnz     .Lsub4x
+
+       mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
+       mov     32($ap,$i,8),@ri[0]     # load overflow bit
+       sbb     16($np,$i,8),@ri[2]
+       mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
+       sbb     24($np,$i,8),@ri[3]
+       mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
+
+       sbb     \$0,@ri[0]              # handle upmost overflow bit
+       mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
+       xor     $i,$i                   # i=0
+       and     @ri[0],$ap
+       not     @ri[0]
+       mov     $rp,$np
+       and     @ri[0],$np
+       lea     -1($num),$j
+       or      $np,$ap                 # ap=borrow?tp:rp
+
+       movdqu  ($ap),%xmm1
+       movdqa  %xmm0,(%rsp)
+       movdqu  %xmm1,($rp)
+       jmp     .Lcopy4x
+.align 16
+.Lcopy4x:                                      # copy or in-place refresh
+       movdqu  16($ap,$i),%xmm2
+       movdqu  32($ap,$i),%xmm1
+       movdqa  %xmm0,16(%rsp,$i)
+       movdqu  %xmm2,16($rp,$i)
+       movdqa  %xmm0,32(%rsp,$i)
+       movdqu  %xmm1,32($rp,$i)
+       lea     32($i),$i
        dec     $j
-       jge     .Lcopy
+       jnz     .Lcopy4x
 
+       shl     \$2,$num
+       movdqu  16($ap,$i),%xmm2
+       movdqa  %xmm0,16(%rsp,$i)
+       movdqu  %xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
        mov     8(%rsp,$num,8),%rsi     # restore %rsp
        mov     \$1,%rax
        mov     (%rsi),%r15
@@ -211,9 +680,823 @@ bn_mul_mont:
        mov     32(%rsi),%rbp
        mov     40(%rsi),%rbx
        lea     48(%rsi),%rsp
-.Lepilogue:
+.Lmul4x_epilogue:
        ret
-.size  bn_mul_mont,.-bn_mul_mont
+.size  bn_mul4x_mont,.-bn_mul4x_mont
+___
+}}}
+\f{{{
+######################################################################
+# void bn_sqr4x_mont(
+my $rptr="%rdi";       # const BN_ULONG *rptr,
+my $aptr="%rsi";       # const BN_ULONG *aptr,
+my $bptr="%rdx";       # not used
+my $nptr="%rcx";       # const BN_ULONG *nptr,
+my $n0  ="%r8";                # const BN_ULONG *n0);
+my $num ="%r9";                # int num, has to be divisible by 4 and
+                       # not less than 8
+
+my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
+my @A0=("%r10","%r11");
+my @A1=("%r12","%r13");
+my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
+
+$code.=<<___;
+.type  bn_sqr4x_mont,\@function,6
+.align 16
+bn_sqr4x_mont:
+.Lsqr4x_enter:
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+
+       shl     \$3,${num}d             # convert $num to bytes
+       xor     %r10,%r10
+       mov     %rsp,%r11               # put aside %rsp
+       sub     $num,%r10               # -$num
+       mov     ($n0),$n0               # *n0
+       lea     -72(%rsp,%r10,2),%rsp   # alloca(frame+2*$num)
+       and     \$-1024,%rsp            # minimize TLB usage
+       ##############################################################
+       # Stack layout
+       #
+       # +0    saved $num, used in reduction section
+       # +8    &t[2*$num], used in reduction section
+       # +32   saved $rptr
+       # +40   saved $nptr
+       # +48   saved *n0
+       # +56   saved %rsp
+       # +64   t[2*$num]
+       #
+       mov     $rptr,32(%rsp)          # save $rptr
+       mov     $nptr,40(%rsp)
+       mov     $n0,  48(%rsp)
+       mov     %r11, 56(%rsp)          # save original %rsp
+.Lsqr4x_body:
+       ##############################################################
+       # Squaring part:
+       #
+       # a) multiply-n-add everything but a[i]*a[i];
+       # b) shift result of a) by 1 to the left and accumulate
+       #    a[i]*a[i] products;
+       #
+       lea     32(%r10),$i             # $i=-($num-32)
+       lea     ($aptr,$num),$aptr      # end of a[] buffer, ($aptr,$i)=&ap[2]
+
+       mov     $num,$j                 # $j=$num
+
+                                       # comments apply to $num==8 case
+       mov     -32($aptr,$i),$a0       # a[0]
+       lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
+       mov     -24($aptr,$i),%rax      # a[1]
+       lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
+       mov     -16($aptr,$i),$ai       # a[2]
+       mov     %rax,$a1
+
+       mul     $a0                     # a[1]*a[0]
+       mov     %rax,$A0[0]             # a[1]*a[0]
+        mov    $ai,%rax                # a[2]
+       mov     %rdx,$A0[1]
+       mov     $A0[0],-24($tptr,$i)    # t[1]
+
+       xor     $A0[0],$A0[0]
+       mul     $a0                     # a[2]*a[0]
+       add     %rax,$A0[1]
+        mov    $ai,%rax
+       adc     %rdx,$A0[0]
+       mov     $A0[1],-16($tptr,$i)    # t[2]
+
+       lea     -16($i),$j              # j=-16
+
+
+        mov    8($aptr,$j),$ai         # a[3]
+       mul     $a1                     # a[2]*a[1]
+       mov     %rax,$A1[0]             # a[2]*a[1]+t[3]
+        mov    $ai,%rax
+       mov     %rdx,$A1[1]
+
+       xor     $A0[1],$A0[1]
+       add     $A1[0],$A0[0]
+        lea    16($j),$j
+       adc     \$0,$A0[1]
+       mul     $a0                     # a[3]*a[0]
+       add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
+        mov    $ai,%rax
+       adc     %rdx,$A0[1]
+       mov     $A0[0],-8($tptr,$j)     # t[3]
+       jmp     .Lsqr4x_1st
+
+.align 16
+.Lsqr4x_1st:
+        mov    ($aptr,$j),$ai          # a[4]
+       xor     $A1[0],$A1[0]
+       mul     $a1                     # a[3]*a[1]
+       add     %rax,$A1[1]             # a[3]*a[1]+t[4]
+        mov    $ai,%rax
+       adc     %rdx,$A1[0]
+
+       xor     $A0[0],$A0[0]
+       add     $A1[1],$A0[1]
+       adc     \$0,$A0[0]
+       mul     $a0                     # a[4]*a[0]
+       add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
+        mov    $ai,%rax                # a[3]
+       adc     %rdx,$A0[0]
+       mov     $A0[1],($tptr,$j)       # t[4]
+
+
+        mov    8($aptr,$j),$ai         # a[5]
+       xor     $A1[1],$A1[1]
+       mul     $a1                     # a[4]*a[3]
+       add     %rax,$A1[0]             # a[4]*a[3]+t[5]
+        mov    $ai,%rax
+       adc     %rdx,$A1[1]
+
+       xor     $A0[1],$A0[1]
+       add     $A1[0],$A0[0]
+       adc     \$0,$A0[1]
+       mul     $a0                     # a[5]*a[2]
+       add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
+        mov    $ai,%rax
+       adc     %rdx,$A0[1]
+       mov     $A0[0],8($tptr,$j)      # t[5]
+
+        mov    16($aptr,$j),$ai        # a[6]
+       xor     $A1[0],$A1[0]
+       mul     $a1                     # a[5]*a[3]
+       add     %rax,$A1[1]             # a[5]*a[3]+t[6]
+        mov    $ai,%rax
+       adc     %rdx,$A1[0]
+
+       xor     $A0[0],$A0[0]
+       add     $A1[1],$A0[1]
+       adc     \$0,$A0[0]
+       mul     $a0                     # a[6]*a[2]
+       add     %rax,$A0[1]             # a[6]*a[2]+a[5]*a[3]+t[6]
+        mov    $ai,%rax                # a[3]
+       adc     %rdx,$A0[0]
+       mov     $A0[1],16($tptr,$j)     # t[6]
+
+
+        mov    24($aptr,$j),$ai        # a[7]
+       xor     $A1[1],$A1[1]
+       mul     $a1                     # a[6]*a[5]
+       add     %rax,$A1[0]             # a[6]*a[5]+t[7]
+        mov    $ai,%rax
+       adc     %rdx,$A1[1]
+
+       xor     $A0[1],$A0[1]
+       add     $A1[0],$A0[0]
+        lea    32($j),$j
+       adc     \$0,$A0[1]
+       mul     $a0                     # a[7]*a[4]
+       add     %rax,$A0[0]             # a[7]*a[4]+a[6]*a[5]+t[6]
+        mov    $ai,%rax
+       adc     %rdx,$A0[1]
+       mov     $A0[0],-8($tptr,$j)     # t[7]
+
+       cmp     \$0,$j
+       jne     .Lsqr4x_1st
+
+       xor     $A1[0],$A1[0]
+       add     $A0[1],$A1[1]
+       adc     \$0,$A1[0]
+       mul     $a1                     # a[7]*a[5]
+       add     %rax,$A1[1]
+       adc     %rdx,$A1[0]
+
+       mov     $A1[1],($tptr)          # t[8]
+       lea     16($i),$i
+       mov     $A1[0],8($tptr)         # t[9]
+       jmp     .Lsqr4x_outer
+
+.align 16
+.Lsqr4x_outer:                         # comments apply to $num==6 case
+       mov     -32($aptr,$i),$a0       # a[0]
+       lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
+       mov     -24($aptr,$i),%rax      # a[1]
+       lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
+       mov     -16($aptr,$i),$ai       # a[2]
+       mov     %rax,$a1
+
+       mov     -24($tptr,$i),$A0[0]    # t[1]
+       xor     $A0[1],$A0[1]
+       mul     $a0                     # a[1]*a[0]
+       add     %rax,$A0[0]             # a[1]*a[0]+t[1]
+        mov    $ai,%rax                # a[2]
+       adc     %rdx,$A0[1]
+       mov     $A0[0],-24($tptr,$i)    # t[1]
+
+       xor     $A0[0],$A0[0]
+       add     -16($tptr,$i),$A0[1]    # a[2]*a[0]+t[2]
+       adc     \$0,$A0[0]
+       mul     $a0                     # a[2]*a[0]
+       add     %rax,$A0[1]
+        mov    $ai,%rax
+       adc     %rdx,$A0[0]
+       mov     $A0[1],-16($tptr,$i)    # t[2]
+
+       lea     -16($i),$j              # j=-16
+       xor     $A1[0],$A1[0]
+
+
+        mov    8($aptr,$j),$ai         # a[3]
+       xor     $A1[1],$A1[1]
+       add     8($tptr,$j),$A1[0]
+       adc     \$0,$A1[1]
+       mul     $a1                     # a[2]*a[1]
+       add     %rax,$A1[0]             # a[2]*a[1]+t[3]
+        mov    $ai,%rax
+       adc     %rdx,$A1[1]
+
+       xor     $A0[1],$A0[1]
+       add     $A1[0],$A0[0]
+       adc     \$0,$A0[1]
+       mul     $a0                     # a[3]*a[0]
+       add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
+        mov    $ai,%rax
+       adc     %rdx,$A0[1]
+       mov     $A0[0],8($tptr,$j)      # t[3]
+
+       lea     16($j),$j
+       jmp     .Lsqr4x_inner
+
+.align 16
+.Lsqr4x_inner:
+        mov    ($aptr,$j),$ai          # a[4]
+       xor     $A1[0],$A1[0]
+       add     ($tptr,$j),$A1[1]
+       adc     \$0,$A1[0]
+       mul     $a1                     # a[3]*a[1]
+       add     %rax,$A1[1]             # a[3]*a[1]+t[4]
+        mov    $ai,%rax
+       adc     %rdx,$A1[0]
+
+       xor     $A0[0],$A0[0]
+       add     $A1[1],$A0[1]
+       adc     \$0,$A0[0]
+       mul     $a0                     # a[4]*a[0]
+       add     %rax,$A0[1]             # a[4]*a[0]+a[3]*a[1]+t[4]
+        mov    $ai,%rax                # a[3]
+       adc     %rdx,$A0[0]
+       mov     $A0[1],($tptr,$j)       # t[4]
+
+        mov    8($aptr,$j),$ai         # a[5]
+       xor     $A1[1],$A1[1]
+       add     8($tptr,$j),$A1[0]
+       adc     \$0,$A1[1]
+       mul     $a1                     # a[4]*a[3]
+       add     %rax,$A1[0]             # a[4]*a[3]+t[5]
+        mov    $ai,%rax
+       adc     %rdx,$A1[1]
+
+       xor     $A0[1],$A0[1]
+       add     $A1[0],$A0[0]
+       lea     16($j),$j               # j++
+       adc     \$0,$A0[1]
+       mul     $a0                     # a[5]*a[2]
+       add     %rax,$A0[0]             # a[5]*a[2]+a[4]*a[3]+t[5]
+        mov    $ai,%rax
+       adc     %rdx,$A0[1]
+       mov     $A0[0],-8($tptr,$j)     # t[5], "preloaded t[1]" below
+
+       cmp     \$0,$j
+       jne     .Lsqr4x_inner
+
+       xor     $A1[0],$A1[0]
+       add     $A0[1],$A1[1]
+       adc     \$0,$A1[0]
+       mul     $a1                     # a[5]*a[3]
+       add     %rax,$A1[1]
+       adc     %rdx,$A1[0]
+
+       mov     $A1[1],($tptr)          # t[6], "preloaded t[2]" below
+       mov     $A1[0],8($tptr)         # t[7], "preloaded t[3]" below
+
+       add     \$16,$i
+       jnz     .Lsqr4x_outer
+
+                                       # comments apply to $num==4 case
+       mov     -32($aptr),$a0          # a[0]
+       lea     64(%rsp,$num,2),$tptr   # end of tp[] buffer, &tp[2*$num]
+       mov     -24($aptr),%rax         # a[1]
+       lea     -32($tptr,$i),$tptr     # end of tp[] window, &tp[2*$num-"$i"]
+       mov     -16($aptr),$ai          # a[2]
+       mov     %rax,$a1
+
+       xor     $A0[1],$A0[1]
+       mul     $a0                     # a[1]*a[0]
+       add     %rax,$A0[0]             # a[1]*a[0]+t[1], preloaded t[1]
+        mov    $ai,%rax                # a[2]
+       adc     %rdx,$A0[1]
+       mov     $A0[0],-24($tptr)       # t[1]
+
+       xor     $A0[0],$A0[0]
+       add     $A1[1],$A0[1]           # a[2]*a[0]+t[2], preloaded t[2]
+       adc     \$0,$A0[0]
+       mul     $a0                     # a[2]*a[0]
+       add     %rax,$A0[1]
+        mov    $ai,%rax
+       adc     %rdx,$A0[0]
+       mov     $A0[1],-16($tptr)       # t[2]
+
+        mov    -8($aptr),$ai           # a[3]
+       mul     $a1                     # a[2]*a[1]
+       add     %rax,$A1[0]             # a[2]*a[1]+t[3], preloaded t[3]
+        mov    $ai,%rax
+       adc     \$0,%rdx
+
+       xor     $A0[1],$A0[1]
+       add     $A1[0],$A0[0]
+        mov    %rdx,$A1[1]
+       adc     \$0,$A0[1]
+       mul     $a0                     # a[3]*a[0]
+       add     %rax,$A0[0]             # a[3]*a[0]+a[2]*a[1]+t[3]
+        mov    $ai,%rax
+       adc     %rdx,$A0[1]
+       mov     $A0[0],-8($tptr)        # t[3]
+
+       xor     $A1[0],$A1[0]
+       add     $A0[1],$A1[1]
+       adc     \$0,$A1[0]
+       mul     $a1                     # a[3]*a[1]
+       add     %rax,$A1[1]
+        mov    -16($aptr),%rax         # a[2]
+       adc     %rdx,$A1[0]
+
+       mov     $A1[1],($tptr)          # t[4]
+       mov     $A1[0],8($tptr)         # t[5]
+
+       mul     $ai                     # a[2]*a[3]
+___
+{
+my ($shift,$carry)=($a0,$a1);
+my @S=(@A1,$ai,$n0);
+$code.=<<___;
+        add    \$16,$i
+        xor    $shift,$shift
+        sub    $num,$i                 # $i=16-$num
+        xor    $carry,$carry
+
+       add     $A1[0],%rax             # t[5]
+       adc     \$0,%rdx
+       mov     %rax,8($tptr)           # t[5]
+       mov     %rdx,16($tptr)          # t[6]
+       mov     $carry,24($tptr)        # t[7]
+
+        mov    -16($aptr,$i),%rax      # a[0]
+       lea     64(%rsp,$num,2),$tptr
+        xor    $A0[0],$A0[0]           # t[0]
+        mov    -24($tptr,$i,2),$A0[1]  # t[1]
+
+       lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+       shr     \$63,$A0[0]
+       lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
+       shr     \$63,$A0[1]
+       or      $A0[0],$S[1]            # | t[2*i]>>63
+        mov    -16($tptr,$i,2),$A0[0]  # t[2*i+2]      # prefetch
+       mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+       mul     %rax                    # a[i]*a[i]
+       neg     $carry                  # mov $carry,cf
+        mov    -8($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
+       adc     %rax,$S[0]
+        mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
+       mov     $S[0],-32($tptr,$i,2)
+       adc     %rdx,$S[1]
+
+       lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+        mov    $S[1],-24($tptr,$i,2)
+        sbb    $carry,$carry           # mov cf,$carry
+       shr     \$63,$A0[0]
+       lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
+       shr     \$63,$A0[1]
+       or      $A0[0],$S[3]            # | t[2*i]>>63
+        mov    0($tptr,$i,2),$A0[0]    # t[2*i+2]      # prefetch
+       mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+       mul     %rax                    # a[i]*a[i]
+       neg     $carry                  # mov $carry,cf
+        mov    8($tptr,$i,2),$A0[1]    # t[2*i+2+1]    # prefetch
+       adc     %rax,$S[2]
+        mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
+       mov     $S[2],-16($tptr,$i,2)
+       adc     %rdx,$S[3]
+       lea     16($i),$i
+       mov     $S[3],-40($tptr,$i,2)
+       sbb     $carry,$carry           # mov cf,$carry
+       jmp     .Lsqr4x_shift_n_add
+
+.align 16
+.Lsqr4x_shift_n_add:
+       lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+       shr     \$63,$A0[0]
+       lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
+       shr     \$63,$A0[1]
+       or      $A0[0],$S[1]            # | t[2*i]>>63
+        mov    -16($tptr,$i,2),$A0[0]  # t[2*i+2]      # prefetch
+       mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+       mul     %rax                    # a[i]*a[i]
+       neg     $carry                  # mov $carry,cf
+        mov    -8($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
+       adc     %rax,$S[0]
+        mov    -8($aptr,$i),%rax       # a[i+1]        # prefetch
+       mov     $S[0],-32($tptr,$i,2)
+       adc     %rdx,$S[1]
+
+       lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+        mov    $S[1],-24($tptr,$i,2)
+        sbb    $carry,$carry           # mov cf,$carry
+       shr     \$63,$A0[0]
+       lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
+       shr     \$63,$A0[1]
+       or      $A0[0],$S[3]            # | t[2*i]>>63
+        mov    0($tptr,$i,2),$A0[0]    # t[2*i+2]      # prefetch
+       mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+       mul     %rax                    # a[i]*a[i]
+       neg     $carry                  # mov $carry,cf
+        mov    8($tptr,$i,2),$A0[1]    # t[2*i+2+1]    # prefetch
+       adc     %rax,$S[2]
+        mov    0($aptr,$i),%rax        # a[i+1]        # prefetch
+       mov     $S[2],-16($tptr,$i,2)
+       adc     %rdx,$S[3]
+
+       lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+        mov    $S[3],-8($tptr,$i,2)
+        sbb    $carry,$carry           # mov cf,$carry
+       shr     \$63,$A0[0]
+       lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
+       shr     \$63,$A0[1]
+       or      $A0[0],$S[1]            # | t[2*i]>>63
+        mov    16($tptr,$i,2),$A0[0]   # t[2*i+2]      # prefetch
+       mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+       mul     %rax                    # a[i]*a[i]
+       neg     $carry                  # mov $carry,cf
+        mov    24($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
+       adc     %rax,$S[0]
+        mov    8($aptr,$i),%rax        # a[i+1]        # prefetch
+       mov     $S[0],0($tptr,$i,2)
+       adc     %rdx,$S[1]
+
+       lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
+        mov    $S[1],8($tptr,$i,2)
+        sbb    $carry,$carry           # mov cf,$carry
+       shr     \$63,$A0[0]
+       lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
+       shr     \$63,$A0[1]
+       or      $A0[0],$S[3]            # | t[2*i]>>63
+        mov    32($tptr,$i,2),$A0[0]   # t[2*i+2]      # prefetch
+       mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+       mul     %rax                    # a[i]*a[i]
+       neg     $carry                  # mov $carry,cf
+        mov    40($tptr,$i,2),$A0[1]   # t[2*i+2+1]    # prefetch
+       adc     %rax,$S[2]
+        mov    16($aptr,$i),%rax       # a[i+1]        # prefetch
+       mov     $S[2],16($tptr,$i,2)
+       adc     %rdx,$S[3]
+       mov     $S[3],24($tptr,$i,2)
+       sbb     $carry,$carry           # mov cf,$carry
+       add     \$32,$i
+       jnz     .Lsqr4x_shift_n_add
+
+       lea     ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
+       shr     \$63,$A0[0]
+       lea     ($j,$A0[1],2),$S[1]     # t[2*i+1]<<1 |
+       shr     \$63,$A0[1]
+       or      $A0[0],$S[1]            # | t[2*i]>>63
+        mov    -16($tptr),$A0[0]       # t[2*i+2]      # prefetch
+       mov     $A0[1],$shift           # shift=t[2*i+1]>>63
+       mul     %rax                    # a[i]*a[i]
+       neg     $carry                  # mov $carry,cf
+        mov    -8($tptr),$A0[1]        # t[2*i+2+1]    # prefetch
+       adc     %rax,$S[0]
+        mov    -8($aptr),%rax          # a[i+1]        # prefetch
+       mov     $S[0],-32($tptr)
+       adc     %rdx,$S[1]
+
+       lea     ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
+        mov    $S[1],-24($tptr)
+        sbb    $carry,$carry           # mov cf,$carry
+       shr     \$63,$A0[0]
+       lea     ($j,$A0[1],2),$S[3]     # t[2*i+1]<<1 |
+       shr     \$63,$A0[1]
+       or      $A0[0],$S[3]            # | t[2*i]>>63
+       mul     %rax                    # a[i]*a[i]
+       neg     $carry                  # mov $carry,cf
+       adc     %rax,$S[2]
+       adc     %rdx,$S[3]
+       mov     $S[2],-16($tptr)
+       mov     $S[3],-8($tptr)
+___
+}\f
+##############################################################
+# Montgomery reduction part, "word-by-word" algorithm.
+#
+{
+my ($topbit,$nptr)=("%rbp",$aptr);
+my ($m0,$m1)=($a0,$a1);
+my @Ni=("%rbx","%r9");
+$code.=<<___;
+       mov     40(%rsp),$nptr          # restore $nptr
+       mov     48(%rsp),$n0            # restore *n0
+       xor     $j,$j
+       mov     $num,0(%rsp)            # save $num
+       sub     $num,$j                 # $j=-$num
+        mov    64(%rsp),$A0[0]         # t[0]          # modsched #
+        mov    $n0,$m0                 #               # modsched #
+       lea     64(%rsp,$num,2),%rax    # end of t[] buffer
+       lea     64(%rsp,$num),$tptr     # end of t[] window
+       mov     %rax,8(%rsp)            # save end of t[] buffer
+       lea     ($nptr,$num),$nptr      # end of n[] buffer
+       xor     $topbit,$topbit         # $topbit=0
+
+       mov     0($nptr,$j),%rax        # n[0]          # modsched #
+       mov     8($nptr,$j),$Ni[1]      # n[1]          # modsched #
+        imulq  $A0[0],$m0              # m0=t[0]*n0    # modsched #
+        mov    %rax,$Ni[0]             #               # modsched #
+       jmp     .Lsqr4x_mont_outer
+
+.align 16
+.Lsqr4x_mont_outer:
+       xor     $A0[1],$A0[1]
+       mul     $m0                     # n[0]*m0
+       add     %rax,$A0[0]             # n[0]*m0+t[0]
+        mov    $Ni[1],%rax
+       adc     %rdx,$A0[1]
+       mov     $n0,$m1
+
+       xor     $A0[0],$A0[0]
+       add     8($tptr,$j),$A0[1]
+       adc     \$0,$A0[0]
+       mul     $m0                     # n[1]*m0
+       add     %rax,$A0[1]             # n[1]*m0+t[1]
+        mov    $Ni[0],%rax
+       adc     %rdx,$A0[0]
+
+       imulq   $A0[1],$m1
+
+       mov     16($nptr,$j),$Ni[0]     # n[2]
+       xor     $A1[1],$A1[1]
+       add     $A0[1],$A1[0]
+       adc     \$0,$A1[1]
+       mul     $m1                     # n[0]*m1
+       add     %rax,$A1[0]             # n[0]*m1+"t[1]"
+        mov    $Ni[0],%rax
+       adc     %rdx,$A1[1]
+       mov     $A1[0],8($tptr,$j)      # "t[1]"
+
+       xor     $A0[1],$A0[1]
+       add     16($tptr,$j),$A0[0]
+       adc     \$0,$A0[1]
+       mul     $m0                     # n[2]*m0
+       add     %rax,$A0[0]             # n[2]*m0+t[2]
+        mov    $Ni[1],%rax
+       adc     %rdx,$A0[1]
+
+       mov     24($nptr,$j),$Ni[1]     # n[3]
+       xor     $A1[0],$A1[0]
+       add     $A0[0],$A1[1]
+       adc     \$0,$A1[0]
+       mul     $m1                     # n[1]*m1
+       add     %rax,$A1[1]             # n[1]*m1+"t[2]"
+        mov    $Ni[1],%rax
+       adc     %rdx,$A1[0]
+       mov     $A1[1],16($tptr,$j)     # "t[2]"
+
+       xor     $A0[0],$A0[0]
+       add     24($tptr,$j),$A0[1]
+       lea     32($j),$j
+       adc     \$0,$A0[0]
+       mul     $m0                     # n[3]*m0
+       add     %rax,$A0[1]             # n[3]*m0+t[3]
+        mov    $Ni[0],%rax
+       adc     %rdx,$A0[0]
+       jmp     .Lsqr4x_mont_inner
+
+.align 16
+.Lsqr4x_mont_inner:
+       mov     ($nptr,$j),$Ni[0]       # n[4]
+       xor     $A1[1],$A1[1]
+       add     $A0[1],$A1[0]
+       adc     \$0,$A1[1]
+       mul     $m1                     # n[2]*m1
+       add     %rax,$A1[0]             # n[2]*m1+"t[3]"
+        mov    $Ni[0],%rax
+       adc     %rdx,$A1[1]
+       mov     $A1[0],-8($tptr,$j)     # "t[3]"
+
+       xor     $A0[1],$A0[1]
+       add     ($tptr,$j),$A0[0]
+       adc     \$0,$A0[1]
+       mul     $m0                     # n[4]*m0
+       add     %rax,$A0[0]             # n[4]*m0+t[4]
+        mov    $Ni[1],%rax
+       adc     %rdx,$A0[1]
+
+       mov     8($nptr,$j),$Ni[1]      # n[5]
+       xor     $A1[0],$A1[0]
+       add     $A0[0],$A1[1]
+       adc     \$0,$A1[0]
+       mul     $m1                     # n[3]*m1
+       add     %rax,$A1[1]             # n[3]*m1+"t[4]"
+        mov    $Ni[1],%rax
+       adc     %rdx,$A1[0]
+       mov     $A1[1],($tptr,$j)       # "t[4]"
+
+       xor     $A0[0],$A0[0]
+       add     8($tptr,$j),$A0[1]
+       adc     \$0,$A0[0]
+       mul     $m0                     # n[5]*m0
+       add     %rax,$A0[1]             # n[5]*m0+t[5]
+        mov    $Ni[0],%rax
+       adc     %rdx,$A0[0]
+
+
+       mov     16($nptr,$j),$Ni[0]     # n[6]
+       xor     $A1[1],$A1[1]
+       add     $A0[1],$A1[0]
+       adc     \$0,$A1[1]
+       mul     $m1                     # n[4]*m1
+       add     %rax,$A1[0]             # n[4]*m1+"t[5]"
+        mov    $Ni[0],%rax
+       adc     %rdx,$A1[1]
+       mov     $A1[0],8($tptr,$j)      # "t[5]"
+
+       xor     $A0[1],$A0[1]
+       add     16($tptr,$j),$A0[0]
+       adc     \$0,$A0[1]
+       mul     $m0                     # n[6]*m0
+       add     %rax,$A0[0]             # n[6]*m0+t[6]
+        mov    $Ni[1],%rax
+       adc     %rdx,$A0[1]
+
+       mov     24($nptr,$j),$Ni[1]     # n[7]
+       xor     $A1[0],$A1[0]
+       add     $A0[0],$A1[1]
+       adc     \$0,$A1[0]
+       mul     $m1                     # n[5]*m1
+       add     %rax,$A1[1]             # n[5]*m1+"t[6]"
+        mov    $Ni[1],%rax
+       adc     %rdx,$A1[0]
+       mov     $A1[1],16($tptr,$j)     # "t[6]"
+
+       xor     $A0[0],$A0[0]
+       add     24($tptr,$j),$A0[1]
+       lea     32($j),$j
+       adc     \$0,$A0[0]
+       mul     $m0                     # n[7]*m0
+       add     %rax,$A0[1]             # n[7]*m0+t[7]
+        mov    $Ni[0],%rax
+       adc     %rdx,$A0[0]
+       cmp     \$0,$j
+       jne     .Lsqr4x_mont_inner
+
+        sub    0(%rsp),$j              # $j=-$num      # modsched #
+        mov    $n0,$m0                 #               # modsched #
+
+       xor     $A1[1],$A1[1]
+       add     $A0[1],$A1[0]
+       adc     \$0,$A1[1]
+       mul     $m1                     # n[6]*m1
+       add     %rax,$A1[0]             # n[6]*m1+"t[7]"
+       mov     $Ni[1],%rax
+       adc     %rdx,$A1[1]
+       mov     $A1[0],-8($tptr)        # "t[7]"
+
+       xor     $A0[1],$A0[1]
+       add     ($tptr),$A0[0]          # +t[8]
+       adc     \$0,$A0[1]
+        mov    0($nptr,$j),$Ni[0]      # n[0]          # modsched #
+       add     $topbit,$A0[0]
+       adc     \$0,$A0[1]
+
+        imulq  16($tptr,$j),$m0        # m0=t[0]*n0    # modsched #
+       xor     $A1[0],$A1[0]
+        mov    8($nptr,$j),$Ni[1]      # n[1]          # modsched #
+       add     $A0[0],$A1[1]
+        mov    16($tptr,$j),$A0[0]     # t[0]          # modsched #
+       adc     \$0,$A1[0]
+       mul     $m1                     # n[7]*m1
+       add     %rax,$A1[1]             # n[7]*m1+"t[8]"
+        mov    $Ni[0],%rax             #               # modsched #
+       adc     %rdx,$A1[0]
+       mov     $A1[1],($tptr)          # "t[8]"
+
+       xor     $topbit,$topbit
+       add     8($tptr),$A1[0]         # +t[9]
+       adc     $topbit,$topbit
+       add     $A0[1],$A1[0]
+       lea     16($tptr),$tptr         # "t[$num]>>128"
+       adc     \$0,$topbit
+       mov     $A1[0],-8($tptr)        # "t[9]"
+       cmp     8(%rsp),$tptr           # are we done?
+       jb      .Lsqr4x_mont_outer
+
+       mov     0(%rsp),$num            # restore $num
+       mov     $topbit,($tptr)         # save $topbit
+___
+}\f
+##############################################################
+# Post-condition, 4x unrolled copy from bn_mul_mont
+#
+{
+my ($tptr,$nptr)=("%rbx",$aptr);
+my @ri=("%rax","%rdx","%r10","%r11");
+$code.=<<___;
+       mov     64(%rsp,$num),@ri[0]    # tp[0]
+       lea     64(%rsp,$num),$tptr     # upper half of t[2*$num] holds result
+       mov     40(%rsp),$nptr          # restore $nptr
+       shr     \$5,$num                # num/4
+       mov     8($tptr),@ri[1]         # t[1]
+       xor     $i,$i                   # i=0 and clear CF!
+
+       mov     32(%rsp),$rptr          # restore $rptr
+       sub     0($nptr),@ri[0]
+       mov     16($tptr),@ri[2]        # t[2]
+       mov     24($tptr),@ri[3]        # t[3]
+       sbb     8($nptr),@ri[1]
+       lea     -1($num),$j             # j=num/4-1
+       jmp     .Lsqr4x_sub
+.align 16
+.Lsqr4x_sub:
+       mov     @ri[0],0($rptr,$i,8)    # rp[i]=tp[i]-np[i]
+       mov     @ri[1],8($rptr,$i,8)    # rp[i]=tp[i]-np[i]
+       sbb     16($nptr,$i,8),@ri[2]
+       mov     32($tptr,$i,8),@ri[0]   # tp[i+1]
+       mov     40($tptr,$i,8),@ri[1]
+       sbb     24($nptr,$i,8),@ri[3]
+       mov     @ri[2],16($rptr,$i,8)   # rp[i]=tp[i]-np[i]
+       mov     @ri[3],24($rptr,$i,8)   # rp[i]=tp[i]-np[i]
+       sbb     32($nptr,$i,8),@ri[0]
+       mov     48($tptr,$i,8),@ri[2]
+       mov     56($tptr,$i,8),@ri[3]
+       sbb     40($nptr,$i,8),@ri[1]
+       lea     4($i),$i                # i++
+       dec     $j                      # doesn't affect CF!
+       jnz     .Lsqr4x_sub
+
+       mov     @ri[0],0($rptr,$i,8)    # rp[i]=tp[i]-np[i]
+       mov     32($tptr,$i,8),@ri[0]   # load overflow bit
+       sbb     16($nptr,$i,8),@ri[2]
+       mov     @ri[1],8($rptr,$i,8)    # rp[i]=tp[i]-np[i]
+       sbb     24($nptr,$i,8),@ri[3]
+       mov     @ri[2],16($rptr,$i,8)   # rp[i]=tp[i]-np[i]
+
+       sbb     \$0,@ri[0]              # handle upmost overflow bit
+       mov     @ri[3],24($rptr,$i,8)   # rp[i]=tp[i]-np[i]
+       xor     $i,$i                   # i=0
+       and     @ri[0],$tptr
+       not     @ri[0]
+       mov     $rptr,$nptr
+       and     @ri[0],$nptr
+       lea     -1($num),$j
+       or      $nptr,$tptr             # tp=borrow?tp:rp
+
+       pxor    %xmm0,%xmm0
+       lea     64(%rsp,$num,8),$nptr
+       movdqu  ($tptr),%xmm1
+       lea     ($nptr,$num,8),$nptr
+       movdqa  %xmm0,64(%rsp)          # zap lower half of temporary vector
+       movdqa  %xmm0,($nptr)           # zap upper half of temporary vector
+       movdqu  %xmm1,($rptr)
+       jmp     .Lsqr4x_copy
+.align 16
+.Lsqr4x_copy:                          # copy or in-place refresh
+       movdqu  16($tptr,$i),%xmm2
+       movdqu  32($tptr,$i),%xmm1
+       movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
+       movdqa  %xmm0,96(%rsp,$i)       # zap lower half of temporary vector
+       movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
+       movdqa  %xmm0,32($nptr,$i)      # zap upper half of temporary vector
+       movdqu  %xmm2,16($rptr,$i)
+       movdqu  %xmm1,32($rptr,$i)
+       lea     32($i),$i
+       dec     $j
+       jnz     .Lsqr4x_copy
+
+       movdqu  16($tptr,$i),%xmm2
+       movdqa  %xmm0,80(%rsp,$i)       # zap lower half of temporary vector
+       movdqa  %xmm0,16($nptr,$i)      # zap upper half of temporary vector
+       movdqu  %xmm2,16($rptr,$i)
+___
+}
+$code.=<<___;
+       mov     56(%rsp),%rsi           # restore %rsp
+       mov     \$1,%rax
+       mov     0(%rsi),%r15
+       mov     8(%rsi),%r14
+       mov     16(%rsi),%r13
+       mov     24(%rsi),%r12
+       mov     32(%rsi),%rbp
+       mov     40(%rsi),%rbx
+       lea     48(%rsi),%rsp
+.Lsqr4x_epilogue:
+       ret
+.size  bn_sqr4x_mont,.-bn_sqr4x_mont
+___
+}}}
+$code.=<<___;
 .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 .align 16
 ___
@@ -228,9 +1511,9 @@ $disp="%r9";
 
 $code.=<<___;
 .extern        __imp_RtlVirtualUnwind
-.type  se_handler,\@abi-omnipotent
+.type  mul_handler,\@abi-omnipotent
 .align 16
-se_handler:
+mul_handler:
        push    %rsi
        push    %rdi
        push    %rbx
@@ -245,15 +1528,20 @@ se_handler:
        mov     120($context),%rax      # pull context->Rax
        mov     248($context),%rbx      # pull context->Rip
 
-       lea     .Lprologue(%rip),%r10
-       cmp     %r10,%rbx               # context->Rip<.Lprologue
-       jb      .Lin_prologue
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # end of prologue label
+       cmp     %r10,%rbx               # context->Rip<end of prologue label
+       jb      .Lcommon_seh_tail
 
        mov     152($context),%rax      # pull context->Rsp
 
-       lea     .Lepilogue(%rip),%r10
-       cmp     %r10,%rbx               # context->Rip>=.Lepilogue
-       jae     .Lin_prologue
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jae     .Lcommon_seh_tail
 
        mov     192($context),%r10      # pull $num
        mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
@@ -272,7 +1560,53 @@ se_handler:
        mov     %r14,232($context)      # restore context->R14
        mov     %r15,240($context)      # restore context->R15
 
-.Lin_prologue:
+       jmp     .Lcommon_seh_tail
+.size  mul_handler,.-mul_handler
+
+.type  sqr_handler,\@abi-omnipotent
+.align 16
+sqr_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       lea     .Lsqr4x_body(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip<.Lsqr_body
+       jb      .Lcommon_seh_tail
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       lea     .Lsqr4x_epilogue(%rip),%r10
+       cmp     %r10,%rbx               # context->Rip>=.Lsqr_epilogue
+       jae     .Lcommon_seh_tail
+
+       mov     56(%rax),%rax           # pull saved stack pointer
+       lea     48(%rax),%rax
+
+       mov     -8(%rax),%rbx
+       mov     -16(%rax),%rbp
+       mov     -24(%rax),%r12
+       mov     -32(%rax),%r13
+       mov     -40(%rax),%r14
+       mov     -48(%rax),%r15
+       mov     %rbx,144($context)      # restore context->Rbx
+       mov     %rbp,160($context)      # restore context->Rbp
+       mov     %r12,216($context)      # restore context->R12
+       mov     %r13,224($context)      # restore context->R13
+       mov     %r14,232($context)      # restore context->R14
+       mov     %r15,240($context)      # restore context->R15
+
+.Lcommon_seh_tail:
        mov     8(%rax),%rdi
        mov     16(%rax),%rsi
        mov     %rax,152($context)      # restore context->Rsp
@@ -310,7 +1644,7 @@ se_handler:
        pop     %rdi
        pop     %rsi
        ret
-.size  se_handler,.-se_handler
+.size  sqr_handler,.-sqr_handler
 
 .section       .pdata
 .align 4
@@ -318,11 +1652,27 @@ se_handler:
        .rva    .LSEH_end_bn_mul_mont
        .rva    .LSEH_info_bn_mul_mont
 
+       .rva    .LSEH_begin_bn_mul4x_mont
+       .rva    .LSEH_end_bn_mul4x_mont
+       .rva    .LSEH_info_bn_mul4x_mont
+
+       .rva    .LSEH_begin_bn_sqr4x_mont
+       .rva    .LSEH_end_bn_sqr4x_mont
+       .rva    .LSEH_info_bn_sqr4x_mont
+
 .section       .xdata
 .align 8
 .LSEH_info_bn_mul_mont:
        .byte   9,0,0,0
-       .rva    se_handler
+       .rva    mul_handler
+       .rva    .Lmul_body,.Lmul_epilogue       # HandlerData[]
+.LSEH_info_bn_mul4x_mont:
+       .byte   9,0,0,0
+       .rva    mul_handler
+       .rva    .Lmul4x_body,.Lmul4x_epilogue   # HandlerData[]
+.LSEH_info_bn_sqr4x_mont:
+       .byte   9,0,0,0
+       .rva    sqr_handler
 ___
 }
 
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
new file mode 100755 (executable)
index 0000000..057cda2
--- /dev/null
@@ -0,0 +1,1070 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# August 2011.
+#
+# Companion to x86_64-mont.pl that optimizes cache-timing attack
+# countermeasures. The subroutines are produced by replacing bp[i]
+# references in their x86_64-mont.pl counterparts with cache-neutral
+# references to powers table computed in BN_mod_exp_mont_consttime.
+# In addition subroutine that scatters elements of the powers table
+# is implemented, so that scatter-/gathering can be tuned without
+# bn_exp.c modifications.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+# int bn_mul_mont_gather5(
+$rp="%rdi";    # BN_ULONG *rp,
+$ap="%rsi";    # const BN_ULONG *ap,
+$bp="%rdx";    # const BN_ULONG *bp,
+$np="%rcx";    # const BN_ULONG *np,
+$n0="%r8";     # const BN_ULONG *n0,
+$num="%r9";    # int num,
+               # int idx);     # 0 to 2^5-1, "index" in $bp holding
+                               # pre-computed powers of a', interlaced
+                               # in such manner that b[0] is $bp[idx],
+                               # b[1] is [2^5+idx], etc.
+$lo0="%r10";
+$hi0="%r11";
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+.globl bn_mul_mont_gather5
+.type  bn_mul_mont_gather5,\@function,6
+.align 64
+bn_mul_mont_gather5:
+       test    \$3,${num}d
+       jnz     .Lmul_enter
+       cmp     \$8,${num}d
+       jb      .Lmul_enter
+       jmp     .Lmul4x_enter
+
+.align 16
+.Lmul_enter:
+       mov     ${num}d,${num}d
+       mov     `($win64?56:8)`(%rsp),%r10d     # load 7th argument
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+___
+$code.=<<___ if ($win64);
+       lea     -0x28(%rsp),%rsp
+       movaps  %xmm6,(%rsp)
+       movaps  %xmm7,0x10(%rsp)
+.Lmul_alloca:
+___
+$code.=<<___;
+       mov     %rsp,%rax
+       lea     2($num),%r11
+       neg     %r11
+       lea     (%rsp,%r11,8),%rsp      # tp=alloca(8*(num+2))
+       and     \$-1024,%rsp            # minimize TLB usage
+
+       mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.Lmul_body:
+       mov     $bp,%r12                # reassign $bp
+___
+               $bp="%r12";
+               $STRIDE=2**5*8;         # 5 is "window size"
+               $N=$STRIDE/4;           # should match cache line size
+$code.=<<___;
+       mov     %r10,%r11
+       shr     \$`log($N/8)/log(2)`,%r10
+       and     \$`$N/8-1`,%r11
+       not     %r10
+       lea     .Lmagic_masks(%rip),%rax
+       and     \$`2**5/($N/8)-1`,%r10  # 5 is "window size"
+       lea     96($bp,%r11,8),$bp      # pointer within 1st cache line
+       movq    0(%rax,%r10,8),%xmm4    # set of masks denoting which
+       movq    8(%rax,%r10,8),%xmm5    # cache line contains element
+       movq    16(%rax,%r10,8),%xmm6   # denoted by 7th argument
+       movq    24(%rax,%r10,8),%xmm7
+
+       movq    `0*$STRIDE/4-96`($bp),%xmm0
+       movq    `1*$STRIDE/4-96`($bp),%xmm1
+       pand    %xmm4,%xmm0
+       movq    `2*$STRIDE/4-96`($bp),%xmm2
+       pand    %xmm5,%xmm1
+       movq    `3*$STRIDE/4-96`($bp),%xmm3
+       pand    %xmm6,%xmm2
+       por     %xmm1,%xmm0
+       pand    %xmm7,%xmm3
+       por     %xmm2,%xmm0
+       lea     $STRIDE($bp),$bp
+       por     %xmm3,%xmm0
+
+       movq    %xmm0,$m0               # m0=bp[0]
+
+       mov     ($n0),$n0               # pull n0[0] value
+       mov     ($ap),%rax
+
+       xor     $i,$i                   # i=0
+       xor     $j,$j                   # j=0
+
+       movq    `0*$STRIDE/4-96`($bp),%xmm0
+       movq    `1*$STRIDE/4-96`($bp),%xmm1
+       pand    %xmm4,%xmm0
+       movq    `2*$STRIDE/4-96`($bp),%xmm2
+       pand    %xmm5,%xmm1
+
+       mov     $n0,$m1
+       mulq    $m0                     # ap[0]*bp[0]
+       mov     %rax,$lo0
+       mov     ($np),%rax
+
+       movq    `3*$STRIDE/4-96`($bp),%xmm3
+       pand    %xmm6,%xmm2
+       por     %xmm1,%xmm0
+       pand    %xmm7,%xmm3
+
+       imulq   $lo0,$m1                # "tp[0]"*n0
+       mov     %rdx,$hi0
+
+       por     %xmm2,%xmm0
+       lea     $STRIDE($bp),$bp
+       por     %xmm3,%xmm0
+
+       mulq    $m1                     # np[0]*m1
+       add     %rax,$lo0               # discarded
+       mov     8($ap),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$hi1
+
+       lea     1($j),$j                # j++
+       jmp     .L1st_enter
+
+.align 16
+.L1st:
+       add     %rax,$hi1
+       mov     ($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
+       mov     $lo0,$hi0
+       adc     \$0,%rdx
+       mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$hi1
+
+.L1st_enter:
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$hi0
+       mov     ($np,$j,8),%rax
+       adc     \$0,%rdx
+       lea     1($j),$j                # j++
+       mov     %rdx,$lo0
+
+       mulq    $m1                     # np[j]*m1
+       cmp     $num,$j
+       jne     .L1st
+
+       movq    %xmm0,$m0               # bp[1]
+
+       add     %rax,$hi1
+       mov     ($ap),%rax              # ap[0]
+       adc     \$0,%rdx
+       add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$hi1
+       mov     $lo0,$hi0
+
+       xor     %rdx,%rdx
+       add     $hi0,$hi1
+       adc     \$0,%rdx
+       mov     $hi1,-8(%rsp,$num,8)
+       mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
+
+       lea     1($i),$i                # i++
+       jmp     .Louter
+.align 16
+.Louter:
+       xor     $j,$j                   # j=0
+       mov     $n0,$m1
+       mov     (%rsp),$lo0
+
+       movq    `0*$STRIDE/4-96`($bp),%xmm0
+       movq    `1*$STRIDE/4-96`($bp),%xmm1
+       pand    %xmm4,%xmm0
+       movq    `2*$STRIDE/4-96`($bp),%xmm2
+       pand    %xmm5,%xmm1
+
+       mulq    $m0                     # ap[0]*bp[i]
+       add     %rax,$lo0               # ap[0]*bp[i]+tp[0]
+       mov     ($np),%rax
+       adc     \$0,%rdx
+
+       movq    `3*$STRIDE/4-96`($bp),%xmm3
+       pand    %xmm6,%xmm2
+       por     %xmm1,%xmm0
+       pand    %xmm7,%xmm3
+
+       imulq   $lo0,$m1                # tp[0]*n0
+       mov     %rdx,$hi0
+
+       por     %xmm2,%xmm0
+       lea     $STRIDE($bp),$bp
+       por     %xmm3,%xmm0
+
+       mulq    $m1                     # np[0]*m1
+       add     %rax,$lo0               # discarded
+       mov     8($ap),%rax
+       adc     \$0,%rdx
+       mov     8(%rsp),$lo0            # tp[1]
+       mov     %rdx,$hi1
+
+       lea     1($j),$j                # j++
+       jmp     .Linner_enter
+
+.align 16
+.Linner:
+       add     %rax,$hi1
+       mov     ($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
+       mov     (%rsp,$j,8),$lo0
+       adc     \$0,%rdx
+       mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$hi1
+
+.Linner_enter:
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$hi0
+       mov     ($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     $hi0,$lo0               # ap[j]*bp[i]+tp[j]
+       mov     %rdx,$hi0
+       adc     \$0,$hi0
+       lea     1($j),$j                # j++
+
+       mulq    $m1                     # np[j]*m1
+       cmp     $num,$j
+       jne     .Linner
+
+       movq    %xmm0,$m0               # bp[i+1]
+
+       add     %rax,$hi1
+       mov     ($ap),%rax              # ap[0]
+       adc     \$0,%rdx
+       add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
+       mov     (%rsp,$j,8),$lo0
+       adc     \$0,%rdx
+       mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$hi1
+
+       xor     %rdx,%rdx
+       add     $hi0,$hi1
+       adc     \$0,%rdx
+       add     $lo0,$hi1               # pull upmost overflow bit
+       adc     \$0,%rdx
+       mov     $hi1,-8(%rsp,$num,8)
+       mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
+
+       lea     1($i),$i                # i++
+       cmp     $num,$i
+       jl      .Louter
+
+       xor     $i,$i                   # i=0 and clear CF!
+       mov     (%rsp),%rax             # tp[0]
+       lea     (%rsp),$ap              # borrow ap for tp
+       mov     $num,$j                 # j=num
+       jmp     .Lsub
+.align 16
+.Lsub: sbb     ($np,$i,8),%rax
+       mov     %rax,($rp,$i,8)         # rp[i]=tp[i]-np[i]
+       mov     8($ap,$i,8),%rax        # tp[i+1]
+       lea     1($i),$i                # i++
+       dec     $j                      # doesnn't affect CF!
+       jnz     .Lsub
+
+       sbb     \$0,%rax                # handle upmost overflow bit
+       xor     $i,$i
+       and     %rax,$ap
+       not     %rax
+       mov     $rp,$np
+       and     %rax,$np
+       mov     $num,$j                 # j=num
+       or      $np,$ap                 # ap=borrow?tp:rp
+.align 16
+.Lcopy:                                        # copy or in-place refresh
+       mov     ($ap,$i,8),%rax
+       mov     $i,(%rsp,$i,8)          # zap temporary vector
+       mov     %rax,($rp,$i,8)         # rp[i]=tp[i]
+       lea     1($i),$i
+       sub     \$1,$j
+       jnz     .Lcopy
+
+       mov     8(%rsp,$num,8),%rsi     # restore %rsp
+       mov     \$1,%rax
+___
+$code.=<<___ if ($win64);
+       movaps  (%rsi),%xmm6
+       movaps  0x10(%rsi),%xmm7
+       lea     0x28(%rsi),%rsi
+___
+$code.=<<___;
+       mov     (%rsi),%r15
+       mov     8(%rsi),%r14
+       mov     16(%rsi),%r13
+       mov     24(%rsi),%r12
+       mov     32(%rsi),%rbp
+       mov     40(%rsi),%rbx
+       lea     48(%rsi),%rsp
+.Lmul_epilogue:
+       ret
+.size  bn_mul_mont_gather5,.-bn_mul_mont_gather5
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type  bn_mul4x_mont_gather5,\@function,6
+.align 16
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+       mov     ${num}d,${num}d
+       mov     `($win64?56:8)`(%rsp),%r10d     # load 7th argument
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+___
+$code.=<<___ if ($win64);
+       lea     -0x28(%rsp),%rsp
+       movaps  %xmm6,(%rsp)
+       movaps  %xmm7,0x10(%rsp)
+.Lmul4x_alloca:
+___
+$code.=<<___;
+       mov     %rsp,%rax
+       lea     4($num),%r11
+       neg     %r11
+       lea     (%rsp,%r11,8),%rsp      # tp=alloca(8*(num+4))
+       and     \$-1024,%rsp            # minimize TLB usage
+
+       mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.Lmul4x_body:
+       mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
+       mov     %rdx,%r12               # reassign $bp
+___
+               $bp="%r12";
+               $STRIDE=2**5*8;         # 5 is "window size"
+               $N=$STRIDE/4;           # should match cache line size
+$code.=<<___;
+       mov     %r10,%r11
+       shr     \$`log($N/8)/log(2)`,%r10
+       and     \$`$N/8-1`,%r11
+       not     %r10
+       lea     .Lmagic_masks(%rip),%rax
+       and     \$`2**5/($N/8)-1`,%r10  # 5 is "window size"
+       lea     96($bp,%r11,8),$bp      # pointer within 1st cache line
+       movq    0(%rax,%r10,8),%xmm4    # set of masks denoting which
+       movq    8(%rax,%r10,8),%xmm5    # cache line contains element
+       movq    16(%rax,%r10,8),%xmm6   # denoted by 7th argument
+       movq    24(%rax,%r10,8),%xmm7
+
+       movq    `0*$STRIDE/4-96`($bp),%xmm0
+       movq    `1*$STRIDE/4-96`($bp),%xmm1
+       pand    %xmm4,%xmm0
+       movq    `2*$STRIDE/4-96`($bp),%xmm2
+       pand    %xmm5,%xmm1
+       movq    `3*$STRIDE/4-96`($bp),%xmm3
+       pand    %xmm6,%xmm2
+       por     %xmm1,%xmm0
+       pand    %xmm7,%xmm3
+       por     %xmm2,%xmm0
+       lea     $STRIDE($bp),$bp
+       por     %xmm3,%xmm0
+
+       movq    %xmm0,$m0               # m0=bp[0]
+       mov     ($n0),$n0               # pull n0[0] value
+       mov     ($ap),%rax
+
+       xor     $i,$i                   # i=0
+       xor     $j,$j                   # j=0
+
+       movq    `0*$STRIDE/4-96`($bp),%xmm0
+       movq    `1*$STRIDE/4-96`($bp),%xmm1
+       pand    %xmm4,%xmm0
+       movq    `2*$STRIDE/4-96`($bp),%xmm2
+       pand    %xmm5,%xmm1
+
+       mov     $n0,$m1
+       mulq    $m0                     # ap[0]*bp[0]
+       mov     %rax,$A[0]
+       mov     ($np),%rax
+
+       movq    `3*$STRIDE/4-96`($bp),%xmm3
+       pand    %xmm6,%xmm2
+       por     %xmm1,%xmm0
+       pand    %xmm7,%xmm3
+
+       imulq   $A[0],$m1               # "tp[0]"*n0
+       mov     %rdx,$A[1]
+
+       por     %xmm2,%xmm0
+       lea     $STRIDE($bp),$bp
+       por     %xmm3,%xmm0
+
+       mulq    $m1                     # np[0]*m1
+       add     %rax,$A[0]              # discarded
+       mov     8($ap),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$N[1]
+
+       mulq    $m0
+       add     %rax,$A[1]
+       mov     8($np),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1
+       add     %rax,$N[1]
+       mov     16($ap),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]
+       lea     4($j),$j                # j++
+       adc     \$0,%rdx
+       mov     $N[1],(%rsp)
+       mov     %rdx,$N[0]
+       jmp     .L1st4x
+.align 16
+.L1st4x:
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[0]
+       mov     -16($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     -8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[1]
+       mov     -8($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     ($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[0]
+       mov     ($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[1]
+       mov     8($np,$j,8),%rax
+       adc     \$0,%rdx
+       lea     4($j),$j                # j++
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     -16($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+       cmp     $num,$j
+       jl      .L1st4x
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[0]
+       mov     -16($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     -8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[1]
+       mov     -8($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     ($ap),%rax              # ap[0]
+       adc     \$0,%rdx
+       add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+
+       movq    %xmm0,$m0               # bp[1]
+
+       xor     $N[1],$N[1]
+       add     $A[0],$N[0]
+       adc     \$0,$N[1]
+       mov     $N[0],-8(%rsp,$j,8)
+       mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
+
+       lea     1($i),$i                # i++
+.align 4
+.Louter4x:
+       xor     $j,$j                   # j=0
+       movq    `0*$STRIDE/4-96`($bp),%xmm0
+       movq    `1*$STRIDE/4-96`($bp),%xmm1
+       pand    %xmm4,%xmm0
+       movq    `2*$STRIDE/4-96`($bp),%xmm2
+       pand    %xmm5,%xmm1
+
+       mov     (%rsp),$A[0]
+       mov     $n0,$m1
+       mulq    $m0                     # ap[0]*bp[i]
+       add     %rax,$A[0]              # ap[0]*bp[i]+tp[0]
+       mov     ($np),%rax
+       adc     \$0,%rdx
+
+       movq    `3*$STRIDE/4-96`($bp),%xmm3
+       pand    %xmm6,%xmm2
+       por     %xmm1,%xmm0
+       pand    %xmm7,%xmm3
+
+       imulq   $A[0],$m1               # tp[0]*n0
+       mov     %rdx,$A[1]
+
+       por     %xmm2,%xmm0
+       lea     $STRIDE($bp),$bp
+       por     %xmm3,%xmm0
+
+       mulq    $m1                     # np[0]*m1
+       add     %rax,$A[0]              # "$N[0]", discarded
+       mov     8($ap),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[1]
+       mov     8($np),%rax
+       adc     \$0,%rdx
+       add     8(%rsp),$A[1]           # +tp[1]
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     16($ap),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[i]+tp[j]
+       lea     4($j),$j                # j+=2
+       adc     \$0,%rdx
+       mov     %rdx,$N[0]
+       jmp     .Linner4x
+.align 16
+.Linner4x:
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[0]
+       mov     -16($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     -8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]
+       adc     \$0,%rdx
+       mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[1]
+       mov     -8($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     -8(%rsp,$j,8),$A[1]
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     ($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]
+       adc     \$0,%rdx
+       mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[0]
+       mov     ($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     (%rsp,$j,8),$A[0]       # ap[j]*bp[i]+tp[j]
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]
+       adc     \$0,%rdx
+       mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[1]
+       mov     8($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     8(%rsp,$j,8),$A[1]
+       adc     \$0,%rdx
+       lea     4($j),$j                # j++
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     -16($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]
+       adc     \$0,%rdx
+       mov     $N[0],-40(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+       cmp     $num,$j
+       jl      .Linner4x
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[0]
+       mov     -16($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     -8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]
+       adc     \$0,%rdx
+       mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[1]
+       mov     -8($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     -8(%rsp,$j,8),$A[1]
+       adc     \$0,%rdx
+       lea     1($i),$i                # i++
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     ($ap),%rax              # ap[0]
+       adc     \$0,%rdx
+       add     $A[1],$N[1]
+       adc     \$0,%rdx
+       mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+
+       movq    %xmm0,$m0               # bp[i+1]
+       mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+
+       xor     $N[1],$N[1]
+       add     $A[0],$N[0]
+       adc     \$0,$N[1]
+       add     (%rsp,$num,8),$N[0]     # pull upmost overflow bit
+       adc     \$0,$N[1]
+       mov     $N[0],-8(%rsp,$j,8)
+       mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
+
+       cmp     $num,$i
+       jl      .Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+       mov     16(%rsp,$num,8),$rp     # restore $rp
+       mov     0(%rsp),@ri[0]          # tp[0]
+       pxor    %xmm0,%xmm0
+       mov     8(%rsp),@ri[1]          # tp[1]
+       shr     \$2,$num                # num/=4
+       lea     (%rsp),$ap              # borrow ap for tp
+       xor     $i,$i                   # i=0 and clear CF!
+
+       sub     0($np),@ri[0]
+       mov     16($ap),@ri[2]          # tp[2]
+       mov     24($ap),@ri[3]          # tp[3]
+       sbb     8($np),@ri[1]
+       lea     -1($num),$j             # j=num/4-1
+       jmp     .Lsub4x
+.align 16
+.Lsub4x:
+       mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
+       mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
+       sbb     16($np,$i,8),@ri[2]
+       mov     32($ap,$i,8),@ri[0]     # tp[i+1]
+       mov     40($ap,$i,8),@ri[1]
+       sbb     24($np,$i,8),@ri[3]
+       mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
+       mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
+       sbb     32($np,$i,8),@ri[0]
+       mov     48($ap,$i,8),@ri[2]
+       mov     56($ap,$i,8),@ri[3]
+       sbb     40($np,$i,8),@ri[1]
+       lea     4($i),$i                # i++
+       dec     $j                      # doesnn't affect CF!
+       jnz     .Lsub4x
+
+       mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
+       mov     32($ap,$i,8),@ri[0]     # load overflow bit
+       sbb     16($np,$i,8),@ri[2]
+       mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
+       sbb     24($np,$i,8),@ri[3]
+       mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
+
+       sbb     \$0,@ri[0]              # handle upmost overflow bit
+       mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
+       xor     $i,$i                   # i=0
+       and     @ri[0],$ap
+       not     @ri[0]
+       mov     $rp,$np
+       and     @ri[0],$np
+       lea     -1($num),$j
+       or      $np,$ap                 # ap=borrow?tp:rp
+
+       movdqu  ($ap),%xmm1
+       movdqa  %xmm0,(%rsp)
+       movdqu  %xmm1,($rp)
+       jmp     .Lcopy4x
+.align 16
+.Lcopy4x:                                      # copy or in-place refresh
+       movdqu  16($ap,$i),%xmm2
+       movdqu  32($ap,$i),%xmm1
+       movdqa  %xmm0,16(%rsp,$i)
+       movdqu  %xmm2,16($rp,$i)
+       movdqa  %xmm0,32(%rsp,$i)
+       movdqu  %xmm1,32($rp,$i)
+       lea     32($i),$i
+       dec     $j
+       jnz     .Lcopy4x
+
+       shl     \$2,$num
+       movdqu  16($ap,$i),%xmm2
+       movdqa  %xmm0,16(%rsp,$i)
+       movdqu  %xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
+       mov     8(%rsp,$num,8),%rsi     # restore %rsp
+       mov     \$1,%rax
+___
+$code.=<<___ if ($win64);
+       movaps  (%rsi),%xmm6
+       movaps  0x10(%rsi),%xmm7
+       lea     0x28(%rsi),%rsi
+___
+$code.=<<___;
+       mov     (%rsi),%r15
+       mov     8(%rsi),%r14
+       mov     16(%rsi),%r13
+       mov     24(%rsi),%r12
+       mov     32(%rsi),%rbp
+       mov     40(%rsi),%rbx
+       lea     48(%rsi),%rsp
+.Lmul4x_epilogue:
+       ret
+.size  bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+___
+}}}
+
+{
+my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
+                               ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+my $out=$inp;
+my $STRIDE=2**5*8;
+my $N=$STRIDE/4;
+
+$code.=<<___;
+.globl bn_scatter5
+.type  bn_scatter5,\@abi-omnipotent
+.align 16
+bn_scatter5:
+       cmp     \$0, $num
+       jz      .Lscatter_epilogue
+       lea     ($tbl,$idx,8),$tbl
+.Lscatter:
+       mov     ($inp),%rax
+       lea     8($inp),$inp
+       mov     %rax,($tbl)
+       lea     32*8($tbl),$tbl
+       sub     \$1,$num
+       jnz     .Lscatter
+.Lscatter_epilogue:
+       ret
+.size  bn_scatter5,.-bn_scatter5
+
+.globl bn_gather5
+.type  bn_gather5,\@abi-omnipotent
+.align 16
+bn_gather5:
+___
+$code.=<<___ if ($win64);
+.LSEH_begin_bn_gather5:
+       # I can't trust assembler to use specific encoding:-(
+       .byte   0x48,0x83,0xec,0x28             #sub    \$0x28,%rsp
+       .byte   0x0f,0x29,0x34,0x24             #movaps %xmm6,(%rsp)
+       .byte   0x0f,0x29,0x7c,0x24,0x10        #movdqa %xmm7,0x10(%rsp)
+___
+$code.=<<___;
+       mov     $idx,%r11
+       shr     \$`log($N/8)/log(2)`,$idx
+       and     \$`$N/8-1`,%r11
+       not     $idx
+       lea     .Lmagic_masks(%rip),%rax
+       and     \$`2**5/($N/8)-1`,$idx  # 5 is "window size"
+       lea     96($tbl,%r11,8),$tbl    # pointer within 1st cache line
+       movq    0(%rax,$idx,8),%xmm4    # set of masks denoting which
+       movq    8(%rax,$idx,8),%xmm5    # cache line contains element
+       movq    16(%rax,$idx,8),%xmm6   # denoted by 7th argument
+       movq    24(%rax,$idx,8),%xmm7
+       jmp     .Lgather
+.align 16
+.Lgather:
+       movq    `0*$STRIDE/4-96`($tbl),%xmm0
+       movq    `1*$STRIDE/4-96`($tbl),%xmm1
+       pand    %xmm4,%xmm0
+       movq    `2*$STRIDE/4-96`($tbl),%xmm2
+       pand    %xmm5,%xmm1
+       movq    `3*$STRIDE/4-96`($tbl),%xmm3
+       pand    %xmm6,%xmm2
+       por     %xmm1,%xmm0
+       pand    %xmm7,%xmm3
+       por     %xmm2,%xmm0
+       lea     $STRIDE($tbl),$tbl
+       por     %xmm3,%xmm0
+
+       movq    %xmm0,($out)            # m0=bp[0]
+       lea     8($out),$out
+       sub     \$1,$num
+       jnz     .Lgather
+___
+$code.=<<___ if ($win64);
+       movaps  %xmm6,(%rsp)
+       movaps  %xmm7,0x10(%rsp)
+       lea     0x28(%rsp),%rsp
+___
+$code.=<<___;
+       ret
+.LSEH_end_bn_gather5:
+.size  bn_gather5,.-bn_gather5
+___
+}
+$code.=<<___;
+.align 64
+.Lmagic_masks:
+       .long   0,0, 0,0, 0,0, -1,-1
+       .long   0,0, 0,0, 0,0,  0,0
+.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
+#              CONTEXT *context,DISPATCHER_CONTEXT *disp)
+if ($win64) {
+$rec="%rcx";
+$frame="%rdx";
+$context="%r8";
+$disp="%r9";
+
+$code.=<<___;
+.extern        __imp_RtlVirtualUnwind
+.type  mul_handler,\@abi-omnipotent
+.align 16
+mul_handler:
+       push    %rsi
+       push    %rdi
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+       pushfq
+       sub     \$64,%rsp
+
+       mov     120($context),%rax      # pull context->Rax
+       mov     248($context),%rbx      # pull context->Rip
+
+       mov     8($disp),%rsi           # disp->ImageBase
+       mov     56($disp),%r11          # disp->HandlerData
+
+       mov     0(%r11),%r10d           # HandlerData[0]
+       lea     (%rsi,%r10),%r10        # end of prologue label
+       cmp     %r10,%rbx               # context->Rip<end of prologue label
+       jb      .Lcommon_seh_tail
+
+       lea     `40+48`(%rax),%rax
+
+       mov     4(%r11),%r10d           # HandlerData[1]
+       lea     (%rsi,%r10),%r10        # end of alloca label
+       cmp     %r10,%rbx               # context->Rip<end of alloca label
+       jb      .Lcommon_seh_tail
+
+       mov     152($context),%rax      # pull context->Rsp
+
+       mov     8(%r11),%r10d           # HandlerData[2]
+       lea     (%rsi,%r10),%r10        # epilogue label
+       cmp     %r10,%rbx               # context->Rip>=epilogue label
+       jae     .Lcommon_seh_tail
+
+       mov     192($context),%r10      # pull $num
+       mov     8(%rax,%r10,8),%rax     # pull saved stack pointer
+
+       movaps  (%rax),%xmm0
+       movaps  16(%rax),%xmm1
+       lea     `40+48`(%rax),%rax
+
+       mov     -8(%rax),%rbx
+       mov     -16(%rax),%rbp
+       mov     -24(%rax),%r12
+       mov     -32(%rax),%r13
+       mov     -40(%rax),%r14
+       mov     -48(%rax),%r15
+       mov     %rbx,144($context)      # restore context->Rbx
+       mov     %rbp,160($context)      # restore context->Rbp
+       mov     %r12,216($context)      # restore context->R12
+       mov     %r13,224($context)      # restore context->R13
+       mov     %r14,232($context)      # restore context->R14
+       mov     %r15,240($context)      # restore context->R15
+       movups  %xmm0,512($context)     # restore context->Xmm6
+       movups  %xmm1,528($context)     # restore context->Xmm7
+
+.Lcommon_seh_tail:
+       mov     8(%rax),%rdi
+       mov     16(%rax),%rsi
+       mov     %rax,152($context)      # restore context->Rsp
+       mov     %rsi,168($context)      # restore context->Rsi
+       mov     %rdi,176($context)      # restore context->Rdi
+
+       mov     40($disp),%rdi          # disp->ContextRecord
+       mov     $context,%rsi           # context
+       mov     \$154,%ecx              # sizeof(CONTEXT)
+       .long   0xa548f3fc              # cld; rep movsq
+
+       mov     $disp,%rsi
+       xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
+       mov     8(%rsi),%rdx            # arg2, disp->ImageBase
+       mov     0(%rsi),%r8             # arg3, disp->ControlPc
+       mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
+       mov     40(%rsi),%r10           # disp->ContextRecord
+       lea     56(%rsi),%r11           # &disp->HandlerData
+       lea     24(%rsi),%r12           # &disp->EstablisherFrame
+       mov     %r10,32(%rsp)           # arg5
+       mov     %r11,40(%rsp)           # arg6
+       mov     %r12,48(%rsp)           # arg7
+       mov     %rcx,56(%rsp)           # arg8, (NULL)
+       call    *__imp_RtlVirtualUnwind(%rip)
+
+       mov     \$1,%eax                # ExceptionContinueSearch
+       add     \$64,%rsp
+       popfq
+       pop     %r15
+       pop     %r14
+       pop     %r13
+       pop     %r12
+       pop     %rbp
+       pop     %rbx
+       pop     %rdi
+       pop     %rsi
+       ret
+.size  mul_handler,.-mul_handler
+
+.section       .pdata
+.align 4
+       .rva    .LSEH_begin_bn_mul_mont_gather5
+       .rva    .LSEH_end_bn_mul_mont_gather5
+       .rva    .LSEH_info_bn_mul_mont_gather5
+
+       .rva    .LSEH_begin_bn_mul4x_mont_gather5
+       .rva    .LSEH_end_bn_mul4x_mont_gather5
+       .rva    .LSEH_info_bn_mul4x_mont_gather5
+
+       .rva    .LSEH_begin_bn_gather5
+       .rva    .LSEH_end_bn_gather5
+       .rva    .LSEH_info_bn_gather5
+
+.section       .xdata
+.align 8
+.LSEH_info_bn_mul_mont_gather5:
+       .byte   9,0,0,0
+       .rva    mul_handler
+       .rva    .Lmul_alloca,.Lmul_body,.Lmul_epilogue          # HandlerData[]
+.align 8
+.LSEH_info_bn_mul4x_mont_gather5:
+       .byte   9,0,0,0
+       .rva    mul_handler
+       .rva    .Lmul4x_alloca,.Lmul4x_body,.Lmul4x_epilogue    # HandlerData[]
+.align 8
+.LSEH_info_bn_gather5:
+        .byte   0x01,0x0d,0x05,0x00
+        .byte   0x0d,0x78,0x01,0x00    #movaps 0x10(rsp),xmm7
+        .byte   0x08,0x68,0x00,0x00    #movaps (rsp),xmm6
+        .byte   0x04,0x42,0x00,0x00    #sub    rsp,0x28
+.align 8
+___
+}
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+close STDOUT;
index e6643f8..f611a2d 100755 (executable)
@@ -95,50 +95,44 @@ sub ROUND_00_15()
 { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 
 $code.=<<___;
-       mov     $e,$a0
-       mov     $e,$a1
+       ror     \$`$Sigma1[2]-$Sigma1[1]`,$a0
        mov     $f,$a2
+       mov     $T1,`$SZ*($i&0xf)`(%rsp)
 
-       ror     \$$Sigma1[0],$a0
-       ror     \$$Sigma1[1],$a1
+       ror     \$`$Sigma0[2]-$Sigma0[1]`,$a1
+       xor     $e,$a0
        xor     $g,$a2                  # f^g
 
-       xor     $a1,$a0
-       ror     \$`$Sigma1[2]-$Sigma1[1]`,$a1
+       ror     \$`$Sigma1[1]-$Sigma1[0]`,$a0
+       add     $h,$T1                  # T1+=h
+       xor     $a,$a1
+
+       add     ($Tbl,$round,$SZ),$T1   # T1+=K[round]
        and     $e,$a2                  # (f^g)&e
-       mov     $T1,`$SZ*($i&0xf)`(%rsp)
+       mov     $b,$h
 
-       xor     $a1,$a0                 # Sigma1(e)
+       ror     \$`$Sigma0[1]-$Sigma0[0]`,$a1
+       xor     $e,$a0
        xor     $g,$a2                  # Ch(e,f,g)=((f^g)&e)^g
-       add     $h,$T1                  # T1+=h
-
-       mov     $a,$h
-       add     $a0,$T1                 # T1+=Sigma1(e)
 
+       xor     $c,$h                   # b^c
+       xor     $a,$a1
        add     $a2,$T1                 # T1+=Ch(e,f,g)
-       mov     $a,$a0
-       mov     $a,$a1
+       mov     $b,$a2
 
-       ror     \$$Sigma0[0],$h
-       ror     \$$Sigma0[1],$a0
-       mov     $a,$a2
-       add     ($Tbl,$round,$SZ),$T1   # T1+=K[round]
+       ror     \$$Sigma1[0],$a0        # Sigma1(e)
+       and     $a,$h                   # h=(b^c)&a
+       and     $c,$a2                  # b&c
 
-       xor     $a0,$h
-       ror     \$`$Sigma0[2]-$Sigma0[1]`,$a0
-       or      $c,$a1                  # a|c
+       ror     \$$Sigma0[0],$a1        # Sigma0(a)
+       add     $a0,$T1                 # T1+=Sigma1(e)
+       add     $a2,$h                  # h+=b&c (completes +=Maj(a,b,c)
 
-       xor     $a0,$h                  # h=Sigma0(a)
-       and     $c,$a2                  # a&c
        add     $T1,$d                  # d+=T1
-
-       and     $b,$a1                  # (a|c)&b
        add     $T1,$h                  # h+=T1
-
-       or      $a2,$a1                 # Maj(a,b,c)=((a|c)&b)|(a&c)
        lea     1($round),$round        # round++
+       add     $a1,$h                  # h+=Sigma0(a)
 
-       add     $a1,$h                  # h+=Maj(a,b,c)
 ___
 }
 
@@ -147,32 +141,30 @@ sub ROUND_16_XX()
 
 $code.=<<___;
        mov     `$SZ*(($i+1)&0xf)`(%rsp),$a0
-       mov     `$SZ*(($i+14)&0xf)`(%rsp),$T1
-
-       mov     $a0,$a2
+       mov     `$SZ*(($i+14)&0xf)`(%rsp),$a1
+       mov     $a0,$T1
+       mov     $a1,$a2
 
+       ror     \$`$sigma0[1]-$sigma0[0]`,$T1
+       xor     $a0,$T1
        shr     \$$sigma0[2],$a0
-       ror     \$$sigma0[0],$a2
-
-       xor     $a2,$a0
-       ror     \$`$sigma0[1]-$sigma0[0]`,$a2
 
-       xor     $a2,$a0                 # sigma0(X[(i+1)&0xf])
-       mov     $T1,$a1
+       ror     \$$sigma0[0],$T1
+       xor     $T1,$a0                 # sigma0(X[(i+1)&0xf])
+       mov     `$SZ*(($i+9)&0xf)`(%rsp),$T1
 
-       shr     \$$sigma1[2],$T1
-       ror     \$$sigma1[0],$a1
-
-       xor     $a1,$T1
-       ror     \$`$sigma1[1]-$sigma1[0]`,$a1
-
-       xor     $a1,$T1                 # sigma1(X[(i+14)&0xf])
+       ror     \$`$sigma1[1]-$sigma1[0]`,$a2
+       xor     $a1,$a2
+       shr     \$$sigma1[2],$a1
 
+       ror     \$$sigma1[0],$a2
        add     $a0,$T1
-
-       add     `$SZ*(($i+9)&0xf)`(%rsp),$T1
+       xor     $a2,$a1                 # sigma1(X[(i+14)&0xf])
 
        add     `$SZ*($i&0xf)`(%rsp),$T1
+       mov     $e,$a0
+       add     $a1,$T1
+       mov     $a,$a1
 ___
        &ROUND_00_15(@_);
 }
@@ -219,6 +211,8 @@ $func:
 ___
        for($i=0;$i<16;$i++) {
                $code.="        mov     $SZ*$i($inp),$T1\n";
+               $code.="        mov     @ROT[4],$a0\n";
+               $code.="        mov     @ROT[0],$a1\n";
                $code.="        bswap   $T1\n";
                &ROUND_00_15($i,@ROT);
                unshift(@ROT,pop(@ROT));
index 9782dd6..7b7b93b 100644 (file)
@@ -23,7 +23,7 @@ print<<___;
        call    OPENSSL_cpuid_setup
 
 .hidden        OPENSSL_ia32cap_P
-.comm  OPENSSL_ia32cap_P,8
+.comm  OPENSSL_ia32cap_P,8,4
 
 .text