This commit completes recent modular exponentiation optimizations on
authorAndy Polyakov <appro@openssl.org>
Fri, 12 Aug 2011 16:44:32 +0000 (16:44 +0000)
committerAndy Polyakov <appro@openssl.org>
Fri, 12 Aug 2011 16:44:32 +0000 (16:44 +0000)
x86_64 platform. It targets specifically RSA1024 sign (using ideas
from http://eprint.iacr.org/2011/239) and adds more than 10% on most
platforms. Overall performance improvement relative to 1.0.0 is ~40%
in average, with best result of 54% on Westmere. Incidentally ~40%
is average improvement even for longer key lengths.

Configure
TABLE
crypto/bn/Makefile
crypto/bn/asm/x86_64-mont.pl
crypto/bn/asm/x86_64-mont5.pl [new file with mode: 0755]
crypto/bn/bn_exp.c

index 8b5efd6b1be154e1c84e4d7aa3a8f9313eed9e30..9d9bd72b8bb85223f1868b3116132141190702d6 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -127,7 +127,7 @@ my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o x86-gf2m.o:des-586.o crypt5
 
 my $x86_elf_asm="$x86_asm:elf";
 
-my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o";
+my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o";
 my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::ghash-ia64.o:void";
 my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::ghash-sparcv9.o:void";
 my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o::::::::::::void";
@@ -513,9 +513,9 @@ my %table=(
 #
 # Win64 targets, WIN64I denotes IA-64 and WIN64A - AMD64
 "VC-WIN64I","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ghash-ia64.o:ias:win32",
-"VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:auto:win32",
+"VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:".eval{my $asm=$x86_64_asm;$asm=~s/x86_64-gcc\.o/bn_asm.o/;$asm}.":auto:win32",
 "debug-VC-WIN64I","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ghash-ia64.o:ias:win32",
-"debug-VC-WIN64A","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:ghash-x86_64.o:auto:win32",
+"debug-VC-WIN64A","cl:-W3 -Gs0 -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:".eval{my $asm=$x86_64_asm;$asm=~s/x86_64-gcc\.o/bn_asm.o/;$asm}.":auto:win32",
 # x86 Win32 target defaults to ANSI API, if you want UNICODE, complement
 # 'perl Configure VC-WIN32' with '-DUNICODE -D_UNICODE'
 "VC-WIN32","cl:-W3 -Gs0 -GF -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -D_CRT_SECURE_NO_DEPRECATE:::WIN32::BN_LLONG RC4_INDEX EXPORT_VAR_AS_FN ${x86_gcc_opts}:${x86_asm}:win32n:win32",
@@ -1510,6 +1510,7 @@ $cflags.=" -DOPENSSL_BN_ASM_PART_WORDS" if ($bn_obj =~ /bn-586/);
 $cflags.=" -DOPENSSL_IA32_SSE2" if (!$no_sse2 && $bn_obj =~ /86/);
 
 $cflags.=" -DOPENSSL_BN_ASM_MONT" if ($bn_obj =~ /-mont/);
+$cflags.=" -DOPENSSL_BN_ASM_MONT5" if ($bn_obj =~ /-mont5/);
 $cflags.=" -DOPENSSL_BN_ASM_GF2m" if ($bn_obj =~ /-gf2m/);
 
 if ($fips)
diff --git a/TABLE b/TABLE
index f520d7116b13223548d4139be9c411e79d64cdde..b5aea36d973ec9cd001a8da6a07dbb4709422a19 100644 (file)
--- a/TABLE
+++ b/TABLE
@@ -297,7 +297,7 @@ $sys_id       =
 $lflags       = 
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
-$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
 $aes_obj      = aes-x86_64.o aesni-x86_64.o
 $bf_obj       = 
@@ -777,7 +777,7 @@ $sys_id       = WIN64A
 $lflags       = 
 $bn_ops       = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
 $cpuid_obj    = x86_64cpuid.o
-$bn_obj       = bn_asm.o x86_64-mont.o
+$bn_obj       = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
 $aes_obj      = aes-x86_64.o aesni-x86_64.o
 $bf_obj       = 
@@ -1385,7 +1385,7 @@ $sys_id       = MACOSX
 $lflags       = -Wl,-search_paths_first%
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
-$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
 $aes_obj      = aes-x86_64.o aesni-x86_64.o
 $bf_obj       = 
@@ -1545,9 +1545,9 @@ $sys_id       = WIN64A
 $lflags       = 
 $bn_ops       = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
 $cpuid_obj    = x86_64cpuid.o
-$bn_obj       = bn_asm.o x86_64-mont.o
+$bn_obj       = bn_asm.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
-$aes_obj      = aes-x86_64.o
+$aes_obj      = aes-x86_64.o aesni-x86_64.o
 $bf_obj       = 
 $md5_obj      = md5-x86_64.o
 $sha1_obj     = sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o
@@ -2313,7 +2313,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
-$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
 $aes_obj      = aes-x86_64.o aesni-x86_64.o
 $bf_obj       = 
@@ -2505,7 +2505,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
-$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
 $aes_obj      = aes-x86_64.o aesni-x86_64.o
 $bf_obj       = 
@@ -2569,7 +2569,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
-$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
 $aes_obj      = aes-x86_64.o aesni-x86_64.o
 $bf_obj       = 
@@ -4073,7 +4073,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
-$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
 $aes_obj      = aes-x86_64.o aesni-x86_64.o
 $bf_obj       = 
@@ -4233,7 +4233,7 @@ $sys_id       = MINGW64
 $lflags       = -lws2_32 -lgdi32 -lcrypt32
 $bn_ops       = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
 $cpuid_obj    = x86_64cpuid.o
-$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
 $aes_obj      = aes-x86_64.o aesni-x86_64.o
 $bf_obj       = 
@@ -5193,7 +5193,7 @@ $sys_id       =
 $lflags       = -lsocket -lnsl -ldl
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
-$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
 $aes_obj      = aes-x86_64.o aesni-x86_64.o
 $bf_obj       = 
@@ -5225,7 +5225,7 @@ $sys_id       =
 $lflags       = -lsocket -lnsl -ldl
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL
 $cpuid_obj    = x86_64cpuid.o
-$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-gf2m.o modexp512-x86_64.o
+$bn_obj       = x86_64-gcc.o x86_64-mont.o x86_64-mont5.o x86_64-gf2m.o modexp512-x86_64.o
 $des_obj      = 
 $aes_obj      = aes-x86_64.o aesni-x86_64.o
 $bf_obj       = 
index 4ae6af5060f6ac1b80c73caa9b5b4e975c399792..1fd8e33371efe50aac6c5d8ef5453d689583d386 100644 (file)
@@ -98,6 +98,8 @@ x86_64-gcc.o: asm/x86_64-gcc.c
        $(CC) $(CFLAGS) -c -o $@ asm/x86_64-gcc.c
 x86_64-mont.s: asm/x86_64-mont.pl
        $(PERL) asm/x86_64-mont.pl $(PERLASM_SCHEME) > $@
+x86_64-mont5.s:        asm/x86_64-mont5.pl
+       $(PERL) asm/x86_64-mont5.pl $(PERLASM_SCHEME) > $@
 x86_64-gf2m.s:  asm/x86_64-gf2m.pl
        $(PERL) asm/x86_64-gf2m.pl $(PERLASM_SCHEME) > $@
 modexp512-x86_64.s:    asm/modexp512-x86_64.pl
index 4f330b81f382c7dedc816336925f0743367491f3..c2a308ddfa853e546414e7d210406d58bb846a3d 100755 (executable)
@@ -1,7 +1,7 @@
 #!/usr/bin/env perl
 
 # ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
@@ -142,6 +142,7 @@ $code.=<<___;
        jne     .L1st
 
        add     %rax,$hi1
+       mov     ($ap),%rax              # ap[0]
        adc     \$0,%rdx
        add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
        adc     \$0,%rdx
@@ -161,7 +162,6 @@ $code.=<<___;
 .Louter:
        mov     ($bp,$i,8),$m0          # m0=bp[i]
        xor     $j,$j                   # j=0
-       mov     ($ap),%rax              # ap[0]
        mov     $n0,$m1
        mov     (%rsp),$lo0
        mulq    $m0                     # ap[0]*bp[i]
@@ -208,6 +208,7 @@ $code.=<<___;
        jne     .Linner
 
        add     %rax,$hi1
+       mov     ($ap),%rax              # ap[0]
        adc     \$0,%rdx
        add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
        mov     (%rsp,$j,8),$lo0
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl
new file mode 100755 (executable)
index 0000000..b7b1ef6
--- /dev/null
@@ -0,0 +1,834 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# August 2011.
+#
+# Companion to x86_64-mont.pl that optimizes cache-timing attack
+# countermeasures. The subroutines are produced by replacing bp[i]
+# references in their x86_64-mont.pl counterparts with cache-neutral
+# references to powers table computed in BN_mod_exp_mont_consttime.
+# In addition subroutine that scatters elements of the powers table
+# is implemented, so that scatter-/gathering can be tuned without
+# bn_exp.c modifications.
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour $output";
+
+# int bn_mul_mont_gather5(
+$rp="%rdi";    # BN_ULONG *rp,
+$ap="%rsi";    # const BN_ULONG *ap,
+$bp="%rdx";    # const BN_ULONG *bp,
+$np="%rcx";    # const BN_ULONG *np,
+$n0="%r8";     # const BN_ULONG *n0,
+$num="%r9";    # int num,
+               # int idx);     # 0 to 2^5-1, "index" in $bp holding
+                               # pre-computed powers of a', interlaced
+                               # in such manner that b[0] is $bp[idx],
+                               # b[1] is [2^5+idx], etc.
+$lo0="%r10";
+$hi0="%r11";
+$hi1="%r13";
+$i="%r14";
+$j="%r15";
+$m0="%rbx";
+$m1="%rbp";
+
+$code=<<___;
+.text
+
+.globl bn_mul_mont_gather5
+.type  bn_mul_mont_gather5,\@function,6
+.align 64
+bn_mul_mont_gather5:
+       test    \$3,${num}d
+       jnz     .Lmul_enter
+       cmp     \$8,${num}d
+       jb      .Lmul_enter
+       jmp     .Lmul4x_enter
+
+.align 16
+.Lmul_enter:
+       mov     `($win64?56:8)`(%rsp),%r10d     # load 7th argument
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+
+       mov     ${num}d,${num}d
+       lea     2($num),%r11
+       mov     %rsp,%rax
+       neg     %r11
+       lea     (%rsp,%r11,8),%rsp      # tp=alloca(8*(num+2))
+       and     \$-1024,%rsp            # minimize TLB usage
+
+       mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.Lmul_body:
+       mov     $bp,%r12                # reassign $bp
+___
+               $bp="%r12";
+               $STRIDE=2**5*8;         # 5 is "window size"
+               $N=$STRIDE/4;           # should match cache line size
+$code.=<<___;
+       mov     %r10,%r11
+       shr     \$`log($N/8)/log(2)`,%r10
+       and     \$`$N/8-1`,%r11
+       not     %r10
+       lea     .Lmagic_masks(%rip),%rax
+       and     \$`2**5/($N/8)-1`,%r10  # 5 is "window size"
+       lea     96($bp,%r11,8),$bp      # pointer within 1st cache line
+       movq    0(%rax,%r10,8),%xmm4    # set of masks denoting which
+       movq    8(%rax,%r10,8),%xmm5    # cache line contains element
+       movq    16(%rax,%r10,8),%xmm6   # denoted by 7th argument
+       movq    24(%rax,%r10,8),%xmm7
+
+       movq    `0*$STRIDE/4-96`($bp),%xmm0
+       movq    `1*$STRIDE/4-96`($bp),%xmm1
+       pand    %xmm4,%xmm0
+       movq    `2*$STRIDE/4-96`($bp),%xmm2
+       pand    %xmm5,%xmm1
+       movq    `3*$STRIDE/4-96`($bp),%xmm3
+       pand    %xmm6,%xmm2
+       por     %xmm1,%xmm0
+       pand    %xmm7,%xmm3
+       por     %xmm2,%xmm0
+       lea     $STRIDE($bp),$bp
+       por     %xmm3,%xmm0
+
+       movq    %xmm0,$m0               # m0=bp[0]
+
+       mov     ($n0),$n0               # pull n0[0] value
+       mov     ($ap),%rax
+
+       xor     $i,$i                   # i=0
+       xor     $j,$j                   # j=0
+
+       movq    `0*$STRIDE/4-96`($bp),%xmm0
+       movq    `1*$STRIDE/4-96`($bp),%xmm1
+       pand    %xmm4,%xmm0
+       movq    `2*$STRIDE/4-96`($bp),%xmm2
+       pand    %xmm5,%xmm1
+
+       mov     $n0,$m1
+       mulq    $m0                     # ap[0]*bp[0]
+       mov     %rax,$lo0
+       mov     ($np),%rax
+
+       movq    `3*$STRIDE/4-96`($bp),%xmm3
+       pand    %xmm6,%xmm2
+       por     %xmm1,%xmm0
+       pand    %xmm7,%xmm3
+
+       imulq   $lo0,$m1                # "tp[0]"*n0
+       mov     %rdx,$hi0
+
+       por     %xmm2,%xmm0
+       lea     $STRIDE($bp),$bp
+       por     %xmm3,%xmm0
+
+       mulq    $m1                     # np[0]*m1
+       add     %rax,$lo0               # discarded
+       mov     8($ap),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$hi1
+
+       lea     1($j),$j                # j++
+       jmp     .L1st_enter
+
+.align 16
+.L1st:
+       add     %rax,$hi1
+       mov     ($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
+       mov     $lo0,$hi0
+       adc     \$0,%rdx
+       mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$hi1
+
+.L1st_enter:
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$hi0
+       mov     ($np,$j,8),%rax
+       adc     \$0,%rdx
+       lea     1($j),$j                # j++
+       mov     %rdx,$lo0
+
+       mulq    $m1                     # np[j]*m1
+       cmp     $num,$j
+       jne     .L1st
+
+       movq    %xmm0,$m0               # bp[1]
+
+       add     %rax,$hi1
+       mov     ($ap),%rax              # ap[0]
+       adc     \$0,%rdx
+       add     $hi0,$hi1               # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$hi1
+       mov     $lo0,$hi0
+
+       xor     %rdx,%rdx
+       add     $hi0,$hi1
+       adc     \$0,%rdx
+       mov     $hi1,-8(%rsp,$num,8)
+       mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
+
+       lea     1($i),$i                # i++
+       jmp     .Louter
+.align 16
+.Louter:
+       xor     $j,$j                   # j=0
+       mov     $n0,$m1
+       mov     (%rsp),$lo0
+
+       movq    `0*$STRIDE/4-96`($bp),%xmm0
+       movq    `1*$STRIDE/4-96`($bp),%xmm1
+       pand    %xmm4,%xmm0
+       movq    `2*$STRIDE/4-96`($bp),%xmm2
+       pand    %xmm5,%xmm1
+
+       mulq    $m0                     # ap[0]*bp[i]
+       add     %rax,$lo0               # ap[0]*bp[i]+tp[0]
+       mov     ($np),%rax
+       adc     \$0,%rdx
+
+       movq    `3*$STRIDE/4-96`($bp),%xmm3
+       pand    %xmm6,%xmm2
+       por     %xmm1,%xmm0
+       pand    %xmm7,%xmm3
+
+       imulq   $lo0,$m1                # tp[0]*n0
+       mov     %rdx,$hi0
+
+       por     %xmm2,%xmm0
+       lea     $STRIDE($bp),$bp
+       por     %xmm3,%xmm0
+
+       mulq    $m1                     # np[0]*m1
+       add     %rax,$lo0               # discarded
+       mov     8($ap),%rax
+       adc     \$0,%rdx
+       mov     8(%rsp),$lo0            # tp[1]
+       mov     %rdx,$hi1
+
+       lea     1($j),$j                # j++
+       jmp     .Linner_enter
+
+.align 16
+.Linner:
+       add     %rax,$hi1
+       mov     ($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
+       mov     (%rsp,$j,8),$lo0
+       adc     \$0,%rdx
+       mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$hi1
+
+.Linner_enter:
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$hi0
+       mov     ($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     $hi0,$lo0               # ap[j]*bp[i]+tp[j]
+       mov     %rdx,$hi0
+       adc     \$0,$hi0
+       lea     1($j),$j                # j++
+
+       mulq    $m1                     # np[j]*m1
+       cmp     $num,$j
+       jne     .Linner
+
+       movq    %xmm0,$m0               # bp[i+1]
+
+       add     %rax,$hi1
+       mov     ($ap),%rax              # ap[0]
+       adc     \$0,%rdx
+       add     $lo0,$hi1               # np[j]*m1+ap[j]*bp[i]+tp[j]
+       mov     (%rsp,$j,8),$lo0
+       adc     \$0,%rdx
+       mov     $hi1,-16(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$hi1
+
+       xor     %rdx,%rdx
+       add     $hi0,$hi1
+       adc     \$0,%rdx
+       add     $lo0,$hi1               # pull upmost overflow bit
+       adc     \$0,%rdx
+       mov     $hi1,-8(%rsp,$num,8)
+       mov     %rdx,(%rsp,$num,8)      # store upmost overflow bit
+
+       lea     1($i),$i                # i++
+       cmp     $num,$i
+       jl      .Louter
+
+       xor     $i,$i                   # i=0 and clear CF!
+       mov     (%rsp),%rax             # tp[0]
+       lea     (%rsp),$ap              # borrow ap for tp
+       mov     $num,$j                 # j=num
+       jmp     .Lsub
+.align 16
+.Lsub: sbb     ($np,$i,8),%rax
+       mov     %rax,($rp,$i,8)         # rp[i]=tp[i]-np[i]
+       mov     8($ap,$i,8),%rax        # tp[i+1]
+       lea     1($i),$i                # i++
+       dec     $j                      # doesnn't affect CF!
+       jnz     .Lsub
+
+       sbb     \$0,%rax                # handle upmost overflow bit
+       xor     $i,$i
+       and     %rax,$ap
+       not     %rax
+       mov     $rp,$np
+       and     %rax,$np
+       mov     $num,$j                 # j=num
+       or      $np,$ap                 # ap=borrow?tp:rp
+.align 16
+.Lcopy:                                        # copy or in-place refresh
+       mov     ($ap,$i,8),%rax
+       mov     $i,(%rsp,$i,8)          # zap temporary vector
+       mov     %rax,($rp,$i,8)         # rp[i]=tp[i]
+       lea     1($i),$i
+       sub     \$1,$j
+       jnz     .Lcopy
+
+       mov     8(%rsp,$num,8),%rsi     # restore %rsp
+       mov     \$1,%rax
+       mov     (%rsi),%r15
+       mov     8(%rsi),%r14
+       mov     16(%rsi),%r13
+       mov     24(%rsi),%r12
+       mov     32(%rsi),%rbp
+       mov     40(%rsi),%rbx
+       lea     48(%rsi),%rsp
+.Lmul_epilogue:
+       ret
+.size  bn_mul_mont_gather5,.-bn_mul_mont_gather5
+___
+{{{
+my @A=("%r10","%r11");
+my @N=("%r13","%rdi");
+$code.=<<___;
+.type  bn_mul4x_mont_gather5,\@function,6
+.align 16
+bn_mul4x_mont_gather5:
+.Lmul4x_enter:
+       mov     `($win64?56:8)`(%rsp),%r10d     # load 7th argument
+       push    %rbx
+       push    %rbp
+       push    %r12
+       push    %r13
+       push    %r14
+       push    %r15
+
+       mov     ${num}d,${num}d
+       lea     4($num),%r11
+       mov     %rsp,%rax               # !!!!
+       neg     %r11
+       lea     (%rsp,%r11,8),%rsp      # tp=alloca(8*(num+4))
+       and     \$-1024,%rsp            # minimize TLB usage
+
+       mov     %rax,8(%rsp,$num,8)     # tp[num+1]=%rsp
+.Lmul4x_body:
+       mov     $rp,16(%rsp,$num,8)     # tp[num+2]=$rp
+       mov     %rdx,%r12               # reassign $bp
+___
+               $bp="%r12";
+               $STRIDE=2**5*8;         # 5 is "window size"
+               $N=$STRIDE/4;           # should match cache line size
+$code.=<<___;
+       mov     %r10,%r11
+       shr     \$`log($N/8)/log(2)`,%r10
+       and     \$`$N/8-1`,%r11
+       not     %r10
+       lea     .Lmagic_masks(%rip),%rax
+       and     \$`2**5/($N/8)-1`,%r10  # 5 is "window size"
+       lea     96($bp,%r11,8),$bp      # pointer within 1st cache line
+       movq    0(%rax,%r10,8),%xmm4    # set of masks denoting which
+       movq    8(%rax,%r10,8),%xmm5    # cache line contains element
+       movq    16(%rax,%r10,8),%xmm6   # denoted by 7th argument
+       movq    24(%rax,%r10,8),%xmm7
+
+       movq    `0*$STRIDE/4-96`($bp),%xmm0
+       movq    `1*$STRIDE/4-96`($bp),%xmm1
+       pand    %xmm4,%xmm0
+       movq    `2*$STRIDE/4-96`($bp),%xmm2
+       pand    %xmm5,%xmm1
+       movq    `3*$STRIDE/4-96`($bp),%xmm3
+       pand    %xmm6,%xmm2
+       por     %xmm1,%xmm0
+       pand    %xmm7,%xmm3
+       por     %xmm2,%xmm0
+       lea     $STRIDE($bp),$bp
+       por     %xmm3,%xmm0
+
+       movq    %xmm0,$m0               # m0=bp[0]
+       mov     ($n0),$n0               # pull n0[0] value
+       mov     ($ap),%rax
+
+       xor     $i,$i                   # i=0
+       xor     $j,$j                   # j=0
+
+       movq    `0*$STRIDE/4-96`($bp),%xmm0
+       movq    `1*$STRIDE/4-96`($bp),%xmm1
+       pand    %xmm4,%xmm0
+       movq    `2*$STRIDE/4-96`($bp),%xmm2
+       pand    %xmm5,%xmm1
+
+       mov     $n0,$m1
+       mulq    $m0                     # ap[0]*bp[0]
+       mov     %rax,$A[0]
+       mov     ($np),%rax
+
+       movq    `3*$STRIDE/4-96`($bp),%xmm3
+       pand    %xmm6,%xmm2
+       por     %xmm1,%xmm0
+       pand    %xmm7,%xmm3
+
+       imulq   $A[0],$m1               # "tp[0]"*n0
+       mov     %rdx,$A[1]
+
+       por     %xmm2,%xmm0
+       lea     $STRIDE($bp),$bp
+       por     %xmm3,%xmm0
+
+       mulq    $m1                     # np[0]*m1
+       add     %rax,$A[0]              # discarded
+       mov     8($ap),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$N[1]
+
+       mulq    $m0
+       add     %rax,$A[1]
+       mov     8($np),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1
+       add     %rax,$N[1]
+       mov     16($ap),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]
+       lea     4($j),$j                # j++
+       adc     \$0,%rdx
+       mov     $N[1],(%rsp)
+       mov     %rdx,$N[0]
+       jmp     .L1st4x
+.align 16
+.L1st4x:
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[0]
+       mov     -16($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     -8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[1]
+       mov     -8($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     ($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[0]
+       mov     ($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[1]
+       mov     8($np,$j,8),%rax
+       adc     \$0,%rdx
+       lea     4($j),$j                # j++
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     -16($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+       cmp     $num,$j
+       jl      .L1st4x
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[0]
+       mov     -16($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     -8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[0]
+       add     %rax,$A[1]
+       mov     -8($np,$j,8),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     ($ap),%rax              # ap[0]
+       adc     \$0,%rdx
+       add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[0]
+       adc     \$0,%rdx
+       mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+
+       movq    %xmm0,$m0               # bp[1]
+
+       xor     $N[1],$N[1]
+       add     $A[0],$N[0]
+       adc     \$0,$N[1]
+       mov     $N[0],-8(%rsp,$j,8)
+       mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
+
+       lea     1($i),$i                # i++
+.align 4
+.Louter4x:
+       xor     $j,$j                   # j=0
+       movq    `0*$STRIDE/4-96`($bp),%xmm0
+       movq    `1*$STRIDE/4-96`($bp),%xmm1
+       pand    %xmm4,%xmm0
+       movq    `2*$STRIDE/4-96`($bp),%xmm2
+       pand    %xmm5,%xmm1
+
+       mov     (%rsp),$A[0]
+       mov     $n0,$m1
+       mulq    $m0                     # ap[0]*bp[i]
+       add     %rax,$A[0]              # ap[0]*bp[i]+tp[0]
+       mov     ($np),%rax
+       adc     \$0,%rdx
+
+       movq    `3*$STRIDE/4-96`($bp),%xmm3
+       pand    %xmm6,%xmm2
+       por     %xmm1,%xmm0
+       pand    %xmm7,%xmm3
+
+       imulq   $A[0],$m1               # tp[0]*n0
+       mov     %rdx,$A[1]
+
+       por     %xmm2,%xmm0
+       lea     $STRIDE($bp),$bp
+       por     %xmm3,%xmm0
+
+       mulq    $m1                     # np[0]*m1
+       add     %rax,$A[0]              # "$N[0]", discarded
+       mov     8($ap),%rax
+       adc     \$0,%rdx
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[1]
+       mov     8($np),%rax
+       adc     \$0,%rdx
+       add     8(%rsp),$A[1]           # +tp[1]
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     16($ap),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]             # np[j]*m1+ap[j]*bp[i]+tp[j]
+       lea     4($j),$j                # j+=2
+       adc     \$0,%rdx
+       mov     $N[1],(%rsp)            # tp[j-1]
+       mov     %rdx,$N[0]
+       jmp     .Linner4x
+.align 16
+.Linner4x:
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[0]
+       mov     -16($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     -8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]
+       adc     \$0,%rdx
+       mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[1]
+       mov     -8($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     -8(%rsp,$j,8),$A[1]
+       adc     \$0,%rdx
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     ($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]
+       adc     \$0,%rdx
+       mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[0]
+       mov     ($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     (%rsp,$j,8),$A[0]       # ap[j]*bp[i]+tp[j]
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]
+       adc     \$0,%rdx
+       mov     $N[0],-8(%rsp,$j,8)     # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[1]
+       mov     8($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     8(%rsp,$j,8),$A[1]
+       adc     \$0,%rdx
+       lea     4($j),$j                # j++
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     -16($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[1],$N[1]
+       adc     \$0,%rdx
+       mov     $N[1],-32(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+       cmp     $num,$j
+       jl      .Linner4x
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[0]
+       mov     -16($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     -16(%rsp,$j,8),$A[0]    # ap[j]*bp[i]+tp[j]
+       adc     \$0,%rdx
+       mov     %rdx,$A[1]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[0]
+       mov     -8($ap,$j,8),%rax
+       adc     \$0,%rdx
+       add     $A[0],$N[0]
+       adc     \$0,%rdx
+       mov     $N[0],-24(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[1]
+
+       mulq    $m0                     # ap[j]*bp[i]
+       add     %rax,$A[1]
+       mov     -8($np,$j,8),%rax
+       adc     \$0,%rdx
+       add     -8(%rsp,$j,8),$A[1]
+       adc     \$0,%rdx
+       lea     1($i),$i                # i++
+       mov     %rdx,$A[0]
+
+       mulq    $m1                     # np[j]*m1
+       add     %rax,$N[1]
+       mov     ($ap),%rax              # ap[0]
+       adc     \$0,%rdx
+       add     $A[1],$N[1]
+       adc     \$0,%rdx
+       mov     $N[1],-16(%rsp,$j,8)    # tp[j-1]
+       mov     %rdx,$N[0]
+
+       movq    %xmm0,$m0               # bp[i+1]
+
+       xor     $N[1],$N[1]
+       add     $A[0],$N[0]
+       adc     \$0,$N[1]
+       add     (%rsp,$num,8),$N[0]     # pull upmost overflow bit
+       adc     \$0,$N[1]
+       mov     $N[0],-8(%rsp,$j,8)
+       mov     $N[1],(%rsp,$j,8)       # store upmost overflow bit
+
+       cmp     $num,$i
+       jl      .Louter4x
+___
+{
+my @ri=("%rax","%rdx",$m0,$m1);
+$code.=<<___;
+       mov     16(%rsp,$num,8),$rp     # restore $rp
+       mov     0(%rsp),@ri[0]          # tp[0]
+       pxor    %xmm0,%xmm0
+       mov     8(%rsp),@ri[1]          # tp[1]
+       shr     \$2,$num                # num/=4
+       lea     (%rsp),$ap              # borrow ap for tp
+       xor     $i,$i                   # i=0 and clear CF!
+
+       sub     0($np),@ri[0]
+       mov     16($ap),@ri[2]          # tp[2]
+       mov     24($ap),@ri[3]          # tp[3]
+       sbb     8($np),@ri[1]
+       lea     -1($num),$j             # j=num/4-1
+       jmp     .Lsub4x
+.align 16
+.Lsub4x:
+       mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
+       mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
+       sbb     16($np,$i,8),@ri[2]
+       mov     32($ap,$i,8),@ri[0]     # tp[i+1]
+       mov     40($ap,$i,8),@ri[1]
+       sbb     24($np,$i,8),@ri[3]
+       mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
+       mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
+       sbb     32($np,$i,8),@ri[0]
+       mov     48($ap,$i,8),@ri[2]
+       mov     56($ap,$i,8),@ri[3]
+       sbb     40($np,$i,8),@ri[1]
+       lea     4($i),$i                # i++
+       dec     $j                      # doesnn't affect CF!
+       jnz     .Lsub4x
+
+       mov     @ri[0],0($rp,$i,8)      # rp[i]=tp[i]-np[i]
+       mov     32($ap,$i,8),@ri[0]     # load overflow bit
+       sbb     16($np,$i,8),@ri[2]
+       mov     @ri[1],8($rp,$i,8)      # rp[i]=tp[i]-np[i]
+       sbb     24($np,$i,8),@ri[3]
+       mov     @ri[2],16($rp,$i,8)     # rp[i]=tp[i]-np[i]
+
+       sbb     \$0,@ri[0]              # handle upmost overflow bit
+       mov     @ri[3],24($rp,$i,8)     # rp[i]=tp[i]-np[i]
+       xor     $i,$i                   # i=0
+       and     @ri[0],$ap
+       not     @ri[0]
+       mov     $rp,$np
+       and     @ri[0],$np
+       lea     -1($num),$j
+       or      $np,$ap                 # ap=borrow?tp:rp
+
+       movdqu  ($ap),%xmm1
+       movdqa  %xmm0,(%rsp)
+       movdqu  %xmm1,($rp)
+       jmp     .Lcopy4x
+.align 16
+.Lcopy4x:                                      # copy or in-place refresh
+       movdqu  16($ap,$i),%xmm2
+       movdqu  32($ap,$i),%xmm1
+       movdqa  %xmm0,16(%rsp,$i)
+       movdqu  %xmm2,16($rp,$i)
+       movdqa  %xmm0,32(%rsp,$i)
+       movdqu  %xmm1,32($rp,$i)
+       lea     32($i),$i
+       dec     $j
+       jnz     .Lcopy4x
+
+       shl     \$2,$num
+       movdqu  16($ap,$i),%xmm2
+       movdqa  %xmm0,16(%rsp,$i)
+       movdqu  %xmm2,16($rp,$i)
+___
+}
+$code.=<<___;
+       mov     8(%rsp,$num,8),%rsi     # restore %rsp
+       mov     \$1,%rax
+       mov     (%rsi),%r15
+       mov     8(%rsi),%r14
+       mov     16(%rsi),%r13
+       mov     24(%rsi),%r12
+       mov     32(%rsi),%rbp
+       mov     40(%rsi),%rbx
+       lea     48(%rsi),%rsp
+.Lmul4x_epilogue:
+       ret
+.size  bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5
+___
+}}}
+
+{
+my ($inp,$num,$tbl,$idx)=$win64?("%rcx","%rdx","%r8", "%r9") : # Win64 order
+                               ("%rdi","%rsi","%rdx","%rcx"); # Unix order
+$code.=<<___;
+.globl bn_scatter5
+.type  bn_scatter5,\@abi-omnipotent
+.align 16
+bn_scatter5:
+       lea     ($tbl,$idx,8),$tbl
+.Lscatter:
+       mov     ($inp),%rax
+       lea     8($inp),$inp
+       mov     %rax,($tbl)
+       lea     32*8($tbl),$tbl
+       sub     \$1,$num
+       jnz     .Lscatter
+       ret
+.size  bn_scatter5,.-bn_scatter5
+___
+}
+$code.=<<___;
+.align 64
+.Lmagic_masks:
+       .long   0,0, 0,0, 0,0, -1,-1
+       .long   0,0, 0,0, 0,0,  0,0
+.asciz "Montgomery Multiplication with scatter/gather for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$code =~ s/\`([^\`]*)\`/eval($1)/gem;
+
+print $code;
+close STDOUT;
index ce31ad0a58a61f74f33fbe12118fedb188ecea3c..5b00aa1e3c5b2c1503a776e424e0a3cc0a14a155 100644 (file)
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
+#include <stdlib.h>
+#ifdef _WIN32
+# include <malloc.h>
+# ifndef alloca
+#  define alloca _alloca
+# endif
+#elif defined(__GNUC__)
+# ifndef alloca
+#  define alloca(s) __builtin_alloca((s))
+# endif
+#endif
+
 /* maximum precomputation table size for *variable* sliding windows */
 #define TABLE_SIZE     32
 
@@ -562,7 +574,7 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
 
 /* Given a pointer value, compute the next address that is a cache line multiple. */
 #define MOD_EXP_CTIME_ALIGN(x_) \
-       ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((BN_ULONG)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
+       ((unsigned char*)(x_) + (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK))))
 
 /* This variant of BN_mod_exp_mont() uses fixed windows and the special
  * precomputation memory layout to limit data-dependency to a minimum
@@ -573,17 +585,16 @@ static int MOD_EXP_CTIME_COPY_FROM_PREBUF(BIGNUM *b, int top, unsigned char *buf
 int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
                    const BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont)
        {
-       int i,bits,ret=0,idx,window,wvalue;
+       int i,bits,ret=0,window,wvalue;
        int top;
        BIGNUM *r;
-       const BIGNUM *aa;
        BN_MONT_CTX *mont=NULL;
 
        int numPowers;
        unsigned char *powerbufFree=NULL;
        int powerbufLen = 0;
        unsigned char *powerbuf=NULL;
-       BIGNUM *computeTemp=NULL, *am=NULL;
+       BIGNUM computeTemp, *am=NULL;
 
        bn_check_top(a);
        bn_check_top(p);
@@ -621,39 +632,149 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
 
        /* Get the window size to use with size of p. */
        window = BN_window_bits_for_ctime_exponent_size(bits);
+#if defined(OPENSSL_BN_ASM_MONT5)
+       if (window==6 && bits<=1024) window=5;  /* ~5% improvement of 2048-bit RSA sign */
+#endif
+       /* Adjust the number of bits up to a multiple of the window size.
+        * If the exponent length is not a multiple of the window size, then
+        * this pads the most significant bits with zeros to normalize the
+        * scanning loop to there's no special cases.
+        *
+        * * NOTE: Making the window size a power of two less than the native
+        * * word size ensures that the padded bits won't go past the last
+        * * word in the internal BIGNUM structure. Going past the end will
+        * * still produce the correct result, but causes a different branch
+        * * to be taken in the BN_is_bit_set function.
+        */
+       bits = ((bits+window-1)/window)*window;
 
        /* Allocate a buffer large enough to hold all of the pre-computed
-        * powers of a.
+        * powers of a, plus computeTemp.
         */
        numPowers = 1 << window;
-       powerbufLen = sizeof(m->d[0])*top*numPowers;
-       if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
+       powerbufLen = sizeof(m->d[0])*(top*numPowers +
+                               (top>numPowers?top:numPowers));
+       if (powerbufLen < 3072)
+               powerbufFree = alloca(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH);
+       else if ((powerbufFree=(unsigned char*)OPENSSL_malloc(powerbufLen+MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH)) == NULL)
                goto err;
                
        powerbuf = MOD_EXP_CTIME_ALIGN(powerbufFree);
        memset(powerbuf, 0, powerbufLen);
 
+       if (powerbufLen < 3072)
+               powerbufFree = NULL;
+
+       computeTemp.d = (BN_ULONG *)(powerbuf + sizeof(m->d[0])*top*numPowers);
+       computeTemp.top = computeTemp.dmax = top;
+       computeTemp.neg = 0;
+       computeTemp.flags = BN_FLG_STATIC_DATA;
+
        /* Initialize the intermediate result. Do this early to save double conversion,
         * once each for a^0 and intermediate result.
         */
        if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err;
-       if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err;
 
        /* Initialize computeTemp as a^1 with montgomery precalcs */
-       computeTemp = BN_CTX_get(ctx);
        am = BN_CTX_get(ctx);
-       if (computeTemp==NULL || am==NULL) goto err;
+       if (am==NULL) goto err;
 
        if (a->neg || BN_ucmp(a,m) >= 0)
                {
-               if (!BN_mod(am,a,m,ctx))
-                       goto err;
-               aa= am;
+               if (!BN_mod(am,a,m,ctx))                goto err;
+               if (!BN_to_montgomery(am,am,mont,ctx))  goto err;
                }
-       else
-               aa=a;
-       if (!BN_to_montgomery(am,aa,mont,ctx)) goto err;
-       if (!BN_copy(computeTemp, am)) goto err;
+       else    if (!BN_to_montgomery(am,a,mont,ctx))   goto err;
+
+       if (!BN_copy(&computeTemp, am)) goto err;
+
+       if (bn_wexpand(am,top)==NULL || bn_wexpand(r,top)==NULL)
+               goto err;
+
+#if defined(OPENSSL_BN_ASM_MONT5)
+    /* This optimization uses ideas from http://eprint.iacr.org/2011/239,
+     * specifically optimization of cache-timing attack countermeasures
+     * and pre-computation optimization. */
+
+    /* Dedicated window==4 case improves 512-bit RSA sign by ~15%, but as
+     * 512-bit RSA is hardly relevant, we omit it to spare size... */ 
+    if (window==5)
+       {
+       void bn_mul_mont_gather5(BN_ULONG *rp,const BN_ULONG *ap,
+                       const void *table,const BN_ULONG *np,
+                       const BN_ULONG *n0,int num,int power);
+       void bn_scatter5(const BN_ULONG *inp,size_t num,
+                       void *table,size_t power);
+
+       BN_ULONG *acc, *np=mont->N.d, *n0=mont->n0;
+
+       bn_scatter5(r->d,r->top,powerbuf,0);
+       bn_scatter5(am->d,am->top,powerbuf,1);
+
+       acc = computeTemp.d;
+#if 0
+       for (i=2; i<32; i++)
+               {
+               bn_mul_mont_gather5(acc,am->d,powerbuf,np,n0,top,i-1);
+               bn_scatter5(acc,top,powerbuf,i);
+               }
+#else
+       /* same as above, but uses squaring for 1/2 of operations */
+       for (i=2; i<32; i*=2)
+               {
+               bn_mul_mont(acc,acc,acc,np,n0,top);
+               bn_scatter5(acc,top,powerbuf,i);
+               }
+       for (i=3; i<8; i+=2)
+               {
+               int j;
+               bn_mul_mont_gather5(acc,am->d,powerbuf,np,n0,top,i-1);
+               bn_scatter5(acc,top,powerbuf,i);
+               for (j=2*i; j<32; j*=2)
+                       {
+                       bn_mul_mont(acc,acc,acc,np,n0,top);
+                       bn_scatter5(acc,top,powerbuf,j);
+                       }
+               }
+       for (; i<16; i+=2)
+               {
+               bn_mul_mont_gather5(acc,am->d,powerbuf,np,n0,top,i-1);
+               bn_scatter5(acc,top,powerbuf,i);
+               bn_mul_mont(acc,acc,acc,np,n0,top);
+               bn_scatter5(acc,top,powerbuf,2*i);
+               }
+       for (; i<32; i+=2)
+               {
+               bn_mul_mont_gather5(acc,am->d,powerbuf,np,n0,top,i-1);
+               bn_scatter5(acc,top,powerbuf,i);
+               }
+#endif
+       acc = r->d;
+
+       /* Scan the exponent one window at a time starting from the most
+        * significant bits.
+        */
+       bits--;
+       while (bits >= 0)
+               {
+               for (wvalue=0, i=0; i<5; i++,bits--)
+                       wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
+
+               bn_mul_mont(acc,acc,acc,np,n0,top);
+               bn_mul_mont(acc,acc,acc,np,n0,top);
+               bn_mul_mont(acc,acc,acc,np,n0,top);
+               bn_mul_mont(acc,acc,acc,np,n0,top);
+               bn_mul_mont(acc,acc,acc,np,n0,top);
+               bn_mul_mont_gather5(acc,acc,powerbuf,np,n0,top,wvalue);
+               }
+
+       r->top=top;
+       bn_correct_top(r);
+       }
+    else
+#endif
+       {
+       if (!MOD_EXP_CTIME_COPY_TO_PREBUF(r, top, powerbuf, 0, numPowers)) goto err;
        if (!MOD_EXP_CTIME_COPY_TO_PREBUF(am, top, powerbuf, 1, numPowers)) goto err;
 
        /* If the window size is greater than 1, then calculate
@@ -666,46 +787,34 @@ int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p,
                for (i=2; i<numPowers; i++)
                        {
                        /* Calculate a^i = a^(i-1) * a */
-                       if (!BN_mod_mul_montgomery(computeTemp,am,computeTemp,mont,ctx))
+                       if (!BN_mod_mul_montgomery(&computeTemp,am,&computeTemp,mont,ctx))
                                goto err;
-                       if (!MOD_EXP_CTIME_COPY_TO_PREBUF(computeTemp, top, powerbuf, i, numPowers)) goto err;
+                       if (!MOD_EXP_CTIME_COPY_TO_PREBUF(&computeTemp, top, powerbuf, i, numPowers)) goto err;
                        }
                }
 
-       /* Adjust the number of bits up to a multiple of the window size.
-        * If the exponent length is not a multiple of the window size, then
-        * this pads the most significant bits with zeros to normalize the
-        * scanning loop to there's no special cases.
-        *
-        * * NOTE: Making the window size a power of two less than the native
-        * * word size ensures that the padded bits won't go past the last
-        * * word in the internal BIGNUM structure. Going past the end will
-        * * still produce the correct result, but causes a different branch
-        * * to be taken in the BN_is_bit_set function.
-        */
-       bits = ((bits+window-1)/window)*window;
-       idx=bits-1;     /* The top bit of the window */
-
-       /* Scan the exponent one window at a time starting from the most
+       /* Scan the exponent one window at a time starting from the most
         * significant bits.
         */
-       while (idx >= 0)
+       bits--;
+       while (bits >= 0)
                {
                wvalue=0; /* The 'value' of the window */
                
                /* Scan the window, squaring the result as we go */
-               for (i=0; i<window; i++,idx--)
+               for (i=0; i<window; i++,bits--)
                        {
                        if (!BN_mod_mul_montgomery(r,r,r,mont,ctx))     goto err;
-                       wvalue = (wvalue<<1)+BN_is_bit_set(p,idx);
+                       wvalue = (wvalue<<1)+BN_is_bit_set(p,bits);
                        }
                
                /* Fetch the appropriate pre-computed value from the pre-buf */
-               if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(computeTemp, top, powerbuf, wvalue, numPowers)) goto err;
+               if (!MOD_EXP_CTIME_COPY_FROM_PREBUF(&computeTemp, top, powerbuf, wvalue, numPowers)) goto err;
 
                /* Multiply the result into the intermediate result */
-               if (!BN_mod_mul_montgomery(r,r,computeTemp,mont,ctx)) goto err;
+               if (!BN_mod_mul_montgomery(r,r,&computeTemp,mont,ctx)) goto err;
                }
+       }
 
        /* Convert the final result from montgomery to standard format */
        if (!BN_from_montgomery(rr,r,mont,ctx)) goto err;
@@ -715,10 +824,9 @@ err:
        if (powerbuf!=NULL)
                {
                OPENSSL_cleanse(powerbuf,powerbufLen);
-               OPENSSL_free(powerbufFree);
+               if (powerbufFree) OPENSSL_free(powerbufFree);
                }
        if (am!=NULL) BN_clear(am);
-       if (computeTemp!=NULL) BN_clear(computeTemp);
        BN_CTX_end(ctx);
        return(ret);
        }