Unobtrusive backport of 32-bit x86 Montgomery improvements from 0.9.9-dev:
authorBodo Möller <bodo@openssl.org>
Thu, 1 May 2008 23:11:34 +0000 (23:11 +0000)
committerBodo Möller <bodo@openssl.org>
Thu, 1 May 2008 23:11:34 +0000 (23:11 +0000)
you need to use "enable-montasm" to see a difference.  (Huge speed
advantage, but BN_MONT_CTX is not binary compatible, so this can't be
enabled by default in the 0.9.8 branch.)

The CHANGES entry also covers the 64-bit x86 backport in November 2007
by appro.

CHANGES
Configure
crypto/bn/.cvsignore
crypto/bn/Makefile
crypto/bn/asm/mo-586.pl [new file with mode: 0644]
crypto/bn/bn.h
crypto/bn/bn_mont.c
crypto/perlasm/x86ms.pl
crypto/perlasm/x86nasm.pl
crypto/perlasm/x86unix.pl

diff --git a/CHANGES b/CHANGES
index 0af9ffe98319cca89786d47e5bc216dc8deba880..27c74e2ef649451f3512aad573f4b9d953930cfd 100644 (file)
--- a/CHANGES
+++ b/CHANGES
@@ -4,6 +4,28 @@
 
  Changes between 0.9.8g and 0.9.8h  [xx XXX xxxx]
 
+  *) Partial backport from 0.9.9-dev:
+
+     New candidate for BIGNUM assembler implementation, bn_mul_mont,
+     dedicated Montgomery multiplication procedure, is introduced.
+     While 0.9.9-dev has assembler for various architectures, here
+     in the 0.9.8 branch, only x86_64 is available by default.
+
+     With Configure option "enable-montasm" (which exists only for
+     this backport), the 32-bit x86 assembler implementation can be
+     activated at compile-time.  In 0.9.9-dev, BN_MONT_CTX is modified
+     to allow bn_mul_mont to reach for higher "64-bit" performance on
+     certain 32-bit targets.  With "enable-montasm", this BN_MONT_CTX
+     change is activated in the 0.9.8 branch.
+
+     Warning: Using "enable-montasm" thus means losing binary
+     compatibility between patchlevels!  (I.e., applications will
+     have to be recompiled to match the particular library.)
+     So you may want to avoid this setting for shared libraries.
+     Use at your own risk.
+
+     [Andy Polyakov (32-bit x86 backport: Bodo Moeller)]
+
   *) Add TLS session ticket callback. This allows an application to set
      TLS ticket cipher and HMAC keys rather than relying on hardcoded fixed
      values. This is useful for key rollover for example where several key
index c7dd1194eff987bff038529231bbbe3aebdd38bb..1a9a59f969f36c9186a4fce2665e400fc13d8b45 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -10,7 +10,7 @@ use strict;
 
 # see INSTALL for instructions.
 
-my $usage="Usage: Configure [no-<cipher> ...] [enable-<cipher> ...] [-Dxxx] [-lxxx] [-Lxxx] [-fxxx] [-Kxxx] [no-hw-xxx|no-hw] [[no-]threads] [[no-]shared] [[no-]zlib|zlib-dynamic] [no-asm] [no-dso] [no-krb5] [386] [--prefix=DIR] [--openssldir=OPENSSLDIR] [--with-xxx[=vvv]] [--test-sanity] os/compiler[:flags]\n";
+my $usage="Usage: Configure [no-<cipher> ...] [enable-<cipher> ...] [-Dxxx] [-lxxx] [-Lxxx] [-fxxx] [-Kxxx] [no-hw-xxx|no-hw] [[no-]threads] [[no-]shared] [[no-]zlib|zlib-dynamic] [enable-montasm] [no-asm] [no-dso] [no-krb5] [386] [--prefix=DIR] [--openssldir=OPENSSLDIR] [--with-xxx[=vvv]] [--test-sanity] os/compiler[:flags]\n";
 
 # Options:
 #
@@ -54,6 +54,8 @@ my $usage="Usage: Configure [no-<cipher> ...] [enable-<cipher> ...] [-Dxxx] [-lx
 # [no-]zlib     [don't] compile support for zlib compression.
 # zlib-dynamic Like "zlib", but the zlib library is expected to be a shared
 #              library and will be loaded in run-time by the OpenSSL library.
+# enable-montasm 0.9.8 branch only: enable Montgomery x86 assembler backport
+#               from 0.9.9
 # 386           generate 80386 code
 # no-sse2      disables IA-32 SSE2 code, above option implies no-sse2
 # no-<cipher>   build without specified algorithm (rsa, idea, rc5, ...)
@@ -114,9 +116,9 @@ my $tlib="-lnsl -lsocket";
 my $bits1="THIRTY_TWO_BIT ";
 my $bits2="SIXTY_FOUR_BIT ";
 
-my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o:dx86-elf.o yx86-elf.o:ax86-elf.o:bx86-elf.o:mx86-elf.o:sx86-elf.o s512sse2-elf.o:cx86-elf.o:rx86-elf.o rc4_skey.o:rm86-elf.o:r586-elf.o";
-my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o rc4_skey.o:rm86-cof.o:r586-cof.o";
-my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o rc4_skey.o:rm86-out.o:r586-out.o";
+my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o MAYBE-MO86-elf.o:dx86-elf.o yx86-elf.o:ax86-elf.o:bx86-elf.o:mx86-elf.o:sx86-elf.o s512sse2-elf.o:cx86-elf.o:rx86-elf.o rc4_skey.o:rm86-elf.o:r586-elf.o";
+my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o MAYBE-MO86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o rc4_skey.o:rm86-cof.o:r586-cof.o";
+my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o MAYBE-MO86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o rc4_skey.o:rm86-out.o:r586-out.o";
 
 my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o::";
 my $ia64_asm=":bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o:::sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o::";
@@ -580,6 +582,7 @@ my $no_shared=0; # but "no-shared" is default
 my $zlib=1;      # but "no-zlib" is default
 my $no_krb5=0;   # but "no-krb5" is implied unless "--with-krb5-..." is used
 my $no_rfc3779=1; # but "no-rfc3779" is default
+my $montasm=1;   # but "no-montasm" is default
 my $no_asm=0;
 my $no_dso=0;
 my $no_gmp=0;
@@ -616,6 +619,7 @@ my %disabled = ( # "what"         => "comment"
                  "cms"            => "default",
                  "gmp"            => "default",
                  "mdc2"           => "default",
+                 "montasm"        => "default", # explicit option in 0.9.8 only (implicitly enabled in 0.9.9)
                  "rc5"            => "default",
                  "rfc3779"        => "default",
                  "seed"           => "default",
@@ -895,6 +899,8 @@ foreach (sort (keys %disabled))
                { $no_shared = 1; }
        elsif (/^zlib$/)
                { $zlib = 0; }
+       elsif (/^montasm$/)
+               { $montasm = 0; }
        elsif (/^static-engine$/)
                { }
        elsif (/^zlib-dynamic$/)
@@ -1121,6 +1127,14 @@ if ($no_asm)
        $cpuid_obj=$bn_obj=$des_obj=$aes_obj=$bf_obj=$cast_obj=$rc4_obj=$rc5_obj="";
        $sha1_obj=$md5_obj=$rmd160_obj="";
        }
+if ($montasm)
+       {
+       $bn_obj =~ s/MAYBE-MO86-/mo86-/;
+       }
+else
+       {
+       $bn_obj =~ s/MAYBE-MO86-[a-z.]*//;
+       }
 
 if (!$no_shared)
        {
index 57df22cf657df97170435e0294bb9c4fd08703fa..c2f3bc0856aba5e114b743803b535822eb3cb96b 100644 (file)
@@ -4,3 +4,4 @@ Makefile.save
 semantic.cache
 co86-elf.s
 bn86-elf.s
+mo86-elf.s
index 6dfd528d52dd94229b697f8a5d079e8ee781b684..e97c751390151e32261a37dbca23d0cd251252fc 100644 (file)
@@ -67,16 +67,22 @@ bn86-elf.s: asm/bn-586.pl ../perlasm/x86asm.pl
        (cd asm; $(PERL) bn-586.pl elf $(CFLAGS) > ../$@)
 co86-elf.s:    asm/co-586.pl ../perlasm/x86asm.pl
        (cd asm; $(PERL) co-586.pl elf $(CFLAGS) > ../$@)
+mo86-elf.s:    asm/mo-586.pl ../perlasm/x86asm.pl
+       (cd asm; $(PERL) mo-586.pl elf $(CFLAGS) > ../$@)
 # COFF
 bn86-cof.s: asm/bn-586.pl ../perlasm/x86asm.pl
        (cd asm; $(PERL) bn-586.pl coff $(CFLAGS) > ../$@)
 co86-cof.s: asm/co-586.pl ../perlasm/x86asm.pl
        (cd asm; $(PERL) co-586.pl coff $(CFLAGS) > ../$@)
+mo86-cof.s: asm/mo-586.pl ../perlasm/x86asm.pl
+       (cd asm; $(PERL) mo-586.pl coff $(CFLAGS) > ../$@)
 # a.out
 bn86-out.s: asm/bn-586.pl ../perlasm/x86asm.pl
        (cd asm; $(PERL) bn-586.pl a.out $(CFLAGS) > ../$@)
 co86-out.s: asm/co-586.pl ../perlasm/x86asm.pl
        (cd asm; $(PERL) co-586.pl a.out $(CFLAGS) > ../$@)
+mo86-out.s: asm/mo-586.pl ../perlasm/x86asm.pl
+       (cd asm; $(PERL) mo-586.pl a.out $(CFLAGS) > ../$@)
 
 sparcv8.o:     asm/sparcv8.S
        $(CC) $(CFLAGS) -c asm/sparcv8.S
diff --git a/crypto/bn/asm/mo-586.pl b/crypto/bn/asm/mo-586.pl
new file mode 100644 (file)
index 0000000..0982293
--- /dev/null
@@ -0,0 +1,603 @@
+#!/usr/bin/env perl
+
+# This is crypto/bn/asm/x86-mont.pl (with asciz from crypto/perlasm/x86asm.pl)
+# from OpenSSL 0.9.9-dev 
+
+sub ::asciz
+{ my @str=unpack("C*",shift);
+    push @str,0;
+    while ($#str>15) {
+       &data_byte(@str[0..15]);
+       foreach (0..15) { shift @str; }
+    }
+    &data_byte(@str) if (@str);
+}
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# October 2005
+#
+# This is a "teaser" code, as it can be improved in several ways...
+# First of all non-SSE2 path should be implemented (yes, for now it
+# performs Montgomery multiplication/convolution only on SSE2-capable
+# CPUs such as P4, others fall down to original code). Then inner loop
+# can be unrolled and modulo-scheduled to improve ILP and possibly
+# moved to 128-bit XMM register bank (though it would require input
+# rearrangement and/or increase bus bandwidth utilization). Dedicated
+# squaring procedure should give further performance improvement...
+# Yet, for being draft, the code improves rsa512 *sign* benchmark by
+# 110%(!), rsa1024 one - by 70% and rsa4096 - by 20%:-)
+
+# December 2006
+#
+# Modulo-scheduling SSE2 loops results in further 15-20% improvement.
+# Integer-only code [being equipped with dedicated squaring procedure]
+# gives ~40% on rsa512 sign benchmark...
+
+push(@INC,"perlasm","../../perlasm");
+require "x86asm.pl";
+
+&asm_init($ARGV[0],$0);
+
+$sse2=0;
+for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
+
+&external_label("OPENSSL_ia32cap_P") if ($sse2);
+
+&function_begin("bn_mul_mont");
+
+$i="edx";
+$j="ecx";
+$ap="esi";     $tp="esi";              # overlapping variables!!!
+$rp="edi";     $bp="edi";              # overlapping variables!!!
+$np="ebp";
+$num="ebx";
+
+$_num=&DWP(4*0,"esp");                 # stack top layout
+$_rp=&DWP(4*1,"esp");
+$_ap=&DWP(4*2,"esp");
+$_bp=&DWP(4*3,"esp");
+$_np=&DWP(4*4,"esp");
+$_n0=&DWP(4*5,"esp");  $_n0q=&QWP(4*5,"esp");
+$_sp=&DWP(4*6,"esp");
+$_bpend=&DWP(4*7,"esp");
+$frame=32;                             # size of above frame rounded up to 16n
+
+       &xor    ("eax","eax");
+       &mov    ("edi",&wparam(5));     # int num
+       &cmp    ("edi",4);
+       &jl     (&label("just_leave"));
+
+       &lea    ("esi",&wparam(0));     # put aside pointer to argument block
+       &lea    ("edx",&wparam(1));     # load ap
+       &mov    ("ebp","esp");          # saved stack pointer!
+       &add    ("edi",2);              # extra two words on top of tp
+       &neg    ("edi");
+       &lea    ("esp",&DWP(-$frame,"esp","edi",4));    # alloca($frame+4*(num+2))
+       &neg    ("edi");
+
+       # minimize cache contention by arraning 2K window between stack
+       # pointer and ap argument [np is also position sensitive vector,
+       # but it's assumed to be near ap, as it's allocated at ~same
+       # time].
+       &mov    ("eax","esp");
+       &sub    ("eax","edx");
+       &and    ("eax",2047);
+       &sub    ("esp","eax");          # this aligns sp and ap modulo 2048
+
+       &xor    ("edx","esp");
+       &and    ("edx",2048);
+       &xor    ("edx",2048);
+       &sub    ("esp","edx");          # this splits them apart modulo 4096
+
+       &and    ("esp",-64);            # align to cache line
+
+       ################################# load argument block...
+       &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
+       &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+       &mov    ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
+       &mov    ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+       &mov    ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
+       #&mov   ("edi",&DWP(5*4,"esi"));# int num
+
+       &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
+       &mov    ($_rp,"eax");           # ... save a copy of argument block
+       &mov    ($_ap,"ebx");
+       &mov    ($_bp,"ecx");
+       &mov    ($_np,"edx");
+       &mov    ($_n0,"esi");
+       &lea    ($num,&DWP(-3,"edi"));  # num=num-1 to assist modulo-scheduling
+       #&mov   ($_num,$num);           # redundant as $num is not reused
+       &mov    ($_sp,"ebp");           # saved stack pointer!
+\f
+if($sse2) {
+$acc0="mm0";   # mmx register bank layout
+$acc1="mm1";
+$car0="mm2";
+$car1="mm3";
+$mul0="mm4";
+$mul1="mm5";
+$temp="mm6";
+$mask="mm7";
+
+       &picmeup("eax","OPENSSL_ia32cap_P");
+       &bt     (&DWP(0,"eax"),26);
+       &jnc    (&label("non_sse2"));
+
+       &mov    ("eax",-1);
+       &movd   ($mask,"eax");          # mask 32 lower bits
+
+       &mov    ($ap,$_ap);             # load input pointers
+       &mov    ($bp,$_bp);
+       &mov    ($np,$_np);
+
+       &xor    ($i,$i);                # i=0
+       &xor    ($j,$j);                # j=0
+
+       &movd   ($mul0,&DWP(0,$bp));            # bp[0]
+       &movd   ($mul1,&DWP(0,$ap));            # ap[0]
+       &movd   ($car1,&DWP(0,$np));            # np[0]
+
+       &pmuludq($mul1,$mul0);                  # ap[0]*bp[0]
+       &movq   ($car0,$mul1);
+       &movq   ($acc0,$mul1);                  # I wish movd worked for
+       &pand   ($acc0,$mask);                  # inter-register transfers
+
+       &pmuludq($mul1,$_n0q);                  # *=n0
+
+       &pmuludq($car1,$mul1);                  # "t[0]"*np[0]*n0
+       &paddq  ($car1,$acc0);
+
+       &movd   ($acc1,&DWP(4,$np));            # np[1]
+       &movd   ($acc0,&DWP(4,$ap));            # ap[1]
+
+       &psrlq  ($car0,32);
+       &psrlq  ($car1,32);
+
+       &inc    ($j);                           # j++
+&set_label("1st",16);
+       &pmuludq($acc0,$mul0);                  # ap[j]*bp[0]
+       &pmuludq($acc1,$mul1);                  # np[j]*m1
+       &paddq  ($car0,$acc0);                  # +=c0
+       &paddq  ($car1,$acc1);                  # +=c1
+
+       &movq   ($acc0,$car0);
+       &pand   ($acc0,$mask);
+       &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
+       &paddq  ($car1,$acc0);                  # +=ap[j]*bp[0];
+       &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
+       &psrlq  ($car0,32);
+       &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[j-1]=
+       &psrlq  ($car1,32);
+
+       &lea    ($j,&DWP(1,$j));
+       &cmp    ($j,$num);
+       &jl     (&label("1st"));
+
+       &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[0]
+       &pmuludq($acc1,$mul1);                  # np[num-1]*m1
+       &paddq  ($car0,$acc0);                  # +=c0
+       &paddq  ($car1,$acc1);                  # +=c1
+
+       &movq   ($acc0,$car0);
+       &pand   ($acc0,$mask);
+       &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[0];
+       &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
+
+       &psrlq  ($car0,32);
+       &psrlq  ($car1,32);
+
+       &paddq  ($car1,$car0);
+       &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
+\f
+       &inc    ($i);                           # i++
+&set_label("outer");
+       &xor    ($j,$j);                        # j=0
+
+       &movd   ($mul0,&DWP(0,$bp,$i,4));       # bp[i]
+       &movd   ($mul1,&DWP(0,$ap));            # ap[0]
+       &movd   ($temp,&DWP($frame,"esp"));     # tp[0]
+       &movd   ($car1,&DWP(0,$np));            # np[0]
+       &pmuludq($mul1,$mul0);                  # ap[0]*bp[i]
+
+       &paddq  ($mul1,$temp);                  # +=tp[0]
+       &movq   ($acc0,$mul1);
+       &movq   ($car0,$mul1);
+       &pand   ($acc0,$mask);
+
+       &pmuludq($mul1,$_n0q);                  # *=n0
+
+       &pmuludq($car1,$mul1);
+       &paddq  ($car1,$acc0);
+
+       &movd   ($temp,&DWP($frame+4,"esp"));   # tp[1]
+       &movd   ($acc1,&DWP(4,$np));            # np[1]
+       &movd   ($acc0,&DWP(4,$ap));            # ap[1]
+
+       &psrlq  ($car0,32);
+       &psrlq  ($car1,32);
+       &paddq  ($car0,$temp);                  # +=tp[1]
+
+       &inc    ($j);                           # j++
+       &dec    ($num);
+&set_label("inner");
+       &pmuludq($acc0,$mul0);                  # ap[j]*bp[i]
+       &pmuludq($acc1,$mul1);                  # np[j]*m1
+       &paddq  ($car0,$acc0);                  # +=c0
+       &paddq  ($car1,$acc1);                  # +=c1
+
+       &movq   ($acc0,$car0);
+       &movd   ($temp,&DWP($frame+4,"esp",$j,4));# tp[j+1]
+       &pand   ($acc0,$mask);
+       &movd   ($acc1,&DWP(4,$np,$j,4));       # np[j+1]
+       &paddq  ($car1,$acc0);                  # +=ap[j]*bp[i]+tp[j]
+       &movd   ($acc0,&DWP(4,$ap,$j,4));       # ap[j+1]
+       &psrlq  ($car0,32);
+       &movd   (&DWP($frame-4,"esp",$j,4),$car1);# tp[j-1]=
+       &psrlq  ($car1,32);
+       &paddq  ($car0,$temp);                  # +=tp[j+1]
+
+       &dec    ($num);
+       &lea    ($j,&DWP(1,$j));                # j++
+       &jnz    (&label("inner"));
+
+       &mov    ($num,$j);
+       &pmuludq($acc0,$mul0);                  # ap[num-1]*bp[i]
+       &pmuludq($acc1,$mul1);                  # np[num-1]*m1
+       &paddq  ($car0,$acc0);                  # +=c0
+       &paddq  ($car1,$acc1);                  # +=c1
+
+       &movq   ($acc0,$car0);
+       &pand   ($acc0,$mask);
+       &paddq  ($car1,$acc0);                  # +=ap[num-1]*bp[i]+tp[num-1]
+       &movd   (&DWP($frame-4,"esp",$j,4),$car1);      # tp[num-2]=
+       &psrlq  ($car0,32);
+       &psrlq  ($car1,32);
+
+       &movd   ($temp,&DWP($frame+4,"esp",$num,4));    # += tp[num]
+       &paddq  ($car1,$car0);
+       &paddq  ($car1,$temp);
+       &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
+
+       &lea    ($i,&DWP(1,$i));                # i++
+       &cmp    ($i,$num);
+       &jle    (&label("outer"));
+
+       &emms   ();                             # done with mmx bank
+       &jmp    (&label("common_tail"));
+
+&set_label("non_sse2",16);
+}
+\f
+if (0) {
+       &mov    ("esp",$_sp);
+       &xor    ("eax","eax");  # signal "not fast enough [yet]"
+       &jmp    (&label("just_leave"));
+       # While the below code provides competitive performance for
+       # all key lengthes on modern Intel cores, it's still more
+       # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
+       # means compared to the original integer-only assembler.
+       # 512-bit RSA sign is better by ~40%, but that's about all
+       # one can say about all CPUs...
+} else {
+$inp="esi";    # integer path uses these registers differently
+$word="edi";
+$carry="ebp";
+
+       &mov    ($inp,$_ap);
+       &lea    ($carry,&DWP(1,$num));
+       &mov    ($word,$_bp);
+       &xor    ($j,$j);                                # j=0
+       &mov    ("edx",$inp);
+       &and    ($carry,1);                             # see if num is even
+       &sub    ("edx",$word);                          # see if ap==bp
+       &lea    ("eax",&DWP(4,$word,$num,4));           # &bp[num]
+       &or     ($carry,"edx");
+       &mov    ($word,&DWP(0,$word));                  # bp[0]
+       &jz     (&label("bn_sqr_mont"));
+       &mov    ($_bpend,"eax");
+       &mov    ("eax",&DWP(0,$inp));
+       &xor    ("edx","edx");
+
+&set_label("mull",16);
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[j]*bp[0]
+       &add    ($carry,"eax");
+       &lea    ($j,&DWP(1,$j));
+       &adc    ("edx",0);
+       &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
+       &cmp    ($j,$num);
+       &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
+       &jl     (&label("mull"));
+
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[num-1]*bp[0]
+        &mov   ($word,$_n0);
+       &add    ("eax",$carry);
+        &mov   ($inp,$_np);
+       &adc    ("edx",0);
+        &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
+
+       &mov    (&DWP($frame,"esp",$num,4),"eax");      # tp[num-1]=
+       &xor    ($j,$j);
+       &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
+       &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
+
+       &mov    ("eax",&DWP(0,$inp));                   # np[0]
+       &mul    ($word);                                # np[0]*m
+       &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
+       &mov    ("eax",&DWP(4,$inp));                   # np[1]
+       &adc    ("edx",0);
+       &inc    ($j);
+
+       &jmp    (&label("2ndmadd"));
+\f\f
+&set_label("1stmadd",16);
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[j]*bp[i]
+       &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
+       &lea    ($j,&DWP(1,$j));
+       &adc    ("edx",0);
+       &add    ($carry,"eax");
+       &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j+1]
+       &adc    ("edx",0);
+       &cmp    ($j,$num);
+       &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
+       &jl     (&label("1stmadd"));
+
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[num-1]*bp[i]
+       &add    ("eax",&DWP($frame,"esp",$num,4));      # +=tp[num-1]
+        &mov   ($word,$_n0);
+       &adc    ("edx",0);
+        &mov   ($inp,$_np);
+       &add    ($carry,"eax");
+       &adc    ("edx",0);
+        &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
+
+       &xor    ($j,$j);
+       &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
+       &mov    (&DWP($frame,"esp",$num,4),$carry);     # tp[num-1]=
+       &adc    ($j,0);
+        &mov   ("eax",&DWP(0,$inp));                   # np[0]
+       &mov    (&DWP($frame+4,"esp",$num,4),"edx");    # tp[num]=
+       &mov    (&DWP($frame+8,"esp",$num,4),$j);       # tp[num+1]=
+
+       &mul    ($word);                                # np[0]*m
+       &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
+       &mov    ("eax",&DWP(4,$inp));                   # np[1]
+       &adc    ("edx",0);
+       &mov    ($j,1);
+\f
+&set_label("2ndmadd",16);
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # np[j]*m
+       &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
+       &lea    ($j,&DWP(1,$j));
+       &adc    ("edx",0);
+       &add    ($carry,"eax");
+       &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+1]
+       &adc    ("edx",0);
+       &cmp    ($j,$num);
+       &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j-1]=
+       &jl     (&label("2ndmadd"));
+
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # np[j]*m
+       &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
+       &adc    ("edx",0);
+       &add    ($carry,"eax");
+       &adc    ("edx",0);
+       &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
+
+       &xor    ("eax","eax");
+        &mov   ($j,$_bp);                              # &bp[i]
+       &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
+       &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
+        &lea   ($j,&DWP(4,$j));
+       &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
+        &cmp   ($j,$_bpend);
+       &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
+       &je     (&label("common_tail"));
+
+       &mov    ($word,&DWP(0,$j));                     # bp[i+1]
+       &mov    ($inp,$_ap);
+       &mov    ($_bp,$j);                              # &bp[++i]
+       &xor    ($j,$j);
+       &xor    ("edx","edx");
+       &mov    ("eax",&DWP(0,$inp));
+       &jmp    (&label("1stmadd"));
+\f
+&set_label("bn_sqr_mont",16);
+$sbit=$num;
+       &mov    ($_num,$num);
+       &mov    ($_bp,$j);                              # i=0
+
+       &mov    ("eax",$word);                          # ap[0]
+       &mul    ($word);                                # ap[0]*ap[0]
+       &mov    (&DWP($frame,"esp"),"eax");             # tp[0]=
+       &mov    ($sbit,"edx");
+       &shr    ("edx",1);
+       &and    ($sbit,1);
+       &inc    ($j);
+&set_label("sqr",16);
+       &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[j]*ap[0]
+       &add    ("eax",$carry);
+       &lea    ($j,&DWP(1,$j));
+       &adc    ("edx",0);
+       &lea    ($carry,&DWP(0,$sbit,"eax",2));
+       &shr    ("eax",31);
+       &cmp    ($j,$_num);
+       &mov    ($sbit,"eax");
+       &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
+       &jl     (&label("sqr"));
+
+       &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[num-1]
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[num-1]*ap[0]
+       &add    ("eax",$carry);
+        &mov   ($word,$_n0);
+       &adc    ("edx",0);
+        &mov   ($inp,$_np);
+       &lea    ($carry,&DWP(0,$sbit,"eax",2));
+        &imul  ($word,&DWP($frame,"esp"));             # n0*tp[0]
+       &shr    ("eax",31);
+       &mov    (&DWP($frame,"esp",$j,4),$carry);       # tp[num-1]=
+
+       &lea    ($carry,&DWP(0,"eax","edx",2));
+        &mov   ("eax",&DWP(0,$inp));                   # np[0]
+       &shr    ("edx",31);
+       &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num]=
+       &mov    (&DWP($frame+8,"esp",$j,4),"edx");      # tp[num+1]=
+
+       &mul    ($word);                                # np[0]*m
+       &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
+       &mov    ($num,$j);
+       &adc    ("edx",0);
+       &mov    ("eax",&DWP(4,$inp));                   # np[1]
+       &mov    ($j,1);
+\f\f
+&set_label("3rdmadd",16);
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # np[j]*m
+       &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
+       &adc    ("edx",0);
+       &add    ($carry,"eax");
+       &mov    ("eax",&DWP(4,$inp,$j,4));              # np[j+1]
+       &adc    ("edx",0);
+       &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j-1]=
+
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # np[j+1]*m
+       &add    ($carry,&DWP($frame+4,"esp",$j,4));     # +=tp[j+1]
+       &lea    ($j,&DWP(2,$j));
+       &adc    ("edx",0);
+       &add    ($carry,"eax");
+       &mov    ("eax",&DWP(0,$inp,$j,4));              # np[j+2]
+       &adc    ("edx",0);
+       &cmp    ($j,$num);
+       &mov    (&DWP($frame-8,"esp",$j,4),$carry);     # tp[j]=
+       &jl     (&label("3rdmadd"));
+
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # np[j]*m
+       &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
+       &adc    ("edx",0);
+       &add    ($carry,"eax");
+       &adc    ("edx",0);
+       &mov    (&DWP($frame-4,"esp",$num,4),$carry);   # tp[num-2]=
+
+       &mov    ($j,$_bp);                              # i
+       &xor    ("eax","eax");
+       &mov    ($inp,$_ap);
+       &add    ("edx",&DWP($frame+4,"esp",$num,4));    # carry+=tp[num]
+       &adc    ("eax",&DWP($frame+8,"esp",$num,4));    # +=tp[num+1]
+       &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
+       &cmp    ($j,$num);
+       &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
+       &je     (&label("common_tail"));
+\f
+       &mov    ($word,&DWP(4,$inp,$j,4));              # ap[i]
+       &lea    ($j,&DWP(1,$j));
+       &mov    ("eax",$word);
+       &mov    ($_bp,$j);                              # ++i
+       &mul    ($word);                                # ap[i]*ap[i]
+       &add    ("eax",&DWP($frame,"esp",$j,4));        # +=tp[i]
+       &adc    ("edx",0);
+       &mov    (&DWP($frame,"esp",$j,4),"eax");        # tp[i]=
+       &xor    ($carry,$carry);
+       &cmp    ($j,$num);
+       &lea    ($j,&DWP(1,$j));
+       &je     (&label("sqrlast"));
+
+       &mov    ($sbit,"edx");                          # zaps $num
+       &shr    ("edx",1);
+       &and    ($sbit,1);
+&set_label("sqradd",16);
+       &mov    ("eax",&DWP(0,$inp,$j,4));              # ap[j]
+       &mov    ($carry,"edx");
+       &mul    ($word);                                # ap[j]*ap[i]
+       &add    ("eax",$carry);
+       &lea    ($carry,&DWP(0,"eax","eax"));
+       &adc    ("edx",0);
+       &shr    ("eax",31);
+       &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
+       &lea    ($j,&DWP(1,$j));
+       &adc    ("eax",0);
+       &add    ($carry,$sbit);
+       &adc    ("eax",0);
+       &cmp    ($j,$_num);
+       &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
+       &mov    ($sbit,"eax");
+       &jle    (&label("sqradd"));
+
+       &mov    ($carry,"edx");
+       &lea    ("edx",&DWP(0,$sbit,"edx",2));
+       &shr    ($carry,31);
+&set_label("sqrlast");
+       &mov    ($word,$_n0);
+       &mov    ($inp,$_np);
+       &imul   ($word,&DWP($frame,"esp"));             # n0*tp[0]
+
+       &add    ("edx",&DWP($frame,"esp",$j,4));        # +=tp[num]
+       &mov    ("eax",&DWP(0,$inp));                   # np[0]
+       &adc    ($carry,0);
+       &mov    (&DWP($frame,"esp",$j,4),"edx");        # tp[num]=
+       &mov    (&DWP($frame+4,"esp",$j,4),$carry);     # tp[num+1]=
+
+       &mul    ($word);                                # np[0]*m
+       &add    ("eax",&DWP($frame,"esp"));             # +=tp[0]
+       &lea    ($num,&DWP(-1,$j));
+       &adc    ("edx",0);
+       &mov    ($j,1);
+       &mov    ("eax",&DWP(4,$inp));                   # np[1]
+
+       &jmp    (&label("3rdmadd"));
+}
+\f
+&set_label("common_tail",16);
+       &mov    ($np,$_np);                     # load modulus pointer
+       &mov    ($rp,$_rp);                     # load result pointer
+       &lea    ($tp,&DWP($frame,"esp"));       # [$ap and $bp are zapped]
+
+       &mov    ("eax",&DWP(0,$tp));            # tp[0]
+       &mov    ($j,$num);                      # j=num-1
+       &xor    ($i,$i);                        # i=0 and clear CF!
+
+&set_label("sub",16);
+       &sbb    ("eax",&DWP(0,$np,$i,4));
+       &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
+       &dec    ($j);                           # doesn't affect CF!
+       &mov    ("eax",&DWP(4,$tp,$i,4));       # tp[i+1]
+       &lea    ($i,&DWP(1,$i));                # i++
+       &jge    (&label("sub"));
+
+       &sbb    ("eax",0);                      # handle upmost overflow bit
+       &and    ($tp,"eax");
+       &not    ("eax");
+       &mov    ($np,$rp);
+       &and    ($np,"eax");
+       &or     ($tp,$np);                      # tp=carry?tp:rp
+
+&set_label("copy",16);                         # copy or in-place refresh
+       &mov    ("eax",&DWP(0,$tp,$num,4));
+       &mov    (&DWP(0,$rp,$num,4),"eax");     # rp[i]=tp[i]
+       &mov    (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
+       &dec    ($num);
+       &jge    (&label("copy"));
+
+       &mov    ("esp",$_sp);           # pull saved stack pointer
+       &mov    ("eax",1);
+&set_label("just_leave");
+&function_end("bn_mul_mont");
+
+&asciz("Montgomery Multiplication for x86, CRYPTOGAMS by <appro\@openssl.org>");
+
+&asm_finish();
index df6eea29a71b7f60867b880ca00972dd6ab4132c..de39a720749cc4c0f25910cf30bfe4a837251e41 100644 (file)
@@ -303,7 +303,14 @@ struct bn_mont_ctx_st
        BIGNUM N;      /* The modulus */
        BIGNUM Ni;     /* R*(1/R mod N) - N*Ni = 1
                        * (Ni is only stored for bignum algorithm) */
+#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
+       /* Non-default compile (can only happen with "enable-montasm"),
+        * uses the new type from 0.9.9 to accomodate two words: */
+       BN_ULONG n0[2];/* least significant word(s) of Ni */
+#else
+       /* By default, use old type: */
        BN_ULONG n0;   /* least significant word of Ni */
+#endif
        int flags;
        };
 
index 23e4ba5140255ed2c7c3bde883283a24d84ad189..e17c697e392538e365c279fb0acb75ceb814aaf8 100644 (file)
 
 #define MONT_WORD /* use the faster word-based algorithm */
 
+#if defined(MONT_WORD) && defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
+static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont);
+#endif
+
+
+
 int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
                          BN_MONT_CTX *mont, BN_CTX *ctx)
        {
@@ -133,7 +139,11 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
        if (num>1 && a->top==num && b->top==num)
                {
                if (bn_wexpand(r,num) == NULL) return(0);
+#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) /* non-default compile */
+               if (bn_mul_mont(r->d,a->d,b->d,mont->N.d,mont->n0,num))
+#else
                if (bn_mul_mont(r->d,a->d,b->d,mont->N.d,&mont->n0,num))
+#endif
                        {
                        r->neg = a->neg^b->neg;
                        r->top = num;
@@ -157,7 +167,11 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
                if (!BN_mul(tmp,a,b,ctx)) goto err;
                }
        /* reduce from aRR to aR */
+#if defined(MONT_WORD) && defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
+       if (!BN_from_montgomery_word(r,tmp,mont)) goto err;
+#else
        if (!BN_from_montgomery(r,tmp,mont,ctx)) goto err;
+#endif
        bn_check_top(r);
        ret=1;
 err:
@@ -165,6 +179,145 @@ err:
        return(ret);
        }
 
+#if defined(MONT_WORD) && defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
+static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
+       {
+       BIGNUM *n;
+       BN_ULONG *ap,*np,*rp,n0,v,*nrp;
+       int al,nl,max,i,x,ri;
+
+       n= &(mont->N);
+       /* mont->ri is the size of mont->N in bits (rounded up
+          to the word size) */
+       al=ri=mont->ri/BN_BITS2;
+
+       nl=n->top;
+       if ((al == 0) || (nl == 0)) { ret->top=0; return(1); }
+
+       max=(nl+al+1); /* allow for overflow (no?) XXX */
+       if (bn_wexpand(r,max) == NULL) return(0);
+
+       r->neg^=n->neg;
+       np=n->d;
+       rp=r->d;
+       nrp= &(r->d[nl]);
+
+       /* clear the top words of T */
+       for (i=r->top; i<max; i++) /* memset? XXX */
+               r->d[i]=0;
+
+       r->top=max;
+       n0=mont->n0[0];
+
+#ifdef BN_COUNT
+       fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl);
+#endif
+       for (i=0; i<nl; i++)
+               {
+#ifdef __TANDEM
+                {
+                   long long t1;
+                   long long t2;
+                   long long t3;
+                   t1 = rp[0] * (n0 & 0177777);
+                   t2 = 037777600000l;
+                   t2 = n0 & t2;
+                   t3 = rp[0] & 0177777;
+                   t2 = (t3 * t2) & BN_MASK2;
+                   t1 = t1 + t2;
+                   v=bn_mul_add_words(rp,np,nl,(BN_ULONG) t1);
+                }
+#else
+               v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
+#endif
+               nrp++;
+               rp++;
+               if (((nrp[-1]+=v)&BN_MASK2) >= v)
+                       continue;
+               else
+                       {
+                       if (((++nrp[0])&BN_MASK2) != 0) continue;
+                       if (((++nrp[1])&BN_MASK2) != 0) continue;
+                       for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ;
+                       }
+               }
+       bn_correct_top(r);
+
+       /* mont->ri will be a multiple of the word size and below code
+        * is kind of BN_rshift(ret,r,mont->ri) equivalent */
+       if (r->top <= ri)
+               {
+               ret->top=0;
+               return(1);
+               }
+       al=r->top-ri;
+
+       if (bn_wexpand(ret,ri) == NULL) return(0);
+       x=0-(((al-ri)>>(sizeof(al)*8-1))&1);
+       ret->top=x=(ri&~x)|(al&x);      /* min(ri,al) */
+       ret->neg=r->neg;
+
+       rp=ret->d;
+       ap=&(r->d[ri]);
+
+       {
+       size_t m1,m2;
+
+       v=bn_sub_words(rp,ap,np,ri);
+       /* this ----------------^^ works even in al<ri case
+        * thanks to zealous zeroing of top of the vector in the
+        * beginning. */
+
+       /* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
+       /* in other words if subtraction result is real, then
+        * trick unconditional memcpy below to perform in-place
+        * "refresh" instead of actual copy. */
+       m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1);   /* al<ri */
+       m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1);   /* al>ri */
+       m1|=m2;                 /* (al!=ri) */
+       m1|=(0-(size_t)v);      /* (al!=ri || v) */
+       m1&=~m2;                /* (al!=ri || v) && !al>ri */
+       nrp=(BN_ULONG *)(((size_t)rp&~m1)|((size_t)ap&m1));
+       }
+
+       /* 'i<ri' is chosen to eliminate dependency on input data, even
+        * though it results in redundant copy in al<ri case. */
+       for (i=0,ri-=4; i<ri; i+=4)
+               {
+               BN_ULONG t1,t2,t3,t4;
+               
+               t1=nrp[i+0];
+               t2=nrp[i+1];
+               t3=nrp[i+2];    ap[i+0]=0;
+               t4=nrp[i+3];    ap[i+1]=0;
+               rp[i+0]=t1;     ap[i+2]=0;
+               rp[i+1]=t2;     ap[i+3]=0;
+               rp[i+2]=t3;
+               rp[i+3]=t4;
+               }
+       for (ri+=4; i<ri; i++)
+               rp[i]=nrp[i], ap[i]=0;
+       bn_correct_top(r);
+       bn_correct_top(ret);
+       bn_check_top(ret);
+
+       return(1);
+       }
+
+int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
+            BN_CTX *ctx)
+       {
+       int retn=0;
+       BIGNUM *t;
+
+       BN_CTX_start(ctx);
+       if ((t = BN_CTX_get(ctx)) && BN_copy(t,a))
+               retn = BN_from_montgomery_word(ret,t,mont);
+       BN_CTX_end(ctx);
+       return retn;
+       }
+#else
+
 int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
             BN_CTX *ctx)
        {
@@ -357,6 +510,7 @@ int BN_from_montgomery(BIGNUM *ret, const BIGNUM *a, BN_MONT_CTX *mont,
        BN_CTX_end(ctx);
        return(retn);
        }
+#endif /* defined(MONT_WORD) && defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) */
 
 BN_MONT_CTX *BN_MONT_CTX_new(void)
        {
@@ -376,6 +530,11 @@ void BN_MONT_CTX_init(BN_MONT_CTX *ctx)
        BN_init(&(ctx->RR));
        BN_init(&(ctx->N));
        BN_init(&(ctx->Ni));
+#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) /* non-default compile */
+       ctx->n0[0] = ctx->n0[1] = 0;
+#else
+       ctx->n0 = 0;
+#endif
        ctx->flags=0;
        }
 
@@ -409,7 +568,11 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
 
                mont->ri=(BN_num_bits(mod)+(BN_BITS2-1))/BN_BITS2*BN_BITS2;
                BN_zero(R);
+#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) /* non-default compile */
+               if (!(BN_set_bit(R,2*BN_BITS2))) goto err;      /* R */
+#else
                if (!(BN_set_bit(R,BN_BITS2))) goto err;        /* R */
+#endif
 
                buf[0]=mod->d[0]; /* tmod = N mod word size */
                buf[1]=0;
@@ -419,6 +582,35 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
                tmod.top = buf[0] != 0 ? 1 : 0;
                tmod.dmax=2;
                tmod.neg=0;
+
+#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
+                                                               tmod.top=0;
+               if ((buf[0] = mod->d[0]))                       tmod.top=1;
+               if ((buf[1] = mod->top>1 ? mod->d[1] : 0))      tmod.top=2;
+
+               if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL)
+                       goto err;
+               if (!BN_lshift(Ri,Ri,2*BN_BITS2)) goto err; /* R*Ri */
+               if (!BN_is_zero(Ri))
+                       {
+                       if (!BN_sub_word(Ri,1)) goto err;
+                       }
+               else /* if N mod word size == 1 */
+                       {
+                       if (bn_expand(Ri,(int)sizeof(BN_ULONG)*2) == NULL)
+                               goto err;
+                       /* Ri-- (mod double word size) */
+                       Ri->neg=0;
+                       Ri->d[0]=BN_MASK2;
+                       Ri->d[1]=BN_MASK2;
+                       Ri->top=2;
+                       }
+               if (!BN_div(Ri,NULL,Ri,&tmod,ctx)) goto err;
+               /* Ni = (R*Ri-1)/N,
+                * keep only couple of least significant words: */
+               mont->n0[0] = (Ri->top > 0) ? Ri->d[0] : 0;
+               mont->n0[1] = (Ri->top > 1) ? Ri->d[1] : 0;
+#else
                                                        /* Ri = R^-1 mod N*/
                if ((BN_mod_inverse(Ri,R,&tmod,ctx)) == NULL)
                        goto err;
@@ -435,6 +627,7 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
                /* Ni = (R*Ri-1)/N,
                 * keep only least significant word: */
                mont->n0 = (Ri->top > 0) ? Ri->d[0] : 0;
+#endif
                }
 #else /* !MONT_WORD */
                { /* bignum version */
@@ -470,7 +663,12 @@ BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from)
        if (!BN_copy(&(to->N),&(from->N))) return NULL;
        if (!BN_copy(&(to->Ni),&(from->Ni))) return NULL;
        to->ri=from->ri;
+#if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32) /* non-default compile */
+       to->n0[0]=from->n0[0];
+       to->n0[1]=from->n0[1];
+#else
        to->n0=from->n0;
+#endif
        return(to);
        }
 
index ebf38adb1a565197fe63d0898cc733b16c2be5f0..a0be2934c20762503d6dd5cd56639469c3d0c3eb 100644 (file)
@@ -146,6 +146,7 @@ sub main'exch       { &out2("xchg",@_); }
 sub main'cmp   { &out2("cmp",@_); }
 sub main'lea   { &out2("lea",@_); }
 sub main'mul   { &out1("mul",@_); }
+sub main'imul  { &out2("imul",@_); }
 sub main'div   { &out1("div",@_); }
 sub main'dec   { &out1("dec",@_); }
 sub main'inc   { &out1("inc",@_); }
index 863c7e9d6e669114c11e63e3effab24b28379c82..fa38f89c09fc7bef2fc0eedc99aa4a91f36c9310 100644 (file)
@@ -154,6 +154,7 @@ sub main'exch       { &out2("xchg",@_); }
 sub main'cmp   { &out2("cmp",@_); }
 sub main'lea   { &out2("lea",@_); }
 sub main'mul   { &out1("mul",@_); }
+sub main'imul  { &out2("imul",@_); }
 sub main'div   { &out1("div",@_); }
 sub main'dec   { &out1("dec",@_); }
 sub main'inc   { &out1("inc",@_); }
index 53507b6b8432f214b5bfea523b6dea30991ca4af..a4c947165e535033e315cf7079defbc8664b7322 100644 (file)
@@ -171,6 +171,7 @@ sub main'exch       { &out2($_[0]=~/%[a-d][lh]/?"xchgb":"xchgl",@_); }
 sub main'cmp   { &out2("cmpl",@_); }
 sub main'lea   { &out2("leal",@_); }
 sub main'mul   { &out1("mull",@_); }
+sub main'imul  { &out2("imull",@_); }
 sub main'div   { &out1("divl",@_); }
 sub main'jmp   { &out1("jmp",@_); }
 sub main'jmp_ptr { &out1p("jmp",@_); }