Add Montgomery multiplication module for IA-64.

[openssl.git] / crypto / bn / asm / x86-mont.pl
diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl

index 7401ff22f01407c7982bf7111c739ddcefafbc18..5cd3cd2ed50a968fa2ab60bcd361b185dce923ac 100755 (executable)
--- a/crypto/bn/asm/x86-mont.pl
+++ b/crypto/bn/asm/x86-mont.pl
@@ -24,7 +24,7 @@
  #
  # Modulo-scheduling SSE2 loops results in further 15-20% improvement.
  # Integer-only code [being equipped with dedicated squaring procedure]
-# gives >=30% on rsa512 sign benchmark...
+# gives ~40% on rsa512 sign benchmark...
  
  $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  push(@INC,"${dir}","${dir}../../perlasm");
@@ -41,17 +41,17 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
  
  $i="edx";
  $j="ecx";
-$ap="esi";
+$ap="esi";     $tp="esi";              # overlapping variables!!!
  $rp="edi";     $bp="edi";              # overlapping variables!!!
  $np="ebp";
  $num="ebx";
  
-$_rp=&DWP(4*0,"esp");                  # stack top layout
-$_ap=&DWP(4*1,"esp");
-$_bp=&DWP(4*2,"esp");
-$_np=&DWP(4*3,"esp");
-$_n0=&DWP(4*4,"esp");
-$_num=&DWP(4*5,"esp");
+$_num=&DWP(4*0,"esp");                 # stack top layout
+$_rp=&DWP(4*1,"esp");
+$_ap=&DWP(4*2,"esp");
+$_bp=&DWP(4*3,"esp");
+$_np=&DWP(4*4,"esp");
+$_n0=&DWP(4*5,"esp");  $_n0q=&QWP(4*5,"esp");
  $_sp=&DWP(4*6,"esp");
  $_bpend=&DWP(4*7,"esp");
  $frame=32;                             # size of above frame rounded up to 16n
@@ -61,20 +61,37 @@ $frame=32;                          # size of above frame rounded up to 16n
         &cmp    ("edi",4);
         &jl     (&label("just_leave"));
  
-       ################################# load argument block...
-       &mov    ("eax",&wparam(0));     # BN_ULONG *rp
-       &mov    ("ebx",&wparam(1));     # const BN_ULONG *ap
-       &mov    ("ecx",&wparam(2));     # const BN_ULONG *bp
-       &mov    ("edx",&wparam(3));     # const BN_ULONG *np
-       &mov    ("esi",&wparam(4));     # const BN_ULONG *n0
-       #&mov   ("edi",&wparam(5));     # int num
-
+       &lea    ("esi",&wparam(0));     # put aside pointer to argument block
+       &lea    ("edx",&wparam(1));     # load ap
         &mov    ("ebp","esp");          # saved stack pointer!
         &add    ("edi",2);              # extra two words on top of tp
         &neg    ("edi");
         &lea    ("esp",&DWP(-$frame,"esp","edi",4));    # alloca($frame+4*(num+2))
         &neg    ("edi");
-       &and    ("esp",-4096);          # minimize TLB utilization
+
+       # minimize cache contention by arraning 2K window between stack
+       # pointer and ap argument [np is also position sensitive vector,
+       # but it's assumed to be near ap, as it's allocated at ~same
+       # time].
+       &mov    ("eax","esp");
+       &sub    ("eax","edx");
+       &and    ("eax",2047);
+       &sub    ("esp","eax");          # this aligns sp and ap modulo 2048
+
+       &xor    ("edx","esp");
+       &and    ("edx",2048);
+       &xor    ("edx",2048);
+       &sub    ("esp","edx");          # this splits them apart modulo 4096
+
+       &and    ("esp",-64);            # align to cache line
+
+       ################################# load argument block...
+       &mov    ("eax",&DWP(0*4,"esi"));# BN_ULONG *rp
+       &mov    ("ebx",&DWP(1*4,"esi"));# const BN_ULONG *ap
+       &mov    ("ecx",&DWP(2*4,"esi"));# const BN_ULONG *bp
+       &mov    ("edx",&DWP(3*4,"esi"));# const BN_ULONG *np
+       &mov    ("esi",&DWP(4*4,"esi"));# const BN_ULONG *n0
+       #&mov   ("edi",&DWP(5*4,"esi"));# int num
  
         &mov    ("esi",&DWP(0,"esi"));  # pull n0[0]
         &mov    ($_rp,"eax");           # ... save a copy of argument block
@@ -119,7 +136,7 @@ $mask="mm7";
         &movq   ($acc0,$mul1);                  # I wish movd worked for
         &pand   ($acc0,$mask);                  # inter-register transfers
  
-       &pmuludq($mul1,$_n0);                   # *=n0
+       &pmuludq($mul1,$_n0q);                  # *=n0
  
         &pmuludq($car1,$mul1);                  # "t[0]"*np[0]*n0
         &paddq  ($car1,$acc0);
@@ -131,7 +148,7 @@ $mask="mm7";
         &psrlq  ($car1,32);
  
         &inc    ($j);                           # j++
-&set_label("1st");
+&set_label("1st",16);
         &pmuludq($acc0,$mul0);                  # ap[j]*bp[0]
         &pmuludq($acc1,$mul1);                  # np[j]*m1
         &paddq  ($car0,$acc0);                  # +=c0
@@ -164,7 +181,7 @@ $mask="mm7";
         &psrlq  ($car1,32);
  
         &paddq  ($car1,$car0);
-       &movq   (&DWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
+       &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
  \f
         &inc    ($i);                           # i++
  &set_label("outer");
@@ -181,7 +198,7 @@ $mask="mm7";
         &movq   ($car0,$mul1);
         &pand   ($acc0,$mask);
  
-       &pmuludq($mul1,$_n0);                   # *=n0
+       &pmuludq($mul1,$_n0q);                  # *=n0
  
         &pmuludq($car1,$mul1);
         &paddq  ($car1,$acc0);
@@ -233,7 +250,7 @@ $mask="mm7";
         &movd   ($temp,&DWP($frame+4,"esp",$num,4));    # += tp[num]
         &paddq  ($car1,$car0);
         &paddq  ($car1,$temp);
-       &movq   (&DWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
+       &movq   (&QWP($frame,"esp",$num,4),$car1);      # tp[num].tp[num-1]
  
         &lea    ($i,&DWP(1,$i));                # i++
         &cmp    ($i,$num);
@@ -250,11 +267,11 @@ if (0) {
         &xor    ("eax","eax");  # signal "not fast enough [yet]"
         &jmp    (&label("just_leave"));
         # While the below code provides competitive performance for
-       # all key lengthes on modern cores, it's still a tad slower
-       # for >=2048-bits keys on *elder* CPUs:-( "Competitive" means
-       # compared to the original integer-only assembler. 512-bit
-       # RSA sign is better by >=30%, but that's about all one can
-       # say about all CPUs...
+       # all key lengthes on modern Intel cores, it's still more
+       # than 10% slower for 4096-bit key elsewhere:-( "Competitive"
+       # means compared to the original integer-only assembler.
+       # 512-bit RSA sign is better by ~40%, but that's about all
+       # one can say about all CPUs...
  } else {
  $inp="esi";    # integer path uses these registers differently
  $word="edi";
@@ -360,7 +377,7 @@ $carry="ebp";
  
         &mov    ($carry,"edx");
         &mul    ($word);                                # np[j]*m
-       &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
+       &add    ($carry,&DWP($frame,"esp",$num,4));     # +=tp[num-1]
         &adc    ("edx",0);
         &add    ($carry,"eax");
         &adc    ("edx",0);
@@ -374,9 +391,9 @@ $carry="ebp";
         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
          &cmp   ($j,$_bpend);
         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
-       &je     (&label("x86done"));
+       &je     (&label("common_tail"));
  
-       &mov    ($word,&DWP(0,$j));                     # bp[i]
+       &mov    ($word,&DWP(0,$j));                     # bp[i+1]
         &mov    ($inp,$_ap);
         &mov    ($_bp,$j);                              # &bp[++i]
         &xor    ($j,$j);
@@ -473,7 +490,7 @@ $sbit=$num;
         &mov    (&DWP($frame,"esp",$num,4),"edx");      # tp[num-1]=
         &cmp    ($j,$num);
         &mov    (&DWP($frame+4,"esp",$num,4),"eax");    # tp[num]=
-       &je     (&label("x86done"));
+       &je     (&label("common_tail"));
  \f
         &mov    ($word,&DWP(4,$inp,$j,4));              # ap[i]
         &lea    ($j,&DWP(1,$j));
@@ -496,11 +513,13 @@ $sbit=$num;
         &mov    ($carry,"edx");
         &mul    ($word);                                # ap[j]*ap[i]
         &add    ("eax",$carry);
-       &lea    ($j,&DWP(1,$j));
+       &lea    ($carry,&DWP(0,"eax","eax"));
         &adc    ("edx",0);
-       &lea    ($carry,&DWP(0,$sbit,"eax",2));
         &shr    ("eax",31);
-       &add    ($carry,&DWP($frame-4,"esp",$j,4));     # +=tp[j]
+       &add    ($carry,&DWP($frame,"esp",$j,4));       # +=tp[j]
+       &lea    ($j,&DWP(1,$j));
+       &adc    ("eax",0);
+       &add    ($carry,$sbit);
         &adc    ("eax",0);
         &cmp    ($j,$_num);
         &mov    (&DWP($frame-4,"esp",$j,4),$carry);     # tp[j]=
@@ -529,46 +548,39 @@ $sbit=$num;
         &mov    ("eax",&DWP(4,$inp));                   # np[1]
  
         &jmp    (&label("3rdmadd"));
-\f
-&set_label("x86done",4);
-       &mov    ($np,$_np);     # make adjustments for tail processing
  }
-
+\f
  &set_label("common_tail",16);
-       &mov    ("esi",&DWP($frame+4,"esp",$num,4));# load upmost overflow bit
+       &mov    ($np,$_np);                     # load modulus pointer
         &mov    ($rp,$_rp);                     # load result pointer
-                                               # [$ap and $bp are zapped]
-       &xor    ($i,$i);                        # i=0
+       &lea    ($tp,&DWP($frame,"esp"));       # [$ap and $bp are zapped]
+
+       &mov    ("eax",&DWP(0,$tp));            # tp[0]
         &mov    ($j,$num);                      # j=num-1
-       &cmp    ("esi",0);                      # clears CF unconditionally
-       &jnz    (&label("sub"));
-       &mov    ("eax",&DWP($frame,"esp",$j,4));
-       &cmp    ("eax",&DWP(0,$np,$j,4));       # tp[num-1]-np[num-1]?
-       &jae    (&label("sub"));                # if taken CF is cleared
-&set_label("copy",16);
-       &mov    ("eax",&DWP($frame,"esp",$j,4));
-       &mov    (&DWP(0,$rp,$j,4),"eax");       # rp[i]=tp[i]
-       &mov    (&DWP($frame,"esp",$j,4),$j);   # zap temporary vector
-       &dec    ($j);
-       &jge    (&label("copy"));
-       &jmp    (&label("exit"));
+       &xor    ($i,$i);                        # i=0 and clear CF!
  
  &set_label("sub",16);
-       &mov    ("eax",&DWP($frame,"esp",$i,4));
         &sbb    ("eax",&DWP(0,$np,$i,4));
         &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
-       &lea    ($i,&DWP(1,$i));                # i++
         &dec    ($j);                           # doesn't affect CF!
+       &mov    ("eax",&DWP(4,$tp,$i,4));       # tp[i+1]
+       &lea    ($i,&DWP(1,$i));                # i++
         &jge    (&label("sub"));
-       &mov    ($j,$num);                      # j=num-1
-       &sbb    ("esi",0);                      # esi holds upmost overflow bit
-       &jc     (&label("copy"));
-&set_label("zap",16);
-       &mov    (&DWP($frame,"esp",$j,4),$i);   # zap temporary vector
-       &dec    ($j);
-       &jge    (&label("zap"));
-
-&set_label("exit",4);
+
+       &sbb    ("eax",0);                      # handle upmost overflow bit
+       &and    ($tp,"eax");
+       &not    ("eax");
+       &mov    ($np,$rp);
+       &and    ($np,"eax");
+       &or     ($tp,$np);                      # tp=carry?tp:rp
+
+&set_label("copy",16);                         # copy or in-place refresh
+       &mov    ("eax",&DWP(0,$tp,$num,4));
+       &mov    (&DWP(0,$rp,$num,4),"eax");     # rp[i]=tp[i]
+       &mov    (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
+       &dec    ($num);
+       &jge    (&label("copy"));
+
         &mov    ("esp",$_sp);           # pull saved stack pointer
         &mov    ("eax",1);
  &set_label("just_leave");