Add Montgomery multiplication module for IA-64.
authorAndy Polyakov <appro@openssl.org>
Wed, 6 Jan 2010 10:57:55 +0000 (10:57 +0000)
committerAndy Polyakov <appro@openssl.org>
Wed, 6 Jan 2010 10:57:55 +0000 (10:57 +0000)
Configure
TABLE
crypto/bn/Makefile
crypto/bn/asm/ia64-mont.pl [new file with mode: 0644]

index a715fa7..e00b923 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -128,7 +128,7 @@ my $x86_asm="x86cpuid.o:bn-586.o co-586.o x86-mont.o:des-586.o crypt586.o:aes-58
 my $x86_elf_asm="$x86_asm:elf";
 
 my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o x86_64-mont.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o";
-my $ia64_asm="ia64cpuid.o:bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::void";
+my $ia64_asm="ia64cpuid.o:bn-ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o:::::void";
 my $sparcv9_asm="sparcv9cap.o sparccpuid.o:bn-sparcv9.o sparcv9-mont.o sparcv9a-mont.o:des_enc-sparc.o fcrypt_b.o:aes_core.o aes_cbc.o aes-sparcv9.o:::sha1-sparcv9.o sha256-sparcv9.o sha512-sparcv9.o:::::::void";
 my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o:::::::::::void";
 my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o::::::::::::void";
@@ -490,7 +490,7 @@ my %table=(
 # Visual C targets
 #
 # Win64 targets, WIN64I denotes IA-64 and WIN64A - AMD64
-"VC-WIN64I","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ias:win32",
+"VC-WIN64I","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64I::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:ia64cpuid.o:ia64.o ia64-mont.o::aes_core.o aes_cbc.o aes-ia64.o::md5-ia64.o:sha1-ia64.o sha256-ia64.o sha512-ia64.o:::::::ias:win32",
 "VC-WIN64A","cl:-W3 -Gs0 -Gy -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -DUNICODE -D_UNICODE -D_CRT_SECURE_NO_DEPRECATE:::WIN64A::SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN:x86_64cpuid.o:bn_asm.o x86_64-mont.o::aes-x86_64.o aesni-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o:::wp-x86_64.o:cmll-x86_64.o cmll_misc.o:auto:win32",
 # x86 Win32 target defaults to ANSI API, if you want UNICODE, complement
 # 'perl Configure VC-WIN32' with '-DUNICODE -D_UNICODE'
diff --git a/TABLE b/TABLE
index c3613cf..6b7f415 100644 (file)
--- a/TABLE
+++ b/TABLE
@@ -133,7 +133,7 @@ $sys_id       =
 $lflags       = 
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT
 $cpuid_obj    = ia64cpuid.o
-$bn_obj       = bn-ia64.o
+$bn_obj       = bn-ia64.o ia64-mont.o
 $des_obj      = 
 $aes_obj      = aes_core.o aes_cbc.o aes-ia64.o
 $bf_obj       = 
@@ -784,7 +784,7 @@ $sys_id       = WIN64I
 $lflags       = 
 $bn_ops       = SIXTY_FOUR_BIT RC4_CHUNK_LL DES_INT EXPORT_VAR_AS_FN
 $cpuid_obj    = ia64cpuid.o
-$bn_obj       = ia64.o
+$bn_obj       = ia64.o ia64-mont.o
 $des_obj      = 
 $aes_obj      = aes_core.o aes_cbc.o aes-ia64.o
 $bf_obj       = 
@@ -2675,7 +2675,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = SIXTY_FOUR_BIT MD2_CHAR RC4_INDEX DES_UNROLL DES_RISC1 DES_INT
 $cpuid_obj    = ia64cpuid.o
-$bn_obj       = bn-ia64.o
+$bn_obj       = bn-ia64.o ia64-mont.o
 $des_obj      = 
 $aes_obj      = aes_core.o aes_cbc.o aes-ia64.o
 $bf_obj       = 
@@ -2706,7 +2706,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = SIXTY_FOUR_BIT MD2_CHAR RC4_INDEX DES_UNROLL DES_RISC1 DES_INT
 $cpuid_obj    = ia64cpuid.o
-$bn_obj       = bn-ia64.o
+$bn_obj       = bn-ia64.o ia64-mont.o
 $des_obj      = 
 $aes_obj      = aes_core.o aes_cbc.o aes-ia64.o
 $bf_obj       = 
@@ -2923,7 +2923,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX DES_UNROLL DES_RISC1 DES_INT
 $cpuid_obj    = ia64cpuid.o
-$bn_obj       = bn-ia64.o
+$bn_obj       = bn-ia64.o ia64-mont.o
 $des_obj      = 
 $aes_obj      = aes_core.o aes_cbc.o aes-ia64.o
 $bf_obj       = 
@@ -2954,7 +2954,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX DES_UNROLL DES_RISC1 DES_INT
 $cpuid_obj    = ia64cpuid.o
-$bn_obj       = bn-ia64.o
+$bn_obj       = bn-ia64.o ia64-mont.o
 $des_obj      = 
 $aes_obj      = aes_core.o aes_cbc.o aes-ia64.o
 $bf_obj       = 
@@ -3574,7 +3574,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_INT
 $cpuid_obj    = ia64cpuid.o
-$bn_obj       = bn-ia64.o
+$bn_obj       = bn-ia64.o ia64-mont.o
 $des_obj      = 
 $aes_obj      = aes_core.o aes_cbc.o aes-ia64.o
 $bf_obj       = 
@@ -3605,7 +3605,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT
 $cpuid_obj    = ia64cpuid.o
-$bn_obj       = bn-ia64.o
+$bn_obj       = bn-ia64.o ia64-mont.o
 $des_obj      = 
 $aes_obj      = aes_core.o aes_cbc.o aes-ia64.o
 $bf_obj       = 
@@ -3636,7 +3636,7 @@ $sys_id       =
 $lflags       = -ldl
 $bn_ops       = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT
 $cpuid_obj    = ia64cpuid.o
-$bn_obj       = bn-ia64.o
+$bn_obj       = bn-ia64.o ia64-mont.o
 $des_obj      = 
 $aes_obj      = aes_core.o aes_cbc.o aes-ia64.o
 $bf_obj       = 
index c42ef3e..134b4a9 100644 (file)
@@ -92,6 +92,8 @@ x86_64-mont.s:        asm/x86_64-mont.pl
 
 bn-ia64.s:     asm/ia64.S
        $(CC) $(CFLAGS) -E asm/ia64.S > $@
+ia64-mont.s:   asm/ia64-mont.pl
+       $(PERL) asm/ia64-mont.pl $@ $(CFLAGS)
 
 # GNU assembler fails to compile PA-RISC2 modules, insist on calling
 # vendor assembler...
diff --git a/crypto/bn/asm/ia64-mont.pl b/crypto/bn/asm/ia64-mont.pl
new file mode 100644 (file)
index 0000000..0084213
--- /dev/null
@@ -0,0 +1,356 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# January 2010
+#
+# "Teaser" Montgomery multiplication module for IA-64. There are
+# several possibilities for improvement:
+#
+# - modulo-scheduling outer loop would eliminate quite a number of
+#   stalls after ldf8, xma and getf.sig outside inner loop and
+#   improve shorter key performance;
+# - shorter vector support [with input vectors being fetched only
+#   once] should be added;
+# - 2x unroll with help of n0[1] would make the code scalable on
+#   "wider" IA-64, "wider" than Itanium 2 that is, which is not of
+#   acute interest, because upcoming Tukwila's individual cores are
+#   reportedly based on Itanium 2 design;
+# - dedicated squaring procedure(?);
+#
+# So far 'openssl speed rsa dsa' output on 900MHz Itanium 2 *with*
+# this module is:
+#                   sign    verify    sign/s verify/s
+# rsa  512 bits 0.000634s 0.000030s   1577.6  32877.3
+# rsa 1024 bits 0.001246s 0.000058s    802.8  17181.5
+# rsa 2048 bits 0.005908s 0.000148s    169.3   6754.0
+# rsa 4096 bits 0.033456s 0.000469s     29.9   2133.6
+# dsa  512 bits 0.000322s 0.000286s   3106.0   3499.0
+# dsa 1024 bits 0.000585s 0.000607s   1708.4   1647.4
+# dsa 2048 bits 0.001453s 0.001703s    688.1    587.4
+#
+# ... and *without*:
+#
+# rsa  512 bits 0.000670s 0.000041s   1491.8  24145.5
+# rsa 1024 bits 0.001988s 0.000080s    502.9  12499.3
+# rsa 2048 bits 0.008702s 0.000189s    114.9   5293.9
+# rsa 4096 bits 0.043860s 0.000533s     22.8   1875.9
+# dsa  512 bits 0.000441s 0.000427s   2265.3   2340.6
+# dsa 1024 bits 0.000823s 0.000867s   1215.6   1153.2
+# dsa 2048 bits 0.001894s 0.002179s    528.1    458.9
+#
+# 512-bit RSA sign performance does not improve, because this module
+# doesn't handle short enough vectors (yet). Otherwise RSA sign
+# improves by 60-30%, less for longer keys, while verify - by 35-13%.
+# DSA performance improves by 40-30%.
+
+if ($^O eq "hpux") {
+    $ADDP="addp4";
+    for (@ARGV) { $ADDP="add" if (/[\+DD|\-mlp]64/); }
+} else { $ADDP="add"; }
+
+$code=<<___;
+.explicit
+.text
+\f
+// int bn_mul_mont (BN_ULONG *rp,const BN_ULONG *ap,
+//                 const BN_ULONG *bp,const BN_ULONG *np,
+//                 const BN_ULONG *n0p,int num);                       
+.global        bn_mul_mont#
+.proc  bn_mul_mont#
+prevsp=r2;
+prevfs=r3;
+prevlc=r10;
+prevpr=r11;
+
+rptr=r14;
+aptr=r15;
+bptr=r16;
+nptr=r17;
+tptr=r18;      // &tp[0]
+tp_1=r19;      // &tp[-1]
+num=r20;
+len=r21;
+topbit=r22;
+lc=r23;
+
+bi=f6;
+n0=f7;
+m0=f8;
+
+.align 64
+bn_mul_mont:
+       .prologue
+{ .mmi;        .save   ar.pfs,prevfs
+       alloc   prevfs=ar.pfs,6,2,0,8
+       $ADDP   aptr=0,in1
+       .save   ar.lc,prevlc
+       mov     prevlc=ar.lc            }
+{ .mmi;        .vframe prevsp
+       mov     prevsp=sp
+       $ADDP   bptr=0,in2
+       cmp4.gt p6,p0=5,in5             };;     // is num large enough?
+{ .mfi;        nop.m   0                               // align loop bodies
+       nop.f   0
+       nop.i   0                       }
+{ .mib;        mov     ret0=r0                         // signal "unhandled"
+       .save   pr,prevpr
+       mov     prevpr=pr
+(p6)   br.ret.dpnt.many        b0      };;
+
+       .body
+       .rotf           alo[6],nlo[4],ahi[8],nhi[6]
+       .rotr           a[3],n[3],t[2]
+
+{ .mmi;        ldf8            bi=[bptr],8             // (*bp++)
+       ldf8            alo[4]=[aptr],16        // ap[0]
+       $ADDP           r30=8,in1       };;
+{ .mmi;        ldf8            alo[3]=[r30],16         // ap[1]
+       ldf8            alo[2]=[aptr],16        // ap[2]
+       $ADDP           in4=0,in4       };;
+{ .mmi;        ldf8            alo[1]=[r30]            // ap[3]
+       ldf8            n0=[in4]                // n0
+       $ADDP           rptr=0,in0              }
+{ .mmi;        $ADDP           nptr=0,in3
+       mov             r31=16
+       zxt4            num=in5         };;
+{ .mmi;        ldf8            nlo[2]=[nptr],8         // np[0]
+       shladd          len=num,3,r0
+       shladd          r31=num,3,r31   };;
+{ .mmi;        ldf8            nlo[1]=[nptr],8         // np[1]
+       add             lc=-5,num
+       sub             r31=sp,r31      };;
+{ .mfb;        and             sp=-16,r31              // alloca
+       xmpy.hu         ahi[2]=alo[4],bi        // ap[0]*bp[0]
+       nop.b           0               }
+{ .mfb;        nop.m           0
+       xmpy.lu         alo[4]=alo[4],bi
+       brp.loop.imp    .L1st_ctop,.L1st_cend-16
+                                       };;
+{ .mfi;        nop.m           0
+       xma.hu          ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[0]
+       $ADDP           tp_1=8,sp       }
+{ .mfi;        nop.m           0
+       xma.lu          alo[3]=alo[3],bi,ahi[2]
+       mov             pr.rot=0x20001f<<16
+                       // ------^----- (p40) at first (p23)
+                       // ----------^^ p[16:20]=1
+                                       };;
+{ .mfi;        nop.m           0
+       xmpy.lu         m0=alo[4],n0            // (ap[0]*bp[0])*n0
+       mov             ar.lc=lc        }
+{ .mfi;        nop.m           0
+       fcvt.fxu.s1     nhi[1]=f0
+       mov             ar.ec=8         };;
+
+.align 32
+.L1st_ctop:
+.pred.rel      "mutex",p40,p42
+{ .mfi;        (p16)   ldf8            alo[0]=[aptr],8             // *(aptr++)
+       (p18)   xma.hu          ahi[0]=alo[2],bi,ahi[1]
+       (p40)   add             n[2]=n[2],a[2]          }   // (p23)                                    }
+{ .mfi;        (p18)   ldf8            nlo[0]=[nptr],8             // *(nptr++)(p16)
+       (p18)   xma.lu          alo[2]=alo[2],bi,ahi[1]
+       (p42)   add             n[2]=n[2],a[2],1        };; // (p23)
+{ .mfi;        (p21)   getf.sig        a[0]=alo[5]
+       (p20)   xma.hu          nhi[0]=nlo[2],m0,nhi[1]
+       (p42)   cmp.leu         p41,p39=n[2],a[2]       }   // (p23)
+{ .mfi;        (p23)   st8             [tp_1]=n[2],8
+       (p20)   xma.lu          nlo[2]=nlo[2],m0,nhi[1]
+       (p40)   cmp.ltu         p41,p39=n[2],a[2]       }   // (p23)
+{ .mmb;        (p21)   getf.sig        n[0]=nlo[3]
+       (p16)   nop.m           0
+       br.ctop.sptk    .L1st_ctop                      };;
+.L1st_cend:
+
+{ .mmi;        getf.sig        a[0]=ahi[6]             // (p24)
+       getf.sig        n[0]=nhi[4]
+       add             num=-1,num      };;     // num--
+{ .mmi;        .pred.rel       "mutex",p40,p42
+(p40)  add             n[0]=n[0],a[0]
+(p42)  add             n[0]=n[0],a[0],1
+       sub             aptr=aptr,len   };;     // rewind
+{ .mmi;        .pred.rel       "mutex",p40,p42
+(p40)  cmp.ltu         p41,p39=n[0],a[0]
+(p42)  cmp.leu         p41,p39=n[0],a[0]
+       sub             nptr=nptr,len   };;
+{ .mmi;        .pred.rel       "mutex",p39,p41
+(p39)  add             topbit=r0,r0
+(p41)  add             topbit=r0,r0,1
+       nop.i           0               }       
+{ .mmi;        st8             [tp_1]=n[0]
+       $ADDP           tptr=16,sp
+       $ADDP           tp_1=8,sp       };;
+___
+\f\f
+$code.=<<___;
+.Louter:
+{ .mmi;        ldf8            bi=[bptr],8             // (*bp++)
+       ldf8            ahi[3]=[tptr]           // tp[0]
+       add             r30=8,aptr      };;
+{ .mmi;        ldf8            alo[4]=[aptr],16        // ap[0]
+       ldf8            alo[3]=[r30],16         // ap[1]
+       add             r31=8,nptr      };;
+{ .mfb;        ldf8            alo[2]=[aptr],16        // ap[2]
+       xma.hu          ahi[2]=alo[4],bi,ahi[3] // ap[0]*bp[i]+tp[0]
+       brp.loop.imp    .Linner_ctop,.Linner_cend-16
+                                       }
+{ .mfb;        ldf8            alo[1]=[r30]            // ap[3]
+       xma.lu          alo[4]=alo[4],bi,ahi[3]
+       clrrrb.pr                       };;
+{ .mfi;        ldf8            nlo[2]=[nptr],16        // np[0]
+       xma.hu          ahi[1]=alo[3],bi,ahi[2] // ap[1]*bp[i]
+       nop.i           0               }
+{ .mfi;        ldf8            nlo[1]=[r31]            // np[1]
+       xma.lu          alo[3]=alo[3],bi,ahi[2]
+       mov             pr.rot=0x20101f<<16
+                       // ------^----- (p40) at first (p23)
+                       // --------^--- (p30) at first (p22)
+                       // ----------^^ p[16:20]=1
+                                       };;
+{ .mfi;        st8             [tptr]=r0               // tp[0] is already accounted
+       xmpy.lu         m0=alo[4],n0            // (ap[0]*bp[i]+tp[0])*n0
+       mov             ar.lc=lc        }
+{ .mfi;
+       fcvt.fxu.s1     nhi[1]=f0
+       mov             ar.ec=8         };;
+
+// This loop spins in 4*(n+7) ticks on Itanium 2 and should spin in
+// 7*(n+7) ticks on Itanium (the one codenamed Merced). Factor of 7
+// in latter case accounts for two-tick pipeline stall, which means
+// that its performance would be ~20% lower than optimal one. No
+// attempt was made to address this, because original Itanium is
+// hardly represented out in the wild...
+.align 32
+.Linner_ctop:
+.pred.rel      "mutex",p40,p42
+.pred.rel      "mutex",p30,p32
+{ .mfi;        (p16)   ldf8            alo[0]=[aptr],8             // *(aptr++)
+       (p18)   xma.hu          ahi[0]=alo[2],bi,ahi[1]
+       (p40)   add             n[2]=n[2],a[2]          }   // (p23)
+{ .mfi;        (p16)   nop.m           0
+       (p18)   xma.lu          alo[2]=alo[2],bi,ahi[1]
+       (p42)   add             n[2]=n[2],a[2],1        };; // (p23)
+{ .mfi;        (p21)   getf.sig        a[0]=alo[5]
+       (p16)   nop.f           0
+       (p40)   cmp.ltu         p41,p39=n[2],a[2]       }   // (p23)
+{ .mfi;        (p21)   ld8             t[0]=[tptr],8
+       (p16)   nop.f           0
+       (p42)   cmp.leu         p41,p39=n[2],a[2]       };; // (p23)
+{ .mfi;        (p18)   ldf8            nlo[0]=[nptr],8             // *(nptr++)
+       (p20)   xma.hu          nhi[0]=nlo[2],m0,nhi[1]
+       (p30)   add             a[1]=a[1],t[1]          }   // (p22)
+{ .mfi;        (p16)   nop.m           0
+       (p20)   xma.lu          nlo[2]=nlo[2],m0,nhi[1]
+       (p32)   add             a[1]=a[1],t[1],1        };; // (p22)
+{ .mmi;        (p21)   getf.sig        n[0]=nlo[3]
+       (p16)   nop.m           0
+       (p30)   cmp.ltu         p31,p29=a[1],t[1]       }   // (p22)
+{ .mmb;        (p23)   st8             [tp_1]=n[2],8
+       (p32)   cmp.leu         p31,p29=a[1],t[1]           // (p22)
+       br.ctop.sptk    .Linner_ctop                    };;
+.Linner_cend:
+
+{ .mmi;        getf.sig        a[0]=ahi[6]             // (p24)
+       getf.sig        n[0]=nhi[4]
+       nop.i           0               };;
+
+{ .mmi;        .pred.rel       "mutex",p31,p33
+(p31)  add             a[0]=a[0],topbit
+(p33)  add             a[0]=a[0],topbit,1
+       mov             topbit=r0       };;
+{ .mfi; .pred.rel      "mutex",p31,p33
+(p31)  cmp.ltu         p32,p30=a[0],topbit
+(p33)  cmp.leu         p32,p30=a[0],topbit
+                                       }
+{ .mfi;        .pred.rel       "mutex",p40,p42
+(p40)  add             n[0]=n[0],a[0]
+(p42)  add             n[0]=n[0],a[0],1
+                                       };;
+{ .mmi;        .pred.rel       "mutex",p44,p46
+(p40)  cmp.ltu         p41,p39=n[0],a[0]
+(p42)  cmp.leu         p41,p39=n[0],a[0]
+(p32)  add             topbit=r0,r0,1  }
+
+{ .mmi;        st8             [tp_1]=n[0],8
+       cmp4.ne         p6,p0=1,num
+       sub             aptr=aptr,len   };;     // rewind
+{ .mmi;        sub             nptr=nptr,len
+(p41)  add             topbit=r0,r0,1
+       $ADDP           tptr=16,sp      }
+{ .mmb;        $ADDP           tp_1=8,sp
+       add             num=-1,num              // num--
+(p6)   br.cond.sptk.many       .Louter };;
+\f
+{ .mbb;        add             lc=4,lc
+       brp.loop.imp    .Lsub_ctop,.Lsub_cend-16
+       clrrrb.pr                       };;
+{ .mii;        nop.m           0
+       mov             pr.rot=0x10001<<16
+                       // ------^---- (p33) at first (p17)
+       mov             ar.lc=lc        }
+{ .mii;        nop.m           0
+       mov             ar.ec=3
+       nop.i           0               };;
+
+.Lsub_ctop:
+.pred.rel      "mutex",p33,p35
+{ .mfi;        (p16)   ld8             t[0]=[tptr],8               // t=*(tp++)
+       (p16)   nop.f           0
+       (p33)   sub             n[1]=t[1],n[1]          }   // (p17)
+{ .mfi;        (p16)   ld8             n[0]=[nptr],8               // n=*(np++)
+       (p16)   nop.f           0
+       (p35)   sub             n[1]=t[1],n[1],1        };; // (p17)
+{ .mib;        (p18)   st8             [rptr]=n[2],8               // *(rp++)=r
+       (p33)   cmp.gtu         p34,p32=n[1],t[1]           // (p17)
+       (p18)   nop.b           0                       }
+{ .mib;        (p18)   nop.m           0
+       (p35)   cmp.geu         p34,p32=n[1],t[1]           // (p17)
+       br.ctop.sptk    .Lsub_ctop                      };;
+.Lsub_cend:
+
+{ .mmb;        .pred.rel       "mutex",p34,p36
+(p34)  sub     topbit=topbit,r0        // (p19)
+(p36)  sub     topbit=topbit,r0,1
+       brp.loop.imp    .Lcopy_ctop,.Lcopy_cend-16
+                                       }
+{ .mmb;        sub     rptr=rptr,len           // rewind
+       sub     tptr=tptr,len
+       clrrrb.pr                       };;
+{ .mmi;        and     aptr=tptr,topbit
+       andcm   bptr=rptr,topbit
+       mov     pr.rot=1<<16            };;
+{ .mii;        or      nptr=aptr,bptr
+       mov     ar.lc=lc
+       mov     ar.ec=3                 };;
+
+.Lcopy_ctop:
+{ .mmb;        (p16)   ld8     n[0]=[nptr],8
+       (p18)   st8     [tptr]=r0,8
+       (p16)   nop.b   0               }
+{ .mmb;        (p16)   nop.m   0
+       (p18)   st8     [rptr]=n[2],8
+       br.ctop.sptk    .Lcopy_ctop     };;
+.Lcopy_cend:
+
+{ .mmi;        mov             ret0=1                  // signal "handled"
+       rum             1<<5                    // clear um.mfh
+       mov             ar.lc=prevlc    }
+{ .mib;        .restore        sp
+       mov             sp=prevsp
+       mov             pr=prevpr,-2
+       br.ret.sptk.many        b0      };;
+.endp  bn_mul_mont
+.type  copyright#,\@object
+copyright:
+stringz        "Montgomery multiplication for IA-64, CRYPTOGAMS by <appro\@openssl.org>"
+___
+
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;