Eliminate conditional final subtraction in Montgomery assembler modules.
authorAndy Polyakov <appro@openssl.org>
Sun, 17 Jun 2007 17:10:03 +0000 (17:10 +0000)
committerAndy Polyakov <appro@openssl.org>
Sun, 17 Jun 2007 17:10:03 +0000 (17:10 +0000)
crypto/bn/asm/alpha-mont.pl
crypto/bn/asm/armv4-mont.pl
crypto/bn/asm/mips3-mont.pl
crypto/bn/asm/ppc-mont.pl
crypto/bn/asm/s390x-mont.pl
crypto/bn/asm/sparcv9-mont.pl
crypto/bn/asm/sparcv9a-mont.pl
crypto/bn/asm/via-mont.pl
crypto/bn/asm/x86-mont.pl
crypto/bn/asm/x86_64-mont.pl

index d840937..09f53a3 100644 (file)
@@ -258,56 +258,48 @@ bn_mul_mont:
        stq     $hi1,16($tp)
        bne     $tj,.Louter
 \f
-       s8addq  $num,sp,$ap
-       mov     $rp,$bp
+       s8addq  $num,sp,$tj     # &tp[num]
+       mov     $rp,$bp         # put rp aside
        mov     sp,$tp
-       mov     0,$hi0
-
-       bne     $hi1,.Lsub
-       cmpult  $nj,$lo1,AT
-       bne     AT,.Lsub
-
-.align 4
-.Lcopy:        ldq     AT,($tp)
-       lda     $tp,8($tp)
-       stq     AT,($rp)
-       cmpult  $tp,$ap,AT
-       stq     zero,-8($tp)
-       nop
-       lda     $rp,8($rp)
-       bne     AT,.Lcopy
-       mov     1,v0
-       br      .Lexit
+       mov     sp,$ap
+       srl     $nj,62,AT       # boundary condition...
+       beq     AT,.Lcopy       # ... is met
+       mov     0,$hi0          # clear borrow bit
 
 .align 4
 .Lsub: ldq     $lo0,($tp)
        ldq     $lo1,($np)
-       subq    $lo0,$lo1,$lo1
+       lda     $tp,8($tp)
+       lda     $np,8($np)
+       subq    $lo0,$lo1,$lo1  # tp[i]-np[i]
        cmpult  $lo0,$lo1,AT
        subq    $lo1,$hi0,$lo0
        cmpult  $lo1,$lo0,$hi0
-       lda     $tp,8($tp)
        or      $hi0,AT,$hi0
-       lda     $np,8($np)
        stq     $lo0,($rp)
-       cmpult  $tp,$ap,v0
+       cmpult  $tp,$tj,v0
        lda     $rp,8($rp)
        bne     v0,.Lsub
 
-       subq    $hi1,$hi0,$hi0
+       subq    $hi1,$hi0,$hi0  # handle upmost overflow bit
        mov     sp,$tp
-       cmpule  $hi1,$hi0,AT
-       mov     $bp,$rp
-       bne     AT,.Lcopy
+       mov     $bp,$rp         # restore rp
+
+       and     sp,$hi0,$ap
+       bic     $bp,$hi0,$bp
+       bis     $bp,$ap,$ap     # ap=borrow?tp:rp
 
 .align 4
-.Lzap: stq     zero,($tp)
-       cmpult  $tp,$ap,AT
+.Lcopy:        ldq     $aj,($ap)       # copy or in-place refresh
        lda     $tp,8($tp)
-       bne     AT,.Lzap
+       lda     $rp,8($rp)
+       lda     $ap,8($ap)
+       stq     zero,-8($tp)    # zap tp
+       cmpult  $tp,$tj,AT
+       stq     $aj,-8($rp)
+       bne     AT,.Lcopy
        mov     1,v0
 
-.align 4
 .Lexit:
        .set    noreorder
        mov     fp,sp
index 26eca61..3561ea2 100644 (file)
@@ -61,7 +61,7 @@ bn_mul_mont:
        cmp     $num,#2
        movlt   r0,#0
        addlt   sp,sp,#2*4
-       blt     .Labort
+       blt     .Labrt
 
        stmdb   sp!,{r4-r12,lr}         @ save 10 registers
 
@@ -160,27 +160,13 @@ bn_mul_mont:
        add     $num,$num,#4            @ $num to point at &tp[num]
        sub     $aj,$num,sp             @ "original" num value
        mov     $tp,sp                  @ "rewind" $tp
+       mov     $ap,$tp                 @ "borrow" $ap
        sub     $np,$np,$aj             @ "rewind" $np to &np[0]
 
-       cmp     $nhi,#0                 @ upmost carry
-       bne     .Lsub
-       cmp     $nlo,$nj                @ tp[num-1]-np[num-1]
-       bhs     .Lsub
-
-.Lcopy:        ldr     $tj,[$tp]
-       str     sp,[$tp],#4             @ zap tp
-       str     $tj,[$rp],#4
-       cmp     $tp,$num
-       bne     .Lcopy
-
-.Lexit:        add     sp,$num,#4              @ skip over tp[num+1]
-       ldmia   sp!,{r4-r12,lr}         @ restore registers
-       add     sp,sp,#2*4              @ skip over {r0,r2}
-       mov     r0,#1
-.Labort:tst    lr,#1
-       moveq   pc,lr                   @ be binary compatible with V4, yet
-       bx      lr                      @ interoperable with Thumb ISA:-)
+       movs    $tj,$nj,lsr#30          @ boundary condition...
+       beq     .Lcopy                  @ ... is met
 
+       subs    $tj,$tj,$tj             @ "clear" carry flag
 .Lsub: ldr     $tj,[$tp],#4
        ldr     $nj,[$np],#4
        sbcs    $tj,$tj,$nj             @ tp[j]-np[j]
@@ -190,12 +176,24 @@ bn_mul_mont:
        sbcs    $nhi,$nhi,#0            @ upmost carry
        mov     $tp,sp                  @ "rewind" $tp
        sub     $rp,$rp,$aj             @ "rewind" $rp
-       blo     .Lcopy                  @ tp was less after all
 
-.Lzap: str     sp,[$tp],#4
+       and     $ap,$tp,$nhi
+       bic     $np,$rp,$nhi
+       orr     $ap,$ap,$np             @ ap=borrow?tp:rp
+
+.Lcopy:        ldr     $tj,[$ap],#4            @ copy or in-place refresh
+       str     sp,[$tp],#4             @ zap tp
+       str     $tj,[$rp],#4
        cmp     $tp,$num
-       bne     .Lzap
-       bal     .Lexit
+       bne     .Lcopy
+
+       add     sp,$num,#4              @ skip over tp[num+1]
+       ldmia   sp!,{r4-r12,lr}         @ restore registers
+       add     sp,sp,#2*4              @ skip over {r0,r2}
+       mov     r0,#1
+.Labrt:        tst     lr,#1
+       moveq   pc,lr                   @ be binary compatible with V4, yet
+       bx      lr                      @ interoperable with Thumb ISA:-)
 .size  bn_mul_mont,.-bn_mul_mont
 .asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
 ___
index 99ebef3..e3c05ac 100644 (file)
@@ -265,27 +265,50 @@ bn_mul_mont:
        addu    $i,8
        sltu    s7,$i,$num
        bnez    s7,.Louter
-
+\f
        .set    noreorder
-       PTR_ADD $ap,sp,$num
+       PTR_ADD $tj,sp,$num     # &tp[num]
        move    $tp,sp
+       move    $ap,sp
 
-       bnez    $hi1,.Lsub
-       li      $hi0,0
-       sgeu    AT,$lo1,$nj
-       beqz    AT,.Lsub
-       nop
+       dsrl    AT,$nj,62       # boundary condition...
+       beqz    AT,.Lcopy       # ... is met
+       li      $hi0,0          # clear borrow bit
 
 .align 4
-.Lcopy:        ld      AT,($tp)
+.Lsub: ld      $lo0,($tp)
+       ld      $lo1,($np)
+       PTR_ADD $tp,8
+       PTR_ADD $np,8
+       dsubu   $lo1,$lo0,$lo1  # tp[i]-np[i]
+       sgtu    AT,$lo1,$lo0
+       dsubu   $lo0,$lo1,$hi0
+       sgtu    $hi0,$lo0,$lo1
+       sd      $lo0,($rp)
+       or      $hi0,AT
+       sltu    AT,$tp,$tj
+       bnez    AT,.Lsub
+       PTR_ADD $rp,8
+
+       dsubu   $hi0,$hi1,$hi0  # handle upmost overflow bit
+       move    $tp,sp
+       PTR_SUB $rp,$num        # restore rp
+       not     $hi1,$hi0
+
+       and     $ap,$hi0,sp
+       and     $bp,$hi1,$rp
+       or      $ap,$ap,$bp     # ap=borrow?tp:rp
+
+.align 4
+.Lcopy:        ld      $aj,($ap)
+       PTR_ADD $ap,8
        PTR_ADD $tp,8
-       sd      AT,($rp)
-       sltu    AT,$tp,$ap
        sd      zero,-8($tp)
+       sltu    AT,$tp,$tj
+       sd      $aj,($rp)
        bnez    AT,.Lcopy
        PTR_ADD $rp,8
 
-.Lexit:
        ld      s0,0($fp)
        ld      s1,8($fp)
        ld      s2,16($fp)
@@ -297,34 +320,6 @@ bn_mul_mont:
        li      v0,1
        jr      ra
        PTR_ADD sp,$fp,64
-
-.align 4
-.Lsub: ld      $lo0,($tp)
-       ld      $lo1,($np)
-       dsubu   $lo1,$lo0,$lo1
-       sgtu    AT,$lo1,$lo0
-       dsubu   $lo0,$lo1,$hi0
-       sgtu    $hi0,$lo0,$lo1
-       PTR_ADD $tp,8
-       or      $hi0,AT
-       PTR_ADD $np,8
-       sd      $lo0,($rp)
-       sltu    AT,$tp,$ap
-       bnez    AT,.Lsub
-       PTR_ADD $rp,8
-
-       dsubu   $hi0,$hi1,$hi0
-       move    $tp,sp
-       sgtu    AT,$hi0,$hi1
-       bnez    AT,.Lcopy
-       PTR_SUB $rp,$num
-.align 4
-.Lzap: sd      zero,($tp)
-       sltu    AT,$tp,$ap
-       bnez    AT,.Lzap
-       PTR_ADD $tp,8
-       b       .Lexit
-       nop
        .set    reorder
 END(bn_mul_mont)
 .rdata
index 280d312..b69809a 100644 (file)
@@ -2,8 +2,9 @@
 
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 
 # April 2006
@@ -42,6 +43,7 @@ if ($output =~ /32\-mont\.s/) {
        $UMULL= "mullw";        # unsigned multiply low
        $UMULH= "mulhwu";       # unsigned multiply high
        $UCMP=  "cmplw";        # unsigned compare
+       $SHRI=  "srwi";         # unsigned shift right by immediate     
        $PUSH=  $ST;
        $POP=   $LD;
 } elsif ($output =~ /64\-mont\.s/) {
@@ -62,6 +64,7 @@ if ($output =~ /32\-mont\.s/) {
        $UMULL= "mulld";        # unsigned multiply low
        $UMULH= "mulhdu";       # unsigned multiply high
        $UCMP=  "cmpld";        # unsigned compare
+       $SHRI=  "srdi";         # unsigned shift right by immediate     
        $PUSH=  $ST;
        $POP=   $LD;
 } else { die "nonsense $output"; }
@@ -264,24 +267,37 @@ Linner:
        addi    $i,$i,$BNSZ
        ble-    Louter
 \f
+       $SHRI.  $nj,$nj,$BITS-2 ; check boundary condition
        addi    $num,$num,2     ; restore $num
+       subfc   $j,$j,$j        ; j=0 and "clear" XER[CA]
        addi    $tp,$sp,$FRAME
+       addi    $ap,$sp,$FRAME
        mtctr   $num
+       beq     Lcopy           ; boundary condition is met
+
+.align 4
+Lsub:  $LDX    $tj,$tp,$j
+       $LDX    $nj,$np,$j
+       subfe   $aj,$nj,$tj     ; tp[j]-np[j]
+       $STX    $aj,$rp,$j
+       addi    $j,$j,$BNSZ
+       bdnz-   Lsub
+
        li      $j,0
+       mtctr   $num
+       subfe   $ovf,$j,$ovf    ; handle upmost overflow bit
+       and     $ap,$tp,$ovf
+       andc    $np,$rp,$ovf
+       or      $ap,$ap,$np     ; ap=borrow?tp:rp
 
-       subfc.  $ovf,$j,$ovf    ; sets XER[CA]
-       bne     Lsub
-       $UCMP   $hi1,$nj
-       bge     Lsub
 .align 4
-Lcopy:
-       $LDX    $tj,$tp,$j
+Lcopy:                         ; copy or in-place refresh
+       $LDX    $tj,$ap,$j
        $STX    $tj,$rp,$j
        $STX    $j,$tp,$j       ; zap at once
        addi    $j,$j,$BNSZ
        bdnz-   Lcopy
 
-Lexit:
        $POP    r14,`4*$SIZE_T`($sp)
        $POP    r15,`5*$SIZE_T`($sp)
        $POP    r16,`6*$SIZE_T`($sp)
@@ -298,22 +314,7 @@ Lexit:
        li      r3,1
        blr
        .long   0
-.align 4
-Lsub:  $LDX    $tj,$tp,$j
-       $LDX    $nj,$np,$j
-       subfe   $tj,$nj,$tj     ; tp[j]-np[j]
-       $STX    $tj,$rp,$j
-       addi    $j,$j,$BNSZ
-       bdnz-   Lsub
-       li      $j,0
-       subfe.  $ovf,$j,$ovf
-       mtctr   $num
-       bne     Lcopy
-.align 4
-Lzap:  $STX    $j,$tp,$j
-       addi    $j,$j,$BNSZ
-       bdnz-   Lzap
-       b       Lexit
+.asciz  "Montgomery Multiplication for PPC, CRYPTOGAMS by <appro\@fy.chalmers.se>"
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
index 5d1b9fd..224d5ba 100644 (file)
@@ -176,45 +176,45 @@ bn_mul_mont:
 ___
 
 undef $bi;
-$count=$ap; undef $ap;
+$count=$bp; undef $bp;
 
 $code.=<<___;
        lg      $rp,16+16($fp)  # reincarnate rp
+       la      $ap,8($fp)
        lgr     $j,$num
-       ltgr    $AHI,$AHI
-       jnz     .Lsub           # upmost overflow bit is not zero
-       #slg    $NHI,-8($np)    # tp[num-1]-np[num-1]
-       lghi    $count,-8               # buggy assembler
-       slg     $NHI,0($count,$np)      # buggy assembler
-       jnle    .Lsub           # branch if not borrow 
 
-.Lcopy:        lg      $alo,8($j,$fp)
-       stg     $j,8($j,$fp)
-       stg     $alo,0($j,$rp)
-       aghi    $j,8
-       jnz     .Lcopy
-.Lexit:
-       lmg     %r6,%r15,16+48($fp)
-       lghi    %r2,1           # signal "processed"
-       br      %r14
+       #lg     $nhi,-8($np)            # buggy assembler
+       lghi    $count,-8               # buggy assembler
+       lg      $nhi,0($count,$np)      # buggy assembler
+       srag    $nhi,$nhi,62    # boundary condition...
+       jz      .Lcopy          # ... is met
 
-.Lsub: lcgr    $count,$num
+       lcgr    $count,$num
        sra     $count,3        # incidentally clears "borrow"
-.Lsubloop:
-       lg      $alo,8($j,$fp)
+.Lsub: lg      $alo,0($j,$ap)
        slbg    $alo,0($j,$np)
        stg     $alo,0($j,$rp)
        la      $j,8($j)
-       brct    $count,.Lsubloop
+       brct    $count,.Lsub
        lghi    $ahi,0
-       slbgr   $AHI,$ahi
+       slbgr   $AHI,$ahi       # handle upmost carry
+
+       ngr     $ap,$AHI
+       lghi    $np,-1
+       xgr     $np,$AHI
+       ngr     $np,$rp
+       ogr     $ap,$np         # ap=borrow?tp:rp
        lgr     $j,$num
-       jle     .Lcopy          # branch if borrow
 
-.Lzap: stg     $j,8($j,$fp)
+.Lcopy:        lg      $alo,0($j,$ap)  # copy or in-place refresh
+       stg     $j,8($j,$fp)    # zap tp
+       stg     $alo,0($j,$rp)
        aghi    $j,8
-       jnz     .Lzap
-       j       .Lexit
+       jnz     .Lcopy
+
+       lmg     %r6,%r15,16+48($fp)
+       lghi    %r2,1           # signal "processed"
+       br      %r14
 .size  bn_mul_mont,.-bn_mul_mont
 .string        "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
 ___
index acdf692..d78b432 100644 (file)
@@ -2,8 +2,9 @@
 
 # ====================================================================
 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
-# project. Rights for redistribution and usage in source and binary
-# forms are granted according to the OpenSSL license.
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 
 # December 2005
@@ -254,44 +255,36 @@ $fname:
 .Ltail:
        add     $np,$num,$np
        add     $rp,$num,$rp
-
-       cmp     $car2,0                 ! clears %icc.c
-       bne,pn  %icc,.Lsub
+       mov     $tp,$ap
        sub     %g0,$num,%o7            ! k=-num
 
-       cmp     $car1,$npj              ! compare top-most $tp and $np words
-       bcs,pt  %icc,.Lcopy             ! %icc.c is clean if not taken
-       nop
+       srl     $npj,30,%o0             ! boundary condition...
+       brz,pn  %o0,.Lcopy              ! ... is met
+       subcc   %g0,%g0,%g0             ! clear %icc.c
 
 .align 16,0x1000000
 .Lsub:
        ld      [$tp+%o7],%o0
        ld      [$np+%o7],%o1
-       subccc  %o0,%o1,%o1
+       subccc  %o0,%o1,%o1             ! tp[j]-np[j]
        st      %o1,[$rp+%o7]
        add     %o7,4,%o7
        brnz    %o7,.Lsub
        nop
-       subccc  $car2,0,$car2
-       bcc     %icc,.Lzap
+       subc    $car2,0,$car2           ! handle upmost overflow bit
+       and     $tp,$car2,$ap
+       andn    $rp,$car2,$np
+       or      $ap,$np,$ap
        sub     %g0,$num,%o7
 
 .align 16,0x1000000
 .Lcopy:
-       ld      [$tp+%o7],%o0
+       ld      [$ap+%o7],%o0           ! copy or in-place refresh
+       st      %g0,[$tp+%o7]           ! zap tp
        st      %o0,[$rp+%o7]
        add     %o7,4,%o7
        brnz    %o7,.Lcopy
        nop
-       ba      .Lzap
-       sub     %g0,$num,%o7
-
-.align 32
-.Lzap:
-       st      %g0,[$tp+%o7]
-       add     %o7,4,%o7
-       brnz    %o7,.Lzap
-       nop
        mov     1,%i0
        ret
        restore
@@ -609,6 +602,7 @@ $code.=<<___;
        add     $tp,8,$tp
 .type  $fname,#function
 .size  $fname,(.-$fname)
+.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
 ___
 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
 print $code;
index cecf07c..02847fd 100755 (executable)
@@ -121,7 +121,6 @@ $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
 $ASI_FL16_P=0xD2;      # magic ASI value to engage 16-bit FP load
 
 $code=<<___;
-.ident         "UltraSPARC Montgomery multiply by <appro\@fy.chalmers.se>"
 .section       ".text",#alloc,#execinstr
 
 .global $fname
@@ -799,17 +798,14 @@ $fname:
        bnz     %icc,.Louter
        nop
 \f
-       sub     %g0,$num,%o7            ! n=-num
-       cmp     $carry,0                ! clears %icc.c
-       bne,pn  %icc,.Lsub
-       add     $tp,8,$tp               ! adjust tp to point at the end
-
-       ld      [$tp-8],%o0
        ld      [$np-4],%o1
-       cmp     %o0,%o1                 ! compare topmost words
-       bcs,pt  %icc,.Lcopy             ! %icc.c is clean if not taken
-       nop
-
+       subcc   %g0,%g0,%g0             ! clear %icc.c
+       add     $tp,8,$tp               ! adjust tp to point at the end
+       srl     %o1,30,%o1              ! boundary condition...
+       orn     %g0,%g0,%g4
+       brz,pn  %o1,.Lcopy              ! ... is met
+       sub     %g0,$num,%o7            ! n=-num
+       
 .align 32,0x1000000
 .Lsub:
        ldx     [$tp+%o7],%o0
@@ -824,24 +820,30 @@ $fname:
        add     %o7,8,%o7
        brnz,pt %o7,.Lsub
        st      %o3,[%g1+4]
-       subccc  $carry,0,$carry
-       bcc,pt  %icc,.Lzap
+       subc    $carry,0,%g4
        sub     %g0,$num,%o7            ! n=-num
 
-.align 16,0x1000000
+.align 32,0x1000000
 .Lcopy:
        ldx     [$tp+%o7],%o0
-       srlx    %o0,32,%o1
        add     $rp,%o7,%g1
+       ld      [%g1+0],%o2
+       ld      [%g1+4],%o3
+       stx     %g0,[$tp+%o7]
+       and     %o0,%g4,%o0
+       srlx    %o0,32,%o1
+       andn    %o2,%g4,%o2
+       andn    %o3,%g4,%o3
+       or      %o2,%o0,%o0
+       or      %o3,%o1,%o1
        st      %o0,[%g1+0]
        add     %o7,8,%o7
        brnz,pt %o7,.Lcopy
        st      %o1,[%g1+4]
        sub     %g0,$num,%o7            ! n=-num
 
-.align 32
+.align 32,0x1000000
 .Lzap:
-       stx     %g0,[$tp+%o7]
        stx     %g0,[$ap_l+%o7]
        stx     %g0,[$ap_h+%o7]
        stx     %g0,[$np_l+%o7]
index e149941..ce3cd61 100644 (file)
@@ -77,7 +77,8 @@
 # - in terms of absolute performance it delivers approximately as much
 #   as modern out-of-order 32-bit cores [again, for longer keys].
 
-push(@INC,".","../../perlasm");
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+push(@INC,"${dir}","${dir}../../perlasm");
 require "x86asm.pl";
 
 &asm_init($ARGV[0],"via-mont.pl");
@@ -100,7 +101,7 @@ $sp=&DWP(28,"esp");
 # &DWP(64+(4*$num+$pad)*0,"esp")       # padded tp[num]
 # &DWP(64+(4*$num+$pad)*1,"esp")       # padded copy of ap[num]
 # &DWP(64+(4*$num+$pad)*2,"esp")       # padded copy of bp[num]
-# &DWP(64+(4*$num+$pad)*2,"esp")       # padded copy of np[num]
+# &DWP(64+(4*$num+$pad)*3,"esp")       # padded copy of np[num]
 # Note that SDK suggests to unconditionally allocate 2K per vector. This
 # has quite an impact on performance. It naturally depends on key length,
 # but to give an example 1024 bit private RSA key operations suffer >30%
@@ -115,7 +116,7 @@ $sp=&DWP(28,"esp");
        &jnz    (&label("leave"));      # num % 4 != 0
        &cmp    ("ecx",8);
        &jb     (&label("leave"));      # num < 8
-       &cmp    ("ecx",256);
+       &cmp    ("ecx",1024);
        &ja     (&label("leave"));      # num > 1024
 
        &pushf  ();
@@ -148,74 +149,91 @@ $sp=&DWP(28,"esp");
        &lea    ("ebp",&DWP(-$pad,"ecx"));
        &shr    ("ebp",2);              # restore original num value in ebp
 
-       &add    ("ecx",32/4);           # (4 vectors + 32 byte scratch)/4
        &xor    ("eax","eax");
+
+       &mov    ("ecx","ebp");
+       &lea    ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch
        &data_byte(0xf3,0xab);          # rep stosl, bzero
 
        &mov    ("ecx","ebp");
        &lea    ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy
        &mov    ($A,"edi");
        &data_byte(0xf3,0xa5);          # rep movsl, memcpy
+       &mov    ("ecx",$pad/4);
+       &data_byte(0xf3,0xab);          # rep stosl, bzero pad
+       # edi points at the end of padded ap copy...
 
-       # edi points at the end of ap copy...
        &mov    ("ecx","ebp");
-       &add    ("edi",$pad);           # skip padding to point at bp copy
        &mov    ("esi","ebx");
        &mov    ($B,"edi");
        &data_byte(0xf3,0xa5);          # rep movsl, memcpy
+       &mov    ("ecx",$pad/4);
+       &data_byte(0xf3,0xab);          # rep stosl, bzero pad
+       # edi points at the end of padded bp copy...
 
-       # edi points at the end of bp copy...
        &mov    ("ecx","ebp");
-       &add    ("edi",$pad);           # skip padding to point at np copy
        &mov    ("esi","edx");
        &mov    ($M,"edi");
        &data_byte(0xf3,0xa5);          # rep movsl, memcpy
+       &mov    ("ecx",$pad/4);
+       &data_byte(0xf3,0xab);          # rep stosl, bzero pad
+       # edi points at the end of padded np copy...
 
        # let magic happen...
        &mov    ("ecx","ebp");
        &mov    ("esi","esp");
-       &xor    ("eax","eax");
        &shl    ("ecx",5);              # convert word counter to bit counter
        &align  (4);
        &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul
 
        &mov    ("ecx","ebp");
-       &xor    ("edx","edx");          # i=0
-       &lea    ("esi",&DWP(64,"esp")); # tp
-       # edi still points at the end of np copy...
+       &xor    ("edx","edx");                  # i=0
+       &lea    ("esi",&DWP(64,"esp"));         # tp
+       # edi still points at the end of padded np copy...
+       &mov    ("eax",&DWP(-4-$pad,"edi"));    # np[num-1]
        &neg    ("ebp");
-       &lea    ("ebp",&DWP(0,"edi","ebp",4));  # so just "rewind"
-       &mov    ("edi",$rp);            # restore rp
-
-       &mov    ("ebx",&DWP(0,"esi","ecx",4));  # upmost overflow bit
-       &cmp    ("ebx",0);                      # clears CF unconfitionally
-       &jnz    (&label("sub"));
-       &mov    ("eax",&DWP(-4,"esi","ecx",4));
-       &cmp    ("eax",&DWP(-4,"ebp","ecx",4)); # tp[num-1]-np[num-1]?
-       &jae    (&label("sub"));                # if taken CF is cleared
-
-&set_label("copy",4);
-       &mov    ("ebx","ecx");
-       &data_byte(0xf3,0xa5);                  # rep movsl
-       &mov    ("ecx","ebx");
-       &jmp    (&label("zap"));
-
-&set_label("sub",16);
+       &lea    ("ebp",&DWP(-$pad,"edi","ebp",4));      # so just "rewind"
+       &mov    ("edi",$rp);                    # restore rp
+
+       &shr    ("eax",30);                     # boundary condition...
+       &jz     (&label("copy"));               # ... is met
+       &xor    ("edx","edx");                  # clear CF
+
+&set_label("sub",8);
        &mov    ("eax",&DWP(0,"esi","edx",4));
        &sbb    ("eax",&DWP(0,"ebp","edx",4));
        &mov    (&DWP(0,"edi","edx",4),"eax");  # rp[i]=tp[i]-np[i]
        &lea    ("edx",&DWP(1,"edx"));          # i++
-       &dec    ("ecx");                        # doesn't affect CF!
-       &jg     (&label("sub"));
-       &sbb    ("ebx",0);                      # upmost overflow is still there
-       &mov    ("ecx","edx");
-       &jc     (&label("copy"));
+       &loop   (&label("sub"));                # doesn't affect CF!
+
+       &mov    ("eax",&DWP(0,"esi","edx",4));  # upmost overflow bit
+       &sbb    ("eax",0);
+       &and    ("esi","eax");
+       &not    ("eax");
+       &mov    ("ebp","edi");
+       &and    ("ebp","eax");
+       &or     ("esi","ebp");                  # tp=carry?tp:rp
+
+       &mov    ("ecx","edx");                  # num
+       &xor    ("edx","edx");                  # i=0
+
+&set_label("copy",8);
+       &mov    ("eax",&DWP(0,"esi","edx",4));
+       &mov    (&DWP(64,"esp","edx",4),"ecx"); # zap tp
+       &mov    (&DWP(0,"edi","edx",4),"eax");
+       &lea    ("edx",&DWP(1,"edx"));          # i++
+       &loop   (&label("copy"));
 
-&set_label("zap",4);
        &mov    ("ebp",$sp);
        &xor    ("eax","eax");
-       &lea    ("ecx",&DWP(64/4+$pad,"","ecx",4));# size of frame divided by 4
-       &mov    ("edi","esp");
+
+       &mov    ("ecx",64/4);
+       &mov    ("edi","esp");          # zap frame including scratch area
+       &data_byte(0xf3,0xab);          # rep stosl, bzero
+
+       # zap copies of ap, bp and np
+       &lea    ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap
+       &lea    ("ecx",&DWP(3*$pad/4,"edx","edx",2));
        &data_byte(0xf3,0xab);          # rep stosl, bzero
 
        &mov    ("esp","ebp");
@@ -224,4 +242,6 @@ $sp=&DWP(28,"esp");
 &set_label("leave");
 &function_end($func);
 
+&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by <appro\@openssl.org>");
+
 &asm_finish();
index 319c17d..2ed76aa 100755 (executable)
@@ -41,7 +41,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); }
 
 $i="edx";
 $j="ecx";
-$ap="esi";
+$ap="esi";     $tp="esi";              # overlapping variables!!!
 $rp="edi";     $bp="edi";              # overlapping variables!!!
 $np="ebp";
 $num="ebx";
@@ -551,41 +551,39 @@ $sbit=$num;
 }
 \f
 &set_label("common_tail",16);
-       &mov    ($np,$_np);
-       &mov    ("esi",&DWP($frame+4,"esp",$num,4));# load upmost overflow bit
+       &mov    ($np,$_np);                     # load modulus pointer
        &mov    ($rp,$_rp);                     # load result pointer
-                                               # [$ap and $bp are zapped]
-       &xor    ($i,$i);                        # i=0
+       &lea    ($tp,&DWP($frame,"esp"));       # [$ap and $bp are zapped]
+       &mov    ("eax",&DWP(0,$np,$num,4));     # np[num-1]
+       &shr    ("eax",30);                     # check for boundary condition
+       &jz     (&label("copy"));
+
+       &mov    ("eax",&DWP(0,$tp));            # tp[0]
        &mov    ($j,$num);                      # j=num-1
-       &cmp    ("esi",0);                      # clears CF unconditionally
-       &jnz    (&label("sub"));
-       &mov    ("eax",&DWP($frame,"esp",$j,4));
-       &cmp    ("eax",&DWP(0,$np,$j,4));       # tp[num-1]-np[num-1]?
-       &jae    (&label("sub"));                # if taken CF is cleared
-&set_label("copy",16);
-       &mov    ("eax",&DWP($frame,"esp",$j,4));
-       &mov    (&DWP(0,$rp,$j,4),"eax");       # rp[i]=tp[i]
-       &mov    (&DWP($frame,"esp",$j,4),$j);   # zap temporary vector
-       &dec    ($j);
-       &jge    (&label("copy"));
-       &jmp    (&label("exit"));
+       &xor    ($i,$i);                        # i=0 and clear CF!
 
 &set_label("sub",16);
-       &mov    ("eax",&DWP($frame,"esp",$i,4));
        &sbb    ("eax",&DWP(0,$np,$i,4));
        &mov    (&DWP(0,$rp,$i,4),"eax");       # rp[i]=tp[i]-np[i]
-       &lea    ($i,&DWP(1,$i));                # i++
        &dec    ($j);                           # doesn't affect CF!
+       &mov    ("eax",&DWP(4,$tp,$i,4));       # tp[i+1]
+       &lea    ($i,&DWP(1,$i));                # i++
        &jge    (&label("sub"));
-       &mov    ($j,$num);                      # j=num-1
-       &sbb    ("esi",0);                      # esi holds upmost overflow bit
-       &jc     (&label("copy"));
-&set_label("zap",8);
-       &mov    (&DWP($frame,"esp",$j,4),$i);   # zap temporary vector
-       &dec    ($j);
-       &jge    (&label("zap"));
-
-&set_label("exit",8);
+
+       &sbb    ("eax",0);                      # handle upmost overflow bit
+       &and    ($tp,"eax");
+       &not    ("eax");
+       &mov    ($np,$rp);
+       &and    ($np,"eax");
+       &or     ($tp,$np);                      # tp=carry?tp:rp
+
+&set_label("copy",16);                         # copy or in-place refresh
+       &mov    ("eax",&DWP(0,$tp,$num,4));
+       &mov    (&DWP(0,$rp,$num,4),"eax");     # rp[i]=tp[i]
+       &mov    (&DWP($frame,"esp",$num,4),$j); # zap temporary vector
+       &dec    ($num);
+       &jge    (&label("copy"));
+
        &mov    ("esp",$_sp);           # pull saved stack pointer
        &mov    ("eax",1);
 &set_label("just_leave");
index bc3fa83..6701bf2 100755 (executable)
@@ -59,6 +59,7 @@ bn_mul_mont:
        neg     %rax
        lea     (%rsp,%rax,8),%rsp      # tp=alloca(8*(num+2))
        and     \$-1024,%rsp            # minimize TLB usage
+
        mov     %rbp,8(%rsp,$num,8)     # tp[num+1]=%rsp
        mov     %rdx,$bp                # $bp reassigned, remember?
 
@@ -166,22 +167,38 @@ bn_mul_mont:
        cmp     $num,$i
        jl      .Louter
 
-       xor     $i,$i                   # i=0
+       mov     -8($np,$num,8),%rax     # np[num-1]
+       lea     (%rsp),$ap              # borrow ap for tp
+       shr     \$62,%rax               # check for boundary condition
+       jz      .Lcopy
+
+       mov     ($ap),%rax              # tp[0]
        lea     -1($num),$j             # j=num-1
-       cmp     \$0,%rdx                # %rdx still holds upmost overflow bit
-       jnz     .Lsub                   # CF is cleared by compare with 0
-       mov     (%rsp,$j,8),%rax
-       cmp     ($np,$j,8),%rax         # tp[num-1]-np[num-1]
-       jae     .Lsub                   # if taken CF was cleared by above cmp
-.align 4
-.Lcopy:
-       mov     (%rsp,$j,8),%rax
+       xor     $i,$i                   # i=0 and clear CF!
+       jmp     .Lsub
+.align 16
+.Lsub: sbb     ($np,$i,8),%rax
+       mov     %rax,($rp,$i,8)         # rp[i]=tp[i]-np[i]
+       dec     $j                      # doesn't affect CF!
+       mov     8($ap,$i,8),%rax        # tp[i+1]
+       lea     1($i),$i                # i++
+       jge     .Lsub
+
+       sbb     \$0,%rax                # handle upmost overflow bit
+       and     %rax,$ap
+       not     %rax
+       mov     $rp,$np
+       and     %rax,$np
+       lea     -1($num),$j
+       or      $np,$ap                 # ap=borrow?tp:rp
+.align 16
+.Lcopy:                                        # copy or in-place refresh
+       mov     ($ap,$j,8),%rax
        mov     %rax,($rp,$j,8)         # rp[i]=tp[i]
        mov     $i,(%rsp,$j,8)          # zap temporary vector
        dec     $j
        jge     .Lcopy
-.align 4
-.Lexit:
+       
        mov     8(%rsp,$num,8),%rsp     # restore %rsp
        mov     \$1,%rax
        pop     %r15
@@ -191,22 +208,6 @@ bn_mul_mont:
        pop     %rbp
        pop     %rbx
        ret
-
-.align 16
-.Lsub: mov     (%rsp,$i,8),%rax
-       sbb     ($np,$i,8),%rax
-       mov     %rax,($rp,$i,8)         # rp[i]=tp[i]-np[j]
-       lea     1($i),$i                # i++
-       dec     $j                      # doesn't affect CF!
-       jge     .Lsub
-       lea     -1($num),$j             # j=num-1
-       sbb     \$0,%rdx
-       jc      .Lcopy                  # tp was less than np
-.align 4
-.Lzap: mov     $i,(%rsp,$j,8)          # zap temporary vector
-       dec     $j
-       jge     .Lzap
-       jmp     .Lexit
 .size  bn_mul_mont,.-bn_mul_mont
 .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
 ___