From 7d9cf7c0bbc17a2c00339e660c83ebf1a4f9061a Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 17 Jun 2007 17:10:03 +0000 Subject: [PATCH] Eliminate conditional final subtraction in Montgomery assembler modules. --- crypto/bn/asm/alpha-mont.pl | 54 +++++++++---------- crypto/bn/asm/armv4-mont.pl | 44 ++++++++-------- crypto/bn/asm/mips3-mont.pl | 73 ++++++++++++-------------- crypto/bn/asm/ppc-mont.pl | 51 +++++++++--------- crypto/bn/asm/s390x-mont.pl | 50 +++++++++--------- crypto/bn/asm/sparcv9-mont.pl | 36 ++++++------- crypto/bn/asm/sparcv9a-mont.pl | 36 +++++++------ crypto/bn/asm/via-mont.pl | 94 +++++++++++++++++++++------------- crypto/bn/asm/x86-mont.pl | 54 ++++++++++--------- crypto/bn/asm/x86_64-mont.pl | 55 ++++++++++---------- 10 files changed, 274 insertions(+), 273 deletions(-) diff --git a/crypto/bn/asm/alpha-mont.pl b/crypto/bn/asm/alpha-mont.pl index d840937548..09f53a3622 100644 --- a/crypto/bn/asm/alpha-mont.pl +++ b/crypto/bn/asm/alpha-mont.pl @@ -258,56 +258,48 @@ bn_mul_mont: stq $hi1,16($tp) bne $tj,.Louter - s8addq $num,sp,$ap - mov $rp,$bp + s8addq $num,sp,$tj # &tp[num] + mov $rp,$bp # put rp aside mov sp,$tp - mov 0,$hi0 - - bne $hi1,.Lsub - cmpult $nj,$lo1,AT - bne AT,.Lsub - -.align 4 -.Lcopy: ldq AT,($tp) - lda $tp,8($tp) - stq AT,($rp) - cmpult $tp,$ap,AT - stq zero,-8($tp) - nop - lda $rp,8($rp) - bne AT,.Lcopy - mov 1,v0 - br .Lexit + mov sp,$ap + srl $nj,62,AT # boundary condition... + beq AT,.Lcopy # ... is met + mov 0,$hi0 # clear borrow bit .align 4 .Lsub: ldq $lo0,($tp) ldq $lo1,($np) - subq $lo0,$lo1,$lo1 + lda $tp,8($tp) + lda $np,8($np) + subq $lo0,$lo1,$lo1 # tp[i]-np[i] cmpult $lo0,$lo1,AT subq $lo1,$hi0,$lo0 cmpult $lo1,$lo0,$hi0 - lda $tp,8($tp) or $hi0,AT,$hi0 - lda $np,8($np) stq $lo0,($rp) - cmpult $tp,$ap,v0 + cmpult $tp,$tj,v0 lda $rp,8($rp) bne v0,.Lsub - subq $hi1,$hi0,$hi0 + subq $hi1,$hi0,$hi0 # handle upmost overflow bit mov sp,$tp - cmpule $hi1,$hi0,AT - mov $bp,$rp - bne AT,.Lcopy + mov $bp,$rp # restore rp + + and sp,$hi0,$ap + bic $bp,$hi0,$bp + bis $bp,$ap,$ap # ap=borrow?tp:rp .align 4 -.Lzap: stq zero,($tp) - cmpult $tp,$ap,AT +.Lcopy: ldq $aj,($ap) # copy or in-place refresh lda $tp,8($tp) - bne AT,.Lzap + lda $rp,8($rp) + lda $ap,8($ap) + stq zero,-8($tp) # zap tp + cmpult $tp,$tj,AT + stq $aj,-8($rp) + bne AT,.Lcopy mov 1,v0 -.align 4 .Lexit: .set noreorder mov fp,sp diff --git a/crypto/bn/asm/armv4-mont.pl b/crypto/bn/asm/armv4-mont.pl index 26eca61e7e..3561ea2d61 100644 --- a/crypto/bn/asm/armv4-mont.pl +++ b/crypto/bn/asm/armv4-mont.pl @@ -61,7 +61,7 @@ bn_mul_mont: cmp $num,#2 movlt r0,#0 addlt sp,sp,#2*4 - blt .Labort + blt .Labrt stmdb sp!,{r4-r12,lr} @ save 10 registers @@ -160,27 +160,13 @@ bn_mul_mont: add $num,$num,#4 @ $num to point at &tp[num] sub $aj,$num,sp @ "original" num value mov $tp,sp @ "rewind" $tp + mov $ap,$tp @ "borrow" $ap sub $np,$np,$aj @ "rewind" $np to &np[0] - cmp $nhi,#0 @ upmost carry - bne .Lsub - cmp $nlo,$nj @ tp[num-1]-np[num-1] - bhs .Lsub - -.Lcopy: ldr $tj,[$tp] - str sp,[$tp],#4 @ zap tp - str $tj,[$rp],#4 - cmp $tp,$num - bne .Lcopy - -.Lexit: add sp,$num,#4 @ skip over tp[num+1] - ldmia sp!,{r4-r12,lr} @ restore registers - add sp,sp,#2*4 @ skip over {r0,r2} - mov r0,#1 -.Labort:tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet - bx lr @ interoperable with Thumb ISA:-) + movs $tj,$nj,lsr#30 @ boundary condition... + beq .Lcopy @ ... is met + subs $tj,$tj,$tj @ "clear" carry flag .Lsub: ldr $tj,[$tp],#4 ldr $nj,[$np],#4 sbcs $tj,$tj,$nj @ tp[j]-np[j] @@ -190,12 +176,24 @@ bn_mul_mont: sbcs $nhi,$nhi,#0 @ upmost carry mov $tp,sp @ "rewind" $tp sub $rp,$rp,$aj @ "rewind" $rp - blo .Lcopy @ tp was less after all -.Lzap: str sp,[$tp],#4 + and $ap,$tp,$nhi + bic $np,$rp,$nhi + orr $ap,$ap,$np @ ap=borrow?tp:rp + +.Lcopy: ldr $tj,[$ap],#4 @ copy or in-place refresh + str sp,[$tp],#4 @ zap tp + str $tj,[$rp],#4 cmp $tp,$num - bne .Lzap - bal .Lexit + bne .Lcopy + + add sp,$num,#4 @ skip over tp[num+1] + ldmia sp!,{r4-r12,lr} @ restore registers + add sp,sp,#2*4 @ skip over {r0,r2} + mov r0,#1 +.Labrt: tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet + bx lr @ interoperable with Thumb ISA:-) .size bn_mul_mont,.-bn_mul_mont .asciz "Montgomery multiplication for ARMv4, CRYPTOGAMS by " ___ diff --git a/crypto/bn/asm/mips3-mont.pl b/crypto/bn/asm/mips3-mont.pl index 99ebef33b9..e3c05acb03 100644 --- a/crypto/bn/asm/mips3-mont.pl +++ b/crypto/bn/asm/mips3-mont.pl @@ -265,27 +265,50 @@ bn_mul_mont: addu $i,8 sltu s7,$i,$num bnez s7,.Louter - + .set noreorder - PTR_ADD $ap,sp,$num + PTR_ADD $tj,sp,$num # &tp[num] move $tp,sp + move $ap,sp - bnez $hi1,.Lsub - li $hi0,0 - sgeu AT,$lo1,$nj - beqz AT,.Lsub - nop + dsrl AT,$nj,62 # boundary condition... + beqz AT,.Lcopy # ... is met + li $hi0,0 # clear borrow bit .align 4 -.Lcopy: ld AT,($tp) +.Lsub: ld $lo0,($tp) + ld $lo1,($np) + PTR_ADD $tp,8 + PTR_ADD $np,8 + dsubu $lo1,$lo0,$lo1 # tp[i]-np[i] + sgtu AT,$lo1,$lo0 + dsubu $lo0,$lo1,$hi0 + sgtu $hi0,$lo0,$lo1 + sd $lo0,($rp) + or $hi0,AT + sltu AT,$tp,$tj + bnez AT,.Lsub + PTR_ADD $rp,8 + + dsubu $hi0,$hi1,$hi0 # handle upmost overflow bit + move $tp,sp + PTR_SUB $rp,$num # restore rp + not $hi1,$hi0 + + and $ap,$hi0,sp + and $bp,$hi1,$rp + or $ap,$ap,$bp # ap=borrow?tp:rp + +.align 4 +.Lcopy: ld $aj,($ap) + PTR_ADD $ap,8 PTR_ADD $tp,8 - sd AT,($rp) - sltu AT,$tp,$ap sd zero,-8($tp) + sltu AT,$tp,$tj + sd $aj,($rp) bnez AT,.Lcopy PTR_ADD $rp,8 -.Lexit: ld s0,0($fp) ld s1,8($fp) ld s2,16($fp) @@ -297,34 +320,6 @@ bn_mul_mont: li v0,1 jr ra PTR_ADD sp,$fp,64 - -.align 4 -.Lsub: ld $lo0,($tp) - ld $lo1,($np) - dsubu $lo1,$lo0,$lo1 - sgtu AT,$lo1,$lo0 - dsubu $lo0,$lo1,$hi0 - sgtu $hi0,$lo0,$lo1 - PTR_ADD $tp,8 - or $hi0,AT - PTR_ADD $np,8 - sd $lo0,($rp) - sltu AT,$tp,$ap - bnez AT,.Lsub - PTR_ADD $rp,8 - - dsubu $hi0,$hi1,$hi0 - move $tp,sp - sgtu AT,$hi0,$hi1 - bnez AT,.Lcopy - PTR_SUB $rp,$num -.align 4 -.Lzap: sd zero,($tp) - sltu AT,$tp,$ap - bnez AT,.Lzap - PTR_ADD $tp,8 - b .Lexit - nop .set reorder END(bn_mul_mont) .rdata diff --git a/crypto/bn/asm/ppc-mont.pl b/crypto/bn/asm/ppc-mont.pl index 280d31244b..b69809a97e 100644 --- a/crypto/bn/asm/ppc-mont.pl +++ b/crypto/bn/asm/ppc-mont.pl @@ -2,8 +2,9 @@ # ==================================================================== # Written by Andy Polyakov for the OpenSSL -# project. Rights for redistribution and usage in source and binary -# forms are granted according to the OpenSSL license. +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # April 2006 @@ -42,6 +43,7 @@ if ($output =~ /32\-mont\.s/) { $UMULL= "mullw"; # unsigned multiply low $UMULH= "mulhwu"; # unsigned multiply high $UCMP= "cmplw"; # unsigned compare + $SHRI= "srwi"; # unsigned shift right by immediate $PUSH= $ST; $POP= $LD; } elsif ($output =~ /64\-mont\.s/) { @@ -62,6 +64,7 @@ if ($output =~ /32\-mont\.s/) { $UMULL= "mulld"; # unsigned multiply low $UMULH= "mulhdu"; # unsigned multiply high $UCMP= "cmpld"; # unsigned compare + $SHRI= "srdi"; # unsigned shift right by immediate $PUSH= $ST; $POP= $LD; } else { die "nonsense $output"; } @@ -264,24 +267,37 @@ Linner: addi $i,$i,$BNSZ ble- Louter + $SHRI. $nj,$nj,$BITS-2 ; check boundary condition addi $num,$num,2 ; restore $num + subfc $j,$j,$j ; j=0 and "clear" XER[CA] addi $tp,$sp,$FRAME + addi $ap,$sp,$FRAME mtctr $num + beq Lcopy ; boundary condition is met + +.align 4 +Lsub: $LDX $tj,$tp,$j + $LDX $nj,$np,$j + subfe $aj,$nj,$tj ; tp[j]-np[j] + $STX $aj,$rp,$j + addi $j,$j,$BNSZ + bdnz- Lsub + li $j,0 + mtctr $num + subfe $ovf,$j,$ovf ; handle upmost overflow bit + and $ap,$tp,$ovf + andc $np,$rp,$ovf + or $ap,$ap,$np ; ap=borrow?tp:rp - subfc. $ovf,$j,$ovf ; sets XER[CA] - bne Lsub - $UCMP $hi1,$nj - bge Lsub .align 4 -Lcopy: - $LDX $tj,$tp,$j +Lcopy: ; copy or in-place refresh + $LDX $tj,$ap,$j $STX $tj,$rp,$j $STX $j,$tp,$j ; zap at once addi $j,$j,$BNSZ bdnz- Lcopy -Lexit: $POP r14,`4*$SIZE_T`($sp) $POP r15,`5*$SIZE_T`($sp) $POP r16,`6*$SIZE_T`($sp) @@ -298,22 +314,7 @@ Lexit: li r3,1 blr .long 0 -.align 4 -Lsub: $LDX $tj,$tp,$j - $LDX $nj,$np,$j - subfe $tj,$nj,$tj ; tp[j]-np[j] - $STX $tj,$rp,$j - addi $j,$j,$BNSZ - bdnz- Lsub - li $j,0 - subfe. $ovf,$j,$ovf - mtctr $num - bne Lcopy -.align 4 -Lzap: $STX $j,$tp,$j - addi $j,$j,$BNSZ - bdnz- Lzap - b Lexit +.asciz "Montgomery Multiplication for PPC, CRYPTOGAMS by " ___ $code =~ s/\`([^\`]*)\`/eval $1/gem; diff --git a/crypto/bn/asm/s390x-mont.pl b/crypto/bn/asm/s390x-mont.pl index 5d1b9fdb41..224d5ba668 100644 --- a/crypto/bn/asm/s390x-mont.pl +++ b/crypto/bn/asm/s390x-mont.pl @@ -176,45 +176,45 @@ bn_mul_mont: ___ undef $bi; -$count=$ap; undef $ap; +$count=$bp; undef $bp; $code.=<<___; lg $rp,16+16($fp) # reincarnate rp + la $ap,8($fp) lgr $j,$num - ltgr $AHI,$AHI - jnz .Lsub # upmost overflow bit is not zero - #slg $NHI,-8($np) # tp[num-1]-np[num-1] - lghi $count,-8 # buggy assembler - slg $NHI,0($count,$np) # buggy assembler - jnle .Lsub # branch if not borrow -.Lcopy: lg $alo,8($j,$fp) - stg $j,8($j,$fp) - stg $alo,0($j,$rp) - aghi $j,8 - jnz .Lcopy -.Lexit: - lmg %r6,%r15,16+48($fp) - lghi %r2,1 # signal "processed" - br %r14 + #lg $nhi,-8($np) # buggy assembler + lghi $count,-8 # buggy assembler + lg $nhi,0($count,$np) # buggy assembler + srag $nhi,$nhi,62 # boundary condition... + jz .Lcopy # ... is met -.Lsub: lcgr $count,$num + lcgr $count,$num sra $count,3 # incidentally clears "borrow" -.Lsubloop: - lg $alo,8($j,$fp) +.Lsub: lg $alo,0($j,$ap) slbg $alo,0($j,$np) stg $alo,0($j,$rp) la $j,8($j) - brct $count,.Lsubloop + brct $count,.Lsub lghi $ahi,0 - slbgr $AHI,$ahi + slbgr $AHI,$ahi # handle upmost carry + + ngr $ap,$AHI + lghi $np,-1 + xgr $np,$AHI + ngr $np,$rp + ogr $ap,$np # ap=borrow?tp:rp lgr $j,$num - jle .Lcopy # branch if borrow -.Lzap: stg $j,8($j,$fp) +.Lcopy: lg $alo,0($j,$ap) # copy or in-place refresh + stg $j,8($j,$fp) # zap tp + stg $alo,0($j,$rp) aghi $j,8 - jnz .Lzap - j .Lexit + jnz .Lcopy + + lmg %r6,%r15,16+48($fp) + lghi %r2,1 # signal "processed" + br %r14 .size bn_mul_mont,.-bn_mul_mont .string "Montgomery Multiplication for s390x, CRYPTOGAMS by " ___ diff --git a/crypto/bn/asm/sparcv9-mont.pl b/crypto/bn/asm/sparcv9-mont.pl index acdf6928ca..d78b43209a 100644 --- a/crypto/bn/asm/sparcv9-mont.pl +++ b/crypto/bn/asm/sparcv9-mont.pl @@ -2,8 +2,9 @@ # ==================================================================== # Written by Andy Polyakov for the OpenSSL -# project. Rights for redistribution and usage in source and binary -# forms are granted according to the OpenSSL license. +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # December 2005 @@ -254,44 +255,36 @@ $fname: .Ltail: add $np,$num,$np add $rp,$num,$rp - - cmp $car2,0 ! clears %icc.c - bne,pn %icc,.Lsub + mov $tp,$ap sub %g0,$num,%o7 ! k=-num - cmp $car1,$npj ! compare top-most $tp and $np words - bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken - nop + srl $npj,30,%o0 ! boundary condition... + brz,pn %o0,.Lcopy ! ... is met + subcc %g0,%g0,%g0 ! clear %icc.c .align 16,0x1000000 .Lsub: ld [$tp+%o7],%o0 ld [$np+%o7],%o1 - subccc %o0,%o1,%o1 + subccc %o0,%o1,%o1 ! tp[j]-np[j] st %o1,[$rp+%o7] add %o7,4,%o7 brnz %o7,.Lsub nop - subccc $car2,0,$car2 - bcc %icc,.Lzap + subc $car2,0,$car2 ! handle upmost overflow bit + and $tp,$car2,$ap + andn $rp,$car2,$np + or $ap,$np,$ap sub %g0,$num,%o7 .align 16,0x1000000 .Lcopy: - ld [$tp+%o7],%o0 + ld [$ap+%o7],%o0 ! copy or in-place refresh + st %g0,[$tp+%o7] ! zap tp st %o0,[$rp+%o7] add %o7,4,%o7 brnz %o7,.Lcopy nop - ba .Lzap - sub %g0,$num,%o7 - -.align 32 -.Lzap: - st %g0,[$tp+%o7] - add %o7,4,%o7 - brnz %o7,.Lzap - nop mov 1,%i0 ret restore @@ -609,6 +602,7 @@ $code.=<<___; add $tp,8,$tp .type $fname,#function .size $fname,(.-$fname) +.asciz "Montgomery Multipltication for SPARCv9, CRYPTOGAMS by " ___ $code =~ s/\`([^\`]*)\`/eval($1)/gem; print $code; diff --git a/crypto/bn/asm/sparcv9a-mont.pl b/crypto/bn/asm/sparcv9a-mont.pl index cecf07c6f4..02847fdc08 100755 --- a/crypto/bn/asm/sparcv9a-mont.pl +++ b/crypto/bn/asm/sparcv9a-mont.pl @@ -121,7 +121,6 @@ $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62"; $ASI_FL16_P=0xD2; # magic ASI value to engage 16-bit FP load $code=<<___; -.ident "UltraSPARC Montgomery multiply by " .section ".text",#alloc,#execinstr .global $fname @@ -799,17 +798,14 @@ $fname: bnz %icc,.Louter nop - sub %g0,$num,%o7 ! n=-num - cmp $carry,0 ! clears %icc.c - bne,pn %icc,.Lsub - add $tp,8,$tp ! adjust tp to point at the end - - ld [$tp-8],%o0 ld [$np-4],%o1 - cmp %o0,%o1 ! compare topmost words - bcs,pt %icc,.Lcopy ! %icc.c is clean if not taken - nop - + subcc %g0,%g0,%g0 ! clear %icc.c + add $tp,8,$tp ! adjust tp to point at the end + srl %o1,30,%o1 ! boundary condition... + orn %g0,%g0,%g4 + brz,pn %o1,.Lcopy ! ... is met + sub %g0,$num,%o7 ! n=-num + .align 32,0x1000000 .Lsub: ldx [$tp+%o7],%o0 @@ -824,24 +820,30 @@ $fname: add %o7,8,%o7 brnz,pt %o7,.Lsub st %o3,[%g1+4] - subccc $carry,0,$carry - bcc,pt %icc,.Lzap + subc $carry,0,%g4 sub %g0,$num,%o7 ! n=-num -.align 16,0x1000000 +.align 32,0x1000000 .Lcopy: ldx [$tp+%o7],%o0 - srlx %o0,32,%o1 add $rp,%o7,%g1 + ld [%g1+0],%o2 + ld [%g1+4],%o3 + stx %g0,[$tp+%o7] + and %o0,%g4,%o0 + srlx %o0,32,%o1 + andn %o2,%g4,%o2 + andn %o3,%g4,%o3 + or %o2,%o0,%o0 + or %o3,%o1,%o1 st %o0,[%g1+0] add %o7,8,%o7 brnz,pt %o7,.Lcopy st %o1,[%g1+4] sub %g0,$num,%o7 ! n=-num -.align 32 +.align 32,0x1000000 .Lzap: - stx %g0,[$tp+%o7] stx %g0,[$ap_l+%o7] stx %g0,[$ap_h+%o7] stx %g0,[$np_l+%o7] diff --git a/crypto/bn/asm/via-mont.pl b/crypto/bn/asm/via-mont.pl index e149941987..ce3cd61eb3 100644 --- a/crypto/bn/asm/via-mont.pl +++ b/crypto/bn/asm/via-mont.pl @@ -77,7 +77,8 @@ # - in terms of absolute performance it delivers approximately as much # as modern out-of-order 32-bit cores [again, for longer keys]. -push(@INC,".","../../perlasm"); +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; &asm_init($ARGV[0],"via-mont.pl"); @@ -100,7 +101,7 @@ $sp=&DWP(28,"esp"); # &DWP(64+(4*$num+$pad)*0,"esp") # padded tp[num] # &DWP(64+(4*$num+$pad)*1,"esp") # padded copy of ap[num] # &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of bp[num] -# &DWP(64+(4*$num+$pad)*2,"esp") # padded copy of np[num] +# &DWP(64+(4*$num+$pad)*3,"esp") # padded copy of np[num] # Note that SDK suggests to unconditionally allocate 2K per vector. This # has quite an impact on performance. It naturally depends on key length, # but to give an example 1024 bit private RSA key operations suffer >30% @@ -115,7 +116,7 @@ $sp=&DWP(28,"esp"); &jnz (&label("leave")); # num % 4 != 0 &cmp ("ecx",8); &jb (&label("leave")); # num < 8 - &cmp ("ecx",256); + &cmp ("ecx",1024); &ja (&label("leave")); # num > 1024 &pushf (); @@ -148,74 +149,91 @@ $sp=&DWP(28,"esp"); &lea ("ebp",&DWP(-$pad,"ecx")); &shr ("ebp",2); # restore original num value in ebp - &add ("ecx",32/4); # (4 vectors + 32 byte scratch)/4 &xor ("eax","eax"); + + &mov ("ecx","ebp"); + &lea ("ecx",&DWP((32+$pad)/4,"ecx"));# padded tp + scratch &data_byte(0xf3,0xab); # rep stosl, bzero &mov ("ecx","ebp"); &lea ("edi",&DWP(64+$pad,"esp","ecx",4));# pointer to ap copy &mov ($A,"edi"); &data_byte(0xf3,0xa5); # rep movsl, memcpy + &mov ("ecx",$pad/4); + &data_byte(0xf3,0xab); # rep stosl, bzero pad + # edi points at the end of padded ap copy... - # edi points at the end of ap copy... &mov ("ecx","ebp"); - &add ("edi",$pad); # skip padding to point at bp copy &mov ("esi","ebx"); &mov ($B,"edi"); &data_byte(0xf3,0xa5); # rep movsl, memcpy + &mov ("ecx",$pad/4); + &data_byte(0xf3,0xab); # rep stosl, bzero pad + # edi points at the end of padded bp copy... - # edi points at the end of bp copy... &mov ("ecx","ebp"); - &add ("edi",$pad); # skip padding to point at np copy &mov ("esi","edx"); &mov ($M,"edi"); &data_byte(0xf3,0xa5); # rep movsl, memcpy + &mov ("ecx",$pad/4); + &data_byte(0xf3,0xab); # rep stosl, bzero pad + # edi points at the end of padded np copy... # let magic happen... &mov ("ecx","ebp"); &mov ("esi","esp"); - &xor ("eax","eax"); &shl ("ecx",5); # convert word counter to bit counter &align (4); &data_byte(0xf3,0x0f,0xa6,0xc0);# rep montmul &mov ("ecx","ebp"); - &xor ("edx","edx"); # i=0 - &lea ("esi",&DWP(64,"esp")); # tp - # edi still points at the end of np copy... + &xor ("edx","edx"); # i=0 + &lea ("esi",&DWP(64,"esp")); # tp + # edi still points at the end of padded np copy... + &mov ("eax",&DWP(-4-$pad,"edi")); # np[num-1] &neg ("ebp"); - &lea ("ebp",&DWP(0,"edi","ebp",4)); # so just "rewind" - &mov ("edi",$rp); # restore rp - - &mov ("ebx",&DWP(0,"esi","ecx",4)); # upmost overflow bit - &cmp ("ebx",0); # clears CF unconfitionally - &jnz (&label("sub")); - &mov ("eax",&DWP(-4,"esi","ecx",4)); - &cmp ("eax",&DWP(-4,"ebp","ecx",4)); # tp[num-1]-np[num-1]? - &jae (&label("sub")); # if taken CF is cleared - -&set_label("copy",4); - &mov ("ebx","ecx"); - &data_byte(0xf3,0xa5); # rep movsl - &mov ("ecx","ebx"); - &jmp (&label("zap")); - -&set_label("sub",16); + &lea ("ebp",&DWP(-$pad,"edi","ebp",4)); # so just "rewind" + &mov ("edi",$rp); # restore rp + + &shr ("eax",30); # boundary condition... + &jz (&label("copy")); # ... is met + &xor ("edx","edx"); # clear CF + +&set_label("sub",8); &mov ("eax",&DWP(0,"esi","edx",4)); &sbb ("eax",&DWP(0,"ebp","edx",4)); &mov (&DWP(0,"edi","edx",4),"eax"); # rp[i]=tp[i]-np[i] &lea ("edx",&DWP(1,"edx")); # i++ - &dec ("ecx"); # doesn't affect CF! - &jg (&label("sub")); - &sbb ("ebx",0); # upmost overflow is still there - &mov ("ecx","edx"); - &jc (&label("copy")); + &loop (&label("sub")); # doesn't affect CF! + + &mov ("eax",&DWP(0,"esi","edx",4)); # upmost overflow bit + &sbb ("eax",0); + &and ("esi","eax"); + ¬ ("eax"); + &mov ("ebp","edi"); + &and ("ebp","eax"); + &or ("esi","ebp"); # tp=carry?tp:rp + + &mov ("ecx","edx"); # num + &xor ("edx","edx"); # i=0 + +&set_label("copy",8); + &mov ("eax",&DWP(0,"esi","edx",4)); + &mov (&DWP(64,"esp","edx",4),"ecx"); # zap tp + &mov (&DWP(0,"edi","edx",4),"eax"); + &lea ("edx",&DWP(1,"edx")); # i++ + &loop (&label("copy")); -&set_label("zap",4); &mov ("ebp",$sp); &xor ("eax","eax"); - &lea ("ecx",&DWP(64/4+$pad,"","ecx",4));# size of frame divided by 4 - &mov ("edi","esp"); + + &mov ("ecx",64/4); + &mov ("edi","esp"); # zap frame including scratch area + &data_byte(0xf3,0xab); # rep stosl, bzero + + # zap copies of ap, bp and np + &lea ("edi",&DWP(64+$pad,"esp","edx",4));# pointer to ap + &lea ("ecx",&DWP(3*$pad/4,"edx","edx",2)); &data_byte(0xf3,0xab); # rep stosl, bzero &mov ("esp","ebp"); @@ -224,4 +242,6 @@ $sp=&DWP(28,"esp"); &set_label("leave"); &function_end($func); +&asciz("Padlock Montgomery Multiplication, CRYPTOGAMS by "); + &asm_finish(); diff --git a/crypto/bn/asm/x86-mont.pl b/crypto/bn/asm/x86-mont.pl index 319c17de65..2ed76aac62 100755 --- a/crypto/bn/asm/x86-mont.pl +++ b/crypto/bn/asm/x86-mont.pl @@ -41,7 +41,7 @@ for (@ARGV) { $sse2=1 if (/-DOPENSSL_IA32_SSE2/); } $i="edx"; $j="ecx"; -$ap="esi"; +$ap="esi"; $tp="esi"; # overlapping variables!!! $rp="edi"; $bp="edi"; # overlapping variables!!! $np="ebp"; $num="ebx"; @@ -551,41 +551,39 @@ $sbit=$num; } &set_label("common_tail",16); - &mov ($np,$_np); - &mov ("esi",&DWP($frame+4,"esp",$num,4));# load upmost overflow bit + &mov ($np,$_np); # load modulus pointer &mov ($rp,$_rp); # load result pointer - # [$ap and $bp are zapped] - &xor ($i,$i); # i=0 + &lea ($tp,&DWP($frame,"esp")); # [$ap and $bp are zapped] + &mov ("eax",&DWP(0,$np,$num,4)); # np[num-1] + &shr ("eax",30); # check for boundary condition + &jz (&label("copy")); + + &mov ("eax",&DWP(0,$tp)); # tp[0] &mov ($j,$num); # j=num-1 - &cmp ("esi",0); # clears CF unconditionally - &jnz (&label("sub")); - &mov ("eax",&DWP($frame,"esp",$j,4)); - &cmp ("eax",&DWP(0,$np,$j,4)); # tp[num-1]-np[num-1]? - &jae (&label("sub")); # if taken CF is cleared -&set_label("copy",16); - &mov ("eax",&DWP($frame,"esp",$j,4)); - &mov (&DWP(0,$rp,$j,4),"eax"); # rp[i]=tp[i] - &mov (&DWP($frame,"esp",$j,4),$j); # zap temporary vector - &dec ($j); - &jge (&label("copy")); - &jmp (&label("exit")); + &xor ($i,$i); # i=0 and clear CF! &set_label("sub",16); - &mov ("eax",&DWP($frame,"esp",$i,4)); &sbb ("eax",&DWP(0,$np,$i,4)); &mov (&DWP(0,$rp,$i,4),"eax"); # rp[i]=tp[i]-np[i] - &lea ($i,&DWP(1,$i)); # i++ &dec ($j); # doesn't affect CF! + &mov ("eax",&DWP(4,$tp,$i,4)); # tp[i+1] + &lea ($i,&DWP(1,$i)); # i++ &jge (&label("sub")); - &mov ($j,$num); # j=num-1 - &sbb ("esi",0); # esi holds upmost overflow bit - &jc (&label("copy")); -&set_label("zap",8); - &mov (&DWP($frame,"esp",$j,4),$i); # zap temporary vector - &dec ($j); - &jge (&label("zap")); - -&set_label("exit",8); + + &sbb ("eax",0); # handle upmost overflow bit + &and ($tp,"eax"); + ¬ ("eax"); + &mov ($np,$rp); + &and ($np,"eax"); + &or ($tp,$np); # tp=carry?tp:rp + +&set_label("copy",16); # copy or in-place refresh + &mov ("eax",&DWP(0,$tp,$num,4)); + &mov (&DWP(0,$rp,$num,4),"eax"); # rp[i]=tp[i] + &mov (&DWP($frame,"esp",$num,4),$j); # zap temporary vector + &dec ($num); + &jge (&label("copy")); + &mov ("esp",$_sp); # pull saved stack pointer &mov ("eax",1); &set_label("just_leave"); diff --git a/crypto/bn/asm/x86_64-mont.pl b/crypto/bn/asm/x86_64-mont.pl index bc3fa83cf7..6701bf2755 100755 --- a/crypto/bn/asm/x86_64-mont.pl +++ b/crypto/bn/asm/x86_64-mont.pl @@ -59,6 +59,7 @@ bn_mul_mont: neg %rax lea (%rsp,%rax,8),%rsp # tp=alloca(8*(num+2)) and \$-1024,%rsp # minimize TLB usage + mov %rbp,8(%rsp,$num,8) # tp[num+1]=%rsp mov %rdx,$bp # $bp reassigned, remember? @@ -166,22 +167,38 @@ bn_mul_mont: cmp $num,$i jl .Louter - xor $i,$i # i=0 + mov -8($np,$num,8),%rax # np[num-1] + lea (%rsp),$ap # borrow ap for tp + shr \$62,%rax # check for boundary condition + jz .Lcopy + + mov ($ap),%rax # tp[0] lea -1($num),$j # j=num-1 - cmp \$0,%rdx # %rdx still holds upmost overflow bit - jnz .Lsub # CF is cleared by compare with 0 - mov (%rsp,$j,8),%rax - cmp ($np,$j,8),%rax # tp[num-1]-np[num-1] - jae .Lsub # if taken CF was cleared by above cmp -.align 4 -.Lcopy: - mov (%rsp,$j,8),%rax + xor $i,$i # i=0 and clear CF! + jmp .Lsub +.align 16 +.Lsub: sbb ($np,$i,8),%rax + mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i] + dec $j # doesn't affect CF! + mov 8($ap,$i,8),%rax # tp[i+1] + lea 1($i),$i # i++ + jge .Lsub + + sbb \$0,%rax # handle upmost overflow bit + and %rax,$ap + not %rax + mov $rp,$np + and %rax,$np + lea -1($num),$j + or $np,$ap # ap=borrow?tp:rp +.align 16 +.Lcopy: # copy or in-place refresh + mov ($ap,$j,8),%rax mov %rax,($rp,$j,8) # rp[i]=tp[i] mov $i,(%rsp,$j,8) # zap temporary vector dec $j jge .Lcopy -.align 4 -.Lexit: + mov 8(%rsp,$num,8),%rsp # restore %rsp mov \$1,%rax pop %r15 @@ -191,22 +208,6 @@ bn_mul_mont: pop %rbp pop %rbx ret - -.align 16 -.Lsub: mov (%rsp,$i,8),%rax - sbb ($np,$i,8),%rax - mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[j] - lea 1($i),$i # i++ - dec $j # doesn't affect CF! - jge .Lsub - lea -1($num),$j # j=num-1 - sbb \$0,%rdx - jc .Lcopy # tp was less than np -.align 4 -.Lzap: mov $i,(%rsp,$j,8) # zap temporary vector - dec $j - jge .Lzap - jmp .Lexit .size bn_mul_mont,.-bn_mul_mont .asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by " ___ -- 2.34.1