# runs in even less cycles, ~30, improvement is measurable only on
# longer keys. One has to optimize code elsewhere to get NEON glow...
-$a="r1";
-$b="r0";
-
-($a0,$a1,$a2,$a12,$a4,$a14)=
-($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
-
-$mask="r12";
+while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
bx lr
.size mul_1x1_neon,.-mul_1x1_neon
#endif
+___
+################
+# private interface to mul_1x1_ialu
+#
+$a="r1";
+$b="r0";
-.align 5
+($a0,$a1,$a2,$a12,$a4,$a14)=
+($hi,$lo,$t0,$t1, $i0,$i1 )=map("r$_",(4..9),12);
+
+$mask="r12";
+
+$code.=<<___;
.type mul_1x1_ialu,%function
+.align 5
mul_1x1_ialu:
mov $a0,#0
bic $a1,$a,#3<<30 @ a1=a&0x3fffffff
mov pc,lr
.size mul_1x1_ialu,.-mul_1x1_ialu
+___
+################
+# void bn_GF2m_mul_2x2(BN_ULONG *r,
+# BN_ULONG a1,BN_ULONG a0,
+# BN_ULONG b1,BN_ULONG b0); # r[3..0]=a1a0·b1b0
+
+($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
+$code.=<<___;
.global bn_GF2m_mul_2x2
.type bn_GF2m_mul_2x2,%function
.align 5
.Lpic: ldr r12,[pc,r12]
tst r12,#1
beq .Lialu
-___
-($A1,$B1,$A0,$B0,$A0B0,$A1B1)=map("d$_",(18..23));
-$code.=<<___;
+
veor $A1,$A1
vmov.32 $B1,r3,r3 @ two copies of b1
vmov.32 ${A1}[0],r1 @ a1
mov $b,r3 @ $b=b1
ldr r3,[sp,#32] @ load b0
mov $mask,#7<<2
- sub sp,#32 @ allocate tab[8]
+ sub sp,sp,#32 @ allocate tab[8]
bl mul_1x1_ialu @ a1·b1
str $lo,[$ret,#8]
str $hi,[$ret,#12]
- eor $b,r3 @ flip b0 and b1
- eor $a,r2 @ flip a0 and a1
- eor r3,$b
- eor r2,$a
- eor $b,r3
- eor $a,r2
+ eor $b,$b,r3 @ flip b0 and b1
+ eor $a,$a,r2 @ flip a0 and a1
+ eor r3,r3,$b
+ eor r2,r2,$a
+ eor $b,$b,r3
+ eor $a,$a,r2
bl mul_1x1_ialu @ a0·b0
str $lo,[$ret]
str $hi,[$ret,#4]
- eor $a,r2
- eor $b,r3
+ eor $a,$a,r2
+ eor $b,$b,r3
bl mul_1x1_ialu @ (a1+a0)·(b1+b0)
___
@r=map("r$_",(6..9));
$code.=<<___;
ldmia $ret,{@r[0]-@r[3]}
- eor $lo,$hi
- eor $hi,@r[1]
- eor $lo,@r[0]
- eor $hi,@r[2]
- eor $lo,@r[3]
- eor $hi,@r[3]
+ eor $lo,$lo,$hi
+ eor $hi,$hi,@r[1]
+ eor $lo,$lo,@r[0]
+ eor $hi,$hi,@r[2]
+ eor $lo,$lo,@r[3]
+ eor $hi,$hi,@r[3]
str $hi,[$ret,#8]
- eor $lo,$hi
- add sp,#32 @ destroy tab[8]
+ eor $lo,$lo,$hi
+ add sp,sp,#32 @ destroy tab[8]
str $lo,[$ret,#4]
#if __ARM_ARCH__>=5
#if __ARM_ARCH__>=7
.align 5
.LOPENSSL_armcap:
-.word OPENSSL_armcap-(.Lpic+8)
+.word OPENSSL_armcap_P-(.Lpic+8)
#endif
-.asciz "GF2m Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
+.asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 5
-.comm OPENSSL_armcap,4,4
+.comm OPENSSL_armcap_P,4,4
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;