RT4320/GH705: Fix PEM parsing bug.
[openssl.git] / crypto / bn / asm / armv4-gf2m.pl
index b781afbf89bebfad9a1d1c29c5491effffc66672..22aa4830f8edcee997589b1528e52412ec6b3105 100644 (file)
 # referred below, which improves ECDH and ECDSA verify benchmarks
 # by 18-40%.
 #
-# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
 # Polynomial Multiplication on ARM Processors using the NEON Engine.
 # 
 # http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
 
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
+$flavour = shift;
+if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
+else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
+
+if ($flavour && $flavour ne "void") {
+    $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+    ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+    ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+    die "can't locate arm-xlate.pl";
+
+    open STDOUT,"| \"$^X\" $xlate $flavour $output";
+} else {
+    open STDOUT,">$output";
+}
 
 $code=<<___;
 #include "arm_arch.h"
 
 .text
+#if defined(__thumb2__)
+.syntax        unified
+.thumb
+#else
 .code  32
-
-#if __ARM_ARCH__>=7
-.fpu   neon
 #endif
 ___
 ################
@@ -124,11 +137,17 @@ mul_1x1_ialu:
        eor     $hi,$hi,$t0,lsr#8
        ldr     $t0,[sp,$i0]            @ tab[b >> 30      ]
 
+#ifdef __thumb2__
+       itt     ne
+#endif
        eorne   $lo,$lo,$b,lsl#30
        eorne   $hi,$hi,$b,lsr#2
        tst     $a,#1<<31
        eor     $lo,$lo,$t1,lsl#27
        eor     $hi,$hi,$t1,lsr#5
+#ifdef __thumb2__
+       itt     ne
+#endif
        eorne   $lo,$lo,$b,lsl#31
        eorne   $hi,$hi,$b,lsr#1
        eor     $lo,$lo,$t0,lsl#30
@@ -140,25 +159,98 @@ ___
 ################
 # void bn_GF2m_mul_2x2(BN_ULONG *r,
 #      BN_ULONG a1,BN_ULONG a0,
-#      BN_ULONG b1,BN_ULONG b0);       # r[3..0]=a1a0·b1b0
+#      BN_ULONG b1,BN_ULONG b0);       # r[3..0]=a1a0·b1b0
 {
-my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
-my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
-
 $code.=<<___;
 .global        bn_GF2m_mul_2x2
 .type  bn_GF2m_mul_2x2,%function
 .align 5
 bn_GF2m_mul_2x2:
-#if __ARM_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7
+       stmdb   sp!,{r10,lr}
        ldr     r12,.LOPENSSL_armcap
-.Lpic: ldr     r12,[pc,r12]
-       tst     r12,#1
-       beq     .Lialu
+       adr     r10,.LOPENSSL_armcap
+       ldr     r12,[r12,r10]
+#ifdef __APPLE__
+       ldr     r12,[r12]
+#endif
+       tst     r12,#ARMV7_NEON
+       itt     ne
+       ldrne   r10,[sp],#8
+       bne     .LNEON
+       stmdb   sp!,{r4-r9}
+#else
+       stmdb   sp!,{r4-r10,lr}
+#endif
+___
+$ret="r10";    # reassigned 1st argument
+$code.=<<___;
+       mov     $ret,r0                 @ reassign 1st argument
+       mov     $b,r3                   @ $b=b1
+       sub     r7,sp,#36
+       mov     r8,sp
+       and     r7,r7,#-32
+       ldr     r3,[sp,#32]             @ load b0
+       mov     $mask,#7<<2
+       mov     sp,r7                   @ allocate tab[8]
+       str     r8,[r7,#32]
+
+       bl      mul_1x1_ialu            @ a1·b1
+       str     $lo,[$ret,#8]
+       str     $hi,[$ret,#12]
+
+       eor     $b,$b,r3                @ flip b0 and b1
+        eor    $a,$a,r2                @ flip a0 and a1
+       eor     r3,r3,$b
+        eor    r2,r2,$a
+       eor     $b,$b,r3
+        eor    $a,$a,r2
+       bl      mul_1x1_ialu            @ a0·b0
+       str     $lo,[$ret]
+       str     $hi,[$ret,#4]
+
+       eor     $a,$a,r2
+       eor     $b,$b,r3
+       bl      mul_1x1_ialu            @ (a1+a0)·(b1+b0)
+___
+@r=map("r$_",(6..9));
+$code.=<<___;
+       ldmia   $ret,{@r[0]-@r[3]}
+       eor     $lo,$lo,$hi
+       ldr     sp,[sp,#32]             @ destroy tab[8]
+       eor     $hi,$hi,@r[1]
+       eor     $lo,$lo,@r[0]
+       eor     $hi,$hi,@r[2]
+       eor     $lo,$lo,@r[3]
+       eor     $hi,$hi,@r[3]
+       str     $hi,[$ret,#8]
+       eor     $lo,$lo,$hi
+       str     $lo,[$ret,#4]
+
+#if __ARM_ARCH__>=5
+       ldmia   sp!,{r4-r10,pc}
+#else
+       ldmia   sp!,{r4-r10,lr}
+       tst     lr,#1
+       moveq   pc,lr                   @ be binary compatible with V4, yet
+       bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
+___
+}
+{
+my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
+my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
+
+$code.=<<___;
+#if __ARM_MAX_ARCH__>=7
+.arch  armv7-a
+.fpu   neon
 
+.align 5
+.LNEON:
        ldr             r12, [sp]               @ 5th argument
-       vmov.32         $a, r2, r1
-       vmov.32         $b, r12, r3
+       vmov            $a, r2, r1
+       vmov            $b, r12, r3
        vmov.i64        $k48, #0x0000ffffffffffff
        vmov.i64        $k32, #0x00000000ffffffff
        vmov.i64        $k16, #0x000000000000ffff
@@ -203,70 +295,22 @@ bn_GF2m_mul_2x2:
 
        vst1.32         {$r}, [r0]
        ret             @ bx lr
-.align 4
-.Lialu:
 #endif
 ___
 }
-$ret="r10";    # reassigned 1st argument
-$code.=<<___;
-       stmdb   sp!,{r4-r10,lr}
-       mov     $ret,r0                 @ reassign 1st argument
-       mov     $b,r3                   @ $b=b1
-       ldr     r3,[sp,#32]             @ load b0
-       mov     $mask,#7<<2
-       sub     sp,sp,#32               @ allocate tab[8]
-
-       bl      mul_1x1_ialu            @ a1·b1
-       str     $lo,[$ret,#8]
-       str     $hi,[$ret,#12]
-
-       eor     $b,$b,r3                @ flip b0 and b1
-        eor    $a,$a,r2                @ flip a0 and a1
-       eor     r3,r3,$b
-        eor    r2,r2,$a
-       eor     $b,$b,r3
-        eor    $a,$a,r2
-       bl      mul_1x1_ialu            @ a0·b0
-       str     $lo,[$ret]
-       str     $hi,[$ret,#4]
-
-       eor     $a,$a,r2
-       eor     $b,$b,r3
-       bl      mul_1x1_ialu            @ (a1+a0)·(b1+b0)
-___
-@r=map("r$_",(6..9));
 $code.=<<___;
-       ldmia   $ret,{@r[0]-@r[3]}
-       eor     $lo,$lo,$hi
-       eor     $hi,$hi,@r[1]
-       eor     $lo,$lo,@r[0]
-       eor     $hi,$hi,@r[2]
-       eor     $lo,$lo,@r[3]
-       eor     $hi,$hi,@r[3]
-       str     $hi,[$ret,#8]
-       eor     $lo,$lo,$hi
-       add     sp,sp,#32               @ destroy tab[8]
-       str     $lo,[$ret,#4]
-
-#if __ARM_ARCH__>=5
-       ldmia   sp!,{r4-r10,pc}
-#else
-       ldmia   sp!,{r4-r10,lr}
-       tst     lr,#1
-       moveq   pc,lr                   @ be binary compatible with V4, yet
-       bx      lr                      @ interoperable with Thumb ISA:-)
-#endif
 .size  bn_GF2m_mul_2x2,.-bn_GF2m_mul_2x2
-#if __ARM_ARCH__>=7
+#if __ARM_MAX_ARCH__>=7
 .align 5
 .LOPENSSL_armcap:
-.word  OPENSSL_armcap_P-(.Lpic+8)
+.word  OPENSSL_armcap_P-.
 #endif
 .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align 5
 
+#if __ARM_MAX_ARCH__>=7
 .comm  OPENSSL_armcap_P,4,4
+#endif
 ___
 
 foreach (split("\n",$code)) {