ARMv4 assembly pack: implement support for Thumb2.
authorAndy Polyakov <appro@openssl.org>
Wed, 23 Sep 2015 16:41:27 +0000 (18:41 +0200)
committerAndy Polyakov <appro@openssl.org>
Fri, 25 Sep 2015 11:34:02 +0000 (13:34 +0200)
As some of ARM processors, more specifically Cortex-Mx series, are
Thumb2-only, we need to support Thumb2-only builds even in assembly.

Reviewed-by: Tim Hudson <tjh@openssl.org>
crypto/aes/asm/aes-armv4.pl
crypto/armv4cpuid.pl
crypto/bn/asm/armv4-gf2m.pl
crypto/bn/asm/armv4-mont.pl
crypto/ec/asm/ecp_nistz256-armv4.pl
crypto/modes/asm/ghash-armv4.pl
crypto/sha/asm/sha1-armv4-large.pl
crypto/sha/asm/sha256-armv4.pl
crypto/sha/asm/sha512-armv4.pl

index 0f7ec39..c3d166f 100644 (file)
@@ -70,15 +70,11 @@ $code=<<___;
 #endif
 
 .text
-#if __ARM_ARCH__<7
-.code  32
-#else
+#if defined(__thumb2__) && !defined(__APPLE__)
 .syntax        unified
-# if defined(__thumb2__) && !defined(__APPLE__)
 .thumb
-# else
+#else
 .code  32
-# endif
 #endif
 
 .type  AES_Te,%object
@@ -193,7 +189,7 @@ AES_Te:
 .type   AES_encrypt,%function
 .align 5
 AES_encrypt:
-#if __ARM_ARCH__<7
+#ifndef        __thumb2__
        sub     r3,pc,#8                @ AES_encrypt
 #else
        adr     r3,AES_encrypt
@@ -443,19 +439,19 @@ _armv4_AES_encrypt:
 .align 5
 AES_set_encrypt_key:
 _armv4_AES_set_encrypt_key:
-#if __ARM_ARCH__<7
+#ifndef        __thumb2__
        sub     r3,pc,#8                @ AES_set_encrypt_key
 #else
        adr     r3,AES_set_encrypt_key
 #endif
        teq     r0,#0
-#if __ARM_ARCH__>=7
+#ifdef __thumb2__
        itt     eq                      @ Thumb2 thing, sanity check in ARM
 #endif
        moveq   r0,#-1
        beq     .Labrt
        teq     r2,#0
-#if __ARM_ARCH__>=7
+#ifdef __thumb2__
        itt     eq                      @ Thumb2 thing, sanity check in ARM
 #endif
        moveq   r0,#-1
@@ -466,7 +462,7 @@ _armv4_AES_set_encrypt_key:
        teq     r1,#192
        beq     .Lok
        teq     r1,#256
-#if __ARM_ARCH__>=7
+#ifdef __thumb2__
        itt     ne                      @ Thumb2 thing, sanity check in ARM
 #endif
        movne   r0,#-1
@@ -627,7 +623,7 @@ _armv4_AES_set_encrypt_key:
        str     $s2,[$key,#-16]
        subs    $rounds,$rounds,#1
        str     $s3,[$key,#-12]
-#if __ARM_ARCH__>=7
+#ifdef __thumb2__
        itt     eq                              @ Thumb2 thing, sanity check in ARM
 #endif
        subeq   r2,$key,#216
@@ -699,7 +695,7 @@ _armv4_AES_set_encrypt_key:
        str     $s2,[$key,#-24]
        subs    $rounds,$rounds,#1
        str     $s3,[$key,#-20]
-#if __ARM_ARCH__>=7
+#ifdef __thumb2__
        itt     eq                              @ Thumb2 thing, sanity check in ARM
 #endif
        subeq   r2,$key,#256
@@ -969,7 +965,7 @@ AES_Td:
 .type   AES_decrypt,%function
 .align 5
 AES_decrypt:
-#if __ARM_ARCH__<7
+#ifndef        __thumb2__
        sub     r3,pc,#8                @ AES_decrypt
 #else
        adr     r3,AES_decrypt
index 1c44718..c669623 100644 (file)
@@ -15,7 +15,12 @@ $code.=<<___;
 #include "arm_arch.h"
 
 .text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax        unified
+.thumb
+#else
 .code  32
+#endif
 
 .align 5
 .global        OPENSSL_atomic_add
@@ -59,6 +64,9 @@ OPENSSL_atomic_add:
 OPENSSL_cleanse:
        eor     ip,ip,ip
        cmp     r1,#7
+#ifdef __thumb2__
+       itt     hs
+#endif
        subhs   r1,r1,#4
        bhs     .Lot
        cmp     r1,#0
@@ -116,27 +124,43 @@ _armv7_tick:
 .global        _armv8_aes_probe
 .type  _armv8_aes_probe,%function
 _armv8_aes_probe:
+#if defined(__thumb2__) && !defined(__APPLE__)
+       .byte   0xb0,0xff,0x00,0x03     @ aese.8        q0,q0
+#else
        .byte   0x00,0x03,0xb0,0xf3     @ aese.8        q0,q0
+#endif
        bx      lr
 .size  _armv8_aes_probe,.-_armv8_aes_probe
 
 .global        _armv8_sha1_probe
 .type  _armv8_sha1_probe,%function
 _armv8_sha1_probe:
+#if defined(__thumb2__) && !defined(__APPLE__)
+       .byte   0x00,0xef,0x40,0x0c     @ sha1c.32      q0,q0,q0
+#else
        .byte   0x40,0x0c,0x00,0xf2     @ sha1c.32      q0,q0,q0
+#endif
        bx      lr
 .size  _armv8_sha1_probe,.-_armv8_sha1_probe
 
 .global        _armv8_sha256_probe
 .type  _armv8_sha256_probe,%function
 _armv8_sha256_probe:
+#if defined(__thumb2__) && !defined(__APPLE__)
+       .byte   0x00,0xff,0x40,0x0c     @ sha256h.32    q0,q0,q0
+#else
        .byte   0x40,0x0c,0x00,0xf3     @ sha256h.32    q0,q0,q0
+#endif
        bx      lr
 .size  _armv8_sha256_probe,.-_armv8_sha256_probe
 .global        _armv8_pmull_probe
 .type  _armv8_pmull_probe,%function
 _armv8_pmull_probe:
+#if defined(__thumb2__) && !defined(__APPLE__)
+       .byte   0xa0,0xef,0x00,0x0e     @ vmull.p64     q0,d0,d0
+#else
        .byte   0x00,0x0e,0xa0,0xf2     @ vmull.p64     q0,d0,d0
+#endif
        bx      lr
 .size  _armv8_pmull_probe,.-_armv8_pmull_probe
 #endif
index a0b018c..227581e 100644 (file)
@@ -51,7 +51,12 @@ $code=<<___;
 #include "arm_arch.h"
 
 .text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax        unified
+.thumb
+#else
 .code  32
+#endif
 ___
 ################
 # private interface to mul_1x1_ialu
@@ -132,11 +137,17 @@ mul_1x1_ialu:
        eor     $hi,$hi,$t0,lsr#8
        ldr     $t0,[sp,$i0]            @ tab[b >> 30      ]
 
+#ifdef __thumb2__
+       itt     ne
+#endif
        eorne   $lo,$lo,$b,lsl#30
        eorne   $hi,$hi,$b,lsr#2
        tst     $a,#1<<31
        eor     $lo,$lo,$t1,lsl#27
        eor     $hi,$hi,$t1,lsr#5
+#ifdef __thumb2__
+       itt     ne
+#endif
        eorne   $lo,$lo,$b,lsl#31
        eorne   $hi,$hi,$b,lsr#1
        eor     $lo,$lo,$t0,lsl#30
@@ -156,20 +167,33 @@ $code.=<<___;
 .align 5
 bn_GF2m_mul_2x2:
 #if __ARM_MAX_ARCH__>=7
+       stmdb   sp!,{r10,lr}
        ldr     r12,.LOPENSSL_armcap
-.Lpic: ldr     r12,[pc,r12]
-       tst     r12,#1
+       adr     r10,.LOPENSSL_armcap
+       ldr     r12,[r12,r10]
+#ifdef __APPLE__
+       ldr     r12,[r12]
+#endif
+       tst     r12,#ARMV7_NEON
+       itt     ne
+       ldrne   r10,[sp],#8
        bne     .LNEON
+       stmdb   sp!,{r4-r9}
+#else
+       stmdb   sp!,{r4-r10,lr}
 #endif
 ___
 $ret="r10";    # reassigned 1st argument
 $code.=<<___;
-       stmdb   sp!,{r4-r10,lr}
        mov     $ret,r0                 @ reassign 1st argument
        mov     $b,r3                   @ $b=b1
+       sub     r7,sp,#36
+       mov     r8,sp
+       and     r7,r7,#-32
        ldr     r3,[sp,#32]             @ load b0
        mov     $mask,#7<<2
-       sub     sp,sp,#32               @ allocate tab[8]
+       mov     sp,r7                   @ allocate tab[8]
+       str     r8,[r7,#32]
 
        bl      mul_1x1_ialu            @ a1·b1
        str     $lo,[$ret,#8]
@@ -193,6 +217,7 @@ ___
 $code.=<<___;
        ldmia   $ret,{@r[0]-@r[3]}
        eor     $lo,$lo,$hi
+       ldr     sp,[sp,#32]             @ destroy tab[8]
        eor     $hi,$hi,@r[1]
        eor     $lo,$lo,@r[0]
        eor     $hi,$hi,@r[2]
@@ -200,7 +225,6 @@ $code.=<<___;
        eor     $hi,$hi,@r[3]
        str     $hi,[$ret,#8]
        eor     $lo,$lo,$hi
-       add     sp,sp,#32               @ destroy tab[8]
        str     $lo,[$ret,#4]
 
 #if __ARM_ARCH__>=5
@@ -279,7 +303,7 @@ $code.=<<___;
 #if __ARM_MAX_ARCH__>=7
 .align 5
 .LOPENSSL_armcap:
-.word  OPENSSL_armcap_P-(.Lpic+8)
+.word  OPENSSL_armcap_P-.
 #endif
 .asciz "GF(2^m) Multiplication for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align 5
index 59f218b..bd56f98 100644 (file)
@@ -82,7 +82,12 @@ $code=<<___;
 #include "arm_arch.h"
 
 .text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax        unified
+.thumb
+#else
 .code  32
+#endif
 
 #if __ARM_MAX_ARCH__>=7
 .align 5
@@ -101,7 +106,7 @@ bn_mul_mont:
 #if __ARM_MAX_ARCH__>=7
        tst     ip,#7
        bne     .Lialu
-       adr     r0,bn_mul_mont
+       adr     r0,.Lbn_mul_mont
        ldr     r2,.LOPENSSL_armcap
        ldr     r0,[r0,r2]
 #ifdef __APPLE__
@@ -117,6 +122,9 @@ bn_mul_mont:
 #endif
        cmp     ip,#2
        mov     $num,ip                 @ load num
+#ifdef __thumb2__
+       ittt    lt
+#endif
        movlt   r0,#0
        addlt   sp,sp,#2*4
        blt     .Labrt
@@ -164,10 +172,11 @@ bn_mul_mont:
        ldr     $n0,[$_n0]              @ restore n0
        adc     $nhi,$nhi,#0
        str     $nlo,[$num]             @ tp[num-1]=
+       mov     $tj,sp
        str     $nhi,[$num,#4]          @ tp[num]=
 \f
 .Louter:
-       sub     $tj,$num,sp             @ "original" $num-1 value
+       sub     $tj,$num,$tj            @ "original" $num-1 value
        sub     $ap,$ap,$tj             @ "rewind" ap to &ap[1]
        ldr     $bi,[$tp,#4]!           @ *(++bp)
        sub     $np,$np,$tj             @ "rewind" np to &np[1]
@@ -212,11 +221,16 @@ bn_mul_mont:
        str     $nhi,[$num,#4]          @ tp[num]=
 
        cmp     $tp,$tj
+#ifdef __thumb2__
+       itt     ne
+#endif
+       movne   $tj,sp
        bne     .Louter
 \f
        ldr     $rp,[$_rp]              @ pull rp
+       mov     $aj,sp
        add     $num,$num,#4            @ $num to point at &tp[num]
-       sub     $aj,$num,sp             @ "original" num value
+       sub     $aj,$num,$aj            @ "original" num value
        mov     $tp,sp                  @ "rewind" $tp
        mov     $ap,$tp                 @ "borrow" $ap
        sub     $np,$np,$aj             @ "rewind" $np to &np[0]
@@ -242,7 +256,8 @@ bn_mul_mont:
        cmp     $tp,$num
        bne     .Lcopy
 
-       add     sp,$num,#4              @ skip over tp[num+1]
+       mov     sp,$num
+       add     sp,sp,#4                @ skip over tp[num+1]
        ldmia   sp!,{r4-r12,lr}         @ restore registers
        add     sp,sp,#2*4              @ skip over {r0,r2}
        mov     r0,#1
@@ -283,6 +298,7 @@ bn_mul8x_mont_neon:
        stmdb   sp!,{r4-r11}
        vstmdb  sp!,{d8-d15}            @ ABI specification says so
        ldmia   ip,{r4-r5}              @ load rest of parameter block
+       mov     ip,sp
 
        sub             $toutptr,sp,#16
        vld1.32         {${Bi}[0]}, [$bptr,:32]!
@@ -638,8 +654,9 @@ bn_mul8x_mont_neon:
        bne     .LNEON_sub
 
        ldr     r10, [$aptr]                            @ load top-most bit
+       mov     r11,sp
        veor    q0,q0,q0
-       sub     r11,$bptr,sp                            @ this is num*4
+       sub     r11,$bptr,r11                           @ this is num*4
        veor    q1,q1,q1
        mov     $aptr,sp
        sub     $rptr,$rptr,r11                         @ rewind $rptr
@@ -649,27 +666,33 @@ bn_mul8x_mont_neon:
 .LNEON_copy_n_zap:
        ldmia   $aptr!, {r4-r7}
        ldmia   $rptr,  {r8-r11}
+       it      cc
        movcc   r8, r4
        vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
+       itt     cc
        movcc   r9, r5
        movcc   r10,r6
        vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
+       it      cc
        movcc   r11,r7
        ldmia   $aptr, {r4-r7}
        stmia   $rptr!, {r8-r11}
        sub     $aptr,$aptr,#16
        ldmia   $rptr, {r8-r11}
+       it      cc
        movcc   r8, r4
        vst1.64 {q0-q1}, [$aptr,:256]!                  @ wipe
+       itt     cc
        movcc   r9, r5
        movcc   r10,r6
        vst1.64 {q0-q1}, [$nptr,:256]!                  @ wipe
+       it      cc
        movcc   r11,r7
        teq     $aptr,$bptr                             @ preserves carry
        stmia   $rptr!, {r8-r11}
        bne     .LNEON_copy_n_zap
 
-       sub     sp,ip,#96
+       mov     sp,ip
         vldmia  sp!,{d8-d15}
         ldmia   sp!,{r4-r11}
        ret                                             @ bx lr
index b49b77e..aeeb190 100755 (executable)
@@ -45,7 +45,12 @@ $code.=<<___;
 #include "arm_arch.h"
 
 .text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax        unified
+.thumb
+#else
 .code  32
+#endif
 ___
 ########################################################################
 # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
@@ -162,6 +167,9 @@ __ecp_nistz256_mul_by_2:
        adcs    $a6,$a6,$a6
        mov     $ff,#0
        adcs    $a7,$a7,$a7
+#ifdef __thumb2__
+       it      cs
+#endif
        movcs   $ff,#-1                 @ $ff = carry ? -1 : 0
 
        b       .Lreduce_by_sub
@@ -213,6 +221,9 @@ __ecp_nistz256_add:
        adcs    $a6,$a6,$t2
        mov     $ff,#0
        adcs    $a7,$a7,$t3
+#ifdef __thumb2__
+       it      cs
+#endif
        movcs   $ff,#-1                 @ $ff = carry ? -1 : 0, "broadcast" carry
        ldr     lr,[sp],#4              @ pop lr
 
@@ -286,6 +297,9 @@ __ecp_nistz256_mul_by_3:
        adcs    $a6,$a6,$a6
        mov     $ff,#0
        adcs    $a7,$a7,$a7
+#ifdef __thumb2__
+       it      cs
+#endif
        movcs   $ff,#-1                 @ $ff = carry ? -1 : 0, "broadcast" carry
 
        subs    $a0,$a0,$ff             @ subtract synthesized modulus, see
@@ -318,6 +332,9 @@ __ecp_nistz256_mul_by_3:
        adcs    $a6,$a6,$t2
        mov     $ff,#0
        adcs    $a7,$a7,$t3
+#ifdef __thumb2__
+       it      cs
+#endif
        movcs   $ff,#-1                 @ $ff = carry ? -1 : 0, "broadcast" carry
        ldr     lr,[sp],#4              @ pop lr
 
@@ -781,6 +798,9 @@ ecp_nistz256_gather_w5:
 
        cmp     $index,#0
        mov     $mask,#0
+#ifdef __thumb2__
+       itt     ne
+#endif
        subne   $index,$index,#1
        movne   $mask,#-1
        add     $inp,$inp,$index,lsl#2
@@ -887,6 +907,9 @@ ecp_nistz256_gather_w7:
 
        cmp     $index,#0
        mov     $mask,#0
+#ifdef __thumb2__
+       itt     ne
+#endif
        subne   $index,$index,#1
        movne   $mask,#-1
        add     $inp,$inp,$index
@@ -1180,6 +1203,9 @@ __ecp_nistz256_add_self:
        adcs    $a6,$a6,$a6
        mov     $ff,#0
        adcs    $a7,$a7,$a7
+#ifdef __thumb2__
+       it      cs
+#endif
        movcs   $ff,#-1                 @ $ff = carry ? -1 : 0
 
        subs    $a0,$a0,$ff             @ subtract synthesized modulus
@@ -1369,6 +1395,9 @@ ecp_nistz256_point_add:
        stmia   r3!,{r4-r11}
        ldmia   $b_ptr,{r4-r11}
        cmp     r12,#0
+#ifdef __thumb2__
+       it      ne
+#endif
        movne   r12,#-1
        stmia   r3,{r4-r11}
        str     r12,[sp,#32*18+8]       @ !in2infty
@@ -1395,6 +1424,9 @@ ecp_nistz256_point_add:
        stmia   r3!,{r4-r11}
        ldmia   $a_ptr,{r4-r11}
        cmp     r12,#0
+#ifdef __thumb2__
+       it      ne
+#endif
        movne   r12,#-1
        stmia   r3,{r4-r11}
        str     r12,[sp,#32*18+4]       @ !in1infty
@@ -1636,6 +1668,9 @@ ecp_nistz256_point_add_affine:
        stmia   r3!,{r4-r11}
        ldmia   $a_ptr,{r4-r11}
        cmp     r12,#0
+#ifdef __thumb2__
+       it      ne
+#endif
        movne   r12,#-1
        stmia   r3,{r4-r11}
        str     r12,[sp,#32*15+4]       @ !in1infty
@@ -1661,6 +1696,9 @@ ecp_nistz256_point_add_affine:
        orr     r12,r12,r11
        stmia   r3!,{r4-r11}
        cmp     r12,#0
+#ifdef __thumb2__
+       it      ne
+#endif
        movne   r12,#-1
        str     r12,[sp,#32*15+8]       @ !in2infty
 
index 2d225cf..1506e5b 100644 (file)
@@ -136,7 +136,12 @@ $code=<<___;
 #include "arm_arch.h"
 
 .text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax        unified
+.thumb
+#else
 .code  32
+#endif
 
 #ifdef  __APPLE__
 #define ldrplb  ldrbpl
@@ -154,19 +159,27 @@ rem_4bit:
 
 .type  rem_4bit_get,%function
 rem_4bit_get:
-       sub     $rem_4bit,pc,#8
-       sub     $rem_4bit,$rem_4bit,#32 @ &rem_4bit
+#if defined(__thumb2__)
+       adr     $rem_4bit,rem_4bit
+#else
+       sub     $rem_4bit,pc,#8+32      @ &rem_4bit
+#endif
        b       .Lrem_4bit_got
        nop
+       nop
 .size  rem_4bit_get,.-rem_4bit_get
 
 .global        gcm_ghash_4bit
 .type  gcm_ghash_4bit,%function
+.align 4
 gcm_ghash_4bit:
-       sub     r12,pc,#8
+#if defined(__thumb2__)
+       adr     r12,rem_4bit
+#else
+       sub     r12,pc,#8+48            @ &rem_4bit
+#endif
        add     $len,$inp,$len          @ $len to point at the end
        stmdb   sp!,{r3-r11,lr}         @ save $len/end too
-       sub     r12,r12,#48             @ &rem_4bit
 
        ldmia   r12,{r4-r11}            @ copy rem_4bit ...
        stmdb   sp!,{r4-r11}            @ ... to stack
@@ -213,6 +226,9 @@ gcm_ghash_4bit:
        eor     $Zlh,$Zlh,$Zhl,lsl#28
        ldrh    $Tll,[sp,$nlo]          @ rem_4bit[rem]
        eor     $Zhl,$Thl,$Zhl,lsr#4
+#ifdef __thumb2__
+       it      pl
+#endif
        ldrplb  $nlo,[$inp,$cnt]
        eor     $Zhl,$Zhl,$Zhh,lsl#28
        eor     $Zhh,$Thh,$Zhh,lsr#4
@@ -223,6 +239,9 @@ gcm_ghash_4bit:
        add     $nhi,$nhi,$nhi
        ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
        eor     $Zll,$Tll,$Zll,lsr#4
+#ifdef __thumb2__
+       it      pl
+#endif
        ldrplb  $Tll,[$Xi,$cnt]
        eor     $Zll,$Zll,$Zlh,lsl#28
        eor     $Zlh,$Tlh,$Zlh,lsr#4
@@ -230,8 +249,14 @@ gcm_ghash_4bit:
        eor     $Zlh,$Zlh,$Zhl,lsl#28
        eor     $Zhl,$Thl,$Zhl,lsr#4
        eor     $Zhl,$Zhl,$Zhh,lsl#28
+#ifdef __thumb2__
+       it      pl
+#endif
        eorpl   $nlo,$nlo,$Tll
        eor     $Zhh,$Thh,$Zhh,lsr#4
+#ifdef __thumb2__
+       itt     pl
+#endif
        andpl   $nhi,$nlo,#0xf0
        andpl   $nlo,$nlo,#0x0f
        eor     $Zhh,$Zhh,$Tlh,lsl#16   @ ^= rem_4bit[rem]
@@ -241,7 +266,11 @@ gcm_ghash_4bit:
        add     $inp,$inp,#16
        mov     $nhi,$Zll
 ___
-       &Zsmash("cmp\t$inp,$len","ldrneb\t$nlo,[$inp,#15]");
+       &Zsmash("cmp\t$inp,$len","\n".
+                                "#ifdef __thumb2__\n".
+                                "      it      ne\n".
+                                "#endif\n".
+                                "      ldrneb  $nlo,[$inp,#15]");
 $code.=<<___;
        bne     .Louter
 
@@ -299,6 +328,9 @@ gcm_gmult_4bit:
        eor     $Zlh,$Zlh,$Zhl,lsl#28
        ldrh    $Tll,[$rem_4bit,$nlo]   @ rem_4bit[rem]
        eor     $Zhl,$Thl,$Zhl,lsr#4
+#ifdef __thumb2__
+       it      pl
+#endif
        ldrplb  $nlo,[$Xi,$cnt]
        eor     $Zhl,$Zhl,$Zhh,lsl#28
        eor     $Zhh,$Thh,$Zhh,lsr#4
@@ -316,6 +348,9 @@ gcm_gmult_4bit:
        eor     $Zhl,$Thl,$Zhl,lsr#4
        eor     $Zhl,$Zhl,$Zhh,lsl#28
        eor     $Zhh,$Thh,$Zhh,lsr#4
+#ifdef __thumb2__
+       itt     pl
+#endif
        andpl   $nhi,$nlo,#0xf0
        andpl   $nlo,$nlo,#0x0f
        eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
index 356b52f..9d34e04 100644 (file)
@@ -181,7 +181,12 @@ $code=<<___;
 #include "arm_arch.h"
 
 .text
+#if defined(__thumb2__) && !defined(__APPLE__)
+.syntax        unified
+.thumb
+#else
 .code  32
+#endif
 
 .global        sha1_block_data_order
 .type  sha1_block_data_order,%function
@@ -189,7 +194,8 @@ $code=<<___;
 .align 5
 sha1_block_data_order:
 #if __ARM_MAX_ARCH__>=7
-       sub     r3,pc,#8                @ sha1_block_data_order
+.Lsha1_block:
+       adr     r3,.Lsha1_block
        ldr     r12,.LOPENSSL_armcap
        ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
 #ifdef __APPLE__
@@ -216,7 +222,12 @@ for($i=0;$i<5;$i++) {
        &BODY_00_15(@V);        unshift(@V,pop(@V));
 }
 $code.=<<___;
+#if defined(__thumb2__) && !defined(__APPLE__)
+       mov     $t3,sp
+       teq     $Xi,$t3
+#else
        teq     $Xi,sp
+#endif
        bne     .L_00_15                @ [((11+4)*5+2)*3]
        sub     sp,sp,#25*4
 ___
@@ -235,7 +246,12 @@ for($i=0;$i<5;$i++) {
        &BODY_20_39(@V);        unshift(@V,pop(@V));
 }
 $code.=<<___;
+#if defined(__thumb2__) && !defined(__APPLE__)
+       mov     $t3,sp
+       teq     $Xi,$t3
+#else
        teq     $Xi,sp                  @ preserve carry
+#endif
        bne     .L_20_39_or_60_79       @ [+((12+3)*5+2)*4]
        bcs     .L_done                 @ [+((12+3)*5+2)*4], spare 300 bytes
 
@@ -247,7 +263,12 @@ for($i=0;$i<5;$i++) {
        &BODY_40_59(@V);        unshift(@V,pop(@V));
 }
 $code.=<<___;
+#if defined(__thumb2__) && !defined(__APPLE__)
+       mov     $t3,sp
+       teq     $Xi,$t3
+#else
        teq     $Xi,sp
+#endif
        bne     .L_40_59                @ [+((12+5)*5+2)*4]
 
        ldr     $K,.LK_60_79
@@ -283,7 +304,7 @@ $code.=<<___;
 .LK_60_79:     .word   0xca62c1d6
 #if __ARM_MAX_ARCH__>=7
 .LOPENSSL_armcap:
-.word  OPENSSL_armcap_P-sha1_block_data_order
+.word  OPENSSL_armcap_P-.Lsha1_block
 #endif
 .asciz "SHA1 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align 5
@@ -458,6 +479,7 @@ sub Xuplast_80 ()
 
        &teq            ($inp,$len);
        &sub            ($K_XX_XX,$K_XX_XX,16); # rewind $K_XX_XX
+       &it             ("eq");
        &subeq          ($inp,$inp,64);         # reload last block to avoid SEGV
        &vld1_8         ("{@X[-4&7]-@X[-3&7]}","[$inp]!");
         eval(shift(@insns));
@@ -508,12 +530,12 @@ sha1_block_data_order_neon:
        @ dmb                           @ errata #451034 on early Cortex A8
        @ vstmdb        sp!,{d8-d15}    @ ABI specification says so
        mov     $saved_sp,sp
-       sub     sp,sp,#64               @ alloca
+       sub     $Xfer,sp,#64
        adr     $K_XX_XX,.LK_00_19
-       bic     sp,sp,#15               @ align for 128-bit stores
+       bic     $Xfer,$Xfer,#15         @ align for 128-bit stores
 
        ldmia   $ctx,{$a,$b,$c,$d,$e}   @ load context
-       mov     $Xfer,sp
+       mov     sp,$Xfer                @ alloca
 
        vld1.8          {@X[-4&7]-@X[-3&7]},[$inp]!     @ handles unaligned
        veor            $zero,$zero,$zero
@@ -560,10 +582,13 @@ $code.=<<___;
        add     $b,$b,$t0
        add     $c,$c,$t1
        add     $d,$d,$Xfer
+       it      eq
        moveq   sp,$saved_sp
        add     $e,$e,$Ki
+       it      ne
        ldrne   $Ki,[sp]
        stmia   $ctx,{$a,$b,$c,$d,$e}
+       itt     ne
        addne   $Xfer,sp,#3*16
        bne     .Loop_neon
 
@@ -584,6 +609,13 @@ my ($W0,$W1,$ABCD_SAVE)=map("q$_",(12..14));
 
 $code.=<<___;
 #if __ARM_MAX_ARCH__>=7
+
+# if defined(__thumb2__) && !defined(__APPLE__)
+#  define INST(a,b,c,d)        .byte   c,d|0xf,a,b
+# else
+#  define INST(a,b,c,d)        .byte   a,b,c,d|0x10
+# endif
+
 .type  sha1_block_data_order_armv8,%function
 .align 5
 sha1_block_data_order_armv8:
@@ -677,7 +709,10 @@ ___
            # since ARMv7 instructions are always encoded little-endian.
            # correct solution is to use .inst directive, but older
            # assemblers don't implement it:-(
-           sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+
+           # this fix-up provides Thumb encoding in conjunction with INST
+           $word &= ~0x10000000 if (($word & 0x0f000000) == 0x02000000);
+           sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
                        $word&0xff,($word>>8)&0xff,
                        ($word>>16)&0xff,($word>>24)&0xff,
                        $mnemonic,$arg;
index efee1fb..c65073b 100644 (file)
@@ -175,16 +175,12 @@ $code=<<___;
 #endif
 
 .text
-#if __ARM_ARCH__<7
-.code  32
-#else
+#if defined(__thumb2__) && !defined(__APPLE__)
 .syntax unified
-# if defined(__thumb2__) && !defined(__APPLE__)
-#  define adrl adr
 .thumb
-# else
+# define adrl adr
+#else
 .code   32
-# endif
 #endif
 
 .type  K256,%object
@@ -218,10 +214,10 @@ K256:
 .type  sha256_block_data_order,%function
 sha256_block_data_order:
 .Lsha256_block_data_order:
-#if __ARM_ARCH__<7
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
        sub     r3,pc,#8                @ sha256_block_data_order
 #else
-       adr     r3,sha256_block_data_order
+       adr     r3,.Lsha256_block_data_order
 #endif
 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
        ldr     r12,.LOPENSSL_armcap
index 77d6c5e..a83d677 100644 (file)
@@ -212,16 +212,12 @@ $code=<<___;
 #endif
 
 .text
-#if __ARM_ARCH__<7 || defined(__APPLE__)
-.code  32
-#else
+#if defined(__thumb2__) && !defined(__APPLE__)
 .syntax unified
-# ifdef __thumb2__
-#  define adrl adr
 .thumb
-# else
-.code   32
-# endif
+# define adrl adr
+#else
+.code  32
 #endif
 
 .type  K512,%object
@@ -280,10 +276,10 @@ WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
 .type  sha512_block_data_order,%function
 sha512_block_data_order:
 .Lsha512_block_data_order:
-#if __ARM_ARCH__<7
+#if __ARM_ARCH__<7 && !defined(__thumb2__)
        sub     r3,pc,#8                @ sha512_block_data_order
 #else
-       adr     r3,sha512_block_data_order
+       adr     r3,.Lsha512_block_data_order
 #endif
 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
        ldr     r12,.LOPENSSL_armcap