ARM assembly pack: get ARMv7 instruction endianness right.
authorAndy Polyakov <appro@openssl.org>
Fri, 6 Jun 2014 19:27:18 +0000 (21:27 +0200)
committerAndy Polyakov <appro@openssl.org>
Fri, 6 Jun 2014 19:27:18 +0000 (21:27 +0200)
Pointer out and suggested by: Ard Biesheuvel.

crypto/aes/asm/aes-armv4.pl
crypto/armv4cpuid.S
crypto/bn/asm/armv4-gf2m.pl
crypto/bn/asm/armv4-mont.pl
crypto/modes/asm/ghash-armv4.pl
crypto/sha/asm/sha1-armv4-large.pl
crypto/sha/asm/sha256-armv4.pl
crypto/sha/asm/sha512-armv4.pl

index 57996f68b7e401574809fc13c07b358cf7d20864..a620a7cddb4ece1aadaf66a80ec2b323fe0b3a6a 100644 (file)
@@ -715,8 +715,8 @@ _armv4_AES_set_encrypt_key:
 .Ldone:        mov     r0,#0
        ldmia   sp!,{r4-r12,lr}
 .Labrt:
 .Ldone:        mov     r0,#0
        ldmia   sp!,{r4-r12,lr}
 .Labrt:
-#if defined(__thumb2__) && __ARM_ARCH__>=7
-       .short  0x4770                  @ bx lr in Thumb2 encoding
+#if __ARM_ARCH__>=5
+       ret                             @ bx lr
 #else
        tst     lr,#1
        moveq   pc,lr                   @ be binary compatible with V4, yet
 #else
        tst     lr,#1
        moveq   pc,lr                   @ be binary compatible with V4, yet
@@ -1203,6 +1203,7 @@ _armv4_AES_decrypt:
 ___
 
 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;   # make it possible to compile with -march=armv4
 ___
 
 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;   # make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx\tlr/gm;
 
 open SELF,$0;
 while(<SELF>) {
 
 open SELF,$0;
 while(<SELF>) {
index 4f6ae17232fe825db6d7a37ce5d854982a42ff7c..add11d405ef994497dc49b4e90f76f54a6e5cb9c 100644 (file)
@@ -7,42 +7,46 @@
 .global        _armv7_neon_probe
 .type  _armv7_neon_probe,%function
 _armv7_neon_probe:
 .global        _armv7_neon_probe
 .type  _armv7_neon_probe,%function
 _armv7_neon_probe:
-       .word   0xf26ee1fe      @ vorr  q15,q15,q15
-       .word   0xe12fff1e      @ bx    lr
+       .byte   0xf0,0x01,0x60,0xf2     @ vorr  q8,q8,q8
+       .byte   0x1e,0xff,0x2f,0xe1     @ bx    lr
 .size  _armv7_neon_probe,.-_armv7_neon_probe
 
 .global        _armv7_tick
 .type  _armv7_tick,%function
 _armv7_tick:
 .size  _armv7_neon_probe,.-_armv7_neon_probe
 
 .global        _armv7_tick
 .type  _armv7_tick,%function
 _armv7_tick:
-       mrrc    p15,1,r0,r1,c14 @ CNTVCT
-       .word   0xe12fff1e      @ bx    lr
+       mrrc    p15,1,r0,r1,c14         @ CNTVCT
+#if __ARM_ARCH__>=5
+       bx      lr
+#else
+       .word   0xe12fff1e              @ bx    lr
+#endif
 .size  _armv7_tick,.-_armv7_tick
 
 .global        _armv8_aes_probe
 .type  _armv8_aes_probe,%function
 _armv8_aes_probe:
 .size  _armv7_tick,.-_armv7_tick
 
 .global        _armv8_aes_probe
 .type  _armv8_aes_probe,%function
 _armv8_aes_probe:
-       .word   0xf3b00300      @ aese.8        q0,q0
-       .word   0xe12fff1e      @ bx    lr
+       .byte   0x00,0x03,0xb0,0xf3     @ aese.8        q0,q0
+       .byte   0x1e,0xff,0x2f,0xe1     @ bx    lr
 .size  _armv8_aes_probe,.-_armv8_aes_probe
 
 .global        _armv8_sha1_probe
 .type  _armv8_sha1_probe,%function
 _armv8_sha1_probe:
 .size  _armv8_aes_probe,.-_armv8_aes_probe
 
 .global        _armv8_sha1_probe
 .type  _armv8_sha1_probe,%function
 _armv8_sha1_probe:
-       .word   0xf2000c40      @ sha1c.32      q0,q0,q0
-       .word   0xe12fff1e      @ bx    lr
+       .byte   0x40,0x0c,0x00,0xf2     @ sha1c.32      q0,q0,q0
+       .byte   0x1e,0xff,0x2f,0xe1     @ bx    lr
 .size  _armv8_sha1_probe,.-_armv8_sha1_probe
 
 .global        _armv8_sha256_probe
 .type  _armv8_sha256_probe,%function
 _armv8_sha256_probe:
 .size  _armv8_sha1_probe,.-_armv8_sha1_probe
 
 .global        _armv8_sha256_probe
 .type  _armv8_sha256_probe,%function
 _armv8_sha256_probe:
-       .word   0xf3000c40      @ sha256h.32    q0,q0,q0
-       .word   0xe12fff1e      @ bx    lr
+       .byte   0x40,0x0c,0x00,0xf3     @ sha256h.32    q0,q0,q0
+       .byte   0x1e,0xff,0x2f,0xe1     @ bx lr
 .size  _armv8_sha256_probe,.-_armv8_sha256_probe
 .global        _armv8_pmull_probe
 .type  _armv8_pmull_probe,%function
 _armv8_pmull_probe:
 .size  _armv8_sha256_probe,.-_armv8_sha256_probe
 .global        _armv8_pmull_probe
 .type  _armv8_pmull_probe,%function
 _armv8_pmull_probe:
-       .word   0xf2a00e00      @ vmull.p64     q0,d0,d0
-       .word   0xe12fff1e      @ bx    lr
+       .byte   0x00,0x0e,0xa0,0xf2     @ vmull.p64     q0,d0,d0
+       .byte   0x1e,0xff,0x2f,0xe1     @ bx    lr
 .size  _armv8_pmull_probe,.-_armv8_pmull_probe
 
 .align 5
 .size  _armv8_pmull_probe,.-_armv8_pmull_probe
 
 .align 5
@@ -56,7 +60,7 @@ OPENSSL_atomic_add:
        cmp     r2,#0
        bne     .Ladd
        mov     r0,r3
        cmp     r2,#0
        bne     .Ladd
        mov     r0,r3
-       .word   0xe12fff1e      @ bx    lr
+       bx      lr
 #else
        stmdb   sp!,{r4-r6,lr}
        ldr     r2,.Lspinlock
 #else
        stmdb   sp!,{r4-r6,lr}
        ldr     r2,.Lspinlock
@@ -109,9 +113,13 @@ OPENSSL_cleanse:
        adds    r1,r1,#4
        bne     .Little
 .Lcleanse_done:
        adds    r1,r1,#4
        bne     .Little
 .Lcleanse_done:
+#if __ARM_ARCH__>=5
+       bx      lr
+#else
        tst     lr,#1
        moveq   pc,lr
        .word   0xe12fff1e      @ bx    lr
        tst     lr,#1
        moveq   pc,lr
        .word   0xe12fff1e      @ bx    lr
+#endif
 .size  OPENSSL_cleanse,.-OPENSSL_cleanse
 
 .global        OPENSSL_wipe_cpu
 .size  OPENSSL_cleanse,.-OPENSSL_cleanse
 
 .global        OPENSSL_wipe_cpu
@@ -125,41 +133,53 @@ OPENSSL_wipe_cpu:
        eor     ip,ip,ip
        tst     r0,#1
        beq     .Lwipe_done
        eor     ip,ip,ip
        tst     r0,#1
        beq     .Lwipe_done
-       .word   0xf3000150      @ veor    q0, q0, q0
-       .word   0xf3022152      @ veor    q1, q1, q1
-       .word   0xf3044154      @ veor    q2, q2, q2
-       .word   0xf3066156      @ veor    q3, q3, q3
-       .word   0xf34001f0      @ veor    q8, q8, q8
-       .word   0xf34221f2      @ veor    q9, q9, q9
-       .word   0xf34441f4      @ veor    q10, q10, q10
-       .word   0xf34661f6      @ veor    q11, q11, q11
-       .word   0xf34881f8      @ veor    q12, q12, q12
-       .word   0xf34aa1fa      @ veor    q13, q13, q13
-       .word   0xf34cc1fc      @ veor    q14, q14, q14
-       .word   0xf34ee1fe      @ veor    q15, q15, q15
+       .byte   0x50,0x01,0x00,0xf3     @ veor  q0, q0, q0
+       .byte   0x52,0x21,0x02,0xf3     @ veor  q1, q1, q1
+       .byte   0x54,0x41,0x04,0xf3     @ veor  q2, q2, q2
+       .byte   0x56,0x61,0x06,0xf3     @ veor  q3, q3, q3
+       .byte   0xf0,0x01,0x40,0xf3     @ veor  q8, q8, q8
+       .byte   0xf2,0x21,0x42,0xf3     @ veor  q9, q9, q9
+       .byte   0xf4,0x41,0x44,0xf3     @ veor  q10, q10, q10
+       .byte   0xf6,0x61,0x46,0xf3     @ veor  q11, q11, q11
+       .byte   0xf8,0x81,0x48,0xf3     @ veor  q12, q12, q12
+       .byte   0xfa,0xa1,0x4a,0xf3     @ veor  q13, q13, q13
+       .byte   0xfc,0xc1,0x4c,0xf3     @ veor  q14, q14, q14
+       .byte   0xfe,0xe1,0x4e,0xf3     @ veor  q14, q14, q14
 .Lwipe_done:
        mov     r0,sp
 .Lwipe_done:
        mov     r0,sp
+#if __ARM_ARCH__>=5
+       bx      lr
+#else
        tst     lr,#1
        moveq   pc,lr
        .word   0xe12fff1e      @ bx    lr
        tst     lr,#1
        moveq   pc,lr
        .word   0xe12fff1e      @ bx    lr
+#endif
 .size  OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
 
 .global        OPENSSL_instrument_bus
 .type  OPENSSL_instrument_bus,%function
 OPENSSL_instrument_bus:
        eor     r0,r0,r0
 .size  OPENSSL_wipe_cpu,.-OPENSSL_wipe_cpu
 
 .global        OPENSSL_instrument_bus
 .type  OPENSSL_instrument_bus,%function
 OPENSSL_instrument_bus:
        eor     r0,r0,r0
+#if __ARM_ARCH__>=5
+       bx      lr
+#else
        tst     lr,#1
        moveq   pc,lr
        .word   0xe12fff1e      @ bx    lr
        tst     lr,#1
        moveq   pc,lr
        .word   0xe12fff1e      @ bx    lr
+#endif
 .size  OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
 
 .global        OPENSSL_instrument_bus2
 .type  OPENSSL_instrument_bus2,%function
 OPENSSL_instrument_bus2:
        eor     r0,r0,r0
 .size  OPENSSL_instrument_bus,.-OPENSSL_instrument_bus
 
 .global        OPENSSL_instrument_bus2
 .type  OPENSSL_instrument_bus2,%function
 OPENSSL_instrument_bus2:
        eor     r0,r0,r0
+#if __ARM_ARCH__>=5
+       bx      lr
+#else
        tst     lr,#1
        moveq   pc,lr
        .word   0xe12fff1e      @ bx    lr
        tst     lr,#1
        moveq   pc,lr
        .word   0xe12fff1e      @ bx    lr
+#endif
 .size  OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
 
 .align 5
 .size  OPENSSL_instrument_bus2,.-OPENSSL_instrument_bus2
 
 .align 5
index c66495040cc59d7984766f83f07c8ed391e7590f..b781afbf89bebfad9a1d1c29c5491effffc66672 100644 (file)
@@ -202,7 +202,7 @@ bn_GF2m_mul_2x2:
        veor            $r, $r, $t2
 
        vst1.32         {$r}, [r0]
        veor            $r, $r, $t2
 
        vst1.32         {$r}, [r0]
-       bx      lr
+       ret             @ bx lr
 .align 4
 .Lialu:
 #endif
 .align 4
 .Lialu:
 #endif
@@ -273,6 +273,7 @@ foreach (split("\n",$code)) {
        s/\`([^\`]*)\`/eval $1/geo;
 
        s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
        s/\`([^\`]*)\`/eval $1/geo;
 
        s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
+       s/\bret\b/bx    lr/go           or
        s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
 
        print $_,"\n";
        s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
 
        print $_,"\n";
index fe81f9b6f67b33b31ac701d6990ecd37d754ddab..72bad8e3083f984f33aac665af93fe61a1883429 100644 (file)
@@ -230,9 +230,14 @@ bn_mul_mont:
        ldmia   sp!,{r4-r12,lr}         @ restore registers
        add     sp,sp,#2*4              @ skip over {r0,r2}
        mov     r0,#1
        ldmia   sp!,{r4-r12,lr}         @ restore registers
        add     sp,sp,#2*4              @ skip over {r0,r2}
        mov     r0,#1
-.Labrt:        tst     lr,#1
+.Labrt:
+#if __ARM_ARCH__>=5
+       ret                             @ bx lr
+#else
+       tst     lr,#1
        moveq   pc,lr                   @ be binary compatible with V4, yet
        bx      lr                      @ interoperable with Thumb ISA:-)
        moveq   pc,lr                   @ be binary compatible with V4, yet
        bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
 .size  bn_mul_mont,.-bn_mul_mont
 ___
 {
 .size  bn_mul_mont,.-bn_mul_mont
 ___
 {
@@ -650,7 +655,7 @@ bn_mul8x_mont_neon:
        sub     sp,ip,#96
         vldmia  sp!,{d8-d15}
         ldmia   sp!,{r4-r11}
        sub     sp,ip,#96
         vldmia  sp!,{d8-d15}
         ldmia   sp!,{r4-r11}
-       bx      lr
+       ret                                             @ bx lr
 .size  bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
 #endif
 ___
 .size  bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
 #endif
 ___
@@ -665,5 +670,6 @@ ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;   # make it possible to compile with -march=armv4
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;   # make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx  lr/gm;
 print $code;
 close STDOUT;
 print $code;
 close STDOUT;
index 0b0dcc8a68f64b52fb22aa0402dc02dfd0d0e451..0023bf994bf33306afa9e2760b3b55176fff7c08 100644 (file)
@@ -386,7 +386,7 @@ gcm_init_neon:
        veor            $IN,$IN,$t0             @ twisted H
        vstmia          r0,{$IN}
 
        veor            $IN,$IN,$t0             @ twisted H
        vstmia          r0,{$IN}
 
-       bx      lr
+       ret                                     @ bx lr
 .size  gcm_init_neon,.-gcm_init_neon
 
 .global        gcm_gmult_neon
 .size  gcm_init_neon,.-gcm_init_neon
 
 .global        gcm_gmult_neon
@@ -470,7 +470,7 @@ $code.=<<___;
        vst1.64         $Xl#hi,[$Xi,:64]!       @ write out Xi
        vst1.64         $Xl#lo,[$Xi,:64]
 
        vst1.64         $Xl#hi,[$Xi,:64]!       @ write out Xi
        vst1.64         $Xl#lo,[$Xi,:64]
 
-       bx      lr
+       ret                                     @ bx lr
 .size  gcm_ghash_neon,.-gcm_ghash_neon
 #endif
 ___
 .size  gcm_ghash_neon,.-gcm_ghash_neon
 #endif
 ___
@@ -484,6 +484,7 @@ foreach (split("\n",$code)) {
        s/\`([^\`]*)\`/eval $1/geo;
 
        s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
        s/\`([^\`]*)\`/eval $1/geo;
 
        s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
+       s/\bret\b/bx    lr/go           or
        s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
 
        print $_,"\n";
        s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
 
        print $_,"\n";
index 43a1b9fd7f65a936af36209a235a3fa866619a8f..50bd07b331d60bf09b3630397d66625b92cd38c4 100644 (file)
@@ -631,7 +631,7 @@ $code.=<<___;
        vst1.32         {$E\[0]},[$ctx]
 
        vldmia  sp!,{d8-d15}
        vst1.32         {$E\[0]},[$ctx]
 
        vldmia  sp!,{d8-d15}
-       bx      lr
+       ret                                     @ bx lr
 .size  sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
 #endif
 ___
 .size  sha1_block_data_order_armv8,.-sha1_block_data_order_armv8
 #endif
 ___
@@ -648,13 +648,18 @@ ___
     sub unsha1 {
        my ($mnemonic,$arg)=@_;
 
     sub unsha1 {
        my ($mnemonic,$arg)=@_;
 
-       $arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o
-       &&
-       sprintf ".long\t0x%08x\t@ %s %s",
-                       $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
-                                         |(($2&7)<<17)|(($2&8)<<4)
-                                         |(($3&7)<<1) |(($3&8)<<2),
+       if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+           my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+                                        |(($2&7)<<17)|(($2&8)<<4)
+                                        |(($3&7)<<1) |(($3&8)<<2);
+           # since ARMv7 instructions are always encoded little-endian.
+           # correct solution is to use .inst directive, but older
+           # assemblers don't implement it:-(
+           sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+                       $word&0xff,($word>>8)&0xff,
+                       ($word>>16)&0xff,($word>>24)&0xff,
                        $mnemonic,$arg;
                        $mnemonic,$arg;
+       }
     }
 }
 
     }
 }
 
@@ -664,6 +669,7 @@ foreach (split($/,$code)) {
 
        s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
 
 
        s/\b(sha1\w+)\s+(q.*)/unsha1($1,$2)/geo;
 
+       s/\bret\b/bx    lr/o            or
        s/\bbx\s+lr\b/.word\t0xe12fff1e/o;      # make it possible to compile with -march=armv4
 
        print $_,$/;
        s/\bbx\s+lr\b/.word\t0xe12fff1e/o;      # make it possible to compile with -march=armv4
 
        print $_,$/;
index 5e5c54ec18549958235a710ba2f8efdf521a0f65..505ca8f350fa959eb614134e8d251962bda87f42 100644 (file)
@@ -608,7 +608,7 @@ $code.=<<___;
 
        vst1.32         {$ABCD,$EFGH},[$ctx]
 
 
        vst1.32         {$ABCD,$EFGH},[$ctx]
 
-       bx      lr
+       ret             @ bx lr
 .size  sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
 #endif
 ___
 .size  sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
 #endif
 ___
@@ -626,13 +626,18 @@ ___
     sub unsha256 {
        my ($mnemonic,$arg)=@_;
 
     sub unsha256 {
        my ($mnemonic,$arg)=@_;
 
-       $arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o
-       &&
-       sprintf ".long\t0x%08x\t@ %s %s",
-                       $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
-                                         |(($2&7)<<17)|(($2&8)<<4)
-                                         |(($3&7)<<1) |(($3&8)<<2),
+       if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
+           my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+                                        |(($2&7)<<17)|(($2&8)<<4)
+                                        |(($3&7)<<1) |(($3&8)<<2);
+           # since ARMv7 instructions are always encoded little-endian.
+           # correct solution is to use .inst directive, but older
+           # assemblers don't implement it:-(
+           sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+                       $word&0xff,($word>>8)&0xff,
+                       ($word>>16)&0xff,($word>>24)&0xff,
                        $mnemonic,$arg;
                        $mnemonic,$arg;
+       }
     }
 }
 
     }
 }
 
@@ -642,6 +647,7 @@ foreach (split($/,$code)) {
 
        s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
 
 
        s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
 
+       s/\bret\b/bx    lr/go           or
        s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
 
        print $_,"\n";
        s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
 
        print $_,"\n";
index d3065794b343890487187ab32e71234712ce155c..1d5275b91704cfb1af482c26d41502d1a86450fb 100644 (file)
@@ -584,7 +584,7 @@ $code.=<<___;
        bne             .Loop_neon
 
        vldmia  sp!,{d8-d15}            @ epilogue
        bne             .Loop_neon
 
        vldmia  sp!,{d8-d15}            @ epilogue
-       bx      lr
+       ret                             @ bx lr
 #endif
 ___
 }
 #endif
 ___
 }
@@ -597,5 +597,6 @@ ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;   # make it possible to compile with -march=armv4
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;   # make it possible to compile with -march=armv4
+$code =~ s/\bret\b/bx  lr/gm;
 print $code;
 close STDOUT; # enforce flush
 print $code;
 close STDOUT; # enforce flush