C64x+ assembly pack: improve EABI support.
authorAndy Polyakov <appro@openssl.org>
Wed, 28 Nov 2012 13:19:10 +0000 (13:19 +0000)
committerAndy Polyakov <appro@openssl.org>
Wed, 28 Nov 2012 13:19:10 +0000 (13:19 +0000)
Configure
TABLE
crypto/aes/asm/aes-c64xplus.pl
crypto/bn/asm/bn-c64xplus.asm
crypto/bn/asm/c64xplus-gf2m.pl
crypto/c64xpluscpuid.pl
crypto/modes/asm/ghash-c64xplus.pl
crypto/sha/asm/sha1-c64xplus.pl
crypto/sha/asm/sha256-c64xplus.pl
crypto/sha/asm/sha512-c64xplus.pl

index 94fa7a8821457ddc7b8dfa921882eb1b6026df36..95c701af3196ecc1692c9b4cf417b676251084cd 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -410,7 +410,7 @@ my %table=(
 "linux-alpha+bwx-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}",
 #
 # TI_CGT_C6000_7.3.x is a requirement
 "linux-alpha+bwx-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}",
 #
 # TI_CGT_C6000_7.3.x is a requirement
-"linux-c64xplus","cl6x:--linux --strip_coff_underscore -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT::-D_REENTRANT:::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o:::::::ghash-c64xplus.o::void:dlfcn:linux-shared:--pic:-z --sysv --shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):true",
+"linux-c64xplus","cl6x:--linux -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT::-D_REENTRANT:::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o:::::::ghash-c64xplus.o::void:dlfcn:linux-shared:--pic:-z --sysv --shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):true",
 
 # Android: linux-* but without -DTERMIO and pointers to headers and libs.
 "android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 
 # Android: linux-* but without -DTERMIO and pointers to headers and libs.
 "android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
diff --git a/TABLE b/TABLE
index 52d88b998fb07f6f4049db084c12f3d39152fcf3..9d1a80bf94d060a1571c552fe0dbf8646e1432ca 100644 (file)
--- a/TABLE
+++ b/TABLE
@@ -3995,7 +3995,7 @@ $multilib     =
 
 *** linux-c64xplus
 $cc           = cl6x
 
 *** linux-c64xplus
 $cc           = cl6x
-$cflags       = --linux --strip_coff_underscore -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT
+$cflags       = --linux -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT
 $unistd       = 
 $thread_cflag = -D_REENTRANT
 $sys_id       = 
 $unistd       = 
 $thread_cflag = -D_REENTRANT
 $sys_id       = 
index ad0c15a36fde2bcbbed39e30d5895b9937bc04d4..cc14ae315799c6bceaaad3158157f57b6f822415 100644 (file)
@@ -46,6 +46,11 @@ $code=<<___;
        .text
        .if     __TI_EABI__
        .nocmp
        .text
        .if     __TI_EABI__
        .nocmp
+       .asg    AES_encrypt,_AES_encrypt
+       .asg    AES_decrypt,_AES_decrypt
+       .asg    AES_set_encrypt_key,_AES_set_encrypt_key
+       .asg    AES_set_decrypt_key,_AES_set_decrypt_key
+       .asg    AES_ctr32_encrypt,_AES_ctr32_encrypt
        .endif
 
        .asg    B3,RA
        .endif
 
        .asg    B3,RA
@@ -1021,7 +1026,11 @@ ___
 }
 # Tables are kept in endian-neutral manner
 $code.=<<___;
 }
 # Tables are kept in endian-neutral manner
 $code.=<<___;
+       .if     __TI_EABI__
+       .sect   ".text:aes_asm.const"
+       .else
        .sect   ".const:aes_asm"
        .sect   ".const:aes_asm"
+       .endif
        .align  128
 AES_Te:
        .byte   0xc6,0x63,0x63,0xa5,    0xf8,0x7c,0x7c,0x84
        .align  128
 AES_Te:
        .byte   0xc6,0x63,0x63,0xa5,    0xf8,0x7c,0x7c,0x84
@@ -1359,3 +1368,4 @@ AES_Td4:
 ___
 
 print $code;
 ___
 
 print $code;
+close STDOUT;
index 161547c3b0a486bba45632bf0272f9ad691b3ee7..f07b09e439bb2f2d6363761fe812af2de0dee586 100644 (file)
 ;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
 ;;====================================================================
        .text
 ;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
 ;;====================================================================
        .text
+       .if     __TI_EABI__
+       .asg    bn_mul_add_words,_bn_mul_add_words
+       .asg    bn_mul_words,_bn_mul_words
+       .asg    bn_sqr_words,_bn_sqr_words
+       .asg    bn_add_words,_bn_add_words
+       .asg    bn_sub_words,_bn_sub_words
+       .asg    bn_div_words,_bn_div_words
+       .asg    bn_sqr_comba8,_bn_sqr_comba8
+       .asg    bn_mul_comba8,_bn_mul_comba8
+       .asg    bn_sqr_comba4,_bn_sqr_comba4
+       .asg    bn_mul_comba4,_bn_mul_comba4
+       .endif
 
        .asg    B3,RA
        .asg    A4,ARG0
 
        .asg    B3,RA
        .asg    A4,ARG0
@@ -158,14 +170,39 @@ _bn_sub_words:
        .endasmfunc
 
        .global _bn_div_words
        .endasmfunc
 
        .global _bn_div_words
-       .global __divull
 _bn_div_words:
        .asmfunc
 _bn_div_words:
        .asmfunc
-       CALLP   __divull,A3     ; jump to rts64plus.lib
-||     MV      ARG0,A5
-||     MV      ARG1,ARG0
-||     MV      ARG2,ARG1
-||     ZERO    B5
+       LMBD    1,A6,A0         ; leading zero bits in dv
+       LMBD    1,A4,A1         ; leading zero bits in hi
+||     MVK     32,B0
+       CMPLTU  A1,A0,A2
+||     ADD     A0,B0,B0
+  [ A2]        BNOP    RA
+||[ A2]        MVK     -1,A4           ; return overflow
+||[!A2]        MV      A4,A3           ; reassign hi
+  [!A2]        MV      B4,A4           ; reassign lo, will be quotient
+||[!A2]        MVC     B0,ILC
+  [!A2]        SHL     A6,A0,A6        ; normalize dv
+||     MVK     1,A1
+
+  [!A2]        CMPLTU  A3,A6,A1        ; hi<dv?
+||[!A2]        SHL     A4,1,A5:A4      ; lo<<1
+  [!A1]        SUB     A3,A6,A3        ; hi-=dv
+||[!A1]        OR      1,A4,A4
+  [!A2]        SHRU    A3,31,A1        ; upper bit
+||[!A2]        ADDAH   A5,A3,A3        ; hi<<1|lo>>31
+
+       SPLOOP  3
+  [!A1]        CMPLTU  A3,A6,A1        ; hi<dv?
+||[ A1]        ZERO    A1
+||     SHL     A4,1,A5:A4      ; lo<<1
+  [!A1]        SUB     A3,A6,A3        ; hi-=dv
+||[!A1]        OR      1,A4,A4         ; quotient
+       SHRU    A3,31,A1        ; upper bit
+||     ADDAH   A5,A3,A3        ; hi<<1|lo>>31
+       SPKERNEL
+
+       BNOP    RA,5
        .endasmfunc
 
 ;;====================================================================
        .endasmfunc
 
 ;;====================================================================
@@ -256,7 +293,7 @@ _bn_mul_comba4:
 ||     LDW     *A5++,B6        ; ap[0]
 ||     MV      A0,A3           ; const A3=M
        .else
 ||     LDW     *A5++,B6        ; ap[0]
 ||     MV      A0,A3           ; const A3=M
        .else
-       ;; This alternative is exercise in fully unrolled Comba
+       ;; This alternative is an exercise in fully unrolled Comba
        ;; algorithm implementation that operates at n*(n+1)+12, or
        ;; as little as 32 cycles...
        LDW     *ARG1[0],B16    ; a[0]
        ;; algorithm implementation that operates at n*(n+1)+12, or
        ;; as little as 32 cycles...
        LDW     *ARG1[0],B16    ; a[0]
index cef83942c91840cc4e9ceedaf86fc57c1a8b0ed4..1b3ecc2c94546e4ba1094abe57cffa33c0afe05a 100644 (file)
@@ -107,6 +107,9 @@ ___
 }
 $code.=<<___;
        .text
 }
 $code.=<<___;
        .text
+       .if     __TI_EABI__
+       .asg    bn_GF2m_mul_2x2,_bn_GF2m_mul_2x2
+       .endif
 
        .global _bn_GF2m_mul_2x2
 _bn_GF2m_mul_2x2:
 
        .global _bn_GF2m_mul_2x2
 _bn_GF2m_mul_2x2:
index 067b693d5c7e2d5aa8d745082fc6da6a022ff4c7..0ee0a4e86fe4a63f77f38e4a6a88378d9208bec1 100644 (file)
@@ -6,6 +6,14 @@ open STDOUT,">$output";
 
 $code.=<<___;
        .text
 
 $code.=<<___;
        .text
+       .if     __TI_EABI__
+       .asg    OPENSSL_rdtsc,_OPENSSL_rdtsc
+       .asg    OPENSSL_cleanse,_OPENSSL_cleanse
+       .asg    OPENSSL_atomic_add,_OPENSSL_atomic_add
+       .asg    OPENSSL_wipe_cpu,_OPENSSL_wipe_cpu
+       .asg    OPENSSL_instrument_bus,_OPENSSL_instrument_bus
+       .asg    OPENSSL_instrument_bus2,_OPENSSL_instrument_bus2
+       .endif
 
        .asg    B3,RA
 
 
        .asg    B3,RA
 
index 1ac4d927d00c4098aff486194c459e287fdfdedf..409b0d61b9cf22e46215d815aa38a982761f7e06 100644 (file)
@@ -35,6 +35,11 @@ open STDOUT,">$output";
 
 $code.=<<___;
        .text
 
 $code.=<<___;
        .text
+       .if     __TI_EABI__
+       .asg    gcm_gmult_1bit,_gcm_gmult_1bit
+       .asg    gcm_gmult_4bit,_gcm_gmult_4bit
+       .asg    gcm_ghash_4bit,_gcm_ghash_4bit
+       .endif
 
        .asg    B3,RA
 
 
        .asg    B3,RA
 
@@ -144,7 +149,7 @@ ___
 #    8/2                                         S1  L1x S2      |        ....
 #####...                                         ................|............
 $code.=<<___;
 #    8/2                                         S1  L1x S2      |        ....
 #####...                                         ................|............
 $code.=<<___;
-       XORMPY  $H0,$xia,$H0x           ; 0     ; H·Xi[i]
+       XORMPY  $H0,$xia,$H0x           ; 0     ; H·(Xi[i]<<1)
 ||     XORMPY  $H01u,$xib,$H01y
 || [A0]        LDBU    *--${xip},$x0
        XORMPY  $H1,$xia,$H1x           ; 1
 ||     XORMPY  $H01u,$xib,$H01y
 || [A0]        LDBU    *--${xip},$x0
        XORMPY  $H1,$xia,$H1x           ; 1
@@ -153,7 +158,7 @@ $code.=<<___;
        XORMPY  $H3,$xia,$H3x           ; 3
 ||     XORMPY  $H3u,$xib,$H3y
 ||[!A0]        MVK.D   15,A0                           ; *--${xip} counter
        XORMPY  $H3,$xia,$H3x           ; 3
 ||     XORMPY  $H3u,$xib,$H3y
 ||[!A0]        MVK.D   15,A0                           ; *--${xip} counter
-       XOR.L   $H0x,$Z0,$Z0            ; 4     ; Z^=H·Xi[i]
+       XOR.L   $H0x,$Z0,$Z0            ; 4     ; Z^=H·(Xi[i]<<1)
 || [A0]        SUB.S   A0,1,A0
        XOR.L   $H1x,$Z1,$Z1            ; 5
 ||     AND.D   $H01y,$FF000000,$H0z
 || [A0]        SUB.S   A0,1,A0
        XOR.L   $H1x,$Z1,$Z1            ; 5
 ||     AND.D   $H01y,$FF000000,$H0z
index 87000d1e8f6a9cc0904d5b22bd210b4b203eff76..456f80a86e407bf745fe392c905d0eca517e5172 100644 (file)
@@ -38,6 +38,9 @@ open STDOUT,">$output";
 
 $code=<<___;
        .text
 
 $code=<<___;
        .text
+       .if     __TI_EABI__
+       .asg    sha1_block_data_order,_sha1_block_data_order
+       .endif
 
        .asg    B3,RA
        .asg    A15,FP
 
        .asg    B3,RA
        .asg    A15,FP
index 5a057868b4c9332c427579152d6a54a10364155e..798f78309b77e993e78ca435fe2b1ea795764b98 100644 (file)
@@ -40,6 +40,7 @@ $code.=<<___;
        .text
        .if     __TI_EABI__
        .nocmp
        .text
        .if     __TI_EABI__
        .nocmp
+       .asg    sha256_block_data_order,_sha256_block_data_order
        .endif
 
        .asg    B3,RA
        .endif
 
        .asg    B3,RA
@@ -275,7 +276,11 @@ outerloop?:
 ||     STW     $H,*${CTXB}[7]
        .endasmfunc
 
 ||     STW     $H,*${CTXB}[7]
        .endasmfunc
 
+       .if     __TI_EABI__
+       .sect   ".text:sha_asm.const"
+       .else
        .sect   ".const:sha_asm"
        .sect   ".const:sha_asm"
+       .endif
        .align  128
 K256:
        .uword  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
        .align  128
 K256:
        .uword  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
@@ -300,3 +305,4 @@ K256:
 ___
 
 print $code;
 ___
 
 print $code;
+close STDOUT;
index e4e7c042fd4a1c97e6052df2520dc7eb6e1ff965..77a62523e565bbff8186274a8e1bbc217e41a11f 100644 (file)
@@ -48,6 +48,7 @@ $code.=<<___;
        .text
        .if     __TI_EABI__
        .nocmp
        .text
        .if     __TI_EABI__
        .nocmp
+       .asg    sha512_block_data_order,_sha512_block_data_order
        .endif
 
        .asg    B3,RA
        .endif
 
        .asg    B3,RA
@@ -370,7 +371,11 @@ break?:
        NOP     2                               ; wait till FP is committed
        .endasmfunc
 
        NOP     2                               ; wait till FP is committed
        .endasmfunc
 
+       .if     __TI_EABI__
+       .sect   ".text:sha_asm.const"
+       .else
        .sect   ".const:sha_asm"
        .sect   ".const:sha_asm"
+       .endif
        .align  128
 K512:
        .uword  0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
        .align  128
 K512:
        .uword  0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd