C64x+ assembply pack: add RC4 module.
authorAndy Polyakov <appro@openssl.org>
Sun, 4 May 2014 14:39:59 +0000 (16:39 +0200)
committerAndy Polyakov <appro@openssl.org>
Sun, 4 May 2014 14:39:59 +0000 (16:39 +0200)
Configure
TABLE
crypto/rc4/asm/rc4-c64xplus.pl [new file with mode: 0644]

index ba0be31..7918287 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -414,7 +414,7 @@ my %table=(
 "linux-alpha+bwx-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}",
 #
 # TI_CGT_C6000_7.3.x is a requirement
-"linux-c64xplus","cl6x:--linux -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT::-D_REENTRANT:::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o:::::::ghash-c64xplus.o::void:dlfcn:linux-shared:--pic:-z --sysv --shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):true",
+"linux-c64xplus","cl6x:--linux -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT::-D_REENTRANT:::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o::rc4-c64xplus.o:::::ghash-c64xplus.o::void:dlfcn:linux-shared:--pic:-z --sysv --shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):true",
 
 # Android: linux-* but without -DTERMIO and pointers to headers and libs.
 "android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
diff --git a/TABLE b/TABLE
index 8a46fb7..bf2dba3 100644 (file)
--- a/TABLE
+++ b/TABLE
@@ -1652,7 +1652,7 @@ $multilib     =
 
 *** debug-VC-WIN32
 $cc           = cl
-$cflags       = -W3 -WX -Gs0 -GF -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -D_CRT_SECURE_NO_DEPRECATE
+$cflags       = -W3 -Gs0 -GF -Gy -Zi -nologo -DOPENSSL_SYSNAME_WIN32 -DWIN32_LEAN_AND_MEAN -DL_ENDIAN -D_CRT_SECURE_NO_DEPRECATE
 $unistd       = 
 $thread_cflag = 
 $sys_id       = WIN32
@@ -4174,7 +4174,7 @@ $bf_obj       =
 $md5_obj      = 
 $sha1_obj     = sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o
 $cast_obj     = 
-$rc4_obj      = 
+$rc4_obj      = rc4-c64xplus.o
 $rmd160_obj   = 
 $rc5_obj      = 
 $wp_obj       = 
diff --git a/crypto/rc4/asm/rc4-c64xplus.pl b/crypto/rc4/asm/rc4-c64xplus.pl
new file mode 100644 (file)
index 0000000..6e5fe05
--- /dev/null
@@ -0,0 +1,183 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# RC4 for C64x+.
+#
+# April 2014
+#
+# RC4 subroutine processes one byte in 7.0 cycles, which is 3x faster
+# than TI CGT-generated code. Loop is scheduled in such way that
+# there is only one reference to memory in each cycle. This is done
+# to avoid L1D memory banking conflicts, see SPRU871 TI publication
+# for further details. Otherwise it should be possible to schedule
+# the loop for iteration interval of 6...
+
+($KEY,$LEN,$INP,$OUT)=("A4","B4","A6","B6");
+
+($KEYA,$XX,$TY,$xx,$ONE,$ret)=map("A$_",(5,7,8,9,1,2));
+($KEYB,$YY,$TX,$tx,$SUM,$dat)=map("B$_",(5,7,8,9,1,2));
+
+$code.=<<___;
+       .text
+
+       .if     .ASSEMBLER_VERSION<7000000
+       .asg    0,__TI_EABI__
+       .endif
+       .if     __TI_EABI__
+       .nocmp
+       .asg    RC4,_RC4
+       .asg    RC4_set_key,_RC4_set_key
+       .asg    RC4_options,_RC4_options
+       .endif
+
+       .global _RC4
+       .align  16
+_RC4:
+       .asmfunc
+       MV      $LEN,B0
+  [!B0]        BNOP    B3                      ; if (len==0) return;
+||[B0] ADD     $KEY,2,$KEYA
+||[B0] ADD     $KEY,2,$KEYB
+  [B0] MVK     1,$ONE
+||[B0] LDBU    *${KEYA}[-2],$XX        ; key->x
+  [B0] LDBU    *${KEYB}[-1],$YY        ; key->y
+||     NOP     4
+
+       ADD4    $ONE,$XX,$XX
+       LDBU    *${KEYA}[$XX],$TX
+||     MVC     $LEN,ILC
+       NOP     4
+;;==================================================
+       SPLOOP  7
+||     ADD4    $TX,$YY,$YY
+
+       LDBU    *${KEYB}[$YY],$TY
+||     MVD     $XX,$xx
+||     ADD4    $ONE,$XX,$XX
+       LDBU    *${KEYA}[$XX],$tx
+       CMPEQ   $YY,$XX,B0
+||     NOP     3
+       STB     $TX,*${KEYB}[$YY]
+||[B0] ADD4    $TX,$YY,$YY
+       STB     $TY,*${KEYA}[$xx]
+||[!B0]        ADD4    $tx,$YY,$YY
+||[!B0]        MVD     $tx,$TX
+       ADD4    $TY,$TX,$SUM            ; [0,0] $TX is not replaced by $tx yet!
+||     NOP     2
+       LDBU    *$INP++,$dat
+||     NOP     2
+       LDBU    *${KEYB}[$SUM],$ret
+||     NOP     5
+       XOR.L   $dat,$ret,$ret
+       SPKERNEL
+||     STB     $ret,*$OUT++
+;;==================================================
+       SUB4    $XX,$ONE,$XX
+||     NOP     5
+       STB     $XX,*${KEYA}[-2]        ; key->x
+||     SUB4    $YY,$TX,$YY
+||     BNOP    B3      
+       STB     $YY,*${KEYB}[-1]        ; key->y
+||     NOP     5
+       .endasmfunc
+
+       .global _RC4_set_key
+       .align  16
+_RC4_set_key:
+       .asmfunc
+       .if     .BIG_ENDIAN
+       MVK     0x00000404,$ONE
+||     MVK     0x00000203,B0
+       MVKH    0x04040000,$ONE
+||     MVKH    0x00010000,B0
+       .else
+       MVK     0x00000404,$ONE
+||     MVK     0x00000100,B0
+       MVKH    0x04040000,$ONE
+||     MVKH    0x03020000,B0
+       .endif
+       ADD     $KEY,2,$KEYA
+||     ADD     $KEY,2,$KEYB
+||     ADD     $INP,$LEN,$ret          ; end of input
+       LDBU    *${INP}++,$dat
+||     MVK     0,$TX
+       STH     $TX,*${KEY}++           ; key->x=key->y=0
+||     MV      B0,A0
+||     MVK     64-4,B0
+
+;;==================================================
+       SPLOOPD 1
+||     MVC     B0,ILC
+
+       STNW    A0,*${KEY}++
+||     ADD4    $ONE,A0,A0
+       SPKERNEL
+;;==================================================
+
+       MVK     0,$YY
+||     MVK     0,$XX
+       MVK     1,$ONE
+||     MVK     256-1,B0
+
+;;==================================================
+       SPLOOPD 8
+||     MVC     B0,ILC
+
+       ADD4    $dat,$YY,$YY
+||     CMPEQ   $INP,$ret,A0            ; end of input?
+       LDBU    *${KEYB}[$YY],$TY
+||     MVD     $XX,$xx
+||     ADD4    $ONE,$XX,$XX
+       LDBU    *${KEYA}[$XX],$tx
+||[A0] SUB     $INP,$LEN,$INP          ; rewind
+       LDBU    *${INP}++,$dat
+||     CMPEQ   $YY,$XX,B0
+||     NOP     3
+       STB     $TX,*${KEYB}[$YY]
+||[B0] ADD4    $TX,$YY,$YY
+       STB     $TY,*${KEYA}[$xx]
+||[!B0]        ADD4    $tx,$YY,$YY
+||[!B0]        MV      $tx,$TX
+       SPKERNEL
+;;==================================================
+
+       BNOP    B3,5
+       .endasmfunc
+
+       .global _RC4_options
+       .align  16
+_RC4_options:
+_rc4_options:
+       .asmfunc
+       BNOP    B3,1
+       ADDKPC  _rc4_options,B4
+       .if     __TI_EABI__
+       MVKL    \$PCR_OFFSET(rc4_options,_rc4_options),A4
+       MVKH    \$PCR_OFFSET(rc4_options,_rc4_options),A4
+       .else
+       MVKL    (rc4_options-_rc4_options),A4
+       MVKH    (rc4_options-_rc4_options),A4
+       .endif
+       ADD     B4,A4,A4
+       .endasmfunc
+
+       .if     __TI_EABI__
+       .sect   ".text:rc4_options.const"
+       .else
+       .sect   ".const:rc4_options"
+       .endif
+       .align  4
+rc4_options:
+       .cstring "rc4(sploop,char)"
+       .cstring "RC4 for C64+, CRYPTOGAMS by <appro\@openssl.org>"
+       .align  4
+___
+
+print $code;
+close STDOUT;