C64x+ assembler pack. linux-c64xplus build is *not* tested nor can it be
authorAndy Polyakov <appro@openssl.org>
Wed, 18 Apr 2012 13:01:36 +0000 (13:01 +0000)
committerAndy Polyakov <appro@openssl.org>
Wed, 18 Apr 2012 13:01:36 +0000 (13:01 +0000)
tested, because kernel is not in shape to handle it *yet*. The code is
committed mostly to stimulate the kernel development.

Configure
TABLE
crypto/aes/asm/aes-c64xplus.pl [new file with mode: 0644]
crypto/bn/asm/bn-c64xplus.asm [new file with mode: 0644]
crypto/bn/asm/c64xplus-gf2m.pl [new file with mode: 0644]
crypto/c64xpluscpuid.pl [new file with mode: 0644]
crypto/modes/asm/ghash-c64xplus.pl [new file with mode: 0644]
crypto/sha/asm/sha1-c64xplus.pl [new file with mode: 0644]
crypto/sha/asm/sha256-c64xplus.pl [new file with mode: 0644]
crypto/sha/asm/sha512-c64xplus.pl [new file with mode: 0644]

index 5900e18..bae77cb 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -399,6 +399,9 @@ my %table=(
 "linux-alpha+bwx-gcc","gcc:-O3 -DL_ENDIAN -DTERMIO::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL:${alpha_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
 "linux-alpha-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}",
 "linux-alpha+bwx-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}",
+#
+# TI_CGT_C6000_7.3.x is a requirement
+"linux-c64xplus","cl6x:--linux --strip_coff_underscore -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT::-D_REENTRANT:::BN_LLONG:c64xpluscpuid.o:bn-c64xplus.o c64xplus-gf2m.o::aes-c64xplus.o aes_cbc.o aes_ctr.o:::sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o:::::::ghash-c64xplus.o::void:dlfcn:linux-shared:--pic:-z --sysv --shared:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):true",
 
 # Android: linux-* but without -DTERMIO and pointers to headers and libs.
 "android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
diff --git a/TABLE b/TABLE
index 74c891a..d5f0b1f 100644 (file)
--- a/TABLE
+++ b/TABLE
@@ -3927,6 +3927,39 @@ $ranlib       =
 $arflags      = 
 $multilib     = 
 
+*** linux-c64xplus
+$cc           = cl6x
+$cflags       = --linux --strip_coff_underscore -ea=.s -eo=.o -mv6400+ -o2 -ox -ms -pden -DOPENSSL_SMALL_FOOTPRINT
+$unistd       = 
+$thread_cflag = -D_REENTRANT
+$sys_id       = 
+$lflags       = 
+$bn_ops       = BN_LLONG
+$cpuid_obj    = c64xpluscpuid.o
+$bn_obj       = bn-c64xplus.o c64xplus-gf2m.o
+$des_obj      = 
+$aes_obj      = aes-c64xplus.o aes_cbc.o aes_ctr.o
+$bf_obj       = 
+$md5_obj      = 
+$sha1_obj     = sha1-c64xplus.o sha256-c64xplus.o sha512-c64xplus.o
+$cast_obj     = 
+$rc4_obj      = 
+$rmd160_obj   = 
+$rc5_obj      = 
+$wp_obj       = 
+$cmll_obj     = 
+$modes_obj    = ghash-c64xplus.o
+$engines_obj  = 
+$perlasm_scheme = void
+$dso_scheme   = dlfcn
+$shared_target= linux-shared
+$shared_cflag = --pic
+$shared_ldflag = -z --sysv --shared
+$shared_extension = .so.$(SHLIB_MAJOR).$(SHLIB_MINOR)
+$ranlib       = true
+$arflags      = 
+$multilib     = 
+
 *** linux-elf
 $cc           = gcc
 $cflags       = -DL_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall
diff --git a/crypto/aes/asm/aes-c64xplus.pl b/crypto/aes/asm/aes-c64xplus.pl
new file mode 100644 (file)
index 0000000..ad0c15a
--- /dev/null
@@ -0,0 +1,1361 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# [Endian-neutral] AES for C64x+.
+#
+# Even though SPLOOPs are scheduled for 13 cycles, and thus expected
+# performance is ~8.5 cycles per byte processed with 128-bit key,
+# measured performance turned to be ~10 cycles per byte. Discrepancy
+# must be caused by limitations of L1D memory banking(*), see SPRU871
+# TI publication for further details. If any consolation it's still
+# ~20% faster than TI's linear assembly module anyway... Compared to
+# aes_core.c compiled with cl6x 6.0 with -mv6400+ -o2 options this
+# code is 3.75x faster and almost 3x smaller (tables included).
+#
+# (*)  This means that there might be subtle correlation between data
+#      and timing and one can wonder if it can be ... attacked:-(
+#      On the other hand this also means that *if* one chooses to
+#      implement *4* T-tables variant [instead of 1 T-table as in
+#      this implementation, or in addition to], then one ought to
+#      *interleave* them. Even though it complicates addressing,
+#      references to interleaved tables would be guaranteed not to
+#      clash. I reckon that it should be possible to break 8 cycles
+#      per byte "barrier," i.e. improve by ~20%, naturally at the
+#      cost of 8x increased pressure on L1D. 8x because you'd have
+#      to interleave both Te and Td tables...
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($TEA,$TEB)=("A5","B5");
+($KPA,$KPB)=("A3","B1");
+@K=("A6","B6","A7","B7");
+@s=("A8","B8","A9","B9");
+@Te0=@Td0=("A16","B16","A17","B17");
+@Te1=@Td1=("A18","B18","A19","B19");
+@Te2=@Td2=("A20","B20","A21","B21");
+@Te3=@Td3=("A22","B22","A23","B23");
+
+$code=<<___;
+       .text
+       .if     __TI_EABI__
+       .nocmp
+       .endif
+
+       .asg    B3,RA
+       .asg    A4,INP
+       .asg    B4,OUT
+       .asg    A6,KEY
+       .asg    A4,RET
+       .asg    B15,SP
+
+       .eval   24,EXT0
+       .eval   16,EXT1
+       .eval   8,EXT2
+       .eval   0,EXT3
+       .eval   8,TBL1
+       .eval   16,TBL2
+       .eval   24,TBL3
+
+       .if     .BIG_ENDIAN
+       .eval   24-EXT0,EXT0
+       .eval   24-EXT1,EXT1
+       .eval   24-EXT2,EXT2
+       .eval   24-EXT3,EXT3
+       .eval   32-TBL1,TBL1
+       .eval   32-TBL2,TBL2
+       .eval   32-TBL3,TBL3
+       .endif
+
+       .global _AES_encrypt
+_AES_encrypt:
+       .asmfunc
+       MVK     1,B2
+__encrypt:
+       .if     __TI_EABI__
+   [B2]        LDNDW   *INP++,A9:A8                    ; load input
+||     MVKL    \$PCR_OFFSET(AES_Te,_AES_encrypt),$TEA
+||     ADDKPC  _AES_encrypt,B0
+   [B2]        LDNDW   *INP++,B9:B8
+||     MVKH    \$PCR_OFFSET(AES_Te,_AES_encrypt),$TEA
+||     ADD     0,KEY,$KPA
+||     ADD     4,KEY,$KPB
+       .else
+   [B2]        LDNDW   *INP++,A9:A8                    ; load input
+||     MVKL    (AES_Te-_AES_encrypt),$TEA
+||     ADDKPC  _AES_encrypt,B0
+   [B2]        LDNDW   *INP++,B9:B8
+||     MVKH    (AES_Te-_AES_encrypt),$TEA
+||     ADD     0,KEY,$KPA
+||     ADD     4,KEY,$KPB
+       .endif
+       LDW     *$KPA++[2],$Te0[0]              ; zero round key
+||     LDW     *$KPB++[2],$Te0[1]
+||     MVK     60,A0
+||     ADD     B0,$TEA,$TEA                    ; AES_Te
+       LDW     *KEY[A0],B0                     ; rounds
+||     MVK     1024,A0                         ; sizeof(AES_Te)
+       LDW     *$KPA++[2],$Te0[2]
+||     LDW     *$KPB++[2],$Te0[3]
+||     MV      $TEA,$TEB
+       NOP
+       .if     .BIG_ENDIAN
+       MV      A9,$s[0]
+||     MV      A8,$s[1]
+||     MV      B9,$s[2]
+||     MV      B8,$s[3]
+       .else
+       MV      A8,$s[0]
+||     MV      A9,$s[1]
+||     MV      B8,$s[2]
+||     MV      B9,$s[3]
+       .endif
+       XOR     $Te0[0],$s[0],$s[0]
+||     XOR     $Te0[1],$s[1],$s[1]
+||     LDW     *$KPA++[2],$K[0]                ; 1st round key
+||     LDW     *$KPB++[2],$K[1]
+       SUB     B0,2,B0
+
+       SPLOOPD 13
+||     MVC     B0,ILC
+||     LDW     *$KPA++[2],$K[2]
+||     LDW     *$KPB++[2],$K[3]
+;;====================================================================
+       EXTU    $s[1],EXT1,24,$Te1[1]
+||     EXTU    $s[0],EXT3,24,$Te3[0]
+       LDW     *${TEB}[$Te1[1]],$Te1[1]        ; Te1[s1>>8],   t0
+||     LDW     *${TEA}[$Te3[0]],$Te3[0]        ; Te3[s0>>24],  t1
+||     XOR     $s[2],$Te0[2],$s[2]             ; modulo-scheduled
+||     XOR     $s[3],$Te0[3],$s[3]             ; modulo-scheduled
+||     EXTU    $s[1],EXT3,24,$Te3[1]
+||     EXTU    $s[0],EXT1,24,$Te1[0]
+       LDW     *${TEB}[$Te3[1]],$Te3[1]        ; Te3[s1>>24],  t2
+||     LDW     *${TEA}[$Te1[0]],$Te1[0]        ; Te1[s0>>8],   t3
+||     EXTU    $s[2],EXT2,24,$Te2[2]
+||     EXTU    $s[3],EXT2,24,$Te2[3]
+       LDW     *${TEA}[$Te2[2]],$Te2[2]        ; Te2[s2>>16],  t0
+||     LDW     *${TEB}[$Te2[3]],$Te2[3]        ; Te2[s3>>16],  t1
+||     EXTU    $s[3],EXT3,24,$Te3[3]
+||     EXTU    $s[2],EXT1,24,$Te1[2]
+       LDW     *${TEB}[$Te3[3]],$Te3[3]        ; Te3[s3>>24],  t0
+||     LDW     *${TEA}[$Te1[2]],$Te1[2]        ; Te1[s2>>8],   t1
+||     EXTU    $s[0],EXT2,24,$Te2[0]
+||     EXTU    $s[1],EXT2,24,$Te2[1]
+       LDW     *${TEA}[$Te2[0]],$Te2[0]        ; Te2[s0>>16],  t2
+||     LDW     *${TEB}[$Te2[1]],$Te2[1]        ; Te2[s1>>16],  t3
+||     EXTU    $s[3],EXT1,24,$Te1[3]
+||     EXTU    $s[2],EXT3,24,$Te3[2]
+       LDW     *${TEB}[$Te1[3]],$Te1[3]        ; Te1[s3>>8],   t2
+||     LDW     *${TEA}[$Te3[2]],$Te3[2]        ; Te3[s2>>24],  t3
+||     ROTL    $Te1[1],TBL1,$Te3[0]            ; t0
+||     ROTL    $Te3[0],TBL3,$Te1[1]            ; t1
+||     EXTU    $s[0],EXT0,24,$Te0[0]
+||     EXTU    $s[1],EXT0,24,$Te0[1]
+       LDW     *${TEA}[$Te0[0]],$Te0[0]        ; Te0[s0],      t0
+||     LDW     *${TEB}[$Te0[1]],$Te0[1]        ; Te0[s1],      t1
+||     ROTL    $Te3[1],TBL3,$Te1[0]            ; t2
+||     ROTL    $Te1[0],TBL1,$Te3[1]            ; t3
+||     EXTU    $s[2],EXT0,24,$Te0[2]
+||     EXTU    $s[3],EXT0,24,$Te0[3]
+       LDW     *${TEA}[$Te0[2]],$Te0[2]        ; Te0[s2],      t2
+||     LDW     *${TEB}[$Te0[3]],$Te0[3]        ; Te0[s3],      t3
+||     ROTL    $Te2[2],TBL2,$Te2[2]            ; t0
+||     ROTL    $Te2[3],TBL2,$Te2[3]            ; t1
+||     XOR     $K[0],$Te3[0],$s[0]
+||     XOR     $K[1],$Te1[1],$s[1]
+       ROTL    $Te3[3],TBL3,$Te1[2]            ; t0
+||     ROTL    $Te1[2],TBL1,$Te3[3]            ; t1
+||     XOR     $K[2],$Te1[0],$s[2]
+||     XOR     $K[3],$Te3[1],$s[3]
+||     LDW     *$KPA++[2],$K[0]                ; next round key
+||     LDW     *$KPB++[2],$K[1]
+       ROTL    $Te2[0],TBL2,$Te2[0]            ; t2
+||     ROTL    $Te2[1],TBL2,$Te2[1]            ; t3
+||     XOR     $s[0],$Te2[2],$s[0]
+||     XOR     $s[1],$Te2[3],$s[1]
+||     LDW     *$KPA++[2],$K[2]
+||     LDW     *$KPB++[2],$K[3]
+       ROTL    $Te1[3],TBL1,$Te3[2]            ; t2
+||     ROTL    $Te3[2],TBL3,$Te1[3]            ; t3
+||     XOR     $s[0],$Te1[2],$s[0]
+||     XOR     $s[1],$Te3[3],$s[1]
+       XOR     $s[2],$Te2[0],$s[2]
+||     XOR     $s[3],$Te2[1],$s[3]
+||     XOR     $s[0],$Te0[0],$s[0]
+||     XOR     $s[1],$Te0[1],$s[1]
+       SPKERNEL
+||     XOR.L   $s[2],$Te3[2],$s[2]
+||     XOR.L   $s[3],$Te1[3],$s[3]
+;;====================================================================
+       ADD.D   ${TEA},A0,${TEA}                ; point to Te4
+||     ADD.D   ${TEB},A0,${TEB}
+||     EXTU    $s[1],EXT1,24,$Te1[1]
+||     EXTU    $s[0],EXT3,24,$Te3[0]
+       LDBU    *${TEB}[$Te1[1]],$Te1[1]        ; Te1[s1>>8],   t0
+||     LDBU    *${TEA}[$Te3[0]],$Te3[0]        ; Te3[s0>>24],  t1
+||     XOR     $s[2],$Te0[2],$s[2]             ; modulo-scheduled
+||     XOR     $s[3],$Te0[3],$s[3]             ; modulo-scheduled
+||     EXTU    $s[0],EXT0,24,$Te0[0]
+||     EXTU    $s[1],EXT0,24,$Te0[1]
+       LDBU    *${TEA}[$Te0[0]],$Te0[0]        ; Te0[s0],      t0
+||     LDBU    *${TEB}[$Te0[1]],$Te0[1]        ; Te0[s1],      t1
+||     EXTU    $s[3],EXT3,24,$Te3[3]
+||     EXTU    $s[2],EXT1,24,$Te1[2]
+       LDBU    *${TEB}[$Te3[3]],$Te3[3]        ; Te3[s3>>24],  t0
+||     LDBU    *${TEA}[$Te1[2]],$Te1[2]        ; Te1[s2>>8],   t1
+||     EXTU    $s[2],EXT2,24,$Te2[2]
+||     EXTU    $s[3],EXT2,24,$Te2[3]
+       LDBU    *${TEA}[$Te2[2]],$Te2[2]        ; Te2[s2>>16],  t0
+||     LDBU    *${TEB}[$Te2[3]],$Te2[3]        ; Te2[s3>>16],  t1
+||     EXTU    $s[1],EXT3,24,$Te3[1]
+||     EXTU    $s[0],EXT1,24,$Te1[0]
+       LDBU    *${TEB}[$Te3[1]],$Te3[1]        ; Te3[s1>>24],  t2
+||     LDBU    *${TEA}[$Te1[0]],$Te1[0]        ; Te1[s0>>8],   t3
+||     EXTU    $s[3],EXT1,24,$Te1[3]
+||     EXTU    $s[2],EXT3,24,$Te3[2]
+       LDBU    *${TEB}[$Te1[3]],$Te1[3]        ; Te1[s3>>8],   t2
+||     LDBU    *${TEA}[$Te3[2]],$Te3[2]        ; Te3[s2>>24],  t3
+||     EXTU    $s[2],EXT0,24,$Te0[2]
+||     EXTU    $s[3],EXT0,24,$Te0[3]
+       LDBU    *${TEA}[$Te0[2]],$Te0[2]        ; Te0[s2],      t2
+||     LDBU    *${TEB}[$Te0[3]],$Te0[3]        ; Te0[s3],      t3
+||     EXTU    $s[0],EXT2,24,$Te2[0]
+||     EXTU    $s[1],EXT2,24,$Te2[1]
+       LDBU    *${TEA}[$Te2[0]],$Te2[0]        ; Te2[s0>>16],  t2
+||     LDBU    *${TEB}[$Te2[1]],$Te2[1]        ; Te2[s1>>16],  t3
+
+       .if     .BIG_ENDIAN
+       PACK2   $Te0[0],$Te1[1],$Te0[0]
+||     PACK2   $Te0[1],$Te1[2],$Te0[1]
+       PACK2   $Te2[2],$Te3[3],$Te2[2]
+||     PACK2   $Te2[3],$Te3[0],$Te2[3]
+       PACKL4  $Te0[0],$Te2[2],$Te0[0]
+||     PACKL4  $Te0[1],$Te2[3],$Te0[1]
+       XOR     $K[0],$Te0[0],$Te0[0]           ; s[0]
+||     XOR     $K[1],$Te0[1],$Te0[1]           ; s[1]
+
+       PACK2   $Te0[2],$Te1[3],$Te0[2]
+||     PACK2   $Te0[3],$Te1[0],$Te0[3]
+       PACK2   $Te2[0],$Te3[1],$Te2[0]
+||     PACK2   $Te2[1],$Te3[2],$Te2[1]
+||     BNOP    RA
+       PACKL4  $Te0[2],$Te2[0],$Te0[2]
+||     PACKL4  $Te0[3],$Te2[1],$Te0[3]
+       XOR     $K[2],$Te0[2],$Te0[2]           ; s[2]
+||     XOR     $K[3],$Te0[3],$Te0[3]           ; s[3]
+
+       MV      $Te0[0],A9
+||     MV      $Te0[1],A8
+       MV      $Te0[2],B9
+||     MV      $Te0[3],B8
+|| [B2]        STNDW   A9:A8,*OUT++
+   [B2]        STNDW   B9:B8,*OUT++
+       .else
+       PACK2   $Te1[1],$Te0[0],$Te1[1]
+||     PACK2   $Te1[2],$Te0[1],$Te1[2]
+       PACK2   $Te3[3],$Te2[2],$Te3[3]
+||     PACK2   $Te3[0],$Te2[3],$Te3[0]
+       PACKL4  $Te3[3],$Te1[1],$Te1[1]
+||     PACKL4  $Te3[0],$Te1[2],$Te1[2]
+       XOR     $K[0],$Te1[1],$Te1[1]           ; s[0]
+||     XOR     $K[1],$Te1[2],$Te1[2]           ; s[1]
+
+       PACK2   $Te1[3],$Te0[2],$Te1[3]
+||     PACK2   $Te1[0],$Te0[3],$Te1[0]
+       PACK2   $Te3[1],$Te2[0],$Te3[1]
+||     PACK2   $Te3[2],$Te2[1],$Te3[2]
+||     BNOP    RA
+       PACKL4  $Te3[1],$Te1[3],$Te1[3]
+||     PACKL4  $Te3[2],$Te1[0],$Te1[0]
+       XOR     $K[2],$Te1[3],$Te1[3]           ; s[2]
+||     XOR     $K[3],$Te1[0],$Te1[0]           ; s[3]
+
+       MV      $Te1[1],A8
+||     MV      $Te1[2],A9
+       MV      $Te1[3],B8
+||     MV      $Te1[0],B9
+|| [B2]        STNDW   A9:A8,*OUT++
+   [B2]        STNDW   B9:B8,*OUT++
+       .endif
+       .endasmfunc
+
+       .global _AES_decrypt
+_AES_decrypt:
+       .asmfunc
+       MVK     1,B2
+__decrypt:
+       .if     __TI_EABI__
+   [B2]        LDNDW   *INP++,A9:A8                    ; load input
+||     MVKL    \$PCR_OFFSET(AES_Td,_AES_decrypt),$TEA
+||     ADDKPC  _AES_decrypt,B0
+   [B2]        LDNDW   *INP++,B9:B8
+||     MVKH    \$PCR_OFFSET(AES_Td,_AES_decrypt),$TEA
+||     ADD     0,KEY,$KPA
+||     ADD     4,KEY,$KPB
+       .else
+   [B2]        LDNDW   *INP++,A9:A8                    ; load input
+||     MVKL    (AES_Td-_AES_decrypt),$TEA
+||     ADDKPC  _AES_decrypt,B0
+   [B2]        LDNDW   *INP++,B9:B8
+||     MVKH    (AES_Td-_AES_decrypt),$TEA
+||     ADD     0,KEY,$KPA
+||     ADD     4,KEY,$KPB
+       .endif
+       LDW     *$KPA++[2],$Td0[0]              ; zero round key
+||     LDW     *$KPB++[2],$Td0[1]
+||     MVK     60,A0
+||     ADD     B0,$TEA,$TEA                    ; AES_Td
+       LDW     *KEY[A0],B0                     ; rounds
+||     MVK     1024,A0                         ; sizeof(AES_Td)
+       LDW     *$KPA++[2],$Td0[2]
+||     LDW     *$KPB++[2],$Td0[3]
+||     MV      $TEA,$TEB
+       NOP
+       .if     .BIG_ENDIAN
+       MV      A9,$s[0]
+||     MV      A8,$s[1]
+||     MV      B9,$s[2]
+||     MV      B8,$s[3]
+       .else
+       MV      A8,$s[0]
+||     MV      A9,$s[1]
+||     MV      B8,$s[2]
+||     MV      B9,$s[3]
+       .endif
+       XOR     $Td0[0],$s[0],$s[0]
+||     XOR     $Td0[1],$s[1],$s[1]
+||     LDW     *$KPA++[2],$K[0]                ; 1st round key
+||     LDW     *$KPB++[2],$K[1]
+       SUB     B0,2,B0
+
+       SPLOOPD 13
+||     MVC     B0,ILC
+||     LDW     *$KPA++[2],$K[2]
+||     LDW     *$KPB++[2],$K[3]
+;;====================================================================
+       EXTU    $s[1],EXT3,24,$Td3[1]
+||     EXTU    $s[0],EXT1,24,$Td1[0]
+       LDW     *${TEB}[$Td3[1]],$Td3[1]        ; Td3[s1>>24],  t0
+||     LDW     *${TEA}[$Td1[0]],$Td1[0]        ; Td1[s0>>8],   t1
+||     XOR     $s[2],$Td0[2],$s[2]             ; modulo-scheduled
+||     XOR     $s[3],$Td0[3],$s[3]             ; modulo-scheduled
+||     EXTU    $s[1],EXT1,24,$Td1[1]
+||     EXTU    $s[0],EXT3,24,$Td3[0]
+       LDW     *${TEB}[$Td1[1]],$Td1[1]        ; Td1[s1>>8],   t2
+||     LDW     *${TEA}[$Td3[0]],$Td3[0]        ; Td3[s0>>24],  t3
+||     EXTU    $s[2],EXT2,24,$Td2[2]
+||     EXTU    $s[3],EXT2,24,$Td2[3]
+       LDW     *${TEA}[$Td2[2]],$Td2[2]        ; Td2[s2>>16],  t0
+||     LDW     *${TEB}[$Td2[3]],$Td2[3]        ; Td2[s3>>16],  t1
+||     EXTU    $s[3],EXT1,24,$Td1[3]
+||     EXTU    $s[2],EXT3,24,$Td3[2]
+       LDW     *${TEB}[$Td1[3]],$Td1[3]        ; Td1[s3>>8],   t0
+||     LDW     *${TEA}[$Td3[2]],$Td3[2]        ; Td3[s2>>24],  t1
+||     EXTU    $s[0],EXT2,24,$Td2[0]
+||     EXTU    $s[1],EXT2,24,$Td2[1]
+       LDW     *${TEA}[$Td2[0]],$Td2[0]        ; Td2[s0>>16],  t2
+||     LDW     *${TEB}[$Td2[1]],$Td2[1]        ; Td2[s1>>16],  t3
+||     EXTU    $s[3],EXT3,24,$Td3[3]
+||     EXTU    $s[2],EXT1,24,$Td1[2]
+       LDW     *${TEB}[$Td3[3]],$Td3[3]        ; Td3[s3>>24],  t2
+||     LDW     *${TEA}[$Td1[2]],$Td1[2]        ; Td1[s2>>8],   t3
+||     ROTL    $Td3[1],TBL3,$Td1[0]            ; t0
+||     ROTL    $Td1[0],TBL1,$Td3[1]            ; t1
+||     EXTU    $s[0],EXT0,24,$Td0[0]
+||     EXTU    $s[1],EXT0,24,$Td0[1]
+       LDW     *${TEA}[$Td0[0]],$Td0[0]        ; Td0[s0],      t0
+||     LDW     *${TEB}[$Td0[1]],$Td0[1]        ; Td0[s1],      t1
+||     ROTL    $Td1[1],TBL1,$Td3[0]            ; t2
+||     ROTL    $Td3[0],TBL3,$Td1[1]            ; t3
+||     EXTU    $s[2],EXT0,24,$Td0[2]
+||     EXTU    $s[3],EXT0,24,$Td0[3]
+       LDW     *${TEA}[$Td0[2]],$Td0[2]        ; Td0[s2],      t2
+||     LDW     *${TEB}[$Td0[3]],$Td0[3]        ; Td0[s3],      t3
+||     ROTL    $Td2[2],TBL2,$Td2[2]            ; t0
+||     ROTL    $Td2[3],TBL2,$Td2[3]            ; t1
+||     XOR     $K[0],$Td1[0],$s[0]
+||     XOR     $K[1],$Td3[1],$s[1]
+       ROTL    $Td1[3],TBL1,$Td3[2]            ; t0
+||     ROTL    $Td3[2],TBL3,$Td1[3]            ; t1
+||     XOR     $K[2],$Td3[0],$s[2]
+||     XOR     $K[3],$Td1[1],$s[3]
+||     LDW     *$KPA++[2],$K[0]                ; next round key
+||     LDW     *$KPB++[2],$K[1]
+       ROTL    $Td2[0],TBL2,$Td2[0]            ; t2
+||     ROTL    $Td2[1],TBL2,$Td2[1]            ; t3
+||     XOR     $s[0],$Td2[2],$s[0]
+||     XOR     $s[1],$Td2[3],$s[1]
+||     LDW     *$KPA++[2],$K[2]
+||     LDW     *$KPB++[2],$K[3]
+       ROTL    $Td3[3],TBL3,$Td1[2]            ; t2
+||     ROTL    $Td1[2],TBL1,$Td3[3]            ; t3
+||     XOR     $s[0],$Td3[2],$s[0]
+||     XOR     $s[1],$Td1[3],$s[1]
+       XOR     $s[2],$Td2[0],$s[2]
+||     XOR     $s[3],$Td2[1],$s[3]
+||     XOR     $s[0],$Td0[0],$s[0]
+||     XOR     $s[1],$Td0[1],$s[1]
+       SPKERNEL
+||     XOR.L   $s[2],$Td1[2],$s[2]
+||     XOR.L   $s[3],$Td3[3],$s[3]
+;;====================================================================
+       ADD.D   ${TEA},A0,${TEA}                ; point to Td4
+||     ADD.D   ${TEB},A0,${TEB}
+||     EXTU    $s[1],EXT3,24,$Td3[1]
+||     EXTU    $s[0],EXT1,24,$Td1[0]
+       LDBU    *${TEB}[$Td3[1]],$Td3[1]        ; Td3[s1>>24],  t0
+||     LDBU    *${TEA}[$Td1[0]],$Td1[0]        ; Td1[s0>>8],   t1
+||     XOR     $s[2],$Td0[2],$s[2]             ; modulo-scheduled
+||     XOR     $s[3],$Td0[3],$s[3]             ; modulo-scheduled
+||     EXTU    $s[0],EXT0,24,$Td0[0]
+||     EXTU    $s[1],EXT0,24,$Td0[1]
+       LDBU    *${TEA}[$Td0[0]],$Td0[0]        ; Td0[s0],      t0
+||     LDBU    *${TEB}[$Td0[1]],$Td0[1]        ; Td0[s1],      t1
+||     EXTU    $s[2],EXT2,24,$Td2[2]
+||     EXTU    $s[3],EXT2,24,$Td2[3]
+       LDBU    *${TEA}[$Td2[2]],$Td2[2]        ; Td2[s2>>16],  t0
+||     LDBU    *${TEB}[$Td2[3]],$Td2[3]        ; Td2[s3>>16],  t1
+||     EXTU    $s[3],EXT1,24,$Td1[3]
+||     EXTU    $s[2],EXT3,24,$Td3[2]
+       LDBU    *${TEB}[$Td1[3]],$Td1[3]        ; Td1[s3>>8],   t0
+||     LDBU    *${TEA}[$Td3[2]],$Td3[2]        ; Td3[s2>>24],  t1
+||     EXTU    $s[1],EXT1,24,$Td1[1]
+||     EXTU    $s[0],EXT3,24,$Td3[0]
+       LDBU    *${TEB}[$Td1[1]],$Td1[1]        ; Td1[s1>>8],   t2
+||     LDBU    *${TEA}[$Td3[0]],$Td3[0]        ; Td3[s0>>24],  t3
+||     EXTU    $s[0],EXT2,24,$Td2[0]
+||     EXTU    $s[1],EXT2,24,$Td2[1]
+       LDBU    *${TEA}[$Td2[0]],$Td2[0]        ; Td2[s0>>16],  t2
+||     LDBU    *${TEB}[$Td2[1]],$Td2[1]        ; Td2[s1>>16],  t3
+||     EXTU    $s[3],EXT3,24,$Td3[3]
+||     EXTU    $s[2],EXT1,24,$Td1[2]
+       LDBU    *${TEB}[$Td3[3]],$Td3[3]        ; Td3[s3>>24],  t2
+||     LDBU    *${TEA}[$Td1[2]],$Td1[2]        ; Td1[s2>>8],   t3
+||     EXTU    $s[2],EXT0,24,$Td0[2]
+||     EXTU    $s[3],EXT0,24,$Td0[3]
+       LDBU    *${TEA}[$Td0[2]],$Td0[2]        ; Td0[s2],      t2
+||     LDBU    *${TEB}[$Td0[3]],$Td0[3]        ; Td0[s3],      t3
+
+       .if     .BIG_ENDIAN
+       PACK2   $Td0[0],$Td1[3],$Td0[0]
+||     PACK2   $Td0[1],$Td1[0],$Td0[1]
+       PACK2   $Td2[2],$Td3[1],$Td2[2]
+||     PACK2   $Td2[3],$Td3[2],$Td2[3]
+       PACKL4  $Td0[0],$Td2[2],$Td0[0]
+||     PACKL4  $Td0[1],$Td2[3],$Td0[1]
+       XOR     $K[0],$Td0[0],$Td0[0]           ; s[0]
+||     XOR     $K[1],$Td0[1],$Td0[1]           ; s[1]
+
+       PACK2   $Td0[2],$Td1[1],$Td0[2]
+||     PACK2   $Td0[3],$Td1[2],$Td0[3]
+       PACK2   $Td2[0],$Td3[3],$Td2[0]
+||     PACK2   $Td2[1],$Td3[0],$Td2[1]
+||     BNOP    RA
+       PACKL4  $Td0[2],$Td2[0],$Td0[2]
+||     PACKL4  $Td0[3],$Td2[1],$Td0[3]
+       XOR     $K[2],$Td0[2],$Td0[2]           ; s[2]
+||     XOR     $K[3],$Td0[3],$Td0[3]           ; s[3]
+
+       MV      $Td0[0],A9
+||     MV      $Td0[1],A8
+       MV      $Td0[2],B9
+||     MV      $Td0[3],B8
+|| [B2]        STNDW   A9:A8,*OUT++
+   [B2]        STNDW   B9:B8,*OUT++
+       .else
+       PACK2   $Td1[3],$Td0[0],$Td1[3]
+||     PACK2   $Td1[0],$Td0[1],$Td1[0]
+       PACK2   $Td3[1],$Td2[2],$Td3[1]
+||     PACK2   $Td3[2],$Td2[3],$Td3[2]
+       PACKL4  $Td3[1],$Td1[3],$Td1[3]
+||     PACKL4  $Td3[2],$Td1[0],$Td1[0]
+       XOR     $K[0],$Td1[3],$Td1[3]           ; s[0]
+||     XOR     $K[1],$Td1[0],$Td1[0]           ; s[1]
+
+       PACK2   $Td1[1],$Td0[2],$Td1[1]
+||     PACK2   $Td1[2],$Td0[3],$Td1[2]
+       PACK2   $Td3[3],$Td2[0],$Td3[3]
+||     PACK2   $Td3[0],$Td2[1],$Td3[0]
+||     BNOP    RA
+       PACKL4  $Td3[3],$Td1[1],$Td1[1]
+||     PACKL4  $Td3[0],$Td1[2],$Td1[2]
+       XOR     $K[2],$Td1[1],$Td1[1]           ; s[2]
+||     XOR     $K[3],$Td1[2],$Td1[2]           ; s[3]
+
+       MV      $Td1[3],A8
+||     MV      $Td1[0],A9
+       MV      $Td1[1],B8
+||     MV      $Td1[2],B9
+|| [B2]        STNDW   A9:A8,*OUT++
+   [B2]        STNDW   B9:B8,*OUT++
+       .endif
+       .endasmfunc
+___
+{
+my @K=(@K,@s);                 # extended key
+my @Te4=map("B$_",(16..19));
+
+my @Kx9=@Te0;                  # used in AES_set_decrypt_key
+my @KxB=@Te1;
+my @KxD=@Te2;
+my @KxE=@Te3;
+
+$code.=<<___;
+       .asg    OUT,BITS
+
+       .global _AES_set_encrypt_key
+_AES_set_encrypt_key:
+__set_encrypt_key:
+       .asmfunc
+       MV      INP,A0
+||     SHRU    BITS,5,BITS                     ; 128-192-256 -> 4-6-8
+||     MV      KEY,A1
+  [!A0]        B       RA
+||[!A0]        MVK     -1,RET
+||[!A0]        MVK     1,A1                            ; only one B RA
+  [!A1]        B       RA
+||[!A1]        MVK     -1,RET
+||[!A1]        MVK     0,A0
+||     MVK     0,B0
+||     MVK     0,A1
+   [A0]        LDNDW   *INP++,A9:A8
+|| [A0]        CMPEQ   4,BITS,B0
+|| [A0]        CMPLT   3,BITS,A1
+   [B0]        B       key128?
+|| [A1]        LDNDW   *INP++,B9:B8
+|| [A0]        CMPEQ   6,BITS,B0
+|| [A0]        CMPLT   5,BITS,A1
+   [B0]        B       key192?
+|| [A1]        LDNDW   *INP++,B17:B16
+|| [A0]        CMPEQ   8,BITS,B0
+|| [A0]        CMPLT   7,BITS,A1
+   [B0]        B       key256?
+|| [A1]        LDNDW   *INP++,B19:B18
+
+       .if     __TI_EABI__
+   [A0]        ADD     0,KEY,$KPA
+|| [A0]        ADD     4,KEY,$KPB
+|| [A0]        MVKL    \$PCR_OFFSET(AES_Te4,_AES_set_encrypt_key),$TEA
+|| [A0]        ADDKPC  _AES_set_encrypt_key,B6
+   [A0]        MVKH    \$PCR_OFFSET(AES_Te4,_AES_set_encrypt_key),$TEA
+   [A0]        ADD     B6,$TEA,$TEA                    ; AES_Te4
+       .else
+   [A0]        ADD     0,KEY,$KPA
+|| [A0]        ADD     4,KEY,$KPB
+|| [A0]        MVKL    (AES_Te4-_AES_set_encrypt_key),$TEA
+|| [A0]        ADDKPC  _AES_set_encrypt_key,B6
+   [A0]        MVKH    (AES_Te4-_AES_set_encrypt_key),$TEA
+   [A0]        ADD     B6,$TEA,$TEA                    ; AES_Te4
+       .endif
+       NOP
+       NOP
+
+       BNOP    RA,5
+||     MVK     -2,RET                          ; unknown bit lenght
+||     MVK     0,B0                            ; redundant
+;;====================================================================
+;;====================================================================
+key128?:
+       .if     .BIG_ENDIAN
+       MV      A9,$K[0]
+||     MV      A8,$K[1]
+||     MV      B9,$Te4[2]
+||     MV      B8,$K[3]
+       .else
+       MV      A8,$K[0]
+||     MV      A9,$K[1]
+||     MV      B8,$Te4[2]
+||     MV      B9,$K[3]
+       .endif
+
+       MVK     256,A0
+||     MVK     9,B0
+
+       SPLOOPD 14
+||     MVC     B0,ILC
+||     MV      $TEA,$TEB
+||     ADD     $TEA,A0,A30                     ; rcon
+;;====================================================================
+       LDW     *A30++[1],A31                   ; rcon[i]
+||     MV      $Te4[2],$K[2]
+||     EXTU    $K[3],EXT1,24,$Te4[0]
+       LDBU    *${TEB}[$Te4[0]],$Te4[0]
+||     MV      $K[3],A0
+||     EXTU    $K[3],EXT2,24,$Te4[1]
+       LDBU    *${TEB}[$Te4[1]],$Te4[1]
+||     EXTU    A0,EXT3,24,A0
+||     EXTU    $K[3],EXT0,24,$Te4[3]
+       .if     .BIG_ENDIAN
+       LDBU    *${TEA}[A0],$Te4[3]
+||     LDBU    *${TEB}[$Te4[3]],A0
+       .else
+       LDBU    *${TEA}[A0],A0
+||     LDBU    *${TEB}[$Te4[3]],$Te4[3]
+       .endif
+
+       STW     $K[0],*$KPA++[2]
+||     STW     $K[1],*$KPB++[2]
+       STW     $K[2],*$KPA++[2]
+||     STW     $K[3],*$KPB++[2]
+
+       XOR     A31,$K[0],$K[0]                 ; ^=rcon[i]
+       .if     .BIG_ENDIAN
+       PACK2   $Te4[0],$Te4[1],$Te4[1]
+       PACK2   $Te4[3],A0,$Te4[3]
+       PACKL4  $Te4[1],$Te4[3],$Te4[3]
+       .else
+       PACK2   $Te4[1],$Te4[0],$Te4[1]
+       PACK2   $Te4[3],A0,$Te4[3]
+       PACKL4  $Te4[3],$Te4[1],$Te4[3]
+       .endif
+       XOR     $Te4[3],$K[0],$Te4[0]           ; K[0]
+       XOR     $Te4[0],$K[1],$K[1]             ; K[1]
+       MV      $Te4[0],$K[0]
+||     XOR     $K[1],$K[2],$Te4[2]             ; K[2]
+       XOR     $Te4[2],$K[3],$K[3]             ; K[3]
+       SPKERNEL
+;;====================================================================
+       BNOP    RA
+       MV      $Te4[2],$K[2]
+||     STW     $K[0],*$KPA++[2]
+||     STW     $K[1],*$KPB++[2]
+       STW     $K[2],*$KPA++[2]
+||     STW     $K[3],*$KPB++[2]
+       MVK     10,B0                           ; rounds
+       STW     B0,*++${KPB}[15]
+       MVK     0,RET
+;;====================================================================
+;;====================================================================
+key192?:
+       .if     .BIG_ENDIAN
+       MV      A9,$K[0]
+||     MV      A8,$K[1]
+||     MV      B9,$K[2]
+||     MV      B8,$K[3]
+       MV      B17,$Te4[2]
+||     MV      B16,$K[5]
+       .else
+       MV      A8,$K[0]
+||     MV      A9,$K[1]
+||     MV      B8,$K[2]
+||     MV      B9,$K[3]
+       MV      B16,$Te4[2]
+||     MV      B17,$K[5]
+       .endif
+
+       MVK     256,A0
+||     MVK     6,B0
+       MV      $TEA,$TEB
+||     ADD     $TEA,A0,A30                     ; rcon
+;;====================================================================
+loop192?:
+       LDW     *A30++[1],A31                   ; rcon[i]
+||     MV      $Te4[2],$K[4]
+||     EXTU    $K[5],EXT1,24,$Te4[0]
+       LDBU    *${TEB}[$Te4[0]],$Te4[0]
+||     MV      $K[5],A0
+||     EXTU    $K[5],EXT2,24,$Te4[1]
+       LDBU    *${TEB}[$Te4[1]],$Te4[1]
+||     EXTU    A0,EXT3,24,A0
+||     EXTU    $K[5],EXT0,24,$Te4[3]
+       .if     .BIG_ENDIAN
+       LDBU    *${TEA}[A0],$Te4[3]
+||     LDBU    *${TEB}[$Te4[3]],A0
+       .else
+       LDBU    *${TEA}[A0],A0
+||     LDBU    *${TEB}[$Te4[3]],$Te4[3]
+       .endif
+
+       STW     $K[0],*$KPA++[2]
+||     STW     $K[1],*$KPB++[2]
+       STW     $K[2],*$KPA++[2]
+||     STW     $K[3],*$KPB++[2]
+       STW     $K[4],*$KPA++[2]
+||     STW     $K[5],*$KPB++[2]
+
+       XOR     A31,$K[0],$K[0]                 ; ^=rcon[i]
+       .if     .BIG_ENDIAN
+       PACK2   $Te4[0],$Te4[1],$Te4[1]
+||     PACK2   $Te4[3],A0,$Te4[3]
+       PACKL4  $Te4[1],$Te4[3],$Te4[3]
+       .else
+       PACK2   $Te4[1],$Te4[0],$Te4[1]
+||     PACK2   $Te4[3],A0,$Te4[3]
+       PACKL4  $Te4[3],$Te4[1],$Te4[3]
+       .endif
+       BDEC    loop192?,B0
+||     XOR     $Te4[3],$K[0],$Te4[0]           ; K[0]
+       XOR     $Te4[0],$K[1],$K[1]             ; K[1]
+       MV      $Te4[0],$K[0]
+||     XOR     $K[1],$K[2],$Te4[2]             ; K[2]
+       XOR     $Te4[2],$K[3],$K[3]             ; K[3]
+       MV      $Te4[2],$K[2]
+||     XOR     $K[3],$K[4],$Te4[2]             ; K[4]
+       XOR     $Te4[2],$K[5],$K[5]             ; K[5]
+;;====================================================================
+       BNOP    RA
+       STW     $K[0],*$KPA++[2]
+||     STW     $K[1],*$KPB++[2]
+       STW     $K[2],*$KPA++[2]
+||     STW     $K[3],*$KPB++[2]
+       MVK     12,B0                           ; rounds
+       STW     B0,*++${KPB}[7]
+       MVK     0,RET
+;;====================================================================
+;;====================================================================
+key256?:
+       .if     .BIG_ENDIAN
+       MV      A9,$K[0]
+||     MV      A8,$K[1]
+||     MV      B9,$K[2]
+||     MV      B8,$K[3]
+       MV      B17,$K[4]
+||     MV      B16,$K[5]
+||     MV      B19,$Te4[2]
+||     MV      B18,$K[7]
+       .else
+       MV      A8,$K[0]
+||     MV      A9,$K[1]
+||     MV      B8,$K[2]
+||     MV      B9,$K[3]
+       MV      B16,$K[4]
+||     MV      B17,$K[5]
+||     MV      B18,$Te4[2]
+||     MV      B19,$K[7]
+       .endif
+
+       MVK     256,A0
+||     MVK     6,B0
+       MV      $TEA,$TEB
+||     ADD     $TEA,A0,A30                     ; rcon
+;;====================================================================
+loop256?:
+       LDW     *A30++[1],A31                   ; rcon[i]
+||     MV      $Te4[2],$K[6]
+||     EXTU    $K[7],EXT1,24,$Te4[0]
+       LDBU    *${TEB}[$Te4[0]],$Te4[0]
+||     MV      $K[7],A0
+||     EXTU    $K[7],EXT2,24,$Te4[1]
+       LDBU    *${TEB}[$Te4[1]],$Te4[1]
+||     EXTU    A0,EXT3,24,A0
+||     EXTU    $K[7],EXT0,24,$Te4[3]
+       .if     .BIG_ENDIAN
+       LDBU    *${TEA}[A0],$Te4[3]
+||     LDBU    *${TEB}[$Te4[3]],A0
+       .else
+       LDBU    *${TEA}[A0],A0
+||     LDBU    *${TEB}[$Te4[3]],$Te4[3]
+       .endif
+
+       STW     $K[0],*$KPA++[2]
+||     STW     $K[1],*$KPB++[2]
+       STW     $K[2],*$KPA++[2]
+||     STW     $K[3],*$KPB++[2]
+       STW     $K[4],*$KPA++[2]
+||     STW     $K[5],*$KPB++[2]
+       STW     $K[6],*$KPA++[2]
+||     STW     $K[7],*$KPB++[2]
+||     XOR     A31,$K[0],$K[0]                 ; ^=rcon[i]
+       .if     .BIG_ENDIAN
+       PACK2   $Te4[0],$Te4[1],$Te4[1]
+||     PACK2   $Te4[3],A0,$Te4[3]
+       PACKL4  $Te4[1],$Te4[3],$Te4[3]
+||[!B0]        B       done256?
+       .else
+       PACK2   $Te4[1],$Te4[0],$Te4[1]
+||     PACK2   $Te4[3],A0,$Te4[3]
+       PACKL4  $Te4[3],$Te4[1],$Te4[3]
+||[!B0]        B       done256?
+       .endif
+       XOR     $Te4[3],$K[0],$Te4[0]           ; K[0]
+       XOR     $Te4[0],$K[1],$K[1]             ; K[1]
+       MV      $Te4[0],$K[0]
+||     XOR     $K[1],$K[2],$Te4[2]             ; K[2]
+       XOR     $Te4[2],$K[3],$K[3]             ; K[3]
+
+       MV      $Te4[2],$K[2]
+|| [B0]        EXTU    $K[3],EXT0,24,$Te4[0]
+|| [B0]        SUB     B0,1,B0
+       LDBU    *${TEB}[$Te4[0]],$Te4[0]
+||     MV      $K[3],A0
+||     EXTU    $K[3],EXT1,24,$Te4[1]
+       LDBU    *${TEB}[$Te4[1]],$Te4[1]
+||     EXTU    A0,EXT2,24,A0
+||     EXTU    $K[3],EXT3,24,$Te4[3]
+
+       .if     .BIG_ENDIAN
+       LDBU    *${TEA}[A0],$Te4[3]
+||     LDBU    *${TEB}[$Te4[3]],A0
+       NOP     3
+       PACK2   $Te4[0],$Te4[1],$Te4[1]
+       PACK2   $Te4[3],A0,$Te4[3]
+||     B       loop256?
+       PACKL4  $Te4[1],$Te4[3],$Te4[3]
+       .else
+       LDBU    *${TEA}[A0],A0
+||     LDBU    *${TEB}[$Te4[3]],$Te4[3]
+       NOP     3
+       PACK2   $Te4[1],$Te4[0],$Te4[1]
+       PACK2   $Te4[3],A0,$Te4[3]
+||     B       loop256?
+       PACKL4  $Te4[3],$Te4[1],$Te4[3]
+       .endif
+
+       XOR     $Te4[3],$K[4],$Te4[0]           ; K[4]
+       XOR     $Te4[0],$K[5],$K[5]             ; K[5]
+       MV      $Te4[0],$K[4]
+||     XOR     $K[5],$K[6],$Te4[2]             ; K[6]
+       XOR     $Te4[2],$K[7],$K[7]             ; K[7]
+;;====================================================================
+done256?:
+       BNOP    RA
+       STW     $K[0],*$KPA++[2]
+||     STW     $K[1],*$KPB++[2]
+       STW     $K[2],*$KPA++[2]
+||     STW     $K[3],*$KPB++[2]
+       MVK     14,B0                           ; rounds
+       STW     B0,*--${KPB}[1]
+       MVK     0,RET
+       .endasmfunc
+
+       .global _AES_set_decrypt_key
+_AES_set_decrypt_key:
+       .asmfunc
+       B       __set_encrypt_key               ; guarantee local call
+       MV      KEY,B30                         ; B30 is not modified
+       MV      RA, B31                         ; B31 is not modified
+       ADDKPC  ret?,RA,2
+ret?:                                          ; B0 holds rounds or zero
+  [!B0]        BNOP    B31                             ; return if zero
+   [B0]        SHL     B0,4,A0                         ; offset to last round key
+   [B0]        SHRU    B0,1,B1
+   [B0]        SUB     B1,1,B1
+   [B0]        MVK     0x0000001B,B3                   ; AES polynomial
+   [B0]        MVKH    0x07000000,B3
+
+       SPLOOPD 9                               ; flip round keys
+||     MVC     B1,ILC
+||     MV      B30,$KPA
+||     ADD     B30,A0,$KPB
+||     MVK     16,A0                           ; sizeof(round key)
+;;====================================================================
+       LDW     *${KPA}[0],A16
+||     LDW     *${KPB}[0],B16
+       LDW     *${KPA}[1],A17
+||     LDW     *${KPB}[1],B17
+       LDW     *${KPA}[2],A18
+||     LDW     *${KPB}[2],B18
+       LDW     *${KPA}[3],A19
+||     ADD     $KPA,A0,$KPA
+||     LDW     *${KPB}[3],B19
+||     SUB     $KPB,A0,$KPB
+       NOP
+       STW     B16,*${KPA}[-4]
+||     STW     A16,*${KPB}[4]
+       STW     B17,*${KPA}[-3]
+||     STW     A17,*${KPB}[5]
+       STW     B18,*${KPA}[-2]
+||     STW     A18,*${KPB}[6]
+       STW     B19,*${KPA}[-1]
+||     STW     A19,*${KPB}[7]
+       SPKERNEL
+;;====================================================================
+       SUB     B0,1,B0                         ; skip last round
+||     ADD     B30,A0,$KPA                     ; skip first round
+||     ADD     B30,A0,$KPB
+||     MVC     GFPGFR,B30                      ; save GFPGFR
+       LDW     *${KPA}[0],$K[0]
+||     LDW     *${KPB}[1],$K[1]
+||     MVC     B3,GFPGFR
+       LDW     *${KPA}[2],$K[2]
+||     LDW     *${KPB}[3],$K[3]
+       MVK     0x00000909,A24
+||     MVK     0x00000B0B,B24
+       MVKH    0x09090000,A24
+||     MVKH    0x0B0B0000,B24
+       MVC     B0,ILC
+||     SUB     B0,1,B0
+
+       GMPY4   $K[0],A24,$Kx9[0]               ; ·0x09
+||     GMPY4   $K[1],A24,$Kx9[1]
+||     MVK     0x00000D0D,A25
+||     MVK     0x00000E0E,B25
+       GMPY4   $K[2],A24,$Kx9[2]
+||     GMPY4   $K[3],A24,$Kx9[3]
+||     MVKH    0x0D0D0000,A25
+||     MVKH    0x0E0E0000,B25
+
+       GMPY4   $K[0],B24,$KxB[0]               ; ·0x0B
+||     GMPY4   $K[1],B24,$KxB[1]
+       GMPY4   $K[2],B24,$KxB[2]
+||     GMPY4   $K[3],B24,$KxB[3]
+
+       SPLOOP  11                              ; InvMixColumns
+;;====================================================================
+       GMPY4   $K[0],A25,$KxD[0]               ; ·0x0D
+||     GMPY4   $K[1],A25,$KxD[1]
+||     SWAP2   $Kx9[0],$Kx9[0]                 ; rotate by 16
+||     SWAP2   $Kx9[1],$Kx9[1]
+||     MV      $K[0],$s[0]                     ; this or DINT
+||     MV      $K[1],$s[1]
+|| [B0]        LDW     *${KPA}[4],$K[0]
+|| [B0]        LDW     *${KPB}[5],$K[1]
+       GMPY4   $K[2],A25,$KxD[2]
+||     GMPY4   $K[3],A25,$KxD[3]
+||     SWAP2   $Kx9[2],$Kx9[2]
+||     SWAP2   $Kx9[3],$Kx9[3]
+||     MV      $K[2],$s[2]
+||     MV      $K[3],$s[3]
+|| [B0]        LDW     *${KPA}[6],$K[2]
+|| [B0]        LDW     *${KPB}[7],$K[3]
+
+       GMPY4   $s[0],B25,$KxE[0]               ; ·0x0E
+||     GMPY4   $s[1],B25,$KxE[1]
+||     XOR     $Kx9[0],$KxB[0],$KxB[0]
+||     XOR     $Kx9[1],$KxB[1],$KxB[1]
+       GMPY4   $s[2],B25,$KxE[2]
+||     GMPY4   $s[3],B25,$KxE[3]
+||     XOR     $Kx9[2],$KxB[2],$KxB[2]
+||     XOR     $Kx9[3],$KxB[3],$KxB[3]
+
+       ROTL    $KxB[0],TBL3,$KxB[0]
+||     ROTL    $KxB[1],TBL3,$KxB[1]
+||     SWAP2   $KxD[0],$KxD[0]                 ; rotate by 16
+||     SWAP2   $KxD[1],$KxD[1]
+       ROTL    $KxB[2],TBL3,$KxB[2]
+||     ROTL    $KxB[3],TBL3,$KxB[3]
+||     SWAP2   $KxD[2],$KxD[2]
+||     SWAP2   $KxD[3],$KxD[3]
+
+       XOR     $KxE[0],$KxD[0],$KxE[0]
+||     XOR     $KxE[1],$KxD[1],$KxE[1]
+|| [B0]        GMPY4   $K[0],A24,$Kx9[0]               ; ·0x09
+|| [B0]        GMPY4   $K[1],A24,$Kx9[1]
+||     ADDAW   $KPA,4,$KPA
+       XOR     $KxE[2],$KxD[2],$KxE[2]
+||     XOR     $KxE[3],$KxD[3],$KxE[3]
+|| [B0]        GMPY4   $K[2],A24,$Kx9[2]
+|| [B0]        GMPY4   $K[3],A24,$Kx9[3]
+||     ADDAW   $KPB,4,$KPB
+
+       XOR     $KxB[0],$KxE[0],$KxE[0]
+||     XOR     $KxB[1],$KxE[1],$KxE[1]
+|| [B0]        GMPY4   $K[0],B24,$KxB[0]               ; ·0x0B
+|| [B0]        GMPY4   $K[1],B24,$KxB[1]
+       XOR     $KxB[2],$KxE[2],$KxE[2]
+||     XOR     $KxB[3],$KxE[3],$KxE[3]
+|| [B0]        GMPY4   $K[2],B24,$KxB[2]
+|| [B0]        GMPY4   $K[3],B24,$KxB[3]
+||     STW     $KxE[0],*${KPA}[-4]
+||     STW     $KxE[1],*${KPB}[-3]
+       STW     $KxE[2],*${KPA}[-2]
+||     STW     $KxE[3],*${KPB}[-1]
+|| [B0]        SUB     B0,1,B0
+       SPKERNEL
+;;====================================================================
+       BNOP    B31,3
+       MVC     B30,GFPGFR                      ; restore GFPGFR(*)
+       MVK     0,RET
+       .endasmfunc
+___
+# (*)  Even though ABI doesn't specify GFPGFR as non-volatile, there
+#      are code samples out there that *assume* its default value.
+}
+{
+my ($inp,$out,$blocks,$key,$ivp)=("A4","B4","A6","B6","A8");
+$code.=<<___;
+       .global _AES_ctr32_encrypt
+_AES_ctr32_encrypt:
+       .asmfunc
+       LDNDW   *${ivp}[0],A31:A30      ; load counter value
+||     MV      $blocks,A2              ; reassign $blocks
+||     DMV     RA,$key,B27:B26         ; reassign RA and $key
+       LDNDW   *${ivp}[1],B31:B30
+||     MVK     0,B2                    ; don't let __encrypt load input
+||     MVK     0,A1                    ; and postpone writing output
+       .if     .BIG_ENDIAN
+       NOP
+       .else
+       NOP     4
+       SWAP2   B31,B31                 ; keep least significant 32 bits
+       SWAP4   B31,B31                 ; in host byte order
+       .endif
+ctr32_loop?:
+   [A2]        BNOP    __encrypt
+|| [A1]        XOR     A29,A9,A9               ; input^Ek(counter)
+|| [A1]        XOR     A28,A8,A8
+|| [A2]        LDNDW   *INP++,A29:A28          ; load input
+  [!A2]        BNOP    B27                     ; return
+|| [A1]        XOR     B29,B9,B9
+|| [A1]        XOR     B28,B8,B8
+|| [A2]        LDNDW   *INP++,B29:B28
+       .if     .BIG_ENDIAN
+   [A1]        STNDW   A9:A8,*OUT++            ; save output
+|| [A2]        DMV     A31,A30,A9:A8           ; pass counter value to __encrypt
+   [A1]        STNDW   B9:B8,*OUT++
+|| [A2]        DMV     B31,B30,B9:B8
+|| [A2]        ADD     B30,1,B30               ; counter++
+       .else
+   [A1]        STNDW   A9:A8,*OUT++            ; save output
+|| [A2]        DMV     A31,A30,A9:A8
+|| [A2]        SWAP2   B31,B0
+|| [A2]        ADD     B31,1,B31               ; counter++
+   [A1]        STNDW   B9:B8,*OUT++
+|| [A2]        MV      B30,B8
+|| [A2]        SWAP4   B0,B9
+       .endif
+   [A2]        ADDKPC  ctr32_loop?,RA          ; return to ctr32_loop?
+|| [A2]        MV      B26,KEY                 ; pass $key
+|| [A2]        SUB     A2,1,A2                 ; $blocks--
+||[!A1]        MVK     1,A1
+       NOP
+       NOP
+       .endasmfunc
+___
+}
+# Tables are kept in endian-neutral manner
+$code.=<<___;
+       .sect   ".const:aes_asm"
+       .align  128
+AES_Te:
+       .byte   0xc6,0x63,0x63,0xa5,    0xf8,0x7c,0x7c,0x84
+       .byte   0xee,0x77,0x77,0x99,    0xf6,0x7b,0x7b,0x8d
+       .byte   0xff,0xf2,0xf2,0x0d,    0xd6,0x6b,0x6b,0xbd
+       .byte   0xde,0x6f,0x6f,0xb1,    0x91,0xc5,0xc5,0x54
+       .byte   0x60,0x30,0x30,0x50,    0x02,0x01,0x01,0x03
+       .byte   0xce,0x67,0x67,0xa9,    0x56,0x2b,0x2b,0x7d
+       .byte   0xe7,0xfe,0xfe,0x19,    0xb5,0xd7,0xd7,0x62
+       .byte   0x4d,0xab,0xab,0xe6,    0xec,0x76,0x76,0x9a
+       .byte   0x8f,0xca,0xca,0x45,    0x1f,0x82,0x82,0x9d
+       .byte   0x89,0xc9,0xc9,0x40,    0xfa,0x7d,0x7d,0x87
+       .byte   0xef,0xfa,0xfa,0x15,    0xb2,0x59,0x59,0xeb
+       .byte   0x8e,0x47,0x47,0xc9,    0xfb,0xf0,0xf0,0x0b
+       .byte   0x41,0xad,0xad,0xec,    0xb3,0xd4,0xd4,0x67
+       .byte   0x5f,0xa2,0xa2,0xfd,    0x45,0xaf,0xaf,0xea
+       .byte   0x23,0x9c,0x9c,0xbf,    0x53,0xa4,0xa4,0xf7
+       .byte   0xe4,0x72,0x72,0x96,    0x9b,0xc0,0xc0,0x5b
+       .byte   0x75,0xb7,0xb7,0xc2,    0xe1,0xfd,0xfd,0x1c
+       .byte   0x3d,0x93,0x93,0xae,    0x4c,0x26,0x26,0x6a
+       .byte   0x6c,0x36,0x36,0x5a,    0x7e,0x3f,0x3f,0x41
+       .byte   0xf5,0xf7,0xf7,0x02,    0x83,0xcc,0xcc,0x4f
+       .byte   0x68,0x34,0x34,0x5c,    0x51,0xa5,0xa5,0xf4
+       .byte   0xd1,0xe5,0xe5,0x34,    0xf9,0xf1,0xf1,0x08
+       .byte   0xe2,0x71,0x71,0x93,    0xab,0xd8,0xd8,0x73
+       .byte   0x62,0x31,0x31,0x53,    0x2a,0x15,0x15,0x3f
+       .byte   0x08,0x04,0x04,0x0c,    0x95,0xc7,0xc7,0x52
+       .byte   0x46,0x23,0x23,0x65,    0x9d,0xc3,0xc3,0x5e
+       .byte   0x30,0x18,0x18,0x28,    0x37,0x96,0x96,0xa1
+       .byte   0x0a,0x05,0x05,0x0f,    0x2f,0x9a,0x9a,0xb5
+       .byte   0x0e,0x07,0x07,0x09,    0x24,0x12,0x12,0x36
+       .byte   0x1b,0x80,0x80,0x9b,    0xdf,0xe2,0xe2,0x3d
+       .byte   0xcd,0xeb,0xeb,0x26,    0x4e,0x27,0x27,0x69
+       .byte   0x7f,0xb2,0xb2,0xcd,    0xea,0x75,0x75,0x9f
+       .byte   0x12,0x09,0x09,0x1b,    0x1d,0x83,0x83,0x9e
+       .byte   0x58,0x2c,0x2c,0x74,    0x34,0x1a,0x1a,0x2e
+       .byte   0x36,0x1b,0x1b,0x2d,    0xdc,0x6e,0x6e,0xb2
+       .byte   0xb4,0x5a,0x5a,0xee,    0x5b,0xa0,0xa0,0xfb
+       .byte   0xa4,0x52,0x52,0xf6,    0x76,0x3b,0x3b,0x4d
+       .byte   0xb7,0xd6,0xd6,0x61,    0x7d,0xb3,0xb3,0xce
+       .byte   0x52,0x29,0x29,0x7b,    0xdd,0xe3,0xe3,0x3e
+       .byte   0x5e,0x2f,0x2f,0x71,    0x13,0x84,0x84,0x97
+       .byte   0xa6,0x53,0x53,0xf5,    0xb9,0xd1,0xd1,0x68
+       .byte   0x00,0x00,0x00,0x00,    0xc1,0xed,0xed,0x2c
+       .byte   0x40,0x20,0x20,0x60,    0xe3,0xfc,0xfc,0x1f
+       .byte   0x79,0xb1,0xb1,0xc8,    0xb6,0x5b,0x5b,0xed
+       .byte   0xd4,0x6a,0x6a,0xbe,    0x8d,0xcb,0xcb,0x46
+       .byte   0x67,0xbe,0xbe,0xd9,    0x72,0x39,0x39,0x4b
+       .byte   0x94,0x4a,0x4a,0xde,    0x98,0x4c,0x4c,0xd4
+       .byte   0xb0,0x58,0x58,0xe8,    0x85,0xcf,0xcf,0x4a
+       .byte   0xbb,0xd0,0xd0,0x6b,    0xc5,0xef,0xef,0x2a
+       .byte   0x4f,0xaa,0xaa,0xe5,    0xed,0xfb,0xfb,0x16
+       .byte   0x86,0x43,0x43,0xc5,    0x9a,0x4d,0x4d,0xd7
+       .byte   0x66,0x33,0x33,0x55,    0x11,0x85,0x85,0x94
+       .byte   0x8a,0x45,0x45,0xcf,    0xe9,0xf9,0xf9,0x10
+       .byte   0x04,0x02,0x02,0x06,    0xfe,0x7f,0x7f,0x81
+       .byte   0xa0,0x50,0x50,0xf0,    0x78,0x3c,0x3c,0x44
+       .byte   0x25,0x9f,0x9f,0xba,    0x4b,0xa8,0xa8,0xe3
+       .byte   0xa2,0x51,0x51,0xf3,    0x5d,0xa3,0xa3,0xfe
+       .byte   0x80,0x40,0x40,0xc0,    0x05,0x8f,0x8f,0x8a
+       .byte   0x3f,0x92,0x92,0xad,    0x21,0x9d,0x9d,0xbc
+       .byte   0x70,0x38,0x38,0x48,    0xf1,0xf5,0xf5,0x04
+       .byte   0x63,0xbc,0xbc,0xdf,    0x77,0xb6,0xb6,0xc1
+       .byte   0xaf,0xda,0xda,0x75,    0x42,0x21,0x21,0x63
+       .byte   0x20,0x10,0x10,0x30,    0xe5,0xff,0xff,0x1a
+       .byte   0xfd,0xf3,0xf3,0x0e,    0xbf,0xd2,0xd2,0x6d
+       .byte   0x81,0xcd,0xcd,0x4c,    0x18,0x0c,0x0c,0x14
+       .byte   0x26,0x13,0x13,0x35,    0xc3,0xec,0xec,0x2f
+       .byte   0xbe,0x5f,0x5f,0xe1,    0x35,0x97,0x97,0xa2
+       .byte   0x88,0x44,0x44,0xcc,    0x2e,0x17,0x17,0x39
+       .byte   0x93,0xc4,0xc4,0x57,    0x55,0xa7,0xa7,0xf2
+       .byte   0xfc,0x7e,0x7e,0x82,    0x7a,0x3d,0x3d,0x47
+       .byte   0xc8,0x64,0x64,0xac,    0xba,0x5d,0x5d,0xe7
+       .byte   0x32,0x19,0x19,0x2b,    0xe6,0x73,0x73,0x95
+       .byte   0xc0,0x60,0x60,0xa0,    0x19,0x81,0x81,0x98
+       .byte   0x9e,0x4f,0x4f,0xd1,    0xa3,0xdc,0xdc,0x7f
+       .byte   0x44,0x22,0x22,0x66,    0x54,0x2a,0x2a,0x7e
+       .byte   0x3b,0x90,0x90,0xab,    0x0b,0x88,0x88,0x83
+       .byte   0x8c,0x46,0x46,0xca,    0xc7,0xee,0xee,0x29
+       .byte   0x6b,0xb8,0xb8,0xd3,    0x28,0x14,0x14,0x3c
+       .byte   0xa7,0xde,0xde,0x79,    0xbc,0x5e,0x5e,0xe2
+       .byte   0x16,0x0b,0x0b,0x1d,    0xad,0xdb,0xdb,0x76
+       .byte   0xdb,0xe0,0xe0,0x3b,    0x64,0x32,0x32,0x56
+       .byte   0x74,0x3a,0x3a,0x4e,    0x14,0x0a,0x0a,0x1e
+       .byte   0x92,0x49,0x49,0xdb,    0x0c,0x06,0x06,0x0a
+       .byte   0x48,0x24,0x24,0x6c,    0xb8,0x5c,0x5c,0xe4
+       .byte   0x9f,0xc2,0xc2,0x5d,    0xbd,0xd3,0xd3,0x6e
+       .byte   0x43,0xac,0xac,0xef,    0xc4,0x62,0x62,0xa6
+       .byte   0x39,0x91,0x91,0xa8,    0x31,0x95,0x95,0xa4
+       .byte   0xd3,0xe4,0xe4,0x37,    0xf2,0x79,0x79,0x8b
+       .byte   0xd5,0xe7,0xe7,0x32,    0x8b,0xc8,0xc8,0x43
+       .byte   0x6e,0x37,0x37,0x59,    0xda,0x6d,0x6d,0xb7
+       .byte   0x01,0x8d,0x8d,0x8c,    0xb1,0xd5,0xd5,0x64
+       .byte   0x9c,0x4e,0x4e,0xd2,    0x49,0xa9,0xa9,0xe0
+       .byte   0xd8,0x6c,0x6c,0xb4,    0xac,0x56,0x56,0xfa
+       .byte   0xf3,0xf4,0xf4,0x07,    0xcf,0xea,0xea,0x25
+       .byte   0xca,0x65,0x65,0xaf,    0xf4,0x7a,0x7a,0x8e
+       .byte   0x47,0xae,0xae,0xe9,    0x10,0x08,0x08,0x18
+       .byte   0x6f,0xba,0xba,0xd5,    0xf0,0x78,0x78,0x88
+       .byte   0x4a,0x25,0x25,0x6f,    0x5c,0x2e,0x2e,0x72
+       .byte   0x38,0x1c,0x1c,0x24,    0x57,0xa6,0xa6,0xf1
+       .byte   0x73,0xb4,0xb4,0xc7,    0x97,0xc6,0xc6,0x51
+       .byte   0xcb,0xe8,0xe8,0x23,    0xa1,0xdd,0xdd,0x7c
+       .byte   0xe8,0x74,0x74,0x9c,    0x3e,0x1f,0x1f,0x21
+       .byte   0x96,0x4b,0x4b,0xdd,    0x61,0xbd,0xbd,0xdc
+       .byte   0x0d,0x8b,0x8b,0x86,    0x0f,0x8a,0x8a,0x85
+       .byte   0xe0,0x70,0x70,0x90,    0x7c,0x3e,0x3e,0x42
+       .byte   0x71,0xb5,0xb5,0xc4,    0xcc,0x66,0x66,0xaa
+       .byte   0x90,0x48,0x48,0xd8,    0x06,0x03,0x03,0x05
+       .byte   0xf7,0xf6,0xf6,0x01,    0x1c,0x0e,0x0e,0x12
+       .byte   0xc2,0x61,0x61,0xa3,    0x6a,0x35,0x35,0x5f
+       .byte   0xae,0x57,0x57,0xf9,    0x69,0xb9,0xb9,0xd0
+       .byte   0x17,0x86,0x86,0x91,    0x99,0xc1,0xc1,0x58
+       .byte   0x3a,0x1d,0x1d,0x27,    0x27,0x9e,0x9e,0xb9
+       .byte   0xd9,0xe1,0xe1,0x38,    0xeb,0xf8,0xf8,0x13
+       .byte   0x2b,0x98,0x98,0xb3,    0x22,0x11,0x11,0x33
+       .byte   0xd2,0x69,0x69,0xbb,    0xa9,0xd9,0xd9,0x70
+       .byte   0x07,0x8e,0x8e,0x89,    0x33,0x94,0x94,0xa7
+       .byte   0x2d,0x9b,0x9b,0xb6,    0x3c,0x1e,0x1e,0x22
+       .byte   0x15,0x87,0x87,0x92,    0xc9,0xe9,0xe9,0x20
+       .byte   0x87,0xce,0xce,0x49,    0xaa,0x55,0x55,0xff
+       .byte   0x50,0x28,0x28,0x78,    0xa5,0xdf,0xdf,0x7a
+       .byte   0x03,0x8c,0x8c,0x8f,    0x59,0xa1,0xa1,0xf8
+       .byte   0x09,0x89,0x89,0x80,    0x1a,0x0d,0x0d,0x17
+       .byte   0x65,0xbf,0xbf,0xda,    0xd7,0xe6,0xe6,0x31
+       .byte   0x84,0x42,0x42,0xc6,    0xd0,0x68,0x68,0xb8
+       .byte   0x82,0x41,0x41,0xc3,    0x29,0x99,0x99,0xb0
+       .byte   0x5a,0x2d,0x2d,0x77,    0x1e,0x0f,0x0f,0x11
+       .byte   0x7b,0xb0,0xb0,0xcb,    0xa8,0x54,0x54,0xfc
+       .byte   0x6d,0xbb,0xbb,0xd6,    0x2c,0x16,0x16,0x3a
+AES_Te4:
+       .byte   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
+       .byte   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
+       .byte   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
+       .byte   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
+       .byte   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
+       .byte   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
+       .byte   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
+       .byte   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
+       .byte   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
+       .byte   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
+       .byte   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
+       .byte   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
+       .byte   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
+       .byte   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
+       .byte   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
+       .byte   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
+       .byte   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
+       .byte   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
+       .byte   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
+       .byte   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
+       .byte   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
+       .byte   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
+       .byte   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
+       .byte   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
+       .byte   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
+       .byte   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
+       .byte   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
+       .byte   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
+       .byte   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
+       .byte   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
+       .byte   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
+       .byte   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
+rcon:
+       .byte   0x01,0x00,0x00,0x00,    0x02,0x00,0x00,0x00
+       .byte   0x04,0x00,0x00,0x00,    0x08,0x00,0x00,0x00
+       .byte   0x10,0x00,0x00,0x00,    0x20,0x00,0x00,0x00
+       .byte   0x40,0x00,0x00,0x00,    0x80,0x00,0x00,0x00
+       .byte   0x1B,0x00,0x00,0x00,    0x36,0x00,0x00,0x00
+       .align  128
+AES_Td:
+       .byte   0x51,0xf4,0xa7,0x50,    0x7e,0x41,0x65,0x53
+       .byte   0x1a,0x17,0xa4,0xc3,    0x3a,0x27,0x5e,0x96
+       .byte   0x3b,0xab,0x6b,0xcb,    0x1f,0x9d,0x45,0xf1
+       .byte   0xac,0xfa,0x58,0xab,    0x4b,0xe3,0x03,0x93
+       .byte   0x20,0x30,0xfa,0x55,    0xad,0x76,0x6d,0xf6
+       .byte   0x88,0xcc,0x76,0x91,    0xf5,0x02,0x4c,0x25
+       .byte   0x4f,0xe5,0xd7,0xfc,    0xc5,0x2a,0xcb,0xd7
+       .byte   0x26,0x35,0x44,0x80,    0xb5,0x62,0xa3,0x8f
+       .byte   0xde,0xb1,0x5a,0x49,    0x25,0xba,0x1b,0x67
+       .byte   0x45,0xea,0x0e,0x98,    0x5d,0xfe,0xc0,0xe1
+       .byte   0xc3,0x2f,0x75,0x02,    0x81,0x4c,0xf0,0x12
+       .byte   0x8d,0x46,0x97,0xa3,    0x6b,0xd3,0xf9,0xc6
+       .byte   0x03,0x8f,0x5f,0xe7,    0x15,0x92,0x9c,0x95
+       .byte   0xbf,0x6d,0x7a,0xeb,    0x95,0x52,0x59,0xda
+       .byte   0xd4,0xbe,0x83,0x2d,    0x58,0x74,0x21,0xd3
+       .byte   0x49,0xe0,0x69,0x29,    0x8e,0xc9,0xc8,0x44
+       .byte   0x75,0xc2,0x89,0x6a,    0xf4,0x8e,0x79,0x78
+       .byte   0x99,0x58,0x3e,0x6b,    0x27,0xb9,0x71,0xdd
+       .byte   0xbe,0xe1,0x4f,0xb6,    0xf0,0x88,0xad,0x17
+       .byte   0xc9,0x20,0xac,0x66,    0x7d,0xce,0x3a,0xb4
+       .byte   0x63,0xdf,0x4a,0x18,    0xe5,0x1a,0x31,0x82
+       .byte   0x97,0x51,0x33,0x60,    0x62,0x53,0x7f,0x45
+       .byte   0xb1,0x64,0x77,0xe0,    0xbb,0x6b,0xae,0x84
+       .byte   0xfe,0x81,0xa0,0x1c,    0xf9,0x08,0x2b,0x94
+       .byte   0x70,0x48,0x68,0x58,    0x8f,0x45,0xfd,0x19
+       .byte   0x94,0xde,0x6c,0x87,    0x52,0x7b,0xf8,0xb7
+       .byte   0xab,0x73,0xd3,0x23,    0x72,0x4b,0x02,0xe2
+       .byte   0xe3,0x1f,0x8f,0x57,    0x66,0x55,0xab,0x2a
+       .byte   0xb2,0xeb,0x28,0x07,    0x2f,0xb5,0xc2,0x03
+       .byte   0x86,0xc5,0x7b,0x9a,    0xd3,0x37,0x08,0xa5
+       .byte   0x30,0x28,0x87,0xf2,    0x23,0xbf,0xa5,0xb2
+       .byte   0x02,0x03,0x6a,0xba,    0xed,0x16,0x82,0x5c
+       .byte   0x8a,0xcf,0x1c,0x2b,    0xa7,0x79,0xb4,0x92
+       .byte   0xf3,0x07,0xf2,0xf0,    0x4e,0x69,0xe2,0xa1
+       .byte   0x65,0xda,0xf4,0xcd,    0x06,0x05,0xbe,0xd5
+       .byte   0xd1,0x34,0x62,0x1f,    0xc4,0xa6,0xfe,0x8a
+       .byte   0x34,0x2e,0x53,0x9d,    0xa2,0xf3,0x55,0xa0
+       .byte   0x05,0x8a,0xe1,0x32,    0xa4,0xf6,0xeb,0x75
+       .byte   0x0b,0x83,0xec,0x39,    0x40,0x60,0xef,0xaa
+       .byte   0x5e,0x71,0x9f,0x06,    0xbd,0x6e,0x10,0x51
+       .byte   0x3e,0x21,0x8a,0xf9,    0x96,0xdd,0x06,0x3d
+       .byte   0xdd,0x3e,0x05,0xae,    0x4d,0xe6,0xbd,0x46
+       .byte   0x91,0x54,0x8d,0xb5,    0x71,0xc4,0x5d,0x05
+       .byte   0x04,0x06,0xd4,0x6f,    0x60,0x50,0x15,0xff
+       .byte   0x19,0x98,0xfb,0x24,    0xd6,0xbd,0xe9,0x97
+       .byte   0x89,0x40,0x43,0xcc,    0x67,0xd9,0x9e,0x77
+       .byte   0xb0,0xe8,0x42,0xbd,    0x07,0x89,0x8b,0x88
+       .byte   0xe7,0x19,0x5b,0x38,    0x79,0xc8,0xee,0xdb
+       .byte   0xa1,0x7c,0x0a,0x47,    0x7c,0x42,0x0f,0xe9
+       .byte   0xf8,0x84,0x1e,0xc9,    0x00,0x00,0x00,0x00
+       .byte   0x09,0x80,0x86,0x83,    0x32,0x2b,0xed,0x48
+       .byte   0x1e,0x11,0x70,0xac,    0x6c,0x5a,0x72,0x4e
+       .byte   0xfd,0x0e,0xff,0xfb,    0x0f,0x85,0x38,0x56
+       .byte   0x3d,0xae,0xd5,0x1e,    0x36,0x2d,0x39,0x27
+       .byte   0x0a,0x0f,0xd9,0x64,    0x68,0x5c,0xa6,0x21
+       .byte   0x9b,0x5b,0x54,0xd1,    0x24,0x36,0x2e,0x3a
+       .byte   0x0c,0x0a,0x67,0xb1,    0x93,0x57,0xe7,0x0f
+       .byte   0xb4,0xee,0x96,0xd2,    0x1b,0x9b,0x91,0x9e
+       .byte   0x80,0xc0,0xc5,0x4f,    0x61,0xdc,0x20,0xa2
+       .byte   0x5a,0x77,0x4b,0x69,    0x1c,0x12,0x1a,0x16
+       .byte   0xe2,0x93,0xba,0x0a,    0xc0,0xa0,0x2a,0xe5
+       .byte   0x3c,0x22,0xe0,0x43,    0x12,0x1b,0x17,0x1d
+       .byte   0x0e,0x09,0x0d,0x0b,    0xf2,0x8b,0xc7,0xad
+       .byte   0x2d,0xb6,0xa8,0xb9,    0x14,0x1e,0xa9,0xc8
+       .byte   0x57,0xf1,0x19,0x85,    0xaf,0x75,0x07,0x4c
+       .byte   0xee,0x99,0xdd,0xbb,    0xa3,0x7f,0x60,0xfd
+       .byte   0xf7,0x01,0x26,0x9f,    0x5c,0x72,0xf5,0xbc
+       .byte   0x44,0x66,0x3b,0xc5,    0x5b,0xfb,0x7e,0x34
+       .byte   0x8b,0x43,0x29,0x76,    0xcb,0x23,0xc6,0xdc
+       .byte   0xb6,0xed,0xfc,0x68,    0xb8,0xe4,0xf1,0x63
+       .byte   0xd7,0x31,0xdc,0xca,    0x42,0x63,0x85,0x10
+       .byte   0x13,0x97,0x22,0x40,    0x84,0xc6,0x11,0x20
+       .byte   0x85,0x4a,0x24,0x7d,    0xd2,0xbb,0x3d,0xf8
+       .byte   0xae,0xf9,0x32,0x11,    0xc7,0x29,0xa1,0x6d
+       .byte   0x1d,0x9e,0x2f,0x4b,    0xdc,0xb2,0x30,0xf3
+       .byte   0x0d,0x86,0x52,0xec,    0x77,0xc1,0xe3,0xd0
+       .byte   0x2b,0xb3,0x16,0x6c,    0xa9,0x70,0xb9,0x99
+       .byte   0x11,0x94,0x48,0xfa,    0x47,0xe9,0x64,0x22
+       .byte   0xa8,0xfc,0x8c,0xc4,    0xa0,0xf0,0x3f,0x1a
+       .byte   0x56,0x7d,0x2c,0xd8,    0x22,0x33,0x90,0xef
+       .byte   0x87,0x49,0x4e,0xc7,    0xd9,0x38,0xd1,0xc1
+       .byte   0x8c,0xca,0xa2,0xfe,    0x98,0xd4,0x0b,0x36
+       .byte   0xa6,0xf5,0x81,0xcf,    0xa5,0x7a,0xde,0x28
+       .byte   0xda,0xb7,0x8e,0x26,    0x3f,0xad,0xbf,0xa4
+       .byte   0x2c,0x3a,0x9d,0xe4,    0x50,0x78,0x92,0x0d
+       .byte   0x6a,0x5f,0xcc,0x9b,    0x54,0x7e,0x46,0x62
+       .byte   0xf6,0x8d,0x13,0xc2,    0x90,0xd8,0xb8,0xe8
+       .byte   0x2e,0x39,0xf7,0x5e,    0x82,0xc3,0xaf,0xf5
+       .byte   0x9f,0x5d,0x80,0xbe,    0x69,0xd0,0x93,0x7c
+       .byte   0x6f,0xd5,0x2d,0xa9,    0xcf,0x25,0x12,0xb3
+       .byte   0xc8,0xac,0x99,0x3b,    0x10,0x18,0x7d,0xa7
+       .byte   0xe8,0x9c,0x63,0x6e,    0xdb,0x3b,0xbb,0x7b
+       .byte   0xcd,0x26,0x78,0x09,    0x6e,0x59,0x18,0xf4
+       .byte   0xec,0x9a,0xb7,0x01,    0x83,0x4f,0x9a,0xa8
+       .byte   0xe6,0x95,0x6e,0x65,    0xaa,0xff,0xe6,0x7e
+       .byte   0x21,0xbc,0xcf,0x08,    0xef,0x15,0xe8,0xe6
+       .byte   0xba,0xe7,0x9b,0xd9,    0x4a,0x6f,0x36,0xce
+       .byte   0xea,0x9f,0x09,0xd4,    0x29,0xb0,0x7c,0xd6
+       .byte   0x31,0xa4,0xb2,0xaf,    0x2a,0x3f,0x23,0x31
+       .byte   0xc6,0xa5,0x94,0x30,    0x35,0xa2,0x66,0xc0
+       .byte   0x74,0x4e,0xbc,0x37,    0xfc,0x82,0xca,0xa6
+       .byte   0xe0,0x90,0xd0,0xb0,    0x33,0xa7,0xd8,0x15
+       .byte   0xf1,0x04,0x98,0x4a,    0x41,0xec,0xda,0xf7
+       .byte   0x7f,0xcd,0x50,0x0e,    0x17,0x91,0xf6,0x2f
+       .byte   0x76,0x4d,0xd6,0x8d,    0x43,0xef,0xb0,0x4d
+       .byte   0xcc,0xaa,0x4d,0x54,    0xe4,0x96,0x04,0xdf
+       .byte   0x9e,0xd1,0xb5,0xe3,    0x4c,0x6a,0x88,0x1b
+       .byte   0xc1,0x2c,0x1f,0xb8,    0x46,0x65,0x51,0x7f
+       .byte   0x9d,0x5e,0xea,0x04,    0x01,0x8c,0x35,0x5d
+       .byte   0xfa,0x87,0x74,0x73,    0xfb,0x0b,0x41,0x2e
+       .byte   0xb3,0x67,0x1d,0x5a,    0x92,0xdb,0xd2,0x52
+       .byte   0xe9,0x10,0x56,0x33,    0x6d,0xd6,0x47,0x13
+       .byte   0x9a,0xd7,0x61,0x8c,    0x37,0xa1,0x0c,0x7a
+       .byte   0x59,0xf8,0x14,0x8e,    0xeb,0x13,0x3c,0x89
+       .byte   0xce,0xa9,0x27,0xee,    0xb7,0x61,0xc9,0x35
+       .byte   0xe1,0x1c,0xe5,0xed,    0x7a,0x47,0xb1,0x3c
+       .byte   0x9c,0xd2,0xdf,0x59,    0x55,0xf2,0x73,0x3f
+       .byte   0x18,0x14,0xce,0x79,    0x73,0xc7,0x37,0xbf
+       .byte   0x53,0xf7,0xcd,0xea,    0x5f,0xfd,0xaa,0x5b
+       .byte   0xdf,0x3d,0x6f,0x14,    0x78,0x44,0xdb,0x86
+       .byte   0xca,0xaf,0xf3,0x81,    0xb9,0x68,0xc4,0x3e
+       .byte   0x38,0x24,0x34,0x2c,    0xc2,0xa3,0x40,0x5f
+       .byte   0x16,0x1d,0xc3,0x72,    0xbc,0xe2,0x25,0x0c
+       .byte   0x28,0x3c,0x49,0x8b,    0xff,0x0d,0x95,0x41
+       .byte   0x39,0xa8,0x01,0x71,    0x08,0x0c,0xb3,0xde
+       .byte   0xd8,0xb4,0xe4,0x9c,    0x64,0x56,0xc1,0x90
+       .byte   0x7b,0xcb,0x84,0x61,    0xd5,0x32,0xb6,0x70
+       .byte   0x48,0x6c,0x5c,0x74,    0xd0,0xb8,0x57,0x42
+AES_Td4:
+       .byte   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
+       .byte   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
+       .byte   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
+       .byte   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
+       .byte   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
+       .byte   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
+       .byte   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
+       .byte   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
+       .byte   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
+       .byte   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
+       .byte   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
+       .byte   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
+       .byte   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
+       .byte   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
+       .byte   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
+       .byte   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
+       .byte   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
+       .byte   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
+       .byte   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
+       .byte   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
+       .byte   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
+       .byte   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
+       .byte   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
+       .byte   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
+       .byte   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
+       .byte   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
+       .byte   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
+       .byte   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
+       .byte   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
+       .byte   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
+       .byte   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
+       .byte   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
+       .cstring "AES for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+       .align  4
+___
+
+print $code;
diff --git a/crypto/bn/asm/bn-c64xplus.asm b/crypto/bn/asm/bn-c64xplus.asm
new file mode 100644 (file)
index 0000000..161547c
--- /dev/null
@@ -0,0 +1,333 @@
+;;====================================================================
+;; Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+;; project.
+;;
+;; Rights for redistribution and usage in source and binary forms are
+;; granted according to the OpenSSL license. Warranty of any kind is
+;; disclaimed.
+;;====================================================================
+;; Compiler-generated multiply-n-add SPLOOP runs at 12*n cycles, n
+;; being the number of 32-bit words, addition - 8*n. Corresponding 4x
+;; unrolled SPLOOP-free loops - at ~8*n and ~5*n. Below assembler
+;; SPLOOPs spin at ... 2*n cycles [plus epilogue].
+;;====================================================================
+       .text
+
+       .asg    B3,RA
+       .asg    A4,ARG0
+       .asg    B4,ARG1
+       .asg    A6,ARG2
+       .asg    B6,ARG3
+       .asg    A8,ARG4
+       .asg    B8,ARG5
+       .asg    A4,RET
+       .asg    A15,FP
+       .asg    B14,DP
+       .asg    B15,SP
+
+       .global _bn_mul_add_words
+_bn_mul_add_words:
+       .asmfunc
+       MV      ARG2,B0
+  [!B0]        BNOP    RA
+||[!B0]        MVK     0,RET
+   [B0]        MVC     B0,ILC
+   [B0]        ZERO    A19             ; high part of accumulator
+|| [B0]        MV      ARG0,A2
+|| [B0]        MV      ARG3,A3
+       NOP     3
+
+       SPLOOP  2               ; 2*n+10
+;;====================================================================
+       LDW     *ARG1++,B7      ; ap[i]
+       NOP     3
+       LDW     *ARG0++,A7      ; rp[i]
+       MPY32U  B7,A3,A17:A16
+       NOP     3               ; [2,0] in epilogue
+       ADDU    A16,A7,A21:A20
+       ADDU    A19,A21:A20,A19:A18
+||     MV.S    A17,A23
+       SPKERNEL 2,1            ; leave slot for "return value"
+||     STW     A18,*A2++       ; rp[i]
+||     ADD     A19,A23,A19
+;;====================================================================
+       BNOP    RA,4
+       MV      A19,RET         ; return value
+       .endasmfunc
+
+       .global _bn_mul_words
+_bn_mul_words:
+       .asmfunc
+       MV      ARG2,B0
+  [!B0]        BNOP    RA
+||[!B0]        MVK     0,RET
+   [B0]        MVC     B0,ILC
+   [B0]        ZERO    A19             ; high part of accumulator
+       NOP     3
+
+       SPLOOP  2               ; 2*n+10
+;;====================================================================
+       LDW     *ARG1++,A7      ; ap[i]
+       NOP     4
+       MPY32U  A7,ARG3,A17:A16
+       NOP     4               ; [2,0] in epiloque
+       ADDU    A19,A16,A19:A18
+||     MV.S    A17,A21
+       SPKERNEL 2,1            ; leave slot for "return value"
+||     STW     A18,*ARG0++     ; rp[i]
+||     ADD.L   A19,A21,A19
+;;====================================================================
+       BNOP    RA,4
+       MV      A19,RET         ; return value
+       .endasmfunc
+
+       .global _bn_sqr_words
+_bn_sqr_words:
+       .asmfunc
+       MV      ARG2,B0
+  [!B0]        BNOP    RA
+||[!B0]        MVK     0,RET
+   [B0]        MVC     B0,ILC
+   [B0]        MV      ARG0,B2
+|| [B0]        ADD     4,ARG0,ARG0
+       NOP     3
+
+       SPLOOP  2               ; 2*n+10
+;;====================================================================
+       LDW     *ARG1++,B7      ; ap[i]
+       NOP     4
+       MPY32U  B7,B7,B1:B0
+       NOP     3               ; [2,0] in epilogue
+       STW     B0,*B2++(8)     ; rp[2*i]
+       MV      B1,A1
+       SPKERNEL 2,0            ; fully overlap BNOP RA,5
+||     STW     A1,*ARG0++(8)   ; rp[2*i+1]
+;;====================================================================
+       BNOP    RA,5
+       .endasmfunc
+
+       .global _bn_add_words
+_bn_add_words:
+       .asmfunc
+       MV      ARG3,B0
+  [!B0]        BNOP    RA
+||[!B0]        MVK     0,RET
+   [B0]        MVC     B0,ILC
+   [B0]        ZERO    A1              ; carry flag
+|| [B0]        MV      ARG0,A3
+       NOP     3
+
+       SPLOOP  2               ; 2*n+6
+;;====================================================================
+       LDW     *ARG2++,A7      ; bp[i]
+||     LDW     *ARG1++,B7      ; ap[i]
+       NOP     4
+       ADDU    A7,B7,A9:A8
+       ADDU    A1,A9:A8,A1:A0
+       SPKERNEL 0,0            ; fully overlap BNOP RA,5
+||     STW     A0,*A3++        ; write result
+||     MV      A1,RET          ; keep carry flag in RET
+;;====================================================================
+       BNOP    RA,5
+       .endasmfunc
+
+       .global _bn_sub_words
+_bn_sub_words:
+       .asmfunc
+       MV      ARG3,B0
+  [!B0]        BNOP    RA
+||[!B0]        MVK     0,RET
+   [B0]        MVC     B0,ILC
+   [B0]        ZERO    A2              ; borrow flag
+|| [B0]        MV      ARG0,A3
+       NOP     3
+
+       SPLOOP  2               ; 2*n+6
+;;====================================================================
+       LDW     *ARG2++,A7      ; bp[i]
+||     LDW     *ARG1++,B7      ; ap[i]
+       NOP     4
+       SUBU    B7,A7,A1:A0
+  [A2] SUB     A1:A0,1,A1:A0
+       SPKERNEL 0,1            ; leave slot for "return borrow flag"
+||     STW     A0,*A3++        ; write result
+||     AND     1,A1,A2         ; pass on borrow flag
+;;====================================================================
+       BNOP    RA,4
+       AND     1,A1,RET        ; return borrow flag
+       .endasmfunc
+
+       .global _bn_div_words
+       .global __divull
+_bn_div_words:
+       .asmfunc
+       CALLP   __divull,A3     ; jump to rts64plus.lib
+||     MV      ARG0,A5
+||     MV      ARG1,ARG0
+||     MV      ARG2,ARG1
+||     ZERO    B5
+       .endasmfunc
+
+;;====================================================================
+;; Not really Comba algorithm, just straightforward NxM... Dedicated
+;; fully unrolled real Comba implementations are asymptotically 2x
+;; faster, but naturally larger undertaking. Purpose of this exercise
+;; was rather to learn to master nested SPLOOPs...
+;;====================================================================
+       .global _bn_sqr_comba8
+       .global _bn_mul_comba8
+_bn_sqr_comba8:
+       MV      ARG1,ARG2
+_bn_mul_comba8:
+       .asmfunc
+       MVK     8,B0            ; N, RILC
+||     MVK     8,A0            ; M, outer loop counter
+||     MV      ARG1,A5         ; copy ap
+||     MV      ARG0,B4         ; copy rp
+||     ZERO    B19             ; high part of accumulator
+       MVC     B0,RILC
+||     SUB     B0,2,B1         ; N-2, initial ILC
+||     SUB     B0,1,B2         ; const B2=N-1
+||     LDW     *A5++,B6        ; ap[0]
+||     MV      A0,A3           ; const A3=M
+sploopNxM?:                    ; for best performance arrange M<=N
+   [A0]        SPLOOPD 2               ; 2*n+10
+||     MVC     B1,ILC
+||     ADDAW   B4,B0,B5
+||     ZERO    B7
+||     LDW     *A5++,A9        ; pre-fetch ap[1]
+||     ZERO    A1
+||     SUB     A0,1,A0
+;;====================================================================
+;; SPLOOP from bn_mul_add_words, but with flipped A<>B register files.
+;; This is because of Advisory 15 from TI publication SPRZ247I.
+       LDW     *ARG2++,A7      ; bp[i]
+       NOP     3
+   [A1]        LDW     *B5++,B7        ; rp[i]
+       MPY32U  A7,B6,B17:B16
+       NOP     3
+       ADDU    B16,B7,B21:B20
+       ADDU    B19,B21:B20,B19:B18
+||     MV.S    B17,B23
+       SPKERNEL
+||     STW     B18,*B4++       ; rp[i]
+||     ADD.S   B19,B23,B19
+;;====================================================================
+outer?:                                ; m*2*(n+1)+10
+       SUBAW   ARG2,A3,ARG2    ; rewind bp to bp[0]
+       SPMASKR
+||     CMPGT   A0,1,A2         ; done pre-fetching ap[i+1]?
+       MVD     A9,B6           ; move through .M unit(*)
+   [A2]        LDW     *A5++,A9        ; pre-fetch ap[i+1]
+       SUBAW   B5,B2,B5        ; rewind rp to rp[1]
+       MVK     1,A1
+   [A0]        BNOP.S1 outer?,4
+|| [A0]        SUB.L   A0,1,A0
+       STW     B19,*B4--[B2]   ; rewind rp tp rp[1]
+||     ZERO.S  B19             ; high part of accumulator
+;; end of outer?
+       BNOP    RA,5            ; return
+       .endasmfunc
+;; (*) It should be noted that B6 is used as input to MPY32U in
+;;     chronologically next cycle in *preceding* SPLOOP iteration.
+;;     Normally such arrangement would require DINT, but at this
+;;     point SPLOOP is draining and interrupts are disabled
+;;     implicitly.
+
+       .global _bn_sqr_comba4
+       .global _bn_mul_comba4
+_bn_sqr_comba4:
+       MV      ARG1,ARG2
+_bn_mul_comba4:
+       .asmfunc
+       .if     0
+       BNOP    sploopNxM?,3
+       ;; Above mentioned m*2*(n+1)+10 does not apply in n=m=4 case,
+       ;; because of read-after-write penalties, it's rather
+       ;; n*2*(n+3)+10, or 66 cycles [plus various overheads]...
+       MVK     4,B0            ; N, RILC
+||     MVK     4,A0            ; M, outer loop counter
+||     MV      ARG1,A5         ; copy ap
+||     MV      ARG0,B4         ; copy rp
+||     ZERO    B19             ; high part of accumulator
+       MVC     B0,RILC
+||     SUB     B0,2,B1         ; first ILC
+||     SUB     B0,1,B2         ; const B2=N-1
+||     LDW     *A5++,B6        ; ap[0]
+||     MV      A0,A3           ; const A3=M
+       .else
+       ;; This alternative is exercise in fully unrolled Comba
+       ;; algorithm implementation that operates at n*(n+1)+12, or
+       ;; as little as 32 cycles...
+       LDW     *ARG1[0],B16    ; a[0]
+||     LDW     *ARG2[0],A16    ; b[0]
+       LDW     *ARG1[1],B17    ; a[1]
+||     LDW     *ARG2[1],A17    ; b[1]
+       LDW     *ARG1[2],B18    ; a[2]
+||     LDW     *ARG2[2],A18    ; b[2]
+       LDW     *ARG1[3],B19    ; a[3]
+||     LDW     *ARG2[3],A19    ; b[3]
+       NOP
+       MPY32U  A16,B16,A1:A0   ; a[0]*b[0]
+       MPY32U  A17,B16,A23:A22 ; a[0]*b[1]
+       MPY32U  A16,B17,A25:A24 ; a[1]*b[0]
+       MPY32U  A16,B18,A27:A26 ; a[2]*b[0]
+       STW     A0,*ARG0[0]
+||     MPY32U  A17,B17,A29:A28 ; a[1]*b[1]
+       MPY32U  A18,B16,A31:A30 ; a[0]*b[2]
+||     ADDU    A22,A1,A1:A0
+       MV      A23,B0
+||     MPY32U  A19,B16,A21:A20 ; a[3]*b[0]
+||     ADDU    A24,A1:A0,A1:A0
+       ADDU    A25,B0,B1:B0
+||     STW     A0,*ARG0[1]
+||     MPY32U  A18,B17,A23:A22 ; a[2]*b[1]
+||     ADDU    A26,A1,A9:A8
+       ADDU    A27,B1,B9:B8
+||     MPY32U  A17,B18,A25:A24 ; a[1]*b[2]
+||     ADDU    A28,A9:A8,A9:A8
+       ADDU    A29,B9:B8,B9:B8
+||     MPY32U  A16,B19,A27:A26 ; a[0]*b[3]
+||     ADDU    A30,A9:A8,A9:A8
+       ADDU    A31,B9:B8,B9:B8
+||     ADDU    B0,A9:A8,A9:A8
+       STW     A8,*ARG0[2]
+||     ADDU    A20,A9,A1:A0
+       ADDU    A21,B9,B1:B0
+||     MPY32U  A19,B17,A21:A20 ; a[3]*b[1]
+||     ADDU    A22,A1:A0,A1:A0
+       ADDU    A23,B1:B0,B1:B0
+||     MPY32U  A18,B18,A23:A22 ; a[2]*b[2]
+||     ADDU    A24,A1:A0,A1:A0
+       ADDU    A25,B1:B0,B1:B0
+||     MPY32U  A17,B19,A25:A24 ; a[1]*b[3]
+||     ADDU    A26,A1:A0,A1:A0
+       ADDU    A27,B1:B0,B1:B0
+||     ADDU    B8,A1:A0,A1:A0
+       STW     A0,*ARG0[3]
+||     MPY32U  A19,B18,A27:A26 ; a[3]*b[2]
+||     ADDU    A20,A1,A9:A8
+       ADDU    A21,B1,B9:B8
+||     MPY32U  A18,B19,A29:A28 ; a[2]*b[3]
+||     ADDU    A22,A9:A8,A9:A8
+       ADDU    A23,B9:B8,B9:B8
+||     MPY32U  A19,B19,A31:A30 ; a[3]*b[3]
+||     ADDU    A24,A9:A8,A9:A8
+       ADDU    A25,B9:B8,B9:B8
+||     ADDU    B0,A9:A8,A9:A8
+       STW     A8,*ARG0[4]
+||     ADDU    A26,A9,A1:A0
+       ADDU    A27,B9,B1:B0
+||     ADDU    A28,A1:A0,A1:A0
+       ADDU    A29,B1:B0,B1:B0
+||     BNOP    RA
+||     ADDU    B8,A1:A0,A1:A0
+       STW     A0,*ARG0[5]
+||     ADDU    A30,A1,A9:A8
+       ADD     A31,B1,B8
+       ADDU    B0,A9:A8,A9:A8  ; removed || to avoid cross-path stall below
+       ADD     B8,A9,A9
+||     STW     A8,*ARG0[6]
+       STW     A9,*ARG0[7]
+       .endif
+       .endasmfunc
diff --git a/crypto/bn/asm/c64xplus-gf2m.pl b/crypto/bn/asm/c64xplus-gf2m.pl
new file mode 100644 (file)
index 0000000..cef8394
--- /dev/null
@@ -0,0 +1,146 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# February 2012
+#
+# The module implements bn_GF2m_mul_2x2 polynomial multiplication
+# used in bn_gf2m.c. It's kind of low-hanging mechanical port from
+# C for the time being... The subroutine runs in 37 cycles, which is
+# 4.5x faster than compiler-generated code. Though comparison is
+# totally unfair, because this module utilizes Galois Field Multiply
+# instruction.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($rp,$a1,$a0,$b1,$b0)=("A4","B4","A6","B6","A8");   # argument vector
+
+($Alo,$Alox0,$Alox1,$Alox2,$Alox3)=map("A$_",(16..20));
+($Ahi,$Ahix0,$Ahix1,$Ahix2,$Ahix3)=map("B$_",(16..20));
+($B_0,$B_1,$B_2,$B_3)=("B5","A5","A7","B7");
+($A,$B)=($Alo,$B_1);
+$xFF="B1";
+
+sub mul_1x1_upper {
+my ($A,$B)=@_;
+$code.=<<___;
+       EXTU    $B,8,24,$B_2            ; smash $B to 4 bytes
+||     AND     $B,$xFF,$B_0
+||     SHRU    $B,24,$B_3
+       SHRU    $A,16,   $Ahi           ; smash $A to two halfwords
+||     EXTU    $A,16,16,$Alo
+
+       XORMPY  $Alo,$B_2,$Alox2        ; 16x8 bits muliplication
+||     XORMPY  $Ahi,$B_2,$Ahix2
+||     EXTU    $B,16,24,$B_1
+       XORMPY  $Alo,$B_0,$Alox0
+||     XORMPY  $Ahi,$B_0,$Ahix0
+       XORMPY  $Alo,$B_3,$Alox3
+||     XORMPY  $Ahi,$B_3,$Ahix3
+       XORMPY  $Alo,$B_1,$Alox1
+||     XORMPY  $Ahi,$B_1,$Ahix1
+___
+}
+sub mul_1x1_merged {
+my ($OUTlo,$OUThi,$A,$B)=@_;
+$code.=<<___;
+        EXTU   $B,8,24,$B_2            ; smash $B to 4 bytes
+||      AND    $B,$xFF,$B_0
+||      SHRU   $B,24,$B_3
+        SHRU   $A,16,   $Ahi           ; smash $A to two halfwords
+||      EXTU   $A,16,16,$Alo
+
+       XOR     $Ahix0,$Alox2,$Ahix0
+||     MV      $Ahix2,$OUThi
+||      XORMPY $Alo,$B_2,$Alox2
+        XORMPY $Ahi,$B_2,$Ahix2
+||      EXTU   $B,16,24,$B_1
+||      XORMPY $Alo,$B_0,A1            ; $Alox0
+       XOR     $Ahix1,$Alox3,$Ahix1
+||     SHL     $Ahix0,16,$OUTlo
+||     SHRU    $Ahix0,16,$Ahix0
+       XOR     $Alox0,$OUTlo,$OUTlo
+||     XOR     $Ahix0,$OUThi,$OUThi
+||      XORMPY $Ahi,$B_0,$Ahix0
+||      XORMPY $Alo,$B_3,$Alox3
+||     SHL     $Alox1,8,$Alox1
+||     SHL     $Ahix3,8,$Ahix3
+       XOR     $Alox1,$OUTlo,$OUTlo
+||     XOR     $Ahix3,$OUThi,$OUThi
+||      XORMPY $Ahi,$B_3,$Ahix3
+||     SHL     $Ahix1,24,$Alox1
+||     SHRU    $Ahix1,8, $Ahix1
+       XOR     $Alox1,$OUTlo,$OUTlo
+||     XOR     $Ahix1,$OUThi,$OUThi
+||      XORMPY $Alo,$B_1,$Alox1
+||      XORMPY $Ahi,$B_1,$Ahix1
+||      MV     A1,$Alox0
+___
+}
+sub mul_1x1_lower {
+my ($OUTlo,$OUThi)=@_;
+$code.=<<___;
+       ;NOP
+       XOR     $Ahix0,$Alox2,$Ahix0
+||     MV      $Ahix2,$OUThi
+       NOP
+       XOR     $Ahix1,$Alox3,$Ahix1
+||     SHL     $Ahix0,16,$OUTlo
+||     SHRU    $Ahix0,16,$Ahix0
+       XOR     $Alox0,$OUTlo,$OUTlo
+||     XOR     $Ahix0,$OUThi,$OUThi
+||     SHL     $Alox1,8,$Alox1
+||     SHL     $Ahix3,8,$Ahix3
+       XOR     $Alox1,$OUTlo,$OUTlo
+||     XOR     $Ahix3,$OUThi,$OUThi
+||     SHL     $Ahix1,24,$Alox1
+||     SHRU    $Ahix1,8, $Ahix1
+       XOR     $Alox1,$OUTlo,$OUTlo
+||     XOR     $Ahix1,$OUThi,$OUThi
+___
+}
+$code.=<<___;
+       .text
+
+       .global _bn_GF2m_mul_2x2
+_bn_GF2m_mul_2x2:
+       .asmfunc
+       MVK     0xFF,$xFF
+___
+       &mul_1x1_upper($a0,$b0);                # a0·b0
+$code.=<<___;
+||     MV      $b1,$B
+       MV      $a1,$A
+___
+       &mul_1x1_merged("A28","B28",$A,$B);     # a0·b0/a1·b1
+$code.=<<___;
+||     XOR     $b0,$b1,$B
+       XOR     $a0,$a1,$A
+___
+       &mul_1x1_merged("A31","B31",$A,$B);     # a1·b1/(a0+a1)·(b0+b1)
+$code.=<<___;
+       XOR     A28,A31,A29
+||     XOR     B28,B31,B29                     ; a0·b0+a1·b1
+___
+       &mul_1x1_lower("A30","B30");            # (a0+a1)·(b0+b1)
+$code.=<<___;
+||     BNOP    B3
+       XOR     A29,A30,A30
+||     XOR     B29,B30,B30                     ; (a0+a1)·(b0+b1)-a0·b0-a1·b1
+       XOR     B28,A30,A30
+||     STW     A28,*${rp}[0]
+       XOR     B30,A31,A31
+||     STW     A30,*${rp}[1]
+       STW     A31,*${rp}[2]
+       STW     B31,*${rp}[3]
+       .endasmfunc
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/c64xpluscpuid.pl b/crypto/c64xpluscpuid.pl
new file mode 100644 (file)
index 0000000..067b693
--- /dev/null
@@ -0,0 +1,246 @@
+#!/usr/bin/env perl
+#
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+$code.=<<___;
+       .text
+
+       .asg    B3,RA
+
+       .global _OPENSSL_rdtsc
+_OPENSSL_rdtsc:
+       .asmfunc
+       B       RA
+       MVC     TSCL,B0
+       MVC     TSCH,B1
+  [!B0]        MVC     B0,TSCL         ; start TSC
+       MV      B0,A4
+       MV      B1,A5
+       .endasmfunc
+
+       .global _OPENSSL_cleanse
+_OPENSSL_cleanse:
+       .asmfunc
+       ZERO    A3:A2
+||     ZERO    B2
+||     SHRU    B4,3,B0         ; is length >= 8
+||     ADD     1,A4,B6
+  [!B0]        BNOP    RA
+||     ZERO    A1
+||     ZERO    B1
+   [B0]        MVC     B0,ILC
+||[!B0]        CMPLT   0,B4,A1
+||[!B0]        CMPLT   1,B4,B1
+   [A1]        STB     A2,*A4++[2]
+|| [B1] STB    B2,*B6++[2]
+||[!B0]        CMPLT   2,B4,A1
+||[!B0]        CMPLT   3,B4,B1
+   [A1]        STB     A2,*A4++[2]
+|| [B1] STB    B2,*B6++[2]
+||[!B0]        CMPLT   4,B4,A1
+||[!B0]        CMPLT   5,B4,B1
+   [A1]        STB     A2,*A4++[2]
+|| [B1] STB    B2,*B6++[2]
+||[!B0]        CMPLT   6,B4,A1
+   [A1]        STB     A2,*A4++[2]
+
+       SPLOOP  1
+       STNDW   A3:A2,*A4++
+||     SUB     B4,8,B4
+       SPKERNEL
+
+       MV      B4,B0           ; remaining bytes
+||     ADD     1,A4,B6
+||     BNOP    RA
+   [B0]        CMPLT   0,B0,A1
+|| [B0]        CMPLT   1,B0,B1
+   [A1]        STB     A2,*A4++[2]
+|| [B1] STB    B2,*B6++[2]
+|| [B0]        CMPLT   2,B0,A1
+|| [B0]        CMPLT   3,B0,B1
+   [A1]        STB     A2,*A4++[2]
+|| [B1] STB    B2,*B6++[2]
+|| [B0]        CMPLT   4,B0,A1
+|| [B0]        CMPLT   5,B0,B1
+   [A1]        STB     A2,*A4++[2]
+|| [B1] STB    B2,*B6++[2]
+|| [B0]        CMPLT   6,B0,A1
+   [A1]        STB     A2,*A4++[2]
+       .endasmfunc
+
+       .global _OPENSSL_atomic_add
+_OPENSSL_atomic_add:
+       .asmfunc
+       MV      A4,B0
+atomic_add?:
+       LL      *B0,B5
+       NOP     4
+       ADD     B4,B5,B5
+       SL      B5,*B0
+       CMTL    *B0,B1
+       NOP     4
+  [!B1]        B       atomic_add?
+   [B1]        BNOP    RA,4
+       MV      B5,A4
+       .endasmfunc
+
+       .global _OPENSSL_wipe_cpu
+_OPENSSL_wipe_cpu:
+       .asmfunc
+       ZERO    A0
+||     ZERO    B0
+||     ZERO    A1
+||     ZERO    B1
+       ZERO    A3:A2
+||     MVD     B0,B2
+||     ZERO    A4
+||     ZERO    B4
+||     ZERO    A5
+||     ZERO    B5
+||     BNOP    RA
+       ZERO    A7:A6
+||     ZERO    B7:B6
+||     ZERO    A8
+||     ZERO    B8
+||     ZERO    A9
+||     ZERO    B9
+       ZERO    A17:A16
+||     ZERO    B17:B16
+||     ZERO    A18
+||     ZERO    B18
+||     ZERO    A19
+||     ZERO    B19
+       ZERO    A21:A20
+||     ZERO    B21:B20
+||     ZERO    A22
+||     ZERO    B22
+||     ZERO    A23
+||     ZERO    B23
+       ZERO    A25:A24
+||     ZERO    B25:B24
+||     ZERO    A26
+||     ZERO    B26
+||     ZERO    A27
+||     ZERO    B27
+       ZERO    A29:A28
+||     ZERO    B29:B28
+||     ZERO    A30
+||     ZERO    B30
+||     ZERO    A31
+||     ZERO    B31
+       .endasmfunc
+
+CLFLUSH        .macro  CONTROL,ADDR,LEN
+       B       passthrough?
+||     STW     ADDR,*CONTROL[0]
+       STW     LEN,*CONTROL[1]
+spinlock?:
+       LDW     *CONTROL[1],A0
+       NOP     3
+passthrough?:
+       NOP
+  [A0] BNOP    spinlock?,5
+       .endm
+
+       .global _OPENSSL_instrument_bus
+_OPENSSL_instrument_bus:
+       .asmfunc
+       MV      B4,B0                   ; reassign sizeof(output)
+||     MV      A4,B4                   ; reassign output
+||     MVK     0x00004030,A3
+       MV      B0,A4                   ; return value
+||     MVK     1,A1
+||     MVKH    0x01840000,A3           ; L1DWIBAR
+       MVC     TSCL,B8                 ; collect 1st tick
+||     MVK     0x00004010,A5
+       MV      B8,B9                   ; lasttick = tick
+||     MVK     0,B7                    ; lastdiff = 0
+||     MVKH    0x01840000,A5           ; L2WIBAR
+       CLFLUSH A3,B4,A1                ; write-back and invalidate L1D line
+       CLFLUSH A5,B4,A1                ; write-back and invalidate L2 line
+       LL      *B4,B5
+       NOP     4
+       ADD     B7,B5,B5
+       SL      B5,*B4
+       CMTL    *B4,B1
+       NOP     4
+       STW     B5,*B4
+bus_loop1?:
+       MVC     TSCL,B8
+|| [B0]        SUB     B0,1,B0
+       SUB     B8,B9,B7                ; lastdiff = tick - lasttick
+||     MV      B8,B9                   ; lasttick = tick
+       CLFLUSH A3,B4,A1                ; write-back and invalidate L1D line
+       CLFLUSH A5,B4,A1                ; write-back and invalidate L2 line
+       LL      *B4,B5
+       NOP     4
+       ADD     B7,B5,B5
+       SL      B5,*B4
+       CMTL    *B4,B1
+       STW     B5,*B4                  ; [!B1] is removed to flatten samples
+||     ADDK    4,B4
+|| [B0]        BNOP    bus_loop1?,5
+
+       BNOP    RA,5
+       .endasmfunc
+
+       .global _OPENSSL_instrument_bus2
+_OPENSSL_instrument_bus2:
+       .asmfunc
+       MV      A6,B0                   ; reassign max
+||     MV      B4,A6                   ; reassing sizeof(output)
+||     MVK     0x00004030,A3
+       MV      A4,B4                   ; reassign output
+||     MVK     0,A4                    ; return value
+||     MVK     1,A1
+||     MVKH    0x01840000,A3           ; L1DWIBAR
+
+       MVC     TSCL,B8                 ; collect 1st tick
+||     MVK     0x00004010,A5
+       MV      B8,B9                   ; lasttick = tick
+||     MVK     0,B7                    ; lastdiff = 0
+||     MVKH    0x01840000,A5           ; L2WIBAR
+       CLFLUSH A3,B4,A1                ; write-back and invalidate L1D line
+       CLFLUSH A5,B4,A1                ; write-back and invalidate L2 line
+       LL      *B4,B5
+       NOP     4
+       ADD     B7,B5,B5
+       SL      B5,*B4
+       CMTL    *B4,B1
+       NOP     4
+       STW     B5,*B4
+
+       MVC     TSCL,B8                 ; collect 1st diff
+       SUB     B8,B9,B7                ; lastdiff = tick - lasttick
+||     MV      B8,B9                   ; lasttick = tick
+||     SUB     B0,1,B0
+bus_loop2?:
+       CLFLUSH A3,B4,A1                ; write-back and invalidate L1D line
+       CLFLUSH A5,B4,A1                ; write-back and invalidate L2 line
+       LL      *B4,B5
+       NOP     4
+       ADD     B7,B5,B5
+       SL      B5,*B4
+       CMTL    *B4,B1
+       STW     B5,*B4                  ; [!B1] is removed to flatten samples
+||[!B0]        BNOP    bus_loop2_done?,2
+||     SUB     B0,1,B0
+       MVC     TSCL,B8
+       SUB     B8,B9,B8
+||     MV      B8,B9
+       CMPEQ   B8,B7,B2
+||     MV      B8,B7
+  [!B2]        ADDAW   B4,1,B4
+||[!B2]        ADDK    1,A4
+       CMPEQ   A4,A6,A2
+  [!A2]        BNOP    bus_loop2?,5
+
+bus_loop2_done?:
+       BNOP    RA,5
+       .endasmfunc
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/modes/asm/ghash-c64xplus.pl b/crypto/modes/asm/ghash-c64xplus.pl
new file mode 100644 (file)
index 0000000..1ac4d92
--- /dev/null
@@ -0,0 +1,231 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# December 2011
+#
+# The module implements GCM GHASH function and underlying single
+# multiplication operation in GF(2^128). Even though subroutines
+# have _4bit suffix, they are not using any tables, but rely on
+# hardware Galois Field Multiply support. Streamed GHASH processes
+# byte in ~7 cycles, which is >6x faster than "4-bit" table-driven
+# code compiled with TI's cl6x 6.0 with -mv6400+ -o2 flags. We are
+# comparing apples vs. oranges, but compiler surely could have done
+# better, because theoretical [though not necessarily achievable]
+# estimate for "4-bit" table-driven implementation is ~12 cycles.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($Xip,$Htable,$inp,$len)=("A4","B4","A6","B6");        # arguments
+
+($Z0,$Z1,$Z2,$Z3,      $H0, $H1, $H2, $H3,
+                       $H0x,$H1x,$H2x,$H3x)=map("A$_",(16..27));
+($H01u,$H01y,$H2u,$H3u,        $H0y,$H1y,$H2y,$H3y,
+                       $H0z,$H1z,$H2z,$H3z)=map("B$_",(16..27));
+($FF000000,$E10000)=("B30","B31");
+($xip,$x0,$x1,$xib)=map("B$_",(6..9)); # $xip zaps $len
+ $xia="A9";
+($rem,$res)=("B4","B5");               # $rem zaps $Htable
+
+$code.=<<___;
+       .text
+
+       .asg    B3,RA
+
+       .if     0
+       .global _gcm_gmult_1bit
+_gcm_gmult_1bit:
+       ADDAD   $Htable,2,$Htable
+       .endif
+       .global _gcm_gmult_4bit
+_gcm_gmult_4bit:
+       .asmfunc
+       LDDW    *${Htable}[-1],$H1:$H0  ; H.lo
+       LDDW    *${Htable}[-2],$H3:$H2  ; H.hi
+||     MV      $Xip,${xip}             ; reassign Xi
+||     MVK     15,B1                   ; SPLOOPD constant
+
+       MVK     0xE1,$E10000
+||     LDBU    *++${xip}[15],$x1       ; Xi[15]
+       MVK     0xFF,$FF000000
+||     LDBU    *--${xip},$x0           ; Xi[14]
+       SHL     $E10000,16,$E10000      ; [pre-shifted] reduction polynomial
+       SHL     $FF000000,24,$FF000000  ; upper byte mask
+||     BNOP    ghash_loop?
+||     MVK     1,B0                    ; take a single spin
+
+       PACKH2  $H0,$H1,$xia            ; pack H0' and H1's upper bytes
+       AND     $H2,$FF000000,$H2u      ; H2's upper byte
+       AND     $H3,$FF000000,$H3u      ; H3's upper byte
+||     SHRU    $H2u,8,$H2u
+       SHRU    $H3u,8,$H3u
+||     ZERO    $Z1:$Z0
+       SHRU2   $xia,8,$H01u
+||     ZERO    $Z3:$Z2
+       .endasmfunc
+
+       .global _gcm_ghash_4bit
+_gcm_ghash_4bit:
+       .asmfunc
+       LDDW    *${Htable}[-1],$H1:$H0  ; H.lo
+||     SHRU    $len,4,B0               ; reassign len
+       LDDW    *${Htable}[-2],$H3:$H2  ; H.hi
+||     MV      $Xip,${xip}             ; reassign Xi
+||     MVK     15,B1                   ; SPLOOPD constant
+
+       MVK     0xE1,$E10000
+|| [B0]        LDNDW   *${inp}[1],$H1x:$H0x
+       MVK     0xFF,$FF000000
+|| [B0]        LDNDW   *${inp}++[2],$H3x:$H2x
+       SHL     $E10000,16,$E10000      ; [pre-shifted] reduction polynomial
+||     LDDW    *${xip}[1],$Z1:$Z0
+       SHL     $FF000000,24,$FF000000  ; upper byte mask
+||     LDDW    *${xip}[0],$Z3:$Z2
+
+       PACKH2  $H0,$H1,$xia            ; pack H0' and H1's upper bytes
+       AND     $H2,$FF000000,$H2u      ; H2's upper byte
+       AND     $H3,$FF000000,$H3u      ; H3's upper byte
+||     SHRU    $H2u,8,$H2u
+       SHRU    $H3u,8,$H3u
+       SHRU2   $xia,8,$H01u
+
+|| [B0]        XOR     $H0x,$Z0,$Z0            ; Xi^=inp
+|| [B0]        XOR     $H1x,$Z1,$Z1
+       .if     .LITTLE_ENDIAN
+   [B0]        XOR     $H2x,$Z2,$Z2
+|| [B0]        XOR     $H3x,$Z3,$Z3
+|| [B0]        SHRU    $Z1,24,$xia             ; Xi[15], avoid cross-path stall
+       STDW    $Z1:$Z0,*${xip}[1]
+|| [B0]        SHRU    $Z1,16,$x0              ; Xi[14]
+|| [B0]        ZERO    $Z1:$Z0
+       .else
+   [B0]        XOR     $H2x,$Z2,$Z2
+|| [B0]        XOR     $H3x,$Z3,$Z3
+|| [B0]        MV      $Z0,$xia                ; Xi[15], avoid cross-path stall
+       STDW    $Z1:$Z0,*${xip}[1]
+|| [B0] SHRU   $Z0,8,$x0               ; Xi[14]
+|| [B0]        ZERO    $Z1:$Z0
+       .endif
+       STDW    $Z3:$Z2,*${xip}[0]
+|| [B0]        ZERO    $Z3:$Z2
+|| [B0]        MV      $xia,$x1
+   [B0]        ADDK    14,${xip}
+
+ghash_loop?:
+       SPLOOPD 6                       ; 6*16+7
+||     MVC     B1,ILC
+|| [B0]        SUB     B0,1,B0
+||     ZERO    A0
+||     ADD     $x1,$x1,$xib            ; SHL   $x1,1,$xib
+||     SHL     $x1,1,$xia
+___
+\f
+########____________________________
+#  0    D2.     M1          M2      |
+#  1            M1                  |
+#  2            M1          M2      |
+#  3        D1. M1          M2      |
+#  4        S1. L1                  |
+#  5    S2  S1x L1          D2  L2  |____________________________
+#  6/0          L1  S1      L2  S2x |D2.     M1          M2      |
+#  7/1          L1  S1  D1x S2  M2  |        M1                  |
+#  8/2              S1  L1x S2      |        M1          M2      |
+#  9/3              S1  L1x         |    D1. M1          M2      |
+# 10/4                  D1x         |    S1. L1                  |
+# 11/5                              |S2  S1x L1          D2  L2  |____________
+# 12/6/0                D1x       __|        L1  S1      L2  S2x |D2.     ....
+#    7/1                                     L1  S1  D1x S2  M2  |        ....
+#    8/2                                         S1  L1x S2      |        ....
+#####...                                         ................|............
+$code.=<<___;
+       XORMPY  $H0,$xia,$H0x           ; 0     ; H·Xi[i]
+||     XORMPY  $H01u,$xib,$H01y
+|| [A0]        LDBU    *--${xip},$x0
+       XORMPY  $H1,$xia,$H1x           ; 1
+       XORMPY  $H2,$xia,$H2x           ; 2
+||     XORMPY  $H2u,$xib,$H2y
+       XORMPY  $H3,$xia,$H3x           ; 3
+||     XORMPY  $H3u,$xib,$H3y
+||[!A0]        MVK.D   15,A0                           ; *--${xip} counter
+       XOR.L   $H0x,$Z0,$Z0            ; 4     ; Z^=H·Xi[i]
+|| [A0]        SUB.S   A0,1,A0
+       XOR.L   $H1x,$Z1,$Z1            ; 5
+||     AND.D   $H01y,$FF000000,$H0z
+||     SWAP2.L $H01y,$H1y              ;       ; SHL   $H01y,16,$H1y
+||     SHL     $x0,1,$xib
+||     SHL     $x0,1,$xia
+
+       XOR.L   $H2x,$Z2,$Z2            ; 6/0   ; [0,0] in epilogue
+||     SHL     $Z0,1,$rem              ;       ; rem=Z<<1
+||     SHRMB.S $Z1,$Z0,$Z0             ;       ; Z>>=8
+||     AND.L   $H1y,$FF000000,$H1z
+       XOR.L   $H3x,$Z3,$Z3            ; 7/1
+||     SHRMB.S $Z2,$Z1,$Z1
+||     XOR.D   $H0z,$Z0,$Z0                    ; merge upper byte products
+||     AND.S   $H2y,$FF000000,$H2z
+||     XORMPY  $E10000,$rem,$res       ;       ; implicit rem&0x1FE
+       XOR.L   $H1z,$Z1,$Z1            ; 8/2
+||     SHRMB.S $Z3,$Z2,$Z2
+||     AND.S   $H3y,$FF000000,$H3z
+       XOR.L   $H2z,$Z2,$Z2            ; 9/3
+||     SHRU    $Z3,8,$Z3
+       XOR.D   $H3z,$Z3,$Z3            ; 10/4
+       NOP                             ; 11/5
+
+       SPKERNEL 0,2
+||     XOR.D   $res,$Z3,$Z3            ; 12/6/0; Z^=res
+
+       ; input pre-fetch is possible where D1 slot is available...
+   [B0]        LDNDW   *${inp}[1],$H1x:$H0x    ; 8/-
+   [B0]        LDNDW   *${inp}++[2],$H3x:$H2x  ; 9/-
+       NOP                             ; 10/-
+       .if     .LITTLE_ENDIAN
+       SWAP2   $Z0,$Z1                 ; 11/-
+||     SWAP4   $Z1,$Z0
+       SWAP4   $Z1,$Z1                 ; 12/-
+||     SWAP2   $Z0,$Z0
+       SWAP2   $Z2,$Z3
+||     SWAP4   $Z3,$Z2
+||[!B0]        BNOP    RA
+       SWAP4   $Z3,$Z3
+||     SWAP2   $Z2,$Z2
+|| [B0]        BNOP    ghash_loop?
+   [B0]        XOR     $H0x,$Z0,$Z0            ; Xi^=inp
+|| [B0]        XOR     $H1x,$Z1,$Z1
+   [B0]        XOR     $H2x,$Z2,$Z2
+|| [B0]        XOR     $H3x,$Z3,$Z3
+|| [B0]        SHRU    $Z1,24,$xia             ; Xi[15], avoid cross-path stall
+       STDW    $Z1:$Z0,*${xip}[1]
+|| [B0]        SHRU    $Z1,16,$x0              ; Xi[14]
+|| [B0]        ZERO    $Z1:$Z0
+       .else
+  [!B0]        BNOP    RA                      ; 11/-
+   [B0]        BNOP    ghash_loop?             ; 12/-
+   [B0]        XOR     $H0x,$Z0,$Z0            ; Xi^=inp
+|| [B0]        XOR     $H1x,$Z1,$Z1
+   [B0]        XOR     $H2x,$Z2,$Z2
+|| [B0]        XOR     $H3x,$Z3,$Z3
+|| [B0]        MV      $Z0,$xia                ; Xi[15], avoid cross-path stall
+       STDW    $Z1:$Z0,*${xip}[1]
+|| [B0] SHRU   $Z0,8,$x0               ; Xi[14]
+|| [B0]        ZERO    $Z1:$Z0
+       .endif
+       STDW    $Z3:$Z2,*${xip}[0]
+|| [B0]        ZERO    $Z3:$Z2
+|| [B0]        MV      $xia,$x1
+   [B0]        ADDK    14,${xip}
+       .endasmfunc
+
+       .sect   .const
+       .cstring "GHASH for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+       .align  4
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha1-c64xplus.pl b/crypto/sha/asm/sha1-c64xplus.pl
new file mode 100644 (file)
index 0000000..87000d1
--- /dev/null
@@ -0,0 +1,323 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA1 for C64x+.
+#
+# November 2011
+#
+# If compared to compiler-generated code with similar characteristics,
+# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs,
+# this implementation is 25% smaller and >2x faster. In absolute terms
+# performance is (quite impressive) ~6.5 cycles per processed byte.
+# Fully unrolled assembler would be ~5x larger and is likely to be
+# ~15% faster. It would be free from references to intermediate ring
+# buffer, but put more pressure on L1P [both because the code would be
+# larger and won't be using SPLOOP buffer]. There are no plans to
+# realize fully unrolled variant though...
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTX,$INP,$NUM) = ("A4","B4","A6");           # arguments
+
+($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25));
+($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27");
+($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31));
+($XPA,$XPB) = ("A5","B5");                     # X circular buffer
+($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM
+
+$code=<<___;
+       .text
+
+       .asg    B3,RA
+       .asg    A15,FP
+       .asg    B15,SP
+
+       .if     .BIG_ENDIAN
+       .asg    MV,SWAP2
+       .asg    MV,SWAP4
+       .endif
+
+       .global _sha1_block_data_order
+_sha1_block_data_order:
+       .asmfunc stack_usage(64)
+       MV      $NUM,A0                 ; reassign $NUM
+||     MVK     -64,B0
+  [!A0]        BNOP    RA                      ; if ($NUM==0) return;
+|| [A0]        STW     FP,*SP--[16]            ; save frame pointer and alloca(64)
+|| [A0]        MV      SP,FP
+   [A0]        LDW     *${CTX}[0],$A           ; load A-E...
+|| [A0]        AND     B0,SP,SP                ; align stack at 64 bytes
+   [A0]        LDW     *${CTX}[1],$B
+|| [A0]        SUBAW   SP,2,SP                 ; reserve two words above buffer
+   [A0]        LDW     *${CTX}[2],$C
+|| [A0]        MVK     0x00404,B0
+   [A0]        LDW     *${CTX}[3],$D
+|| [A0]        MVKH    0x50000,B0              ; 0x050404, 64 bytes for $XP[AB]
+   [A0]        LDW     *${CTX}[4],$E
+|| [A0]        MVC     B0,AMR                  ; setup circular addressing
+       LDNW    *${INP}++,$TX1          ; pre-fetch input
+       NOP     1
+
+loop?:
+       MVK     0x00007999,$K
+||     ADDAW   SP,2,$XPA
+||     SUB     A0,1,A0
+||     MVK     13,B0
+       MVKH    0x5a820000,$K           ; K_00_19
+||     ADDAW   SP,2,$XPB
+||     MV      $A,$Actx
+||     MV      $B,$Bctx
+;;==================================================
+       SPLOOPD 5                       ; BODY_00_13
+||     MV      $C,$Cctx
+||     MV      $D,$Dctx
+||     MV      $E,$Ectx
+||     MVC     B0,ILC
+
+       ROTL    $A,5,$Arot
+||     AND     $C,$B,$F
+||     ANDN    $D,$B,$F0
+||     ADD     $K,$E,$T                ; T=E+K
+
+       XOR     $F0,$F,$F               ; F_00_19(B,C,D)
+||     MV      $D,$E                   ; E=D
+||     MV      $C,$D                   ; D=C
+||     SWAP2   $TX1,$TX2
+||     LDNW    *${INP}++,$TX1
+
+       ADD     $F,$T,$T                ; T+=F_00_19(B,C,D)
+||     ROTL    $B,30,$C                ; C=ROL(B,30)
+||     SWAP4   $TX2,$TX3               ; byte swap
+
+       ADD     $Arot,$T,$T             ; T+=ROL(A,5)
+||     MV      $A,$B                   ; B=A
+
+       ADD     $TX3,$T,$A              ; A=T+Xi
+||     STW     $TX3,*${XPB}++
+       SPKERNEL
+;;==================================================
+       ROTL    $A,5,$Arot              ; BODY_14
+||     AND     $C,$B,$F
+||     ANDN    $D,$B,$F0
+||     ADD     $K,$E,$T                ; T=E+K
+
+       XOR     $F0,$F,$F               ; F_00_19(B,C,D)
+||     MV      $D,$E                   ; E=D
+||     MV      $C,$D                   ; D=C
+||     SWAP2   $TX1,$TX2
+||     LDNW    *${INP}++,$TX1
+
+       ADD     $F,$T,$T                ; T+=F_00_19(B,C,D)
+||     ROTL    $B,30,$C                ; C=ROL(B,30)
+||     SWAP4   $TX2,$TX2               ; byte swap
+||     LDW     *${XPA}++,$X0           ; fetches from X ring buffer are
+||     LDW     *${XPB}[4],$X2          ; 2 iterations ahead
+
+       ADD     $Arot,$T,$T             ; T+=ROL(A,5)
+||     MV      $A,$B                   ; B=A
+||     LDW     *${XPA}[7],$X8
+||     MV      $TX3,$X13               ; ||    LDW     *${XPB}[15],$X13
+||     MV      $TX2,$TX3
+
+       ADD     $TX2,$T,$A              ; A=T+Xi
+||     STW     $TX2,*${XPB}++
+;;==================================================
+       ROTL    $A,5,$Arot              ; BODY_15
+||     AND     $C,$B,$F
+||     ANDN    $D,$B,$F0
+||     ADD     $K,$E,$T                ; T=E+K
+
+       XOR     $F0,$F,$F               ; F_00_19(B,C,D)
+||     MV      $D,$E                   ; E=D
+||     MV      $C,$D                   ; D=C
+||     SWAP2   $TX1,$TX2
+
+       ADD     $F,$T,$T                ; T+=F_00_19(B,C,D)
+||     ROTL    $B,30,$C                ; C=ROL(B,30)
+||     SWAP4   $TX2,$TX2               ; byte swap
+||     XOR     $X0,$X2,$TX0            ; Xupdate XORs are 1 iteration ahead
+||     LDW     *${XPA}++,$X0
+||     LDW     *${XPB}[4],$X2
+
+       ADD     $Arot,$T,$T             ; T+=ROL(A,5)
+||     MV      $A,$B                   ; B=A
+||     XOR     $X8,$X13,$TX1
+||     LDW     *${XPA}[7],$X8
+||     MV      $TX3,$X13               ; ||    LDW     *${XPB}[15],$X13
+||     MV      $TX2,$TX3
+
+       ADD     $TX2,$T,$A              ; A=T+Xi
+||     STW     $TX2,*${XPB}++
+||     XOR     $TX0,$TX1,$TX1
+||     MVK     3,B0
+;;==================================================
+       SPLOOPD 5                       ; BODY_16_19
+||     MVC     B0,ILC
+
+       ROTL    $A,5,$Arot
+||     AND     $C,$B,$F
+||     ANDN    $D,$B,$F0
+||     ADD     $K,$E,$T                ; T=E+K
+||     ROTL    $TX1,1,$TX2             ; Xupdate output
+
+       XOR     $F0,$F,$F               ; F_00_19(B,C,D)
+||     MV      $D,$E                   ; E=D
+||     MV      $C,$D                   ; D=C
+
+       ADD     $F,$T,$T                ; T+=F_00_19(B,C,D)
+||     ROTL    $B,30,$C                ; C=ROL(B,30)
+||     XOR     $X0,$X2,$TX0
+||     LDW     *${XPA}++,$X0
+||     LDW     *${XPB}[4],$X2
+
+       ADD     $Arot,$T,$T             ; T+=ROL(A,5)
+||     MV      $A,$B                   ; B=A
+||     XOR     $X8,$X13,$TX1
+||     LDW     *${XPA}[7],$X8
+||     MV      $TX3,$X13               ; ||    LDW     *${XPB}[15],$X13
+||     MV      $TX2,$TX3
+
+       ADD     $TX2,$T,$A              ; A=T+Xi
+||     STW     $TX2,*${XPB}++
+||     XOR     $TX0,$TX1,$TX1
+       SPKERNEL
+
+       MVK     0xffffeba1,$K
+||     MVK     19,B0
+       MVKH    0x6ed90000,$K           ; K_20_39
+___
+sub BODY_20_39 {
+$code.=<<___;
+;;==================================================
+       SPLOOPD 5                       ; BODY_20_39
+||     MVC     B0,ILC
+
+       ROTL    $A,5,$Arot
+||     XOR     $B,$C,$F
+||     ADD     $K,$E,$T                ; T=E+K
+||     ROTL    $TX1,1,$TX2             ; Xupdate output
+
+       XOR     $D,$F,$F                ; F_20_39(B,C,D)
+||     MV      $D,$E                   ; E=D
+||     MV      $C,$D                   ; D=C
+
+       ADD     $F,$T,$T                ; T+=F_20_39(B,C,D)
+||     ROTL    $B,30,$C                ; C=ROL(B,30)
+||     XOR     $X0,$X2,$TX0
+||     LDW     *${XPA}++,$X0
+||     LDW     *${XPB}[4],$X2
+
+       ADD     $Arot,$T,$T             ; T+=ROL(A,5)
+||     MV      $A,$B                   ; B=A
+||     XOR     $X8,$X13,$TX1
+||     LDW     *${XPA}[7],$X8
+||     MV      $TX3,$X13               ; ||    LDW     *${XPB}[15],$X13
+||     MV      $TX2,$TX3
+
+       ADD     $TX2,$T,$A              ; A=T+Xi
+||     STW     $TX2,*${XPB}++          ; last one is redundant
+||     XOR     $TX0,$TX1,$TX1
+       SPKERNEL
+___
+$code.=<<___ if (!shift);
+       MVK     0xffffbcdc,$K
+       MVKH    0x8f1b0000,$K           ; K_40_59
+___
+}      &BODY_20_39();
+$code.=<<___;
+;;==================================================
+       SPLOOPD 5                       ; BODY_40_59
+||     MVC     B0,ILC
+||     AND     $B,$C,$F
+||     AND     $B,$D,$F0
+
+       ROTL    $A,5,$Arot
+||     XOR     $F0,$F,$F
+||     AND     $C,$D,$F0
+||     ADD     $K,$E,$T                ; T=E+K
+||     ROTL    $TX1,1,$TX2             ; Xupdate output
+
+       XOR     $F0,$F,$F               ; F_40_59(B,C,D)
+||     MV      $D,$E                   ; E=D
+||     MV      $C,$D                   ; D=C
+
+       ADD     $F,$T,$T                ; T+=F_40_59(B,C,D)
+||     ROTL    $B,30,$C                ; C=ROL(B,30)
+||     XOR     $X0,$X2,$TX0
+||     LDW     *${XPA}++,$X0
+||     LDW     *${XPB}[4],$X2
+
+       ADD     $Arot,$T,$T             ; T+=ROL(A,5)
+||     MV      $A,$B                   ; B=A
+||     XOR     $X8,$X13,$TX1
+||     LDW     *${XPA}[7],$X8
+||     MV      $TX3,$X13               ; ||    LDW     *${XPB}[15],$X13
+||     MV      $TX2,$TX3
+
+       ADD     $TX2,$T,$A              ; A=T+Xi
+||     STW     $TX2,*${XPB}++
+||     XOR     $TX0,$TX1,$TX1
+||     AND     $B,$C,$F
+||     AND     $B,$D,$F0
+       SPKERNEL
+
+       MVK     0xffffc1d6,$K
+||     MVK     18,B0
+       MVKH    0xca620000,$K           ; K_60_79
+___
+       &BODY_20_39(-1);                # BODY_60_78
+$code.=<<___;
+;;==================================================
+   [A0]        B       loop?
+||     ROTL    $A,5,$Arot              ; BODY_79
+||     XOR     $B,$C,$F
+||     ROTL    $TX1,1,$TX2             ; Xupdate output
+
+   [A0]        LDNW    *${INP}++,$TX1          ; pre-fetch input
+||     ADD     $K,$E,$T                ; T=E+K
+||     XOR     $D,$F,$F                ; F_20_39(B,C,D)
+
+       ADD     $F,$T,$T                ; T+=F_20_39(B,C,D)
+||     ADD     $Ectx,$D,$E             ; E=D,E+=Ectx
+||     ADD     $Dctx,$C,$D             ; D=C,D+=Dctx
+||     ROTL    $B,30,$C                ; C=ROL(B,30)
+
+       ADD     $Arot,$T,$T             ; T+=ROL(A,5)
+||     ADD     $Bctx,$A,$B             ; B=A,B+=Bctx
+
+       ADD     $TX2,$T,$A              ; A=T+Xi
+
+       ADD     $Actx,$A,$A             ; A+=Actx
+||     ADD     $Cctx,$C,$C             ; C+=Cctx
+;; end of loop?
+
+       BNOP    RA                      ; return
+||     MV      FP,SP                   ; restore stack pointer
+||     LDW     *FP[0],FP               ; restore frame pointer
+       STW     $A,*${CTX}[0]           ; emit A-E...
+||     MVK     0,B0
+       STW     $B,*${CTX}[1]
+||     MVC     B0,AMR                  ; clear AMR
+       STW     $C,*${CTX}[2]
+       STW     $D,*${CTX}[3]
+       STW     $E,*${CTX}[4]
+       .endasmfunc
+
+       .sect   .const
+       .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+       .align  4
+___
+
+print $code;
+close STDOUT;
diff --git a/crypto/sha/asm/sha256-c64xplus.pl b/crypto/sha/asm/sha256-c64xplus.pl
new file mode 100644 (file)
index 0000000..5a05786
--- /dev/null
@@ -0,0 +1,302 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA256 for C64x+.
+#
+# January 2012
+#
+# Performance is just below 10 cycles per processed byte, which is
+# almost 40% faster than compiler-generated code. Unroll is unlikely
+# to give more than ~8% improvement...
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTXA,$INP,$NUM) = ("A4","B4","A6");            # arguments
+ $K256="A3";
+
+($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14)
+       =map("A$_",(16..31));
+($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15)
+       =map("B$_",(16..31));
+
+($Xia,$Xib)=("A5","B5");                       # circular/ring buffer
+ $CTXB=$t2e;
+
+($Xn,$X0,$K)=("B7","B8","B9");
+($Maj,$Ch)=($T2,"B6");
+
+$code.=<<___;
+       .text
+       .if     __TI_EABI__
+       .nocmp
+       .endif
+
+       .asg    B3,RA
+       .asg    A15,FP
+       .asg    B15,SP
+
+       .if     .BIG_ENDIAN
+       .asg    SWAP2,MV
+       .asg    SWAP4,MV
+       .endif
+
+       .global _sha256_block_data_order
+_sha256_block_data_order:
+       .asmfunc stack_usage(64)
+       MV      $NUM,A0                         ; reassign $NUM
+||     MVK     -64,B0
+  [!A0]        BNOP    RA                              ; if ($NUM==0) return;
+|| [A0]        STW     FP,*SP--[16]                    ; save frame pointer and alloca(64)
+|| [A0]        MV      SP,FP
+   [A0]        ADDKPC  _sha256_block_data_order,B2
+|| [A0]        AND     B0,SP,SP                        ; align stack at 64 bytes
+       .if     __TI_EABI__
+   [A0]        MVK     0x00404,B1
+|| [A0]        MVKL    \$PCR_OFFSET(K256,_sha256_block_data_order),$K256
+   [A0]        MVKH    0x50000,B1
+|| [A0]        MVKH    \$PCR_OFFSET(K256,_sha256_block_data_order),$K256
+       .else
+   [A0]        MVK     0x00404,B1
+|| [A0]        MVKL    (K256-_sha256_block_data_order),$K256
+   [A0]        MVKH    0x50000,B1
+|| [A0]        MVKH    (K256-_sha256_block_data_order),$K256
+       .endif
+   [A0]        MVC     B1,AMR                          ; setup circular addressing
+|| [A0]        MV      SP,$Xia
+   [A0]        MV      SP,$Xib
+|| [A0]        ADD     B2,$K256,$K256
+|| [A0]        MV      $CTXA,$CTXB
+|| [A0]        SUBAW   SP,2,SP                         ; reserve two words above buffer
+       LDW     *${CTXA}[0],$A                  ; load ctx
+||     LDW     *${CTXB}[4],$E
+       LDW     *${CTXA}[1],$B
+||     LDW     *${CTXB}[5],$F
+       LDW     *${CTXA}[2],$C
+||     LDW     *${CTXB}[6],$G
+       LDW     *${CTXA}[3],$D
+||     LDW     *${CTXB}[7],$H
+
+       LDNW    *$INP++,$Xn                     ; pre-fetch input
+       LDW     *$K256++,$K                     ; pre-fetch K256[0]
+       MVK     14,B0                           ; loop counters
+       MVK     47,B1
+||     ADDAW   $Xia,9,$Xia
+outerloop?:
+       SUB     A0,1,A0
+||     MV      $A,$Actx
+||     MV      $E,$Ectx
+||     MVD     $B,$Bctx
+||     MVD     $F,$Fctx
+       MV      $C,$Cctx
+||     MV      $G,$Gctx
+||     MVD     $D,$Dctx
+||     MVD     $H,$Hctx
+||     SWAP4   $Xn,$X0
+
+       SPLOOPD 8                               ; BODY_00_14
+||     MVC     B0,ILC
+||     SWAP2   $X0,$X0
+
+       LDNW    *$INP++,$Xn
+||     ROTL    $A,30,$S0
+||     OR      $A,$B,$Maj
+||     AND     $A,$B,$t2a
+||     ROTL    $E,26,$S1
+||     AND     $F,$E,$Ch
+||     ANDN    $G,$E,$t2e
+       ROTL    $A,19,$t0a
+||     AND     $C,$Maj,$Maj
+||     ROTL    $E,21,$t0e
+||     XOR     $t2e,$Ch,$Ch                    ; Ch(e,f,g) = (e&f)^(~e&g)
+       ROTL    $A,10,$t1a
+||     OR      $t2a,$Maj,$Maj                  ; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||     ROTL    $E,7,$t1e
+||     ADD     $K,$H,$T1                       ; T1 = h + K256[i]
+       ADD     $X0,$T1,$T1                     ; T1 += X[i];
+||     STW     $X0,*$Xib++
+||     XOR     $t0a,$S0,$S0
+||     XOR     $t0e,$S1,$S1
+       XOR     $t1a,$S0,$S0                    ; Sigma0(a)
+||     XOR     $t1e,$S1,$S1                    ; Sigma1(e)
+||     LDW     *$K256++,$K                     ; pre-fetch K256[i+1]
+||     ADD     $Ch,$T1,$T1                     ; T1 += Ch(e,f,g)
+       ADD     $S1,$T1,$T1                     ; T1 += Sigma1(e)
+||     ADD     $S0,$Maj,$T2                    ; T2 = Sigma0(a) + Maj(a,b,c)
+||     ROTL    $G,0,$H                         ; h = g
+||     MV      $F,$G                           ; g = f
+||     MV      $X0,$X14
+||     SWAP4   $Xn,$X0
+       SWAP2   $X0,$X0
+||     MV      $E,$F                           ; f = e
+||     ADD     $D,$T1,$E                       ; e = d + T1
+||     MV      $C,$D                           ; d = c
+       MV      $B,$C                           ; c = b
+||     MV      $A,$B                           ; b = a
+||     ADD     $T1,$T2,$A                      ; a = T1 + T2
+       SPKERNEL
+
+       ROTL    $A,30,$S0                       ; BODY_15
+||     OR      $A,$B,$Maj
+||     AND     $A,$B,$t2a
+||     ROTL    $E,26,$S1
+||     AND     $F,$E,$Ch
+||     ANDN    $G,$E,$t2e
+||     LDW     *${Xib}[1],$Xn                  ; modulo-scheduled
+       ROTL    $A,19,$t0a
+||     AND     $C,$Maj,$Maj
+||     ROTL    $E,21,$t0e
+||     XOR     $t2e,$Ch,$Ch                    ; Ch(e,f,g) = (e&f)^(~e&g)
+||     LDW     *${Xib}[2],$X1                  ; modulo-scheduled
+       ROTL    $A,10,$t1a
+||     OR      $t2a,$Maj,$Maj                  ; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||     ROTL    $E,7,$t1e
+||     ADD     $K,$H,$T1                       ; T1 = h + K256[i]
+       ADD     $X0,$T1,$T1                     ; T1 += X[i];
+||     STW     $X0,*$Xib++
+||     XOR     $t0a,$S0,$S0
+||     XOR     $t0e,$S1,$S1
+       XOR     $t1a,$S0,$S0                    ; Sigma0(a)
+||     XOR     $t1e,$S1,$S1                    ; Sigma1(e)
+||     LDW     *$K256++,$K                     ; pre-fetch K256[i+1]
+||     ADD     $Ch,$T1,$T1                     ; T1 += Ch(e,f,g)
+       ADD     $S1,$T1,$T1                     ; T1 += Sigma1(e)
+||     ADD     $S0,$Maj,$T2                    ; T2 = Sigma0(a) + Maj(a,b,c)
+||     ROTL    $G,0,$H                         ; h = g
+||     MV      $F,$G                           ; g = f
+||     MV      $X0,$X15
+       MV      $E,$F                           ; f = e
+||     ADD     $D,$T1,$E                       ; e = d + T1
+||     MV      $C,$D                           ; d = c
+||     MV      $Xn,$X0                         ; modulo-scheduled
+||     LDW     *$Xia,$X9                       ; modulo-scheduled
+||     ROTL    $X1,25,$t0e                     ; modulo-scheduled
+||     ROTL    $X14,15,$t0a                    ; modulo-scheduled
+       SHRU    $X1,3,$s0                       ; modulo-scheduled
+||     SHRU    $X14,10,$s1                     ; modulo-scheduled
+||     ROTL    $B,0,$C                         ; c = b
+||     MV      $A,$B                           ; b = a
+||     ADD     $T1,$T2,$A                      ; a = T1 + T2
+
+       SPLOOPD 10                              ; BODY_16_63
+||     MVC     B1,ILC
+||     ROTL    $X1,14,$t1e                     ; modulo-scheduled
+||     ROTL    $X14,13,$t1a                    ; modulo-scheduled
+
+       XOR     $t0e,$s0,$s0
+||     XOR     $t0a,$s1,$s1
+||     MV      $X15,$X14
+||     MV      $X1,$Xn
+       XOR     $t1e,$s0,$s0                    ; sigma0(X[i+1])
+||     XOR     $t1a,$s1,$s1                    ; sigma1(X[i+14])
+||     LDW     *${Xib}[2],$X1                  ; module-scheduled
+       ROTL    $A,30,$S0
+||     OR      $A,$B,$Maj
+||     AND     $A,$B,$t2a
+||     ROTL    $E,26,$S1
+||     AND     $F,$E,$Ch
+||     ANDN    $G,$E,$t2e
+||     ADD     $X9,$X0,$X0                     ; X[i] += X[i+9]
+       ROTL    $A,19,$t0a
+||     AND     $C,$Maj,$Maj
+||     ROTL    $E,21,$t0e
+||     XOR     $t2e,$Ch,$Ch                    ; Ch(e,f,g) = (e&f)^(~e&g)
+||     ADD     $s0,$X0,$X0                     ; X[i] += sigma1(X[i+1])
+       ROTL    $A,10,$t1a
+||     OR      $t2a,$Maj,$Maj                  ; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||     ROTL    $E,7,$t1e
+||     ADD     $H,$K,$T1                       ; T1 = h + K256[i]
+||     ADD     $s1,$X0,$X0                     ; X[i] += sigma1(X[i+14])
+       XOR     $t0a,$S0,$S0
+||     XOR     $t0e,$S1,$S1
+||     ADD     $X0,$T1,$T1                     ; T1 += X[i]
+||     STW     $X0,*$Xib++
+       XOR     $t1a,$S0,$S0                    ; Sigma0(a)
+||     XOR     $t1e,$S1,$S1                    ; Sigma1(e)
+||     ADD     $Ch,$T1,$T1                     ; T1 += Ch(e,f,g)
+||     MV      $X0,$X15
+||     ROTL    $G,0,$H                         ; h = g
+||     LDW     *$K256++,$K                     ; pre-fetch K256[i+1]
+       ADD     $S1,$T1,$T1                     ; T1 += Sigma1(e)
+||     ADD     $S0,$Maj,$T2                    ; T2 = Sigma0(a) + Maj(a,b,c)
+||     MV      $F,$G                           ; g = f
+||     MV      $Xn,$X0                         ; modulo-scheduled
+||     LDW     *++$Xia,$X9                     ; modulo-scheduled
+||     ROTL    $X1,25,$t0e                     ; module-scheduled
+||     ROTL    $X14,15,$t0a                    ; modulo-scheduled
+       ROTL    $X1,14,$t1e                     ; modulo-scheduled
+||     ROTL    $X14,13,$t1a                    ; modulo-scheduled
+||     MV      $E,$F                           ; f = e
+||     ADD     $D,$T1,$E                       ; e = d + T1
+||     MV      $C,$D                           ; d = c
+||     MV      $B,$C                           ; c = b
+       MV      $A,$B                           ; b = a
+||     ADD     $T1,$T2,$A                      ; a = T1 + T2
+||     SHRU    $X1,3,$s0                       ; modulo-scheduled
+||     SHRU    $X14,10,$s1                     ; modulo-scheduled
+       SPKERNEL
+
+   [A0]        B       outerloop?
+|| [A0]        LDNW    *$INP++,$Xn                     ; pre-fetch input
+|| [A0]        ADDK    -260,$K256                      ; rewind K256
+||     ADD     $Actx,$A,$A                     ; accumulate ctx
+||     ADD     $Ectx,$E,$E
+||     ADD     $Bctx,$B,$B
+       ADD     $Fctx,$F,$F
+||     ADD     $Cctx,$C,$C
+||     ADD     $Gctx,$G,$G
+||     ADD     $Dctx,$D,$D
+||     ADD     $Hctx,$H,$H
+|| [A0]        LDW     *$K256++,$K                     ; pre-fetch K256[0]
+
+  [!A0]        BNOP    RA
+||[!A0]        MV      $CTXA,$CTXB
+  [!A0]        MV      FP,SP                           ; restore stack pointer
+||[!A0]        LDW     *FP[0],FP                       ; restore frame pointer
+  [!A0]        STW     $A,*${CTXA}[0]                  ; save ctx
+||[!A0]        STW     $E,*${CTXB}[4]
+||[!A0]        MVK     0,B0
+  [!A0]        STW     $B,*${CTXA}[1]
+||[!A0]        STW     $F,*${CTXB}[5]
+||[!A0]        MVC     B0,AMR                          ; clear AMR
+       STW     $C,*${CTXA}[2]
+||     STW     $G,*${CTXB}[6]
+       STW     $D,*${CTXA}[3]
+||     STW     $H,*${CTXB}[7]
+       .endasmfunc
+
+       .sect   ".const:sha_asm"
+       .align  128
+K256:
+       .uword  0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
+       .uword  0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
+       .uword  0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
+       .uword  0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
+       .uword  0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
+       .uword  0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
+       .uword  0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
+       .uword  0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
+       .uword  0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
+       .uword  0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
+       .uword  0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
+       .uword  0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
+       .uword  0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
+       .uword  0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
+       .uword  0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
+       .uword  0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
+       .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+       .align  4
+
+___
+
+print $code;
diff --git a/crypto/sha/asm/sha512-c64xplus.pl b/crypto/sha/asm/sha512-c64xplus.pl
new file mode 100644 (file)
index 0000000..e4e7c04
--- /dev/null
@@ -0,0 +1,421 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# SHA512 for C64x+.
+#
+# January 2012
+#
+# Performance is 19 cycles per processed byte. Compared to block
+# transform function from sha512.c compiled with cl6x with -mv6400+
+# -o2 -DOPENSSL_SMALL_FOOTPRINT it's almost 7x faster and 2x smaller.
+# Loop unroll won't make it, this implementation, any faster, because
+# it's effectively dominated by SHRU||SHL pairs and you can't schedule
+# more of them.
+#
+# !!! Note that this module uses AMR, which means that all interrupt
+# service routines are expected to preserve it and for own well-being
+# zero it upon entry.
+
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
+open STDOUT,">$output";
+
+($CTXA,$INP,$NUM) = ("A4","B4","A6");            # arguments
+ $K512="A3";
+
+($Ahi,$Actxhi,$Bhi,$Bctxhi,$Chi,$Cctxhi,$Dhi,$Dctxhi,
+ $Ehi,$Ectxhi,$Fhi,$Fctxhi,$Ghi,$Gctxhi,$Hhi,$Hctxhi)=map("A$_",(16..31));
+($Alo,$Actxlo,$Blo,$Bctxlo,$Clo,$Cctxlo,$Dlo,$Dctxlo,
+ $Elo,$Ectxlo,$Flo,$Fctxlo,$Glo,$Gctxlo,$Hlo,$Hctxlo)=map("B$_",(16..31));
+
+($S1hi,$CHhi,$S0hi,$t0hi)=map("A$_",(10..13));
+($S1lo,$CHlo,$S0lo,$t0lo)=map("B$_",(10..13));
+($T1hi,         $T2hi)=         ("A6","A7");
+($T1lo,$T1carry,$T2lo,$T2carry)=("B6","B7","B8","B9");
+($Khi,$Klo)=("A9","A8");
+($MAJhi,$MAJlo)=($T2hi,$T2lo);
+($t1hi,$t1lo)=($Khi,"B2");
+ $CTXB=$t1lo;
+
+($Xihi,$Xilo)=("A5","B5");                     # circular/ring buffer
+
+$code.=<<___;
+       .text
+       .if     __TI_EABI__
+       .nocmp
+       .endif
+
+       .asg    B3,RA
+       .asg    A15,FP
+       .asg    B15,SP
+
+       .if     .BIG_ENDIAN
+       .asg    $Khi,KHI
+       .asg    $Klo,KLO
+       .else
+       .asg    $Khi,KLO
+       .asg    $Klo,KHI
+       .endif
+
+       .global _sha512_block_data_order
+_sha512_block_data_order:
+       .asmfunc stack_usage(40+128)
+       MV      $NUM,A0                         ; reassign $NUM
+||     MVK     -128,B0
+  [!A0]        BNOP    RA                              ; if ($NUM==0) return;
+|| [A0]        STW     FP,*SP--(40)                    ; save frame pointer
+|| [A0]        MV      SP,FP
+   [A0]        STDW    B13:B12,*SP[4]
+|| [A0]        MVK     0x00404,B1
+   [A0]        STDW    B11:B10,*SP[3]
+|| [A0]        STDW    A13:A12,*FP[-3]
+|| [A0]        MVKH    0x60000,B1
+   [A0]        STDW    A11:A10,*SP[1]
+|| [A0]        MVC     B1,AMR                          ; setup circular addressing
+|| [A0]        ADD     B0,SP,SP                        ; alloca(128)
+       .if     __TI_EABI__
+   [A0]        AND     B0,SP,SP                        ; align stack at 128 bytes
+|| [A0]        ADDKPC  _sha512_block_data_order,B1
+|| [A0]        MVKL    \$PCR_OFFSET(K512,_sha512_block_data_order),$K512
+   [A0]        MVKH    \$PCR_OFFSET(K512,_sha512_block_data_order),$K512
+|| [A0]        SUBAW   SP,2,SP                         ; reserve two words above buffer
+       .else
+   [A0]        AND     B0,SP,SP                        ; align stack at 128 bytes
+|| [A0]        ADDKPC  _sha512_block_data_order,B1
+|| [A0]        MVKL    (K512-_sha512_block_data_order),$K512
+   [A0]        MVKH    (K512-_sha512_block_data_order),$K512
+|| [A0]        SUBAW   SP,2,SP                         ; reserve two words above buffer
+       .endif
+       ADDAW   SP,3,$Xilo
+       ADDAW   SP,2,$Xihi
+
+||     MV      $CTXA,$CTXB
+       LDW     *${CTXA}[0^.LITTLE_ENDIAN],$Ahi ; load ctx
+||     LDW     *${CTXB}[1^.LITTLE_ENDIAN],$Alo
+||     ADD     B1,$K512,$K512
+       LDW     *${CTXA}[2^.LITTLE_ENDIAN],$Bhi
+||     LDW     *${CTXB}[3^.LITTLE_ENDIAN],$Blo
+       LDW     *${CTXA}[4^.LITTLE_ENDIAN],$Chi
+||     LDW     *${CTXB}[5^.LITTLE_ENDIAN],$Clo
+       LDW     *${CTXA}[6^.LITTLE_ENDIAN],$Dhi
+||     LDW     *${CTXB}[7^.LITTLE_ENDIAN],$Dlo
+       LDW     *${CTXA}[8^.LITTLE_ENDIAN],$Ehi
+||     LDW     *${CTXB}[9^.LITTLE_ENDIAN],$Elo
+       LDW     *${CTXA}[10^.LITTLE_ENDIAN],$Fhi
+||     LDW     *${CTXB}[11^.LITTLE_ENDIAN],$Flo
+       LDW     *${CTXA}[12^.LITTLE_ENDIAN],$Ghi
+||     LDW     *${CTXB}[13^.LITTLE_ENDIAN],$Glo
+       LDW     *${CTXA}[14^.LITTLE_ENDIAN],$Hhi
+||     LDW     *${CTXB}[15^.LITTLE_ENDIAN],$Hlo
+
+       LDNDW   *$INP++,B11:B10                 ; pre-fetch input
+       LDDW    *$K512++,$Khi:$Klo              ; pre-fetch K512[0]
+outerloop?:
+       MVK     15,B0                           ; loop counters
+||     MVK     64,B1
+||     SUB     A0,1,A0
+       MV      $Ahi,$Actxhi
+||     MV      $Alo,$Actxlo
+||     MV      $Bhi,$Bctxhi
+||     MV      $Blo,$Bctxlo
+||     MV      $Chi,$Cctxhi
+||     MV      $Clo,$Cctxlo
+||     MVD     $Dhi,$Dctxhi
+||     MVD     $Dlo,$Dctxlo
+       MV      $Ehi,$Ectxhi
+||     MV      $Elo,$Ectxlo
+||     MV      $Fhi,$Fctxhi
+||     MV      $Flo,$Fctxlo
+||     MV      $Ghi,$Gctxhi
+||     MV      $Glo,$Gctxlo
+||     MVD     $Hhi,$Hctxhi
+||     MVD     $Hlo,$Hctxlo
+loop0_15?:
+       .if     .BIG_ENDIAN
+       MV      B11,$T1hi
+||     MV      B10,$T1lo
+       .else
+       SWAP4   B10,$T1hi
+||     SWAP4   B11,$T1lo
+       SWAP2   $T1hi,$T1hi
+||     SWAP2   $T1lo,$T1lo
+       .endif
+loop16_79?:
+       STW     $T1hi,*$Xihi++[2]
+||     STW     $T1lo,*$Xilo++[2]                       ; X[i] = T1
+||     ADD     $Hhi,$T1hi,$T1hi
+||     ADDU    $Hlo,$T1lo,$T1carry:$T1lo               ; T1 += h
+||     SHRU    $Ehi,14,$S1hi
+||     SHL     $Ehi,32-14,$S1lo
+       XOR     $Fhi,$Ghi,$CHhi
+||     XOR     $Flo,$Glo,$CHlo
+||     ADD     KHI,$T1hi,$T1hi
+||     ADDU    KLO,$T1carry:$T1lo,$T1carry:$T1lo       ; T1 += K512[i]
+||     SHRU    $Elo,14,$t0lo
+||     SHL     $Elo,32-14,$t0hi
+       XOR     $t0hi,$S1hi,$S1hi
+||     XOR     $t0lo,$S1lo,$S1lo
+||     AND     $Ehi,$CHhi,$CHhi
+||     AND     $Elo,$CHlo,$CHlo
+||     ROTL    $Ghi,0,$Hhi
+||     ROTL    $Glo,0,$Hlo                             ; h = g
+||     SHRU    $Ehi,18,$t0hi
+||     SHL     $Ehi,32-18,$t0lo
+       XOR     $t0hi,$S1hi,$S1hi
+||     XOR     $t0lo,$S1lo,$S1lo
+||     XOR     $Ghi,$CHhi,$CHhi
+||     XOR     $Glo,$CHlo,$CHlo                        ; Ch(e,f,g) = ((f^g)&e)^g
+||     ROTL    $Fhi,0,$Ghi
+||     ROTL    $Flo,0,$Glo                             ; g = f
+||     SHRU    $Elo,18,$t0lo
+||     SHL     $Elo,32-18,$t0hi
+       XOR     $t0hi,$S1hi,$S1hi
+||     XOR     $t0lo,$S1lo,$S1lo
+||     OR      $Ahi,$Bhi,$MAJhi
+||     OR      $Alo,$Blo,$MAJlo
+||     ROTL    $Ehi,0,$Fhi
+||     ROTL    $Elo,0,$Flo                             ; f = e
+||     SHRU    $Ehi,41-32,$t0lo
+||     SHL     $Ehi,64-41,$t0hi
+       XOR     $t0hi,$S1hi,$S1hi
+||     XOR     $t0lo,$S1lo,$S1lo
+||     AND     $Chi,$MAJhi,$MAJhi
+||     AND     $Clo,$MAJlo,$MAJlo
+||     ROTL    $Dhi,0,$Ehi
+||     ROTL    $Dlo,0,$Elo                             ; e = d
+||     SHRU    $Elo,41-32,$t0hi
+||     SHL     $Elo,64-41,$t0lo
+       XOR     $t0hi,$S1hi,$S1hi
+||     XOR     $t0lo,$S1lo,$S1lo                       ; Sigma1(e)
+||     AND     $Ahi,$Bhi,$t1hi
+||     AND     $Alo,$Blo,$t1lo
+||     ROTL    $Chi,0,$Dhi
+||     ROTL    $Clo,0,$Dlo                             ; d = c
+||     SHRU    $Ahi,28,$S0hi
+||     SHL     $Ahi,32-28,$S0lo
+       OR      $t1hi,$MAJhi,$MAJhi
+||     OR      $t1lo,$MAJlo,$MAJlo                     ; Maj(a,b,c) = ((a|b)&c)|(a&b)
+||     ADD     $CHhi,$T1hi,$T1hi
+||     ADDU    $CHlo,$T1carry:$T1lo,$T1carry:$T1lo     ; T1 += Ch(e,f,g)
+||     ROTL    $Bhi,0,$Chi
+||     ROTL    $Blo,0,$Clo                             ; c = b
+||     SHRU    $Alo,28,$t0lo
+||     SHL     $Alo,32-28,$t0hi
+       XOR     $t0hi,$S0hi,$S0hi
+||     XOR     $t0lo,$S0lo,$S0lo
+||     ADD     $S1hi,$T1hi,$T1hi
+||     ADDU    $S1lo,$T1carry:$T1lo,$T1carry:$T1lo     ; T1 += Sigma1(e)
+||     ROTL    $Ahi,0,$Bhi
+||     ROTL    $Alo,0,$Blo                             ; b = a
+||     SHRU    $Ahi,34-32,$t0lo
+||     SHL     $Ahi,64-34,$t0hi
+       XOR     $t0hi,$S0hi,$S0hi
+||     XOR     $t0lo,$S0lo,$S0lo
+||     ADD     $MAJhi,$T1hi,$T2hi
+||     ADDU    $MAJlo,$T1carry:$T1lo,$T2carry:$T2lo    ; T2 = T1+Maj(a,b,c)
+||     SHRU    $Alo,34-32,$t0hi
+||     SHL     $Alo,64-34,$t0lo
+       XOR     $t0hi,$S0hi,$S0hi
+||     XOR     $t0lo,$S0lo,$S0lo
+||     ADD     $Ehi,$T1hi,$T1hi
+||     ADDU    $Elo,$T1carry:$T1lo,$T1carry:$T1lo      ; T1 += e
+|| [B0]        BNOP    loop0_15?
+||     SHRU    $Ahi,39-32,$t0lo
+||     SHL     $Ahi,64-39,$t0hi
+       XOR     $t0hi,$S0hi,$S0hi
+||     XOR     $t0lo,$S0lo,$S0lo
+|| [B0]        LDNDW   *$INP++,B11:B10                         ; pre-fetch input
+||[!B1]        BNOP    break?
+||     SHRU    $Alo,39-32,$t0hi
+||     SHL     $Alo,64-39,$t0lo
+       XOR     $t0hi,$S0hi,$S0hi
+||     XOR     $t0lo,$S0lo,$S0lo                       ; Sigma0(a)
+||     ADD     $T1carry,$T1hi,$Ehi
+||     MV      $T1lo,$Elo                              ; e = T1
+||[!B0]        LDW     *${Xihi}[28],$T1hi
+||[!B0]        LDW     *${Xilo}[28],$T1lo                      ; X[i+14]
+       ADD     $S0hi,$T2hi,$T2hi
+||     ADDU    $S0lo,$T2carry:$T2lo,$T2carry:$T2lo     ; T2 += Sigma0(a)
+|| [B1]        LDDW    *$K512++,$Khi:$Klo                      ; pre-fetch K512[i]
+       NOP                                             ; avoid cross-path stall
+       ADD     $T2carry,$T2hi,$Ahi
+||     MV      $T2lo,$Alo                              ; a = T2
+|| [B0]        SUB     B0,1,B0
+;;===== branch to loop00_15? is taken here
+       NOP
+;;===== branch to break? is taken here
+       LDW     *${Xihi}[2],$T2hi
+||     LDW     *${Xilo}[2],$T2lo                       ; X[i+1]
+||     SHRU    $T1hi,19,$S1hi
+||     SHL     $T1hi,32-19,$S1lo
+       SHRU    $T1lo,19,$t0lo
+||     SHL     $T1lo,32-19,$t0hi
+       XOR     $t0hi,$S1hi,$S1hi
+||     XOR     $t0lo,$S1lo,$S1lo
+||     SHRU    $T1hi,61-32,$t0lo
+||     SHL     $T1hi,64-61,$t0hi
+       XOR     $t0hi,$S1hi,$S1hi
+||     XOR     $t0lo,$S1lo,$S1lo
+||     SHRU    $T1lo,61-32,$t0hi
+||     SHL     $T1lo,64-61,$t0lo
+       XOR     $t0hi,$S1hi,$S1hi
+||     XOR     $t0lo,$S1lo,$S1lo
+||     SHRU    $T1hi,6,$t0hi
+||     SHL     $T1hi,32-6,$t0lo
+       XOR     $t0hi,$S1hi,$S1hi
+||     XOR     $t0lo,$S1lo,$S1lo
+||     SHRU    $T1lo,6,$t0lo
+||     LDW     *${Xihi}[18],$T1hi
+||     LDW     *${Xilo}[18],$T1lo                      ; X[i+9]
+       XOR     $t0lo,$S1lo,$S1lo                       ; sigma1(Xi[i+14])
+
+||     LDW     *${Xihi}[0],$CHhi
+||     LDW     *${Xilo}[0],$CHlo                       ; X[i]
+||     SHRU    $T2hi,1,$S0hi
+||     SHL     $T2hi,32-1,$S0lo
+       SHRU    $T2lo,1,$t0lo
+||     SHL     $T2lo,32-1,$t0hi
+       XOR     $t0hi,$S0hi,$S0hi
+||     XOR     $t0lo,$S0lo,$S0lo
+||     SHRU    $T2hi,8,$t0hi
+||     SHL     $T2hi,32-8,$t0lo
+       XOR     $t0hi,$S0hi,$S0hi
+||     XOR     $t0lo,$S0lo,$S0lo
+||     SHRU    $T2lo,8,$t0lo
+||     SHL     $T2lo,32-8,$t0hi
+       XOR     $t0hi,$S0hi,$S0hi
+||     XOR     $t0lo,$S0lo,$S0lo
+||     ADD     $S1hi,$T1hi,$T1hi
+||     ADDU    $S1lo,$T1lo,$T1carry:$T1lo              ; T1 = X[i+9]+sigma1()
+|| [B1]        BNOP    loop16_79?
+||     SHRU    $T2hi,7,$t0hi
+||     SHL     $T2hi,32-7,$t0lo
+       XOR     $t0hi,$S0hi,$S0hi
+||     XOR     $t0lo,$S0lo,$S0lo
+||     ADD     $CHhi,$T1hi,$T1hi
+||     ADDU    $CHlo,$T1carry:$T1lo,$T1carry:$T1lo     ; T1 += X[i]
+||     SHRU    $T2lo,7,$t0lo
+       XOR     $t0lo,$S0lo,$S0lo                       ; sigma0(Xi[i+1]
+
+       ADD     $S0hi,$T1hi,$T1hi
+||     ADDU    $S0lo,$T1carry:$T1lo,$T1carry:$T1lo     ; T1 += sigma0()
+|| [B1]        SUB     B1,1,B1
+       NOP                                             ; avoid cross-path stall
+       ADD     $T1carry,$T1hi,$T1hi
+;;===== branch to loop16_79? is taken here
+
+break?:
+       ADD     $Ahi,$Actxhi,$Ahi               ; accumulate ctx
+||     ADDU    $Alo,$Actxlo,$Actxlo:$Alo
+|| [A0]        LDNDW   *$INP++,B11:B10                 ; pre-fetch input
+|| [A0]        ADDK    -640,$K512                      ; rewind pointer to K512
+       ADD     $Bhi,$Bctxhi,$Bhi
+||     ADDU    $Blo,$Bctxlo,$Bctxlo:$Blo
+|| [A0]        LDDW    *$K512++,$Khi:$Klo              ; pre-fetch K512[0]
+       ADD     $Chi,$Cctxhi,$Chi
+||     ADDU    $Clo,$Cctxlo,$Cctxlo:$Clo
+||     ADD     $Actxlo,$Ahi,$Ahi
+||[!A0]        MV      $CTXA,$CTXB
+       ADD     $Dhi,$Dctxhi,$Dhi
+||     ADDU    $Dlo,$Dctxlo,$Dctxlo:$Dlo
+||     ADD     $Bctxlo,$Bhi,$Bhi
+||[!A0]        STW     $Ahi,*${CTXA}[0^.LITTLE_ENDIAN] ; save ctx
+||[!A0]        STW     $Alo,*${CTXB}[1^.LITTLE_ENDIAN]
+       ADD     $Ehi,$Ectxhi,$Ehi
+||     ADDU    $Elo,$Ectxlo,$Ectxlo:$Elo
+||     ADD     $Cctxlo,$Chi,$Chi
+|| [A0]        BNOP    outerloop?
+||[!A0]        STW     $Bhi,*${CTXA}[2^.LITTLE_ENDIAN]
+||[!A0]        STW     $Blo,*${CTXB}[3^.LITTLE_ENDIAN]
+       ADD     $Fhi,$Fctxhi,$Fhi
+||     ADDU    $Flo,$Fctxlo,$Fctxlo:$Flo
+||     ADD     $Dctxlo,$Dhi,$Dhi
+||[!A0]        STW     $Chi,*${CTXA}[4^.LITTLE_ENDIAN]
+||[!A0]        STW     $Clo,*${CTXB}[5^.LITTLE_ENDIAN]
+       ADD     $Ghi,$Gctxhi,$Ghi
+||     ADDU    $Glo,$Gctxlo,$Gctxlo:$Glo
+||     ADD     $Ectxlo,$Ehi,$Ehi
+||[!A0]        STW     $Dhi,*${CTXA}[6^.LITTLE_ENDIAN]
+||[!A0]        STW     $Dlo,*${CTXB}[7^.LITTLE_ENDIAN]
+       ADD     $Hhi,$Hctxhi,$Hhi
+||     ADDU    $Hlo,$Hctxlo,$Hctxlo:$Hlo
+||     ADD     $Fctxlo,$Fhi,$Fhi
+||[!A0]        STW     $Ehi,*${CTXA}[8^.LITTLE_ENDIAN]
+||[!A0]        STW     $Elo,*${CTXB}[9^.LITTLE_ENDIAN]
+       ADD     $Gctxlo,$Ghi,$Ghi
+||[!A0]        STW     $Fhi,*${CTXA}[10^.LITTLE_ENDIAN]
+||[!A0]        STW     $Flo,*${CTXB}[11^.LITTLE_ENDIAN]
+       ADD     $Hctxlo,$Hhi,$Hhi
+||[!A0]        STW     $Ghi,*${CTXA}[12^.LITTLE_ENDIAN]
+||[!A0]        STW     $Glo,*${CTXB}[13^.LITTLE_ENDIAN]
+;;===== branch to outerloop? is taken here
+
+       STW     $Hhi,*${CTXA}[14^.LITTLE_ENDIAN]
+||     STW     $Hlo,*${CTXB}[15^.LITTLE_ENDIAN]
+||     MVK     -40,B0
+       ADD     FP,B0,SP                        ; destroy circular buffer
+||     LDDW    *FP[-4],A11:A10
+       LDDW    *SP[2],A13:A12
+||     LDDW    *FP[-2],B11:B10
+       LDDW    *SP[4],B13:B12
+||     BNOP    RA
+       LDW     *++SP(40),FP                    ; restore frame pointer
+       MVK     0,B0
+       MVC     B0,AMR                          ; clear AMR
+       NOP     2                               ; wait till FP is committed
+       .endasmfunc
+
+       .sect   ".const:sha_asm"
+       .align  128
+K512:
+       .uword  0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
+       .uword  0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
+       .uword  0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
+       .uword  0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
+       .uword  0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
+       .uword  0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
+       .uword  0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
+       .uword  0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
+       .uword  0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
+       .uword  0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
+       .uword  0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
+       .uword  0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
+       .uword  0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
+       .uword  0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
+       .uword  0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
+       .uword  0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
+       .uword  0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
+       .uword  0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
+       .uword  0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
+       .uword  0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
+       .uword  0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
+       .uword  0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
+       .uword  0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
+       .uword  0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
+       .uword  0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
+       .uword  0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
+       .uword  0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
+       .uword  0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
+       .uword  0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
+       .uword  0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
+       .uword  0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
+       .uword  0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
+       .uword  0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
+       .uword  0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
+       .uword  0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
+       .uword  0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
+       .uword  0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
+       .uword  0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
+       .uword  0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
+       .uword  0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
+       .cstring "SHA512 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>"
+       .align  4
+___
+
+print $code;
+close STDOUT;