Add "teaser" AES module for PowerISA 2.07.
authorAndy Polyakov <appro@openssl.org>
Mon, 12 May 2014 08:35:29 +0000 (10:35 +0200)
committerAndy Polyakov <appro@openssl.org>
Mon, 12 May 2014 08:35:29 +0000 (10:35 +0200)
"Teaser" means that it's not integrated yet and purpose of this
commit is primarily informational, to exhibit design choices,
such as how to handle alignment and endianness. In other words
it's proof-of-concept code that EVP module will build upon.

crypto/aes/asm/aesp8-ppc.pl [new file with mode: 0755]
crypto/perlasm/ppc-xlate.pl

diff --git a/crypto/aes/asm/aesp8-ppc.pl b/crypto/aes/asm/aesp8-ppc.pl
new file mode 100755 (executable)
index 0000000..f2e51cd
--- /dev/null
@@ -0,0 +1,722 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for AES instructions as per PowerISA
+# specification version 2.07, first implemented by POWER8 processor.
+# The module is endian-agnostic in sense that it supports both big-
+# and little-endian cases. As well as alignment-agnostic, and it is
+# guaranteed not to cause alignment exceptions. [One of options was
+# to use VSX loads and stores, which tolerate unaligned references,
+# but even then specification doesn't prohibit exceptions on page
+# boundaries.]
+
+$flavour = shift;
+
+if ($flavour =~ /64/) {
+       $SIZE_T =8;
+       $LRSAVE =2*$SIZE_T;
+       $STU    ="stdu";
+       $POP    ="ld";
+       $PUSH   ="std";
+} elsif ($flavour =~ /32/) {
+       $SIZE_T =4;
+       $LRSAVE =$SIZE_T;
+       $STU    ="stwu";
+       $POP    ="lwz";
+       $PUSH   ="stw";
+} else { die "nonsense $flavour"; }
+
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
+die "can't locate ppc-xlate.pl";
+
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
+
+$FRAME=8*$SIZE_T;
+$prefix="AES";
+
+$sp="r1";
+$vrsave="r12";
+
+{{{
+my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
+my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
+my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
+
+$code.=<<___;
+.machine       "any"
+
+.text
+
+.align 7
+rcon:
+.long  0x01000000, 0x01000000, 0x01000000, 0x01000000  ?rev
+.long  0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000  ?rev
+.long  0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c  ?rev
+.long  0,0,0,0                                         ?asis
+Lconsts:
+       mflr    r0
+       bcl     20,31,\$+4
+       mflr    $ptr     #vvvvv "distance between . and rcon
+       addi    $ptr,$ptr,-0x48
+       mtlr    r0
+       blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+.asciz "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
+
+.globl .${prefix}_set_encrypt_key
+.align 5
+.${prefix}_set_encrypt_key:
+Lset_encrypt_key:
+       mflr            r11
+       li              r0,0xfff
+       $PUSH           r11,$LRSAVE($sp)
+       mfspr           $vrsave,256
+       mtspr           256,r0
+
+       bl              Lconsts
+       mtlr            r11
+
+       neg             r9,$inp
+       lvx             $in0,0,$inp
+       addi            $inp,$inp,15            # 15 is not typo
+       lvsr            $key,0,r9               # borrow $key
+       li              r8,0x20
+       cmpwi           $bits,192
+       lvx             $in1,0,$inp
+___
+$code.=<<___           if ($LITTLE_ENDIAN);
+       vspltisb        $mask,0x0f              # borrow $mask
+       vxor            $key,$key,$mask         # adjust for byte swap
+___
+$code.=<<___;
+       lvx             $rcon,0,$ptr
+       lvx             $mask,r8,$ptr
+       addi            $ptr,$ptr,0x10
+       vperm           $in0,$in0,$in1,$key     # align [and byte swap in LE]
+       li              $cnt,8
+       vxor            $zero,$zero,$zero
+       mtctr           $cnt
+
+       ?lvsr           $outperm,0,$out
+       vspltisb        $outmask,-1
+       lvx             $outhead,0,$out
+       ?vperm          $outmask,$zero,$outmask,$outperm
+
+       blt             Loop128
+       addi            $inp,$inp,8
+       beq             L192
+       addi            $inp,$inp,8
+       b               L256
+
+.align 4
+Loop128:
+       vperm           $key,$in0,$in0,$mask    # rotate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vcipherlast     $key,$key,$rcon
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+        vadduwm        $rcon,$rcon,$rcon
+       vxor            $in0,$in0,$key
+       bdnz            Loop128
+
+       lvx             $rcon,0,$ptr            # last two round keys
+
+       vperm           $key,$in0,$in0,$mask    # rotate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vcipherlast     $key,$key,$rcon
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+        vadduwm        $rcon,$rcon,$rcon
+       vxor            $in0,$in0,$key
+
+       vperm           $key,$in0,$in0,$mask    # rotate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vcipherlast     $key,$key,$rcon
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vxor            $in0,$in0,$key
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+        stvx           $stage,0,$out
+
+       addi            $inp,$out,15            # 15 is not typo
+       addi            $out,$out,0x50
+
+       li              $rounds,10
+       b               Ldone
+
+.align 4
+L192:
+       lvx             $tmp,0,$inp
+       li              $cnt,4
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+       vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
+       vspltisb        $key,8                  # borrow $key
+       mtctr           $cnt
+       vsububm         $mask,$mask,$key        # adjust the mask
+
+Loop192:
+       vperm           $key,$in1,$in1,$mask    # roate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+       vcipherlast     $key,$key,$rcon
+
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+
+        vsldoi         $stage,$zero,$in1,8
+       vspltw          $tmp,$in0,3
+       vxor            $tmp,$tmp,$in1
+       vsldoi          $in1,$zero,$in1,12      # >>32
+        vadduwm        $rcon,$rcon,$rcon
+       vxor            $in1,$in1,$tmp
+       vxor            $in0,$in0,$key
+       vxor            $in1,$in1,$key
+        vsldoi         $stage,$stage,$in0,8
+
+       vperm           $key,$in1,$in1,$mask    # rotate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+        vperm          $outtail,$stage,$stage,$outperm # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vcipherlast     $key,$key,$rcon
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+        vsldoi         $stage,$in0,$in1,8
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+        vperm          $outtail,$stage,$stage,$outperm # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+       vspltw          $tmp,$in0,3
+       vxor            $tmp,$tmp,$in1
+       vsldoi          $in1,$zero,$in1,12      # >>32
+        vadduwm        $rcon,$rcon,$rcon
+       vxor            $in1,$in1,$tmp
+       vxor            $in0,$in0,$key
+       vxor            $in1,$in1,$key
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+        stvx           $stage,0,$out
+        addi           $inp,$out,15            # 15 is not typo
+        addi           $out,$out,16
+       bdnz            Loop192
+
+       li              $rounds,12
+       addi            $out,$out,0x20
+       b               Ldone
+
+.align 4
+L256:
+       lvx             $tmp,0,$inp
+       li              $cnt,7
+       li              $rounds,14
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+       vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
+       mtctr           $cnt
+
+Loop256:
+       vperm           $key,$in1,$in1,$mask    # rotate-n-splat
+       vsldoi          $tmp,$zero,$in0,12      # >>32
+        vperm          $outtail,$in1,$in1,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+       vcipherlast     $key,$key,$rcon
+        stvx           $stage,0,$out
+        addi           $out,$out,16
+
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in0,$in0,$tmp
+        vadduwm        $rcon,$rcon,$rcon
+       vxor            $in0,$in0,$key
+        vperm          $outtail,$in0,$in0,$outperm     # rotate
+        vsel           $stage,$outhead,$outtail,$outmask
+        vmr            $outhead,$outtail
+        stvx           $stage,0,$out
+        addi           $inp,$out,15            # 15 is not typo
+        addi           $out,$out,16
+       bdz             Ldone
+
+       vspltw          $key,$in0,3             # just splat
+       vsldoi          $tmp,$zero,$in1,12      # >>32
+       vsbox           $key,$key
+
+       vxor            $in1,$in1,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in1,$in1,$tmp
+       vsldoi          $tmp,$zero,$tmp,12      # >>32
+       vxor            $in1,$in1,$tmp
+
+       vxor            $in1,$in1,$key
+       b               Loop256
+
+.align 4
+Ldone:
+       lvx             $in1,0,$inp             # redundant in aligned case
+       vsel            $in1,$outhead,$in1,$outmask
+       stvx            $in1,0,$inp
+       xor             r3,r3,r3                # return value
+       mtspr           256,$vrsave
+       stw             $rounds,0($out)
+
+       blr
+       .long           0
+       .byte           0,12,0x14,1,0,0,3,0
+.size  .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
+
+.globl .${prefix}_set_decrypt_key
+.align 5
+.${prefix}_set_decrypt_key:
+       $STU            $sp,-$FRAME($sp)
+       mflr            r10
+       $PUSH           r10,$FRAME+$LRSAVE($sp)
+       bl              Lset_encrypt_key
+       mtlr            r10
+
+       slwi            $cnt,$rounds,4
+       subi            $inp,$out,240           # first round key
+       srwi            $rounds,$rounds,1
+       add             $out,$inp,$cnt          # last round key
+       mtctr           $rounds
+
+Ldeckey:
+       lwz             r0, 0($inp)
+       lwz             r6, 4($inp)
+       lwz             r7, 8($inp)
+       lwz             r8, 12($inp)
+       addi            $inp,$inp,16
+       lwz             r9, 0($out)
+       lwz             r10,4($out)
+       lwz             r11,8($out)
+       lwz             r12,12($out)
+       stw             r0, 0($out)
+       stw             r6, 4($out)
+       stw             r7, 8($out)
+       stw             r8, 12($out)
+       subi            $out,$out,16
+       stw             r9, -16($inp)
+       stw             r10,-12($inp)
+       stw             r11,-8($inp)
+       stw             r12,-4($inp)
+       bdnz            Ldeckey
+
+       xor             r3,r3,r3                # return value
+       addi            $sp,$sp,$FRAME
+       blr
+       .long           0
+       .byte           0,12,4,1,0x80,0,3,0
+.size  .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
+___
+}}}
+{{{
+my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
+
+$code.=<<___;
+.globl .${prefix}_encrypt
+.align 5
+.${prefix}_encrypt:
+       lwz             $rounds,240($key)
+       li              r0,0x3f
+       mfspr           $vrsave,256
+       li              $idx,15                 # 15 is not typo
+       mtspr           256,r0
+
+       lvx             v0,0,$inp
+       neg             r11,$out
+       lvx             v1,$idx,$inp
+       lvsl            v2,0,$inp               # inpperm
+       `"vspltisb      v4,0x0f"                if ($LITTLE_ENDIAN)`
+       ?lvsl           v3,0,r11                # outperm
+       `"vxor          v2,v2,v4"               if ($LITTLE_ENDIAN)`
+       li              $idx,16
+       vperm           v0,v0,v1,v2             # align [and byte swap in LE]
+       lvx             v1,0,$key
+       ?lvsl           v5,0,$key               # keyperm
+       srwi            $rounds,$rounds,1
+       lvx             v2,$idx,$key
+       addi            $idx,$idx,16
+       subi            $rounds,$rounds,1
+       ?vperm          v1,v1,v2,v5             # align round key
+
+       vxor            v0,v0,v1
+       lvx             v1,$idx,$key
+       addi            $idx,$idx,16
+       mtctr           $rounds
+
+Loop_enc:
+       ?vperm          v2,v2,v1,v5
+       vcipher         v0,v0,v2
+       lvx             v2,$idx,$key
+       addi            $idx,$idx,16
+       ?vperm          v1,v1,v2,v5
+       vcipher         v0,v0,v1
+       lvx             v1,$idx,$key
+       addi            $idx,$idx,16
+       bdnz            Loop_enc
+
+       ?vperm          v2,v2,v1,v5
+       vcipher         v0,v0,v2
+       lvx             v2,$idx,$key
+       ?vperm          v1,v1,v2,v5
+       vcipherlast     v0,v0,v1
+
+       vspltisb        v2,-1
+       vxor            v1,v1,v1
+       li              $idx,15                 # 15 is not typo
+       ?vperm          v2,v1,v2,v3             # outmask
+       `"vxor          v3,v3,v4"               if ($LITTLE_ENDIAN)`
+       lvx             v1,0,$out               # outhead
+       vperm           v0,v0,v0,v3             # rotate [and byte swap in LE]
+       vsel            v1,v1,v0,v2
+       lvx             v4,$idx,$out
+       stvx            v1,0,$out
+       vsel            v0,v0,v4,v2
+       stvx            v0,$idx,$out
+
+       mtspr           256,$vrsave
+       blr
+       .long           0
+       .byte           0,12,0x14,0,0,0,3,0
+.size  .${prefix}_encrypt,.-.${prefix}_encrypt
+
+.globl .${prefix}_decrypt
+.align 5
+.${prefix}_decrypt:
+       lwz             $rounds,240($key)
+       li              r0,0x3f
+       mfspr           $vrsave,256
+       li              $idx,15                 # 15 is not typo
+       mtspr           256,r0
+
+       lvx             v0,0,$inp
+       neg             r11,$out
+       lvx             v1,$idx,$inp
+       lvsl            v2,0,$inp               # inpperm
+       `"vspltisb      v4,0x0f"                if ($LITTLE_ENDIAN)`
+       ?lvsl           v3,0,r11                # outperm
+       `"vxor          v2,v2,v4"               if ($LITTLE_ENDIAN)`
+       li              $idx,16
+       vperm           v0,v0,v1,v2             # align [and byte swap in LE]
+       lvx             v1,0,$key
+       ?lvsl           v5,0,$key               # keyperm
+       srwi            $rounds,$rounds,1
+       lvx             v2,$idx,$key
+       addi            $idx,$idx,16
+       subi            $rounds,$rounds,1
+       ?vperm          v1,v1,v2,v5             # align round key
+
+       vxor            v0,v0,v1
+       lvx             v1,$idx,$key
+       addi            $idx,$idx,16
+       mtctr           $rounds
+
+Loop_dec:
+       ?vperm          v2,v2,v1,v5
+       vncipher        v0,v0,v2
+       lvx             v2,$idx,$key
+       addi            $idx,$idx,16
+       ?vperm          v1,v1,v2,v5
+       vncipher        v0,v0,v1
+       lvx             v1,$idx,$key
+       addi            $idx,$idx,16
+       bdnz            Loop_dec
+
+       ?vperm          v2,v2,v1,v5
+       vncipher        v0,v0,v2
+       lvx             v2,$idx,$key
+       ?vperm          v1,v1,v2,v5
+       vncipherlast    v0,v0,v1
+
+       vspltisb        v2,-1
+       vxor            v1,v1,v1
+       li              $idx,15                 # 15 is not typo
+       ?vperm          v2,v1,v2,v3             # outmask
+       `"vxor          v3,v3,v4"               if ($LITTLE_ENDIAN)`
+       lvx             v1,0,$out               # outhead
+       vperm           v0,v0,v0,v3             # rotate [and byte swap in LE]
+       vsel            v1,v1,v0,v2
+       lvx             v4,$idx,$out
+       stvx            v1,0,$out
+       vsel            v0,v0,v4,v2
+       stvx            v0,$idx,$out
+
+       mtspr           256,$vrsave
+       blr
+       .long           0
+       .byte           0,12,0x14,0,0,0,3,0
+.size  .${prefix}_decrypt,.-.${prefix}_decrypt
+___
+}}}
+{{{
+my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout,$ivec,$tmp)=map("v$_",(0..4));
+my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=map("v$_",(5..10));
+
+$code.=<<___;
+.globl .${prefix}_cbc_encrypt
+.align 5
+.${prefix}_cbc_encrypt:
+       subic.          $len,$len,16
+       bltlr-
+
+       cmpwi           $enc,0                  # test direction
+       li              r0,0x7ff
+       mfspr           $vrsave,256
+       mtspr           256,r0
+
+       li              $idx,15
+       vxor            $rndkey0,$rndkey0,$rndkey0
+       `"vspltisb      $tmp,0x0f"              if ($LITTLE_ENDIAN)`
+
+       lvx             $ivec,0,$ivp            # load [unaligned] iv
+       lvsl            $inpperm,0,$ivp
+       lvx             $inptail,$idx,$ivp
+       `"vxor          $inpperm,$inpperm,$tmp" if ($LITTLE_ENDIAN)`
+       vperm           $ivec,$ivec,$inptail,$inpperm
+
+       ?lvsl           $keyperm,0,$key         # prepare for unaligned key
+       lwz             $rounds,240($key)
+
+       lvsl            $inpperm,0,$inp         # prepare for unaligned load
+       lvx             $inptail,0,$inp
+       addi            $inp,$inp,15            # 15 is not typo
+       `"vxor          $inpperm,$inpperm,$tmp" if ($LITTLE_ENDIAN)`
+
+       ?lvsr           $outperm,0,$out         # prepare for unaligned store
+       vspltisb        $outmask,-1
+       lvx             $outhead,0,$out
+       ?vperm          $outmask,$rndkey0,$outmask,$outperm
+       `"vxor          $outperm,$outperm,$tmp" if ($LITTLE_ENDIAN)`
+
+       srwi            $rounds,$rounds,1
+       li              $idx,16
+       subi            $rounds,$rounds,1
+       beq             Lcbc_dec
+
+Lcbc_enc:
+       vmr             $inout,$inptail
+       lvx             $inptail,0,$inp
+       addi            $inp,$inp,16
+       mtctr           $rounds
+
+       lvx             $rndkey0,0,$key
+        vperm          $inout,$inout,$inptail,$inpperm
+       lvx             $rndkey1,$idx,$key
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vxor            $inout,$inout,$rndkey0
+       lvx             $rndkey0,$idx,$key
+       addi            $idx,$idx,16
+       vxor            $inout,$inout,$ivec
+
+Loop_cbc_enc:
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vcipher         $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vcipher         $inout,$inout,$rndkey0
+       lvx             $rndkey0,$idx,$key
+       addi            $idx,$idx,16
+       bdnz            Loop_cbc_enc
+
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vcipher         $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key
+       li              $idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vcipherlast     $ivec,$inout,$rndkey0
+       sub.            $len,$len,$idx          # len -=16
+
+       vperm           $tmp,$ivec,$ivec,$outperm
+       vsel            $inout,$outhead,$tmp,$outmask
+       vmr             $outhead,$tmp
+       stvx            $inout,0,$out
+       addi            $out,$out,16
+       bge             Lcbc_enc
+
+       b               Lcbc_done
+
+.align 4
+Lcbc_dec:
+       vmr             $tmp,$inptail
+       lvx             $inptail,0,$inp
+       addi            $inp,$inp,16
+       mtctr           $rounds
+
+       lvx             $rndkey0,0,$key
+        vperm          $tmp,$tmp,$inptail,$inpperm
+       lvx             $rndkey1,$idx,$key
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vxor            $inout,$tmp,$rndkey0
+       lvx             $rndkey0,$idx,$key
+       addi            $idx,$idx,16
+
+Loop_cbc_dec:
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vncipher        $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vncipher        $inout,$inout,$rndkey0
+       lvx             $rndkey0,$idx,$key
+       addi            $idx,$idx,16
+       bdnz            Loop_cbc_dec
+
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vncipher        $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key
+       li              $idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vncipherlast    $inout,$inout,$rndkey0
+       sub.            $len,$len,$idx          # len -=16
+
+       vxor            $inout,$inout,$ivec
+       vmr             $ivec,$tmp
+       vperm           $tmp,$inout,$inout,$outperm
+       vsel            $inout,$outhead,$tmp,$outmask
+       vmr             $outhead,$tmp
+       stvx            $inout,0,$out
+       addi            $out,$out,16
+       bge             Lcbc_dec
+
+Lcbc_done:
+       addi            $out,$out,-1
+       lvx             $inout,0,$out           # redundant in aligned case
+       vsel            $inout,$outhead,$inout,$outmask
+       stvx            $inout,0,$out
+
+       neg             $enc,$ivp               # write [unaligned] iv
+       li              $idx,15                 # 15 is not typo
+       vxor            $rndkey0,$rndkey0,$rndkey0
+       vspltisb        $outmask,-1
+       `"vspltisb      $tmp,0x0f"              if ($LITTLE_ENDIAN)`
+       ?lvsl           $outperm,0,$enc
+       ?vperm          $outmask,$rndkey0,$outmask,$outperm
+       `"vxor          $outperm,$outperm,$tmp" if ($LITTLE_ENDIAN)`
+       lvx             $outhead,0,$ivp
+       vperm           $ivec,$ivec,$ivec,$outperm
+       vsel            $inout,$outhead,$ivec,$outmask
+       lvx             $inptail,$idx,$ivp
+       stvx            $inout,0,$ivp
+       vsel            $inout,$ivec,$inptail,$outmask
+       stvx            $inout,$idx,$ivp
+
+       mtspr           256,$vrsave
+       blr
+       .long           0
+       .byte           0,12,0x14,0,0,0,6,0
+.size  .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
+___
+}}}
+
+my $consts=1;
+foreach(split("\n",$code)) {
+        s/\`([^\`]*)\`/eval($1)/geo;
+
+       # constants table endian-specific conversion
+       if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
+           my $conv=$3;
+           my @bytes=();
+
+           # convert to endian-agnostic format
+           if ($1 eq "long") {
+             foreach (split(/,\s*/,$2)) {
+               my $l = /^0/?oct:int;
+               push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
+             }
+           } else {
+               @bytes = map(/^0/?oct:int,split(/,\s*/,$2));
+           }
+
+           # little-endian conversion
+           if ($flavour =~ /le$/o) {
+               SWITCH: for($conv)  {
+                   /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
+                   /\?rev/ && do   { @bytes=reverse(@bytes);    last; }; 
+               }
+           }
+
+           #emit
+           print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
+           next;
+       }
+       $consts=0 if (m/Lconsts:/o);    # end of table
+
+       # instructions prefixed with '?' are endian-specific and need
+       # to be adjusted accordingly...
+       if ($flavour =~ /le$/o) {       # little-endian
+           s/\?lvsr/lvsl/o or
+           s/\?lvsl/lvsr/o or
+           s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
+           s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
+           s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
+       } else {                        # big-endian
+           s/\?([a-z]+)/$1/o;
+       }
+
+        print $_,"\n";
+}
+
+close STDOUT;
index a67eef5..50252df 100755 (executable)
@@ -151,6 +151,39 @@ my $vmr = sub {
     "  vor     $vx,$vy,$vy";
 };
 
+# PowerISA 2.06 stuff
+sub vsxmem_op {
+    my ($f, $vrt, $ra, $rb, $op) = @_;
+    "  .long   ".sprintf "0x%X",(31<<26)|($vrt<<21)|($ra<<16)|($rb<<11)|($op*2+1);
+}
+# made-up unaligned memory reference AltiVec/VMX instructions
+my $lvx_u      = sub { vsxmem_op(@_, 844); };  # lxvd2x
+my $stvx_u     = sub { vsxmem_op(@_, 972); };  # stxvd2x
+my $lvdx_u     = sub { vsxmem_op(@_, 588); };  # lxsdx
+my $stvdx_u    = sub { vsxmem_op(@_, 716); };  # stxsdx
+
+# PowerISA 2.07 stuff
+sub vcrypto_op {
+    my ($f, $vrt, $vra, $vrb, $op) = @_;
+    "  .long   ".sprintf "0x%X",(4<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|$op;
+}
+my $vcipher    = sub { vcrypto_op(@_, 1288); };
+my $vcipherlast        = sub { vcrypto_op(@_, 1289); };
+my $vncipher   = sub { vcrypto_op(@_, 1352); };
+my $vncipherlast= sub { vcrypto_op(@_, 1353); };
+my $vsbox      = sub { vcrypto_op(@_, 0, 1480); };
+my $vshasigmad = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1730); };
+my $vshasigmaw = sub { my ($st,$six)=splice(@_,-2); vcrypto_op(@_, $st<<4|$six, 1666); };
+my $vpmsumb    = sub { vcrypto_op(@_, 1032); };
+my $vpmsumd    = sub { vcrypto_op(@_, 1224); };
+my $vpmsubh    = sub { vcrypto_op(@_, 1096); };
+my $vpmsumw    = sub { vcrypto_op(@_, 1160); };
+
+my $mtsle      = sub {
+    my ($f, $arg) = @_;
+    "  .long   ".sprintf "0x%X",(31<<26)|($arg<<21)|(147*2);
+};
+
 while($line=<>) {
 
     $line =~ s|[#!;].*$||;     # get rid of asm-style comments...