aesp8-ppc.pl: add CTR mode.
[openssl.git] / crypto / aes / asm / aesp8-ppc.pl
index f5c2be3d406b7d49698e513d725a2ca1aaf59e6e..b660cd5b2cdee200d560f2922afdba69521705f3 100755 (executable)
@@ -30,6 +30,7 @@ if ($flavour =~ /64/) {
        $POP    ="ld";
        $PUSH   ="std";
        $UCMP   ="cmpld";
+       $SHL    ="sldi";
 } elsif ($flavour =~ /32/) {
        $SIZE_T =4;
        $LRSAVE =$SIZE_T;
@@ -37,6 +38,7 @@ if ($flavour =~ /64/) {
        $POP    ="lwz";
        $PUSH   ="stw";
        $UCMP   ="cmplw";
+       $SHL    ="slwi";
 } else { die "nonsense $flavour"; }
 
 $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
@@ -1211,6 +1213,658 @@ Lcbc_dec8x_done:
 ___
 }}     }}}
 
+#########################################################################
+{{{    # CTR procedure[s]                                              #
+my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
+my ($rndkey0,$rndkey1,$inout,$tmp)=            map("v$_",(0..3));
+my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
+                                               map("v$_",(4..11));
+my $dat=$tmp;
+
+$code.=<<___;
+.globl .${prefix}_ctr32_encrypt_blocks
+.align 5
+.${prefix}_ctr32_encrypt_blocks:
+       ${UCMP}i        $len,1
+       bltlr-
+
+       lis             r0,0xfff0
+       mfspr           $vrsave,256
+       mtspr           256,r0
+
+       li              $idx,15
+       vxor            $rndkey0,$rndkey0,$rndkey0
+       le?vspltisb     $tmp,0x0f
+
+       lvx             $ivec,0,$ivp            # load [unaligned] iv
+       lvsl            $inpperm,0,$ivp
+       lvx             $inptail,$idx,$ivp
+        vspltisb       $one,1
+       le?vxor         $inpperm,$inpperm,$tmp
+       vperm           $ivec,$ivec,$inptail,$inpperm
+        vsldoi         $one,$rndkey0,$one,1
+
+       neg             r11,$inp
+       ?lvsl           $keyperm,0,$key         # prepare for unaligned key
+       lwz             $rounds,240($key)
+
+       lvsr            $inpperm,0,r11          # prepare for unaligned load
+       lvx             $inptail,0,$inp
+       addi            $inp,$inp,15            # 15 is not typo
+       le?vxor         $inpperm,$inpperm,$tmp
+
+       srwi            $rounds,$rounds,1
+       li              $idx,16
+       subi            $rounds,$rounds,1
+
+       ${UCMP}i        $len,8
+       bge             _aesp8_ctr32_encrypt8x
+
+       ?lvsr           $outperm,0,$out         # prepare for unaligned store
+       vspltisb        $outmask,-1
+       lvx             $outhead,0,$out
+       ?vperm          $outmask,$rndkey0,$outmask,$outperm
+       le?vxor         $outperm,$outperm,$tmp
+
+       lvx             $rndkey0,0,$key
+       mtctr           $rounds
+       lvx             $rndkey1,$idx,$key
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vxor            $inout,$ivec,$rndkey0
+       lvx             $rndkey0,$idx,$key
+       addi            $idx,$idx,16
+       b               Loop_ctr32_enc
+
+.align 5
+Loop_ctr32_enc:
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vcipher         $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key
+       addi            $idx,$idx,16
+       ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vcipher         $inout,$inout,$rndkey0
+       lvx             $rndkey0,$idx,$key
+       addi            $idx,$idx,16
+       bdnz            Loop_ctr32_enc
+
+       vadduwm         $ivec,$ivec,$one
+        vmr            $dat,$inptail
+        lvx            $inptail,0,$inp
+        addi           $inp,$inp,16
+        subic.         $len,$len,1             # blocks--
+
+       ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
+       vcipher         $inout,$inout,$rndkey1
+       lvx             $rndkey1,$idx,$key
+        vperm          $dat,$dat,$inptail,$inpperm
+        li             $idx,16
+       ?vperm          $rndkey1,$rndkey0,$rndkey1,$keyperm
+        lvx            $rndkey0,0,$key
+       vxor            $dat,$dat,$rndkey1      # last round key
+       vcipherlast     $inout,$inout,$dat
+
+        lvx            $rndkey1,$idx,$key
+        addi           $idx,$idx,16
+       vperm           $inout,$inout,$inout,$outperm
+       vsel            $dat,$outhead,$inout,$outmask
+        mtctr          $rounds
+        ?vperm         $rndkey0,$rndkey0,$rndkey1,$keyperm
+       vmr             $outhead,$inout
+        vxor           $inout,$ivec,$rndkey0
+        lvx            $rndkey0,$idx,$key
+        addi           $idx,$idx,16
+       stvx            $dat,0,$out
+       addi            $out,$out,16
+       bne             Loop_ctr32_enc
+
+       addi            $out,$out,-1
+       lvx             $inout,0,$out           # redundant in aligned case
+       vsel            $inout,$outhead,$inout,$outmask
+       stvx            $inout,0,$out
+
+       mtspr           256,$vrsave
+       blr
+       .long           0
+       .byte           0,12,0x14,0,0,0,6,0
+       .long           0
+___
+#########################################################################
+{{     # Optimized CTR procedure                                       #
+my $key_="r11";
+my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
+my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
+my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
+my $rndkey0="v23";     # v24-v25 rotating buffer for first found keys
+                       # v26-v31 last 6 round keys
+my ($tmp,$keyperm)=($in3,$in4);        # aliases with "caller", redundant assignment
+my ($two,$three,$four)=($outhead,$outperm,$outmask);
+
+$code.=<<___;
+.align 5
+_aesp8_ctr32_encrypt8x:
+       $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
+       li              r10,`$FRAME+8*16+15`
+       li              r11,`$FRAME+8*16+31`
+       stvx            v20,r10,$sp             # ABI says so
+       addi            r10,r10,32
+       stvx            v21,r11,$sp
+       addi            r11,r11,32
+       stvx            v22,r10,$sp
+       addi            r10,r10,32
+       stvx            v23,r11,$sp
+       addi            r11,r11,32
+       stvx            v24,r10,$sp
+       addi            r10,r10,32
+       stvx            v25,r11,$sp
+       addi            r11,r11,32
+       stvx            v26,r10,$sp
+       addi            r10,r10,32
+       stvx            v27,r11,$sp
+       addi            r11,r11,32
+       stvx            v28,r10,$sp
+       addi            r10,r10,32
+       stvx            v29,r11,$sp
+       addi            r11,r11,32
+       stvx            v30,r10,$sp
+       stvx            v31,r11,$sp
+       li              r0,-1
+       stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
+       li              $x10,0x10
+       $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+       li              $x20,0x20
+       $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+       li              $x30,0x30
+       $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+       li              $x40,0x40
+       $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+       li              $x50,0x50
+       $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+       li              $x60,0x60
+       $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+       li              $x70,0x70
+       mtspr           256,r0
+
+       subi            $rounds,$rounds,3       # -4 in total
+
+       lvx             $rndkey0,$x00,$key      # load key schedule
+       lvx             v30,$x10,$key
+       addi            $key,$key,0x20
+       lvx             v31,$x00,$key
+       ?vperm          $rndkey0,$rndkey0,v30,$keyperm
+       addi            $key_,$sp,$FRAME+15
+       mtctr           $rounds
+
+Load_ctr32_enc_key:
+       ?vperm          v24,v30,v31,$keyperm
+       lvx             v30,$x10,$key
+       addi            $key,$key,0x20
+       stvx            v24,$x00,$key_          # off-load round[1]
+       ?vperm          v25,v31,v30,$keyperm
+       lvx             v31,$x00,$key
+       stvx            v25,$x10,$key_          # off-load round[2]
+       addi            $key_,$key_,0x20
+       bdnz            Load_ctr32_enc_key
+
+       lvx             v26,$x10,$key
+       ?vperm          v24,v30,v31,$keyperm
+       lvx             v27,$x20,$key
+       stvx            v24,$x00,$key_          # off-load round[3]
+       ?vperm          v25,v31,v26,$keyperm
+       lvx             v28,$x30,$key
+       stvx            v25,$x10,$key_          # off-load round[4]
+       addi            $key_,$sp,$FRAME+15     # rewind $key_
+       ?vperm          v26,v26,v27,$keyperm
+       lvx             v29,$x40,$key
+       ?vperm          v27,v27,v28,$keyperm
+       lvx             v30,$x50,$key
+       ?vperm          v28,v28,v29,$keyperm
+       lvx             v31,$x60,$key
+       ?vperm          v29,v29,v30,$keyperm
+       lvx             $out0,$x70,$key         # borrow $out0
+       ?vperm          v30,v30,v31,$keyperm
+       lvx             v24,$x00,$key_          # pre-load round[1]
+       ?vperm          v31,v31,$out0,$keyperm
+       lvx             v25,$x10,$key_          # pre-load round[2]
+
+       vadduwm         $two,$one,$one
+       subi            $inp,$inp,15            # undo "caller"
+       $SHL            $len,$len,4
+
+       vadduwm         $out1,$ivec,$one        # counter values ...
+       vadduwm         $out2,$ivec,$two
+       vxor            $out0,$ivec,$rndkey0    # ... xored with rndkey[0]
+        le?li          $idx,8
+       vadduwm         $out3,$out1,$two
+       vxor            $out1,$out1,$rndkey0
+        le?lvsl        $inpperm,0,$idx
+       vadduwm         $out4,$out2,$two
+       vxor            $out2,$out2,$rndkey0
+        le?vspltisb    $tmp,0x0f
+       vadduwm         $out5,$out3,$two
+       vxor            $out3,$out3,$rndkey0
+        le?vxor        $inpperm,$inpperm,$tmp  # transform for lvx_u/stvx_u
+       vadduwm         $out6,$out4,$two
+       vxor            $out4,$out4,$rndkey0
+       vadduwm         $out7,$out5,$two
+       vxor            $out5,$out5,$rndkey0
+       vadduwm         $ivec,$out6,$two        # next counter value
+       vxor            $out6,$out6,$rndkey0
+       vxor            $out7,$out7,$rndkey0
+
+       mtctr           $rounds
+       b               Loop_ctr32_enc8x
+.align 5
+Loop_ctr32_enc8x:
+       vcipher         $out0,$out0,v24
+       vcipher         $out1,$out1,v24
+       vcipher         $out2,$out2,v24
+       vcipher         $out3,$out3,v24
+       vcipher         $out4,$out4,v24
+       vcipher         $out5,$out5,v24
+       vcipher         $out6,$out6,v24
+       vcipher         $out7,$out7,v24
+Loop_ctr32_enc8x_middle:
+       lvx             v24,$x20,$key_          # round[3]
+       addi            $key_,$key_,0x20
+
+       vcipher         $out0,$out0,v25
+       vcipher         $out1,$out1,v25
+       vcipher         $out2,$out2,v25
+       vcipher         $out3,$out3,v25
+       vcipher         $out4,$out4,v25
+       vcipher         $out5,$out5,v25
+       vcipher         $out6,$out6,v25
+       vcipher         $out7,$out7,v25
+       lvx             v25,$x10,$key_          # round[4]
+       bdnz            Loop_ctr32_enc8x
+
+       subic           r11,$len,256            # $len-256, borrow $key_
+       vcipher         $out0,$out0,v24
+       vcipher         $out1,$out1,v24
+       vcipher         $out2,$out2,v24
+       vcipher         $out3,$out3,v24
+       vcipher         $out4,$out4,v24
+       vcipher         $out5,$out5,v24
+       vcipher         $out6,$out6,v24
+       vcipher         $out7,$out7,v24
+
+       subfe           r0,r0,r0                # borrow?-1:0
+       vcipher         $out0,$out0,v25
+       vcipher         $out1,$out1,v25
+       vcipher         $out2,$out2,v25
+       vcipher         $out3,$out3,v25
+       vcipher         $out4,$out4,v25
+       vcipher         $out5,$out5,v25
+       vcipher         $out6,$out6,v25
+       vcipher         $out7,$out7,v25
+
+       and             r0,r0,r11
+       addi            $key_,$sp,$FRAME+15     # rewind $key_
+       vcipher         $out0,$out0,v26
+       vcipher         $out1,$out1,v26
+       vcipher         $out2,$out2,v26
+       vcipher         $out3,$out3,v26
+       vcipher         $out4,$out4,v26
+       vcipher         $out5,$out5,v26
+       vcipher         $out6,$out6,v26
+       vcipher         $out7,$out7,v26
+       lvx             v24,$x00,$key_          # re-pre-load round[1]
+
+       subic           $len,$len,129           # $len-=129
+       vcipher         $out0,$out0,v27
+       addi            $len,$len,1             # $len-=128 really
+       vcipher         $out1,$out1,v27
+       vcipher         $out2,$out2,v27
+       vcipher         $out3,$out3,v27
+       vcipher         $out4,$out4,v27
+       vcipher         $out5,$out5,v27
+       vcipher         $out6,$out6,v27
+       vcipher         $out7,$out7,v27
+       lvx             v25,$x10,$key_          # re-pre-load round[2]
+
+       vcipher         $out0,$out0,v28
+        lvx_u          $in0,$x00,$inp          # load input
+       vcipher         $out1,$out1,v28
+        lvx_u          $in1,$x10,$inp
+       vcipher         $out2,$out2,v28
+        lvx_u          $in2,$x20,$inp
+       vcipher         $out3,$out3,v28
+        lvx_u          $in3,$x30,$inp
+       vcipher         $out4,$out4,v28
+        lvx_u          $in4,$x40,$inp
+       vcipher         $out5,$out5,v28
+        lvx_u          $in5,$x50,$inp
+       vcipher         $out6,$out6,v28
+        lvx_u          $in6,$x60,$inp
+       vcipher         $out7,$out7,v28
+        lvx_u          $in7,$x70,$inp
+        addi           $inp,$inp,0x80
+
+       vcipher         $out0,$out0,v29
+        le?vperm       $in0,$in0,$in0,$inpperm
+       vcipher         $out1,$out1,v29
+        le?vperm       $in1,$in1,$in1,$inpperm
+       vcipher         $out2,$out2,v29
+        le?vperm       $in2,$in2,$in2,$inpperm
+       vcipher         $out3,$out3,v29
+        le?vperm       $in3,$in3,$in3,$inpperm
+       vcipher         $out4,$out4,v29
+        le?vperm       $in4,$in4,$in4,$inpperm
+       vcipher         $out5,$out5,v29
+        le?vperm       $in5,$in5,$in5,$inpperm
+       vcipher         $out6,$out6,v29
+        le?vperm       $in6,$in6,$in6,$inpperm
+       vcipher         $out7,$out7,v29
+        le?vperm       $in7,$in7,$in7,$inpperm
+
+       add             $inp,$inp,r0            # $inp is adjusted in such
+                                               # way that at exit from the
+                                               # loop inX-in7 are loaded
+                                               # with last "words"
+       subfe.          r0,r0,r0                # borrow?-1:0
+       vcipher         $out0,$out0,v30
+        vxor           $in0,$in0,v31           # xor with last round key
+       vcipher         $out1,$out1,v30
+        vxor           $in1,$in1,v31
+       vcipher         $out2,$out2,v30
+        vxor           $in2,$in2,v31
+       vcipher         $out3,$out3,v30
+        vxor           $in3,$in3,v31
+       vcipher         $out4,$out4,v30
+        vxor           $in4,$in4,v31
+       vcipher         $out5,$out5,v30
+        vxor           $in5,$in5,v31
+       vcipher         $out6,$out6,v30
+        vxor           $in6,$in6,v31
+       vcipher         $out7,$out7,v30
+        vxor           $in7,$in7,v31
+
+       bne             Lctr32_enc8x_break      # did $len-129 borrow?
+
+       vcipherlast     $in0,$out0,$in0
+       vcipherlast     $in1,$out1,$in1
+        vadduwm        $out1,$ivec,$one        # counter values ...
+       vcipherlast     $in2,$out2,$in2
+        vadduwm        $out2,$ivec,$two
+        vxor           $out0,$ivec,$rndkey0    # ... xored with rndkey[0]
+       vcipherlast     $in3,$out3,$in3
+        vadduwm        $out3,$out1,$two
+        vxor           $out1,$out1,$rndkey0
+       vcipherlast     $in4,$out4,$in4
+        vadduwm        $out4,$out2,$two
+        vxor           $out2,$out2,$rndkey0
+       vcipherlast     $in5,$out5,$in5
+        vadduwm        $out5,$out3,$two
+        vxor           $out3,$out3,$rndkey0
+       vcipherlast     $in6,$out6,$in6
+        vadduwm        $out6,$out4,$two
+        vxor           $out4,$out4,$rndkey0
+       vcipherlast     $in7,$out7,$in7
+        vadduwm        $out7,$out5,$two
+        vxor           $out5,$out5,$rndkey0
+       le?vperm        $in0,$in0,$in0,$inpperm
+        vadduwm        $ivec,$out6,$two        # next counter value
+        vxor           $out6,$out6,$rndkey0
+       le?vperm        $in1,$in1,$in1,$inpperm
+        vxor           $out7,$out7,$rndkey0
+       mtctr           $rounds
+
+        vcipher        $out0,$out0,v24
+       stvx_u          $in0,$x00,$out
+       le?vperm        $in2,$in2,$in2,$inpperm
+        vcipher        $out1,$out1,v24
+       stvx_u          $in1,$x10,$out
+       le?vperm        $in3,$in3,$in3,$inpperm
+        vcipher        $out2,$out2,v24
+       stvx_u          $in2,$x20,$out
+       le?vperm        $in4,$in4,$in4,$inpperm
+        vcipher        $out3,$out3,v24
+       stvx_u          $in3,$x30,$out
+       le?vperm        $in5,$in5,$in5,$inpperm
+        vcipher        $out4,$out4,v24
+       stvx_u          $in4,$x40,$out
+       le?vperm        $in6,$in6,$in6,$inpperm
+        vcipher        $out5,$out5,v24
+       stvx_u          $in5,$x50,$out
+       le?vperm        $in7,$in7,$in7,$inpperm
+        vcipher        $out6,$out6,v24
+       stvx_u          $in6,$x60,$out
+        vcipher        $out7,$out7,v24
+       stvx_u          $in7,$x70,$out
+       addi            $out,$out,0x80
+
+       b               Loop_ctr32_enc8x_middle
+
+.align 5
+Lctr32_enc8x_break:
+       cmpwi           $len,-0x60
+       blt             Lctr32_enc8x_one
+       nop
+       beq             Lctr32_enc8x_two
+       cmpwi           $len,-0x40
+       blt             Lctr32_enc8x_three
+       nop
+       beq             Lctr32_enc8x_four
+       cmpwi           $len,-0x20
+       blt             Lctr32_enc8x_five
+       nop
+       beq             Lctr32_enc8x_six
+       cmpwi           $len,0x00
+       blt             Lctr32_enc8x_seven
+
+Lctr32_enc8x_eight:
+       vcipherlast     $out0,$out0,$in0
+       vcipherlast     $out1,$out1,$in1
+       vcipherlast     $out2,$out2,$in2
+       vcipherlast     $out3,$out3,$in3
+       vcipherlast     $out4,$out4,$in4
+       vcipherlast     $out5,$out5,$in5
+       vcipherlast     $out6,$out6,$in6
+       vcipherlast     $out7,$out7,$in7
+
+       le?vperm        $out0,$out0,$out0,$inpperm
+       le?vperm        $out1,$out1,$out1,$inpperm
+       stvx_u          $out0,$x00,$out
+       le?vperm        $out2,$out2,$out2,$inpperm
+       stvx_u          $out1,$x10,$out
+       le?vperm        $out3,$out3,$out3,$inpperm
+       stvx_u          $out2,$x20,$out
+       le?vperm        $out4,$out4,$out4,$inpperm
+       stvx_u          $out3,$x30,$out
+       le?vperm        $out5,$out5,$out5,$inpperm
+       stvx_u          $out4,$x40,$out
+       le?vperm        $out6,$out6,$out6,$inpperm
+       stvx_u          $out5,$x50,$out
+       le?vperm        $out7,$out7,$out7,$inpperm
+       stvx_u          $out6,$x60,$out
+       stvx_u          $out7,$x70,$out
+       addi            $out,$out,0x80
+       b               Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_seven:
+       vcipherlast     $out0,$out0,$in1
+       vcipherlast     $out1,$out1,$in2
+       vcipherlast     $out2,$out2,$in3
+       vcipherlast     $out3,$out3,$in4
+       vcipherlast     $out4,$out4,$in5
+       vcipherlast     $out5,$out5,$in6
+       vcipherlast     $out6,$out6,$in7
+
+       le?vperm        $out0,$out0,$out0,$inpperm
+       le?vperm        $out1,$out1,$out1,$inpperm
+       stvx_u          $out0,$x00,$out
+       le?vperm        $out2,$out2,$out2,$inpperm
+       stvx_u          $out1,$x10,$out
+       le?vperm        $out3,$out3,$out3,$inpperm
+       stvx_u          $out2,$x20,$out
+       le?vperm        $out4,$out4,$out4,$inpperm
+       stvx_u          $out3,$x30,$out
+       le?vperm        $out5,$out5,$out5,$inpperm
+       stvx_u          $out4,$x40,$out
+       le?vperm        $out6,$out6,$out6,$inpperm
+       stvx_u          $out5,$x50,$out
+       stvx_u          $out6,$x60,$out
+       addi            $out,$out,0x70
+       b               Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_six:
+       vcipherlast     $out0,$out0,$in2
+       vcipherlast     $out1,$out1,$in3
+       vcipherlast     $out2,$out2,$in4
+       vcipherlast     $out3,$out3,$in5
+       vcipherlast     $out4,$out4,$in6
+       vcipherlast     $out5,$out5,$in7
+
+       le?vperm        $out0,$out0,$out0,$inpperm
+       le?vperm        $out1,$out1,$out1,$inpperm
+       stvx_u          $out0,$x00,$out
+       le?vperm        $out2,$out2,$out2,$inpperm
+       stvx_u          $out1,$x10,$out
+       le?vperm        $out3,$out3,$out3,$inpperm
+       stvx_u          $out2,$x20,$out
+       le?vperm        $out4,$out4,$out4,$inpperm
+       stvx_u          $out3,$x30,$out
+       le?vperm        $out5,$out5,$out5,$inpperm
+       stvx_u          $out4,$x40,$out
+       stvx_u          $out5,$x50,$out
+       addi            $out,$out,0x60
+       b               Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_five:
+       vcipherlast     $out0,$out0,$in3
+       vcipherlast     $out1,$out1,$in4
+       vcipherlast     $out2,$out2,$in5
+       vcipherlast     $out3,$out3,$in6
+       vcipherlast     $out4,$out4,$in7
+
+       le?vperm        $out0,$out0,$out0,$inpperm
+       le?vperm        $out1,$out1,$out1,$inpperm
+       stvx_u          $out0,$x00,$out
+       le?vperm        $out2,$out2,$out2,$inpperm
+       stvx_u          $out1,$x10,$out
+       le?vperm        $out3,$out3,$out3,$inpperm
+       stvx_u          $out2,$x20,$out
+       le?vperm        $out4,$out4,$out4,$inpperm
+       stvx_u          $out3,$x30,$out
+       stvx_u          $out4,$x40,$out
+       addi            $out,$out,0x50
+       b               Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_four:
+       vcipherlast     $out0,$out0,$in4
+       vcipherlast     $out1,$out1,$in5
+       vcipherlast     $out2,$out2,$in6
+       vcipherlast     $out3,$out3,$in7
+
+       le?vperm        $out0,$out0,$out0,$inpperm
+       le?vperm        $out1,$out1,$out1,$inpperm
+       stvx_u          $out0,$x00,$out
+       le?vperm        $out2,$out2,$out2,$inpperm
+       stvx_u          $out1,$x10,$out
+       le?vperm        $out3,$out3,$out3,$inpperm
+       stvx_u          $out2,$x20,$out
+       stvx_u          $out3,$x30,$out
+       addi            $out,$out,0x40
+       b               Lctr32_enc8x_done
+
+.align 5
+Lctr32_enc8x_three:
+       vcipherlast     $out0,$out0,$in5
+       vcipherlast     $out1,$out1,$in6
+       vcipherlast     $out2,$out2,$in7
+
+       le?vperm        $out0,$out0,$out0,$inpperm
+       le?vperm        $out1,$out1,$out1,$inpperm
+       stvx_u          $out0,$x00,$out
+       le?vperm        $out2,$out2,$out2,$inpperm
+       stvx_u          $out1,$x10,$out
+       stvx_u          $out2,$x20,$out
+       addi            $out,$out,0x30
+       b               Lcbc_dec8x_done
+
+.align 5
+Lctr32_enc8x_two:
+       vcipherlast     $out0,$out0,$in6
+       vcipherlast     $out1,$out1,$in7
+
+       le?vperm        $out0,$out0,$out0,$inpperm
+       le?vperm        $out1,$out1,$out1,$inpperm
+       stvx_u          $out0,$x00,$out
+       stvx_u          $out1,$x10,$out
+       addi            $out,$out,0x20
+       b               Lcbc_dec8x_done
+
+.align 5
+Lctr32_enc8x_one:
+       vcipherlast     $out0,$out0,$in7
+
+       le?vperm        $out0,$out0,$out0,$inpperm
+       stvx_u          $out0,0,$out
+       addi            $out,$out,0x10
+
+Lctr32_enc8x_done:
+       li              r10,`$FRAME+15`
+       li              r11,`$FRAME+31`
+       stvx            $inpperm,r10,$sp        # wipe copies of round keys
+       addi            r10,r10,32
+       stvx            $inpperm,r11,$sp
+       addi            r11,r11,32
+       stvx            $inpperm,r10,$sp
+       addi            r10,r10,32
+       stvx            $inpperm,r11,$sp
+       addi            r11,r11,32
+       stvx            $inpperm,r10,$sp
+       addi            r10,r10,32
+       stvx            $inpperm,r11,$sp
+       addi            r11,r11,32
+       stvx            $inpperm,r10,$sp
+       addi            r10,r10,32
+       stvx            $inpperm,r11,$sp
+       addi            r11,r11,32
+
+       mtspr           256,$vrsave
+       lvx             v20,r10,$sp             # ABI says so
+       addi            r10,r10,32
+       lvx             v21,r11,$sp
+       addi            r11,r11,32
+       lvx             v22,r10,$sp
+       addi            r10,r10,32
+       lvx             v23,r11,$sp
+       addi            r11,r11,32
+       lvx             v24,r10,$sp
+       addi            r10,r10,32
+       lvx             v25,r11,$sp
+       addi            r11,r11,32
+       lvx             v26,r10,$sp
+       addi            r10,r10,32
+       lvx             v27,r11,$sp
+       addi            r11,r11,32
+       lvx             v28,r10,$sp
+       addi            r10,r10,32
+       lvx             v29,r11,$sp
+       addi            r11,r11,32
+       lvx             v30,r10,$sp
+       lvx             v31,r11,$sp
+       $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
+       $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
+       $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
+       $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
+       $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
+       $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
+       addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
+       blr
+       .long           0
+       .byte           0,12,0x14,0,0x80,6,6,0
+       .long           0
+.size  .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
+___
+}}     }}}
+
 my $consts=1;
 foreach(split("\n",$code)) {
         s/\`([^\`]*)\`/eval($1)/geo;