aesp8-ppc.pl: optimize CBC decrypt even further.
[openssl.git] / crypto / aes / asm / aesp8-ppc.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for AES instructions as per PowerISA
11 # specification version 2.07, first implemented by POWER8 processor.
12 # The module is endian-agnostic in sense that it supports both big-
13 # and little-endian cases. Data alignment in parallelizable modes is
14 # handled with VSX loads and stores, which implies MSR.VSX flag being
15 # set. It should also be noted that ISA specification doesn't prohibit
16 # alignment exceptions for these instructions on page boundaries.
17 # Initially alignment was handled in pure AltiVec/VMX way [when data
18 # is aligned programmatically, which in turn guarantees exception-
19 # free execution], but it turned to hamper performance when vcipher
20 # instructions are interleaved. It's reckoned that eventual
21 # misalignment penalties at page boundaries are in average lower
22 # than additional overhead in pure AltiVec approach.
23
24 $flavour = shift;
25
26 if ($flavour =~ /64/) {
27         $SIZE_T =8;
28         $LRSAVE =2*$SIZE_T;
29         $STU    ="stdu";
30         $POP    ="ld";
31         $PUSH   ="std";
32         $UCMP   ="cmpld";
33 } elsif ($flavour =~ /32/) {
34         $SIZE_T =4;
35         $LRSAVE =$SIZE_T;
36         $STU    ="stwu";
37         $POP    ="lwz";
38         $PUSH   ="stw";
39         $UCMP   ="cmplw";
40 } else { die "nonsense $flavour"; }
41
42 $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
43
44 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
45 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
46 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
47 die "can't locate ppc-xlate.pl";
48
49 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
50
51 $FRAME=8*$SIZE_T;
52 $prefix="AES";
53
54 $sp="r1";
55 $vrsave="r12";
56
57 #########################################################################
58 {{{     Key setup procedures                                            #
59 my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
60 my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
61 my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
62
63 $code.=<<___;
64 .machine        "any"
65
66 .text
67
68 .align  7
69 rcon:
70 .long   0x01000000, 0x01000000, 0x01000000, 0x01000000  ?rev
71 .long   0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000  ?rev
72 .long   0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c  ?rev
73 .long   0,0,0,0                                         ?asis
74 Lconsts:
75         mflr    r0
76         bcl     20,31,\$+4
77         mflr    $ptr     #vvvvv "distance between . and rcon
78         addi    $ptr,$ptr,-0x48
79         mtlr    r0
80         blr
81         .long   0
82         .byte   0,12,0x14,0,0,0,0,0
83 .asciz  "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
84
85 .globl  .${prefix}_set_encrypt_key
86 .align  5
87 .${prefix}_set_encrypt_key:
88 Lset_encrypt_key:
89         mflr            r11
90         lis             r0,0xfff0
91         $PUSH           r11,$LRSAVE($sp)
92         mfspr           $vrsave,256
93         mtspr           256,r0
94
95         bl              Lconsts
96         mtlr            r11
97
98         neg             r9,$inp
99         lvx             $in0,0,$inp
100         addi            $inp,$inp,15            # 15 is not typo
101         lvsr            $key,0,r9               # borrow $key
102         li              r8,0x20
103         cmpwi           $bits,192
104         lvx             $in1,0,$inp
105         le?vspltisb     $mask,0x0f              # borrow $mask
106         lvx             $rcon,0,$ptr
107         le?vxor         $key,$key,$mask         # adjust for byte swap
108         lvx             $mask,r8,$ptr
109         addi            $ptr,$ptr,0x10
110         vperm           $in0,$in0,$in1,$key     # align [and byte swap in LE]
111         li              $cnt,8
112         vxor            $zero,$zero,$zero
113         mtctr           $cnt
114
115         ?lvsr           $outperm,0,$out
116         vspltisb        $outmask,-1
117         lvx             $outhead,0,$out
118         ?vperm          $outmask,$zero,$outmask,$outperm
119
120         blt             Loop128
121         addi            $inp,$inp,8
122         beq             L192
123         addi            $inp,$inp,8
124         b               L256
125
126 .align  4
127 Loop128:
128         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
129         vsldoi          $tmp,$zero,$in0,12      # >>32
130          vperm          $outtail,$in0,$in0,$outperm     # rotate
131          vsel           $stage,$outhead,$outtail,$outmask
132          vmr            $outhead,$outtail
133         vcipherlast     $key,$key,$rcon
134          stvx           $stage,0,$out
135          addi           $out,$out,16
136
137         vxor            $in0,$in0,$tmp
138         vsldoi          $tmp,$zero,$tmp,12      # >>32
139         vxor            $in0,$in0,$tmp
140         vsldoi          $tmp,$zero,$tmp,12      # >>32
141         vxor            $in0,$in0,$tmp
142          vadduwm        $rcon,$rcon,$rcon
143         vxor            $in0,$in0,$key
144         bdnz            Loop128
145
146         lvx             $rcon,0,$ptr            # last two round keys
147
148         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
149         vsldoi          $tmp,$zero,$in0,12      # >>32
150          vperm          $outtail,$in0,$in0,$outperm     # rotate
151          vsel           $stage,$outhead,$outtail,$outmask
152          vmr            $outhead,$outtail
153         vcipherlast     $key,$key,$rcon
154          stvx           $stage,0,$out
155          addi           $out,$out,16
156
157         vxor            $in0,$in0,$tmp
158         vsldoi          $tmp,$zero,$tmp,12      # >>32
159         vxor            $in0,$in0,$tmp
160         vsldoi          $tmp,$zero,$tmp,12      # >>32
161         vxor            $in0,$in0,$tmp
162          vadduwm        $rcon,$rcon,$rcon
163         vxor            $in0,$in0,$key
164
165         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
166         vsldoi          $tmp,$zero,$in0,12      # >>32
167          vperm          $outtail,$in0,$in0,$outperm     # rotate
168          vsel           $stage,$outhead,$outtail,$outmask
169          vmr            $outhead,$outtail
170         vcipherlast     $key,$key,$rcon
171          stvx           $stage,0,$out
172          addi           $out,$out,16
173
174         vxor            $in0,$in0,$tmp
175         vsldoi          $tmp,$zero,$tmp,12      # >>32
176         vxor            $in0,$in0,$tmp
177         vsldoi          $tmp,$zero,$tmp,12      # >>32
178         vxor            $in0,$in0,$tmp
179         vxor            $in0,$in0,$key
180          vperm          $outtail,$in0,$in0,$outperm     # rotate
181          vsel           $stage,$outhead,$outtail,$outmask
182          vmr            $outhead,$outtail
183          stvx           $stage,0,$out
184
185         addi            $inp,$out,15            # 15 is not typo
186         addi            $out,$out,0x50
187
188         li              $rounds,10
189         b               Ldone
190
191 .align  4
192 L192:
193         lvx             $tmp,0,$inp
194         li              $cnt,4
195          vperm          $outtail,$in0,$in0,$outperm     # rotate
196          vsel           $stage,$outhead,$outtail,$outmask
197          vmr            $outhead,$outtail
198          stvx           $stage,0,$out
199          addi           $out,$out,16
200         vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
201         vspltisb        $key,8                  # borrow $key
202         mtctr           $cnt
203         vsububm         $mask,$mask,$key        # adjust the mask
204
205 Loop192:
206         vperm           $key,$in1,$in1,$mask    # roate-n-splat
207         vsldoi          $tmp,$zero,$in0,12      # >>32
208         vcipherlast     $key,$key,$rcon
209
210         vxor            $in0,$in0,$tmp
211         vsldoi          $tmp,$zero,$tmp,12      # >>32
212         vxor            $in0,$in0,$tmp
213         vsldoi          $tmp,$zero,$tmp,12      # >>32
214         vxor            $in0,$in0,$tmp
215
216          vsldoi         $stage,$zero,$in1,8
217         vspltw          $tmp,$in0,3
218         vxor            $tmp,$tmp,$in1
219         vsldoi          $in1,$zero,$in1,12      # >>32
220          vadduwm        $rcon,$rcon,$rcon
221         vxor            $in1,$in1,$tmp
222         vxor            $in0,$in0,$key
223         vxor            $in1,$in1,$key
224          vsldoi         $stage,$stage,$in0,8
225
226         vperm           $key,$in1,$in1,$mask    # rotate-n-splat
227         vsldoi          $tmp,$zero,$in0,12      # >>32
228          vperm          $outtail,$stage,$stage,$outperm # rotate
229          vsel           $stage,$outhead,$outtail,$outmask
230          vmr            $outhead,$outtail
231         vcipherlast     $key,$key,$rcon
232          stvx           $stage,0,$out
233          addi           $out,$out,16
234
235          vsldoi         $stage,$in0,$in1,8
236         vxor            $in0,$in0,$tmp
237         vsldoi          $tmp,$zero,$tmp,12      # >>32
238          vperm          $outtail,$stage,$stage,$outperm # rotate
239          vsel           $stage,$outhead,$outtail,$outmask
240          vmr            $outhead,$outtail
241         vxor            $in0,$in0,$tmp
242         vsldoi          $tmp,$zero,$tmp,12      # >>32
243         vxor            $in0,$in0,$tmp
244          stvx           $stage,0,$out
245          addi           $out,$out,16
246
247         vspltw          $tmp,$in0,3
248         vxor            $tmp,$tmp,$in1
249         vsldoi          $in1,$zero,$in1,12      # >>32
250          vadduwm        $rcon,$rcon,$rcon
251         vxor            $in1,$in1,$tmp
252         vxor            $in0,$in0,$key
253         vxor            $in1,$in1,$key
254          vperm          $outtail,$in0,$in0,$outperm     # rotate
255          vsel           $stage,$outhead,$outtail,$outmask
256          vmr            $outhead,$outtail
257          stvx           $stage,0,$out
258          addi           $inp,$out,15            # 15 is not typo
259          addi           $out,$out,16
260         bdnz            Loop192
261
262         li              $rounds,12
263         addi            $out,$out,0x20
264         b               Ldone
265
266 .align  4
267 L256:
268         lvx             $tmp,0,$inp
269         li              $cnt,7
270         li              $rounds,14
271          vperm          $outtail,$in0,$in0,$outperm     # rotate
272          vsel           $stage,$outhead,$outtail,$outmask
273          vmr            $outhead,$outtail
274          stvx           $stage,0,$out
275          addi           $out,$out,16
276         vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
277         mtctr           $cnt
278
279 Loop256:
280         vperm           $key,$in1,$in1,$mask    # rotate-n-splat
281         vsldoi          $tmp,$zero,$in0,12      # >>32
282          vperm          $outtail,$in1,$in1,$outperm     # rotate
283          vsel           $stage,$outhead,$outtail,$outmask
284          vmr            $outhead,$outtail
285         vcipherlast     $key,$key,$rcon
286          stvx           $stage,0,$out
287          addi           $out,$out,16
288
289         vxor            $in0,$in0,$tmp
290         vsldoi          $tmp,$zero,$tmp,12      # >>32
291         vxor            $in0,$in0,$tmp
292         vsldoi          $tmp,$zero,$tmp,12      # >>32
293         vxor            $in0,$in0,$tmp
294          vadduwm        $rcon,$rcon,$rcon
295         vxor            $in0,$in0,$key
296          vperm          $outtail,$in0,$in0,$outperm     # rotate
297          vsel           $stage,$outhead,$outtail,$outmask
298          vmr            $outhead,$outtail
299          stvx           $stage,0,$out
300          addi           $inp,$out,15            # 15 is not typo
301          addi           $out,$out,16
302         bdz             Ldone
303
304         vspltw          $key,$in0,3             # just splat
305         vsldoi          $tmp,$zero,$in1,12      # >>32
306         vsbox           $key,$key
307
308         vxor            $in1,$in1,$tmp
309         vsldoi          $tmp,$zero,$tmp,12      # >>32
310         vxor            $in1,$in1,$tmp
311         vsldoi          $tmp,$zero,$tmp,12      # >>32
312         vxor            $in1,$in1,$tmp
313
314         vxor            $in1,$in1,$key
315         b               Loop256
316
317 .align  4
318 Ldone:
319         lvx             $in1,0,$inp             # redundant in aligned case
320         vsel            $in1,$outhead,$in1,$outmask
321         stvx            $in1,0,$inp
322         xor             r3,r3,r3                # return value
323         mtspr           256,$vrsave
324         stw             $rounds,0($out)
325
326         blr
327         .long           0
328         .byte           0,12,0x14,1,0,0,3,0
329         .long           0
330 .size   .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
331
332 .globl  .${prefix}_set_decrypt_key
333 .align  5
334 .${prefix}_set_decrypt_key:
335         $STU            $sp,-$FRAME($sp)
336         mflr            r10
337         $PUSH           r10,$FRAME+$LRSAVE($sp)
338         bl              Lset_encrypt_key
339         mtlr            r10
340
341         slwi            $cnt,$rounds,4
342         subi            $inp,$out,240           # first round key
343         srwi            $rounds,$rounds,1
344         add             $out,$inp,$cnt          # last round key
345         mtctr           $rounds
346
347 Ldeckey:
348         lwz             r0, 0($inp)
349         lwz             r6, 4($inp)
350         lwz             r7, 8($inp)
351         lwz             r8, 12($inp)
352         addi            $inp,$inp,16
353         lwz             r9, 0($out)
354         lwz             r10,4($out)
355         lwz             r11,8($out)
356         lwz             r12,12($out)
357         stw             r0, 0($out)
358         stw             r6, 4($out)
359         stw             r7, 8($out)
360         stw             r8, 12($out)
361         subi            $out,$out,16
362         stw             r9, -16($inp)
363         stw             r10,-12($inp)
364         stw             r11,-8($inp)
365         stw             r12,-4($inp)
366         bdnz            Ldeckey
367
368         xor             r3,r3,r3                # return value
369         addi            $sp,$sp,$FRAME
370         blr
371         .long           0
372         .byte           0,12,4,1,0x80,0,3,0
373         .long           0
374 .size   .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
375 ___
376 }}}
377 #########################################################################
378 {{{     Single block en- and decrypt procedures                         #
379 sub gen_block () {
380 my $dir = shift;
381 my $n   = $dir eq "de" ? "n" : "";
382 my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
383
384 $code.=<<___;
385 .globl  .${prefix}_${dir}crypt
386 .align  5
387 .${prefix}_${dir}crypt:
388         lwz             $rounds,240($key)
389         lis             r0,0xfc00
390         mfspr           $vrsave,256
391         li              $idx,15                 # 15 is not typo
392         mtspr           256,r0
393
394         lvx             v0,0,$inp
395         neg             r11,$out
396         lvx             v1,$idx,$inp
397         lvsl            v2,0,$inp               # inpperm
398         le?vspltisb     v4,0x0f
399         ?lvsl           v3,0,r11                # outperm
400         le?vxor         v2,v2,v4
401         li              $idx,16
402         vperm           v0,v0,v1,v2             # align [and byte swap in LE]
403         lvx             v1,0,$key
404         ?lvsl           v5,0,$key               # keyperm
405         srwi            $rounds,$rounds,1
406         lvx             v2,$idx,$key
407         addi            $idx,$idx,16
408         subi            $rounds,$rounds,1
409         ?vperm          v1,v1,v2,v5             # align round key
410
411         vxor            v0,v0,v1
412         lvx             v1,$idx,$key
413         addi            $idx,$idx,16
414         mtctr           $rounds
415
416 Loop_${dir}c:
417         ?vperm          v2,v2,v1,v5
418         v${n}cipher     v0,v0,v2
419         lvx             v2,$idx,$key
420         addi            $idx,$idx,16
421         ?vperm          v1,v1,v2,v5
422         v${n}cipher     v0,v0,v1
423         lvx             v1,$idx,$key
424         addi            $idx,$idx,16
425         bdnz            Loop_${dir}c
426
427         ?vperm          v2,v2,v1,v5
428         v${n}cipher     v0,v0,v2
429         lvx             v2,$idx,$key
430         ?vperm          v1,v1,v2,v5
431         v${n}cipherlast v0,v0,v1
432
433         vspltisb        v2,-1
434         vxor            v1,v1,v1
435         li              $idx,15                 # 15 is not typo
436         ?vperm          v2,v1,v2,v3             # outmask
437         le?vxor         v3,v3,v4
438         lvx             v1,0,$out               # outhead
439         vperm           v0,v0,v0,v3             # rotate [and byte swap in LE]
440         vsel            v1,v1,v0,v2
441         lvx             v4,$idx,$out
442         stvx            v1,0,$out
443         vsel            v0,v0,v4,v2
444         stvx            v0,$idx,$out
445
446         mtspr           256,$vrsave
447         blr
448         .long           0
449         .byte           0,12,0x14,0,0,0,3,0
450         .long           0
451 .size   .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
452 ___
453 }
454 &gen_block("en");
455 &gen_block("de");
456 }}}
457 #########################################################################
458 {{{     CBC en- and decrypt procedures                                  #
459 my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
460 my ($rndkey0,$rndkey1,$inout,$tmp)=             map("v$_",(0..3));
461 my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
462                                                 map("v$_",(4..10));
463 $code.=<<___;
464 .globl  .${prefix}_cbc_encrypt
465 .align  5
466 .${prefix}_cbc_encrypt:
467         ${UCMP}i        $len,16
468         bltlr-
469
470         cmpwi           $enc,0                  # test direction
471         lis             r0,0xffe0
472         mfspr           $vrsave,256
473         mtspr           256,r0
474
475         li              $idx,15
476         vxor            $rndkey0,$rndkey0,$rndkey0
477         le?vspltisb     $tmp,0x0f
478
479         lvx             $ivec,0,$ivp            # load [unaligned] iv
480         lvsl            $inpperm,0,$ivp
481         lvx             $inptail,$idx,$ivp
482         le?vxor         $inpperm,$inpperm,$tmp
483         vperm           $ivec,$ivec,$inptail,$inpperm
484
485         neg             r11,$inp
486         ?lvsl           $keyperm,0,$key         # prepare for unaligned key
487         lwz             $rounds,240($key)
488
489         lvsr            $inpperm,0,r11          # prepare for unaligned load
490         lvx             $inptail,0,$inp
491         addi            $inp,$inp,15            # 15 is not typo
492         le?vxor         $inpperm,$inpperm,$tmp
493
494         ?lvsr           $outperm,0,$out         # prepare for unaligned store
495         vspltisb        $outmask,-1
496         lvx             $outhead,0,$out
497         ?vperm          $outmask,$rndkey0,$outmask,$outperm
498         le?vxor         $outperm,$outperm,$tmp
499
500         srwi            $rounds,$rounds,1
501         li              $idx,16
502         subi            $rounds,$rounds,1
503         beq             Lcbc_dec
504
505 Lcbc_enc:
506         vmr             $inout,$inptail
507         lvx             $inptail,0,$inp
508         addi            $inp,$inp,16
509         mtctr           $rounds
510         subi            $len,$len,16            # len-=16
511
512         lvx             $rndkey0,0,$key
513          vperm          $inout,$inout,$inptail,$inpperm
514         lvx             $rndkey1,$idx,$key
515         addi            $idx,$idx,16
516         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
517         vxor            $inout,$inout,$rndkey0
518         lvx             $rndkey0,$idx,$key
519         addi            $idx,$idx,16
520         vxor            $inout,$inout,$ivec
521
522 Loop_cbc_enc:
523         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
524         vcipher         $inout,$inout,$rndkey1
525         lvx             $rndkey1,$idx,$key
526         addi            $idx,$idx,16
527         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
528         vcipher         $inout,$inout,$rndkey0
529         lvx             $rndkey0,$idx,$key
530         addi            $idx,$idx,16
531         bdnz            Loop_cbc_enc
532
533         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
534         vcipher         $inout,$inout,$rndkey1
535         lvx             $rndkey1,$idx,$key
536         li              $idx,16
537         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
538         vcipherlast     $ivec,$inout,$rndkey0
539         ${UCMP}i        $len,16
540
541         vperm           $tmp,$ivec,$ivec,$outperm
542         vsel            $inout,$outhead,$tmp,$outmask
543         vmr             $outhead,$tmp
544         stvx            $inout,0,$out
545         addi            $out,$out,16
546         bge             Lcbc_enc
547
548         b               Lcbc_done
549
550 .align  4
551 Lcbc_dec:
552         ${UCMP}i        $len,128
553         bge             _aesp8_cbc_decrypt8x
554         vmr             $tmp,$inptail
555         lvx             $inptail,0,$inp
556         addi            $inp,$inp,16
557         mtctr           $rounds
558         subi            $len,$len,16            # len-=16
559
560         lvx             $rndkey0,0,$key
561          vperm          $tmp,$tmp,$inptail,$inpperm
562         lvx             $rndkey1,$idx,$key
563         addi            $idx,$idx,16
564         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
565         vxor            $inout,$tmp,$rndkey0
566         lvx             $rndkey0,$idx,$key
567         addi            $idx,$idx,16
568
569 Loop_cbc_dec:
570         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
571         vncipher        $inout,$inout,$rndkey1
572         lvx             $rndkey1,$idx,$key
573         addi            $idx,$idx,16
574         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
575         vncipher        $inout,$inout,$rndkey0
576         lvx             $rndkey0,$idx,$key
577         addi            $idx,$idx,16
578         bdnz            Loop_cbc_dec
579
580         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
581         vncipher        $inout,$inout,$rndkey1
582         lvx             $rndkey1,$idx,$key
583         li              $idx,16
584         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
585         vncipherlast    $inout,$inout,$rndkey0
586         ${UCMP}i        $len,16
587
588         vxor            $inout,$inout,$ivec
589         vmr             $ivec,$tmp
590         vperm           $tmp,$inout,$inout,$outperm
591         vsel            $inout,$outhead,$tmp,$outmask
592         vmr             $outhead,$tmp
593         stvx            $inout,0,$out
594         addi            $out,$out,16
595         bge             Lcbc_dec
596
597 Lcbc_done:
598         addi            $out,$out,-1
599         lvx             $inout,0,$out           # redundant in aligned case
600         vsel            $inout,$outhead,$inout,$outmask
601         stvx            $inout,0,$out
602
603         neg             $enc,$ivp               # write [unaligned] iv
604         li              $idx,15                 # 15 is not typo
605         vxor            $rndkey0,$rndkey0,$rndkey0
606         vspltisb        $outmask,-1
607         le?vspltisb     $tmp,0x0f
608         ?lvsl           $outperm,0,$enc
609         ?vperm          $outmask,$rndkey0,$outmask,$outperm
610         le?vxor         $outperm,$outperm,$tmp
611         lvx             $outhead,0,$ivp
612         vperm           $ivec,$ivec,$ivec,$outperm
613         vsel            $inout,$outhead,$ivec,$outmask
614         lvx             $inptail,$idx,$ivp
615         stvx            $inout,0,$ivp
616         vsel            $inout,$ivec,$inptail,$outmask
617         stvx            $inout,$idx,$ivp
618
619         mtspr           256,$vrsave
620         blr
621         .long           0
622         .byte           0,12,0x14,0,0,0,6,0
623         .long           0
624 ___
625 #########################################################################
626 {{      Optimized CBC decrypt procedure                                 #
627 my $key_="r11";
628 my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
629 my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
630 my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
631 my $rndkey0="v23";      # v24-v25 rotating buffer for first found keys
632                         # v26-v31 last 6 round keys
633 my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
634
635 $code.=<<___;
636 .align  5
637 _aesp8_cbc_decrypt8x:
638         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
639         li              r10,`$FRAME+8*16+15`
640         li              r11,`$FRAME+8*16+31`
641         stvx            v20,r10,$sp             # ABI says so
642         addi            r10,r10,32
643         stvx            v21,r11,$sp
644         addi            r11,r11,32
645         stvx            v22,r10,$sp
646         addi            r10,r10,32
647         stvx            v23,r11,$sp
648         addi            r11,r11,32
649         stvx            v24,r10,$sp
650         addi            r10,r10,32
651         stvx            v25,r11,$sp
652         addi            r11,r11,32
653         stvx            v26,r10,$sp
654         addi            r10,r10,32
655         stvx            v27,r11,$sp
656         addi            r11,r11,32
657         stvx            v28,r10,$sp
658         addi            r10,r10,32
659         stvx            v29,r11,$sp
660         addi            r11,r11,32
661         stvx            v30,r10,$sp
662         stvx            v31,r11,$sp
663         li              r0,-1
664         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
665         li              $x10,0x10
666         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
667         li              $x20,0x20
668         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
669         li              $x30,0x30
670         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
671         li              $x40,0x40
672         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
673         li              $x50,0x50
674         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
675         li              $x60,0x60
676         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
677         li              $x70,0x70
678         mtspr           256,r0
679
680         subi            $rounds,$rounds,3       # -4 in total
681         subi            $len,$len,128           # bias
682
683         lvx             $rndkey0,$x00,$key      # load key schedule
684         lvx             v30,$x10,$key
685         addi            $key,$key,0x20
686         lvx             v31,$x00,$key
687         ?vperm          $rndkey0,$rndkey0,v30,$keyperm
688         addi            $key_,$sp,$FRAME+15
689         mtctr           $rounds
690
691 Load_cbc_dec_key:
692         ?vperm          v24,v30,v31,$keyperm
693         lvx             v30,$x10,$key
694         addi            $key,$key,0x20
695         stvx            v24,$x00,$key_          # off-load round[1]
696         ?vperm          v25,v31,v30,$keyperm
697         lvx             v31,$x00,$key
698         stvx            v25,$x10,$key_          # off-load round[2]
699         addi            $key_,$key_,0x20
700         bdnz            Load_cbc_dec_key
701
702         lvx             v26,$x10,$key
703         ?vperm          v24,v30,v31,$keyperm
704         lvx             v27,$x20,$key
705         stvx            v24,$x00,$key_          # off-load round[3]
706         ?vperm          v25,v31,v26,$keyperm
707         lvx             v28,$x30,$key
708         stvx            v25,$x10,$key_          # off-load round[4]
709         addi            $key_,$sp,$FRAME+15     # rewind $key_
710         ?vperm          v26,v26,v27,$keyperm
711         lvx             v29,$x40,$key
712         ?vperm          v27,v27,v28,$keyperm
713         lvx             v30,$x50,$key
714         ?vperm          v28,v28,v29,$keyperm
715         lvx             v31,$x60,$key
716         ?vperm          v29,v29,v30,$keyperm
717         lvx             $out0,$x70,$key         # borrow $out0
718         ?vperm          v30,v30,v31,$keyperm
719         lvx             v24,$x00,$key_          # pre-load round[1]
720         ?vperm          v31,v31,$out0,$keyperm
721         lvx             v25,$x10,$key_          # pre-load round[2]
722
723         #lvx            $inptail,0,$inp         # "caller" already did this
724         #addi           $inp,$inp,15            # 15 is not typo
725         subi            $inp,$inp,15            # undo "caller"
726
727          le?li          $idx,8
728         lvx_u           $in0,$x00,$inp          # load first 8 "words"
729          le?lvsl        $inpperm,0,$idx
730          le?vspltisb    $tmp,0x0f
731         lvx_u           $in1,$x10,$inp
732          le?vxor        $inpperm,$inpperm,$tmp  # transform for lvx_u/stvx_u
733         lvx_u           $in2,$x20,$inp
734          le?vperm       $in0,$in0,$in0,$inpperm
735         lvx_u           $in3,$x30,$inp
736          le?vperm       $in1,$in1,$in1,$inpperm
737         lvx_u           $in4,$x40,$inp
738          le?vperm       $in2,$in2,$in2,$inpperm
739         vxor            $out0,$in0,$rndkey0
740         lvx_u           $in5,$x50,$inp
741          le?vperm       $in3,$in3,$in3,$inpperm
742         vxor            $out1,$in1,$rndkey0
743         lvx_u           $in6,$x60,$inp
744          le?vperm       $in4,$in4,$in4,$inpperm
745         vxor            $out2,$in2,$rndkey0
746         lvx_u           $in7,$x70,$inp
747         addi            $inp,$inp,0x80
748          le?vperm       $in5,$in5,$in5,$inpperm
749         vxor            $out3,$in3,$rndkey0
750          le?vperm       $in6,$in6,$in6,$inpperm
751         vxor            $out4,$in4,$rndkey0
752          le?vperm       $in7,$in7,$in7,$inpperm
753         vxor            $out5,$in5,$rndkey0
754         vxor            $out6,$in6,$rndkey0
755         vxor            $out7,$in7,$rndkey0
756
757         mtctr           $rounds
758         b               Loop_cbc_dec8x
759 .align  5
760 Loop_cbc_dec8x:
761         vncipher        $out0,$out0,v24
762         vncipher        $out1,$out1,v24
763         vncipher        $out2,$out2,v24
764         vncipher        $out3,$out3,v24
765         vncipher        $out4,$out4,v24
766         vncipher        $out5,$out5,v24
767         vncipher        $out6,$out6,v24
768         vncipher        $out7,$out7,v24
769         lvx             v24,$x20,$key_          # round[3]
770         addi            $key_,$key_,0x20
771
772         vncipher        $out0,$out0,v25
773         vncipher        $out1,$out1,v25
774         vncipher        $out2,$out2,v25
775         vncipher        $out3,$out3,v25
776         vncipher        $out4,$out4,v25
777         vncipher        $out5,$out5,v25
778         vncipher        $out6,$out6,v25
779         vncipher        $out7,$out7,v25
780         lvx             v25,$x10,$key_          # round[4]
781         bdnz            Loop_cbc_dec8x
782
783         subic           $len,$len,128           # $len-=128
784         vncipher        $out0,$out0,v24
785         vncipher        $out1,$out1,v24
786         vncipher        $out2,$out2,v24
787         vncipher        $out3,$out3,v24
788         vncipher        $out4,$out4,v24
789         vncipher        $out5,$out5,v24
790         vncipher        $out6,$out6,v24
791         vncipher        $out7,$out7,v24
792
793         subfe.          r0,r0,r0                # borrow?-1:0
794         vncipher        $out0,$out0,v25
795         vncipher        $out1,$out1,v25
796         vncipher        $out2,$out2,v25
797         vncipher        $out3,$out3,v25
798         vncipher        $out4,$out4,v25
799         vncipher        $out5,$out5,v25
800         vncipher        $out6,$out6,v25
801         vncipher        $out7,$out7,v25
802
803         and             r0,r0,$len
804         vncipher        $out0,$out0,v26
805         vncipher        $out1,$out1,v26
806         vncipher        $out2,$out2,v26
807         vncipher        $out3,$out3,v26
808         vncipher        $out4,$out4,v26
809         vncipher        $out5,$out5,v26
810         vncipher        $out6,$out6,v26
811         vncipher        $out7,$out7,v26
812
813         add             $inp,$inp,r0            # $inp is adjusted in such
814                                                 # way that at exit from the
815                                                 # loop inX-in7 are loaded
816                                                 # with last "words"
817         vncipher        $out0,$out0,v27
818         vncipher        $out1,$out1,v27
819         vncipher        $out2,$out2,v27
820         vncipher        $out3,$out3,v27
821         vncipher        $out4,$out4,v27
822         vncipher        $out5,$out5,v27
823         vncipher        $out6,$out6,v27
824         vncipher        $out7,$out7,v27
825
826         addi            $key_,$sp,$FRAME+15     # rewind $key_
827         vncipher        $out0,$out0,v28
828         vncipher        $out1,$out1,v28
829         vncipher        $out2,$out2,v28
830         vncipher        $out3,$out3,v28
831         vncipher        $out4,$out4,v28
832         vncipher        $out5,$out5,v28
833         vncipher        $out6,$out6,v28
834         vncipher        $out7,$out7,v28
835         lvx             v24,$x00,$key_          # re-pre-load round[1]
836
837         vncipher        $out0,$out0,v29
838         vncipher        $out1,$out1,v29
839         vncipher        $out2,$out2,v29
840         vncipher        $out3,$out3,v29
841         vncipher        $out4,$out4,v29
842         vncipher        $out5,$out5,v29
843         vncipher        $out6,$out6,v29
844         vncipher        $out7,$out7,v29
845         lvx             v25,$x10,$key_          # re-pre-load round[2]
846
847         vncipher        $out0,$out0,v30
848          vxor           $ivec,$ivec,v31         # xor with last round key
849         vncipher        $out1,$out1,v30
850          vxor           $in0,$in0,v31
851         vncipher        $out2,$out2,v30
852          vxor           $in1,$in1,v31
853         vncipher        $out3,$out3,v30
854          vxor           $in2,$in2,v31
855         vncipher        $out4,$out4,v30
856          vxor           $in3,$in3,v31
857         vncipher        $out5,$out5,v30
858          vxor           $in4,$in4,v31
859         vncipher        $out6,$out6,v30
860          vxor           $in5,$in5,v31
861         vncipher        $out7,$out7,v30
862          vxor           $in6,$in6,v31
863
864         vncipherlast    $out0,$out0,$ivec
865         vncipherlast    $out1,$out1,$in0
866          lvx_u          $in0,$x00,$inp          # load next input block
867         vncipherlast    $out2,$out2,$in1
868          lvx_u          $in1,$x10,$inp
869         vncipherlast    $out3,$out3,$in2
870          le?vperm       $in0,$in0,$in0,$inpperm
871          lvx_u          $in2,$x20,$inp
872         vncipherlast    $out4,$out4,$in3
873          le?vperm       $in1,$in1,$in1,$inpperm
874          lvx_u          $in3,$x30,$inp
875         vncipherlast    $out5,$out5,$in4
876          le?vperm       $in2,$in2,$in2,$inpperm
877          lvx_u          $in4,$x40,$inp
878         vncipherlast    $out6,$out6,$in5
879          le?vperm       $in3,$in3,$in3,$inpperm
880          lvx_u          $in5,$x50,$inp
881         vncipherlast    $out7,$out7,$in6
882          le?vperm       $in4,$in4,$in4,$inpperm
883          lvx_u          $in6,$x60,$inp
884         vmr             $ivec,$in7
885          le?vperm       $in5,$in5,$in5,$inpperm
886          lvx_u          $in7,$x70,$inp
887          addi           $inp,$inp,0x80
888
889         le?vperm        $out0,$out0,$out0,$inpperm
890         le?vperm        $out1,$out1,$out1,$inpperm
891         stvx_u          $out0,$x00,$out
892          le?vperm       $in6,$in6,$in6,$inpperm
893          vxor           $out0,$in0,$rndkey0
894         le?vperm        $out2,$out2,$out2,$inpperm
895         stvx_u          $out1,$x10,$out
896          le?vperm       $in7,$in7,$in7,$inpperm
897          vxor           $out1,$in1,$rndkey0
898         le?vperm        $out3,$out3,$out3,$inpperm
899         stvx_u          $out2,$x20,$out
900          vxor           $out2,$in2,$rndkey0
901         le?vperm        $out4,$out4,$out4,$inpperm
902         stvx_u          $out3,$x30,$out
903          vxor           $out3,$in3,$rndkey0
904         le?vperm        $out5,$out5,$out5,$inpperm
905         stvx_u          $out4,$x40,$out
906          vxor           $out4,$in4,$rndkey0
907         le?vperm        $out6,$out6,$out6,$inpperm
908         stvx_u          $out5,$x50,$out
909          vxor           $out5,$in5,$rndkey0
910         le?vperm        $out7,$out7,$out7,$inpperm
911         stvx_u          $out6,$x60,$out
912          vxor           $out6,$in6,$rndkey0
913         stvx_u          $out7,$x70,$out
914         addi            $out,$out,0x80
915          vxor           $out7,$in7,$rndkey0
916
917         mtctr           $rounds
918         beq             Loop_cbc_dec8x          # did $len-=128 borrow?
919
920         addic.          $len,$len,128
921         beq             Lcbc_dec8x_done
922         nop
923         nop
924
925 Loop_cbc_dec8x_tail:                            # up to 7 "words" tail...
926         vncipher        $out1,$out1,v24
927         vncipher        $out2,$out2,v24
928         vncipher        $out3,$out3,v24
929         vncipher        $out4,$out4,v24
930         vncipher        $out5,$out5,v24
931         vncipher        $out6,$out6,v24
932         vncipher        $out7,$out7,v24
933         lvx             v24,$x20,$key_          # round[3]
934         addi            $key_,$key_,0x20
935
936         vncipher        $out1,$out1,v25
937         vncipher        $out2,$out2,v25
938         vncipher        $out3,$out3,v25
939         vncipher        $out4,$out4,v25
940         vncipher        $out5,$out5,v25
941         vncipher        $out6,$out6,v25
942         vncipher        $out7,$out7,v25
943         lvx             v25,$x10,$key_          # round[4]
944         bdnz            Loop_cbc_dec8x_tail
945
946         vncipher        $out1,$out1,v24
947         vncipher        $out2,$out2,v24
948         vncipher        $out3,$out3,v24
949         vncipher        $out4,$out4,v24
950         vncipher        $out5,$out5,v24
951         vncipher        $out6,$out6,v24
952         vncipher        $out7,$out7,v24
953
954         vncipher        $out1,$out1,v25
955         vncipher        $out2,$out2,v25
956         vncipher        $out3,$out3,v25
957         vncipher        $out4,$out4,v25
958         vncipher        $out5,$out5,v25
959         vncipher        $out6,$out6,v25
960         vncipher        $out7,$out7,v25
961
962         vncipher        $out1,$out1,v26
963         vncipher        $out2,$out2,v26
964         vncipher        $out3,$out3,v26
965         vncipher        $out4,$out4,v26
966         vncipher        $out5,$out5,v26
967         vncipher        $out6,$out6,v26
968         vncipher        $out7,$out7,v26
969
970         vncipher        $out1,$out1,v27
971         vncipher        $out2,$out2,v27
972         vncipher        $out3,$out3,v27
973         vncipher        $out4,$out4,v27
974         vncipher        $out5,$out5,v27
975         vncipher        $out6,$out6,v27
976         vncipher        $out7,$out7,v27
977
978         vncipher        $out1,$out1,v28
979         vncipher        $out2,$out2,v28
980         vncipher        $out3,$out3,v28
981         vncipher        $out4,$out4,v28
982         vncipher        $out5,$out5,v28
983         vncipher        $out6,$out6,v28
984         vncipher        $out7,$out7,v28
985
986         vncipher        $out1,$out1,v29
987         vncipher        $out2,$out2,v29
988         vncipher        $out3,$out3,v29
989         vncipher        $out4,$out4,v29
990         vncipher        $out5,$out5,v29
991         vncipher        $out6,$out6,v29
992         vncipher        $out7,$out7,v29
993
994         vncipher        $out1,$out1,v30
995          vxor           $ivec,$ivec,v31         # last round key
996         vncipher        $out2,$out2,v30
997          vxor           $in1,$in1,v31
998         vncipher        $out3,$out3,v30
999          vxor           $in2,$in2,v31
1000         vncipher        $out4,$out4,v30
1001          vxor           $in3,$in3,v31
1002         vncipher        $out5,$out5,v30
1003          vxor           $in4,$in4,v31
1004         vncipher        $out6,$out6,v30
1005          vxor           $in5,$in5,v31
1006         vncipher        $out7,$out7,v30
1007          vxor           $in6,$in6,v31
1008
1009         cmplwi          $len,32                 # switch($len)
1010         blt             Lcbc_dec8x_one
1011         nop
1012         beq             Lcbc_dec8x_two
1013         cmplwi          $len,64
1014         blt             Lcbc_dec8x_three
1015         nop
1016         beq             Lcbc_dec8x_four
1017         cmplwi          $len,96
1018         blt             Lcbc_dec8x_five
1019         nop
1020         beq             Lcbc_dec8x_six
1021
1022 Lcbc_dec8x_seven:
1023         vncipherlast    $out1,$out1,$ivec
1024         vncipherlast    $out2,$out2,$in1
1025         vncipherlast    $out3,$out3,$in2
1026         vncipherlast    $out4,$out4,$in3
1027         vncipherlast    $out5,$out5,$in4
1028         vncipherlast    $out6,$out6,$in5
1029         vncipherlast    $out7,$out7,$in6
1030         vmr             $ivec,$in7
1031
1032         le?vperm        $out1,$out1,$out1,$inpperm
1033         le?vperm        $out2,$out2,$out2,$inpperm
1034         stvx_u          $out1,$x00,$out
1035         le?vperm        $out3,$out3,$out3,$inpperm
1036         stvx_u          $out2,$x10,$out
1037         le?vperm        $out4,$out4,$out4,$inpperm
1038         stvx_u          $out3,$x20,$out
1039         le?vperm        $out5,$out5,$out5,$inpperm
1040         stvx_u          $out4,$x30,$out
1041         le?vperm        $out6,$out6,$out6,$inpperm
1042         stvx_u          $out5,$x40,$out
1043         le?vperm        $out7,$out7,$out7,$inpperm
1044         stvx_u          $out6,$x50,$out
1045         stvx_u          $out7,$x60,$out
1046         addi            $out,$out,0x70
1047         b               Lcbc_dec8x_done
1048
1049 .align  5
1050 Lcbc_dec8x_six:
1051         vncipherlast    $out2,$out2,$ivec
1052         vncipherlast    $out3,$out3,$in2
1053         vncipherlast    $out4,$out4,$in3
1054         vncipherlast    $out5,$out5,$in4
1055         vncipherlast    $out6,$out6,$in5
1056         vncipherlast    $out7,$out7,$in6
1057         vmr             $ivec,$in7
1058
1059         le?vperm        $out2,$out2,$out2,$inpperm
1060         le?vperm        $out3,$out3,$out3,$inpperm
1061         stvx_u          $out2,$x00,$out
1062         le?vperm        $out4,$out4,$out4,$inpperm
1063         stvx_u          $out3,$x10,$out
1064         le?vperm        $out5,$out5,$out5,$inpperm
1065         stvx_u          $out4,$x20,$out
1066         le?vperm        $out6,$out6,$out6,$inpperm
1067         stvx_u          $out5,$x30,$out
1068         le?vperm        $out7,$out7,$out7,$inpperm
1069         stvx_u          $out6,$x40,$out
1070         stvx_u          $out7,$x50,$out
1071         addi            $out,$out,0x60
1072         b               Lcbc_dec8x_done
1073
1074 .align  5
1075 Lcbc_dec8x_five:
1076         vncipherlast    $out3,$out3,$ivec
1077         vncipherlast    $out4,$out4,$in3
1078         vncipherlast    $out5,$out5,$in4
1079         vncipherlast    $out6,$out6,$in5
1080         vncipherlast    $out7,$out7,$in6
1081         vmr             $ivec,$in7
1082
1083         le?vperm        $out3,$out3,$out3,$inpperm
1084         le?vperm        $out4,$out4,$out4,$inpperm
1085         stvx_u          $out3,$x00,$out
1086         le?vperm        $out5,$out5,$out5,$inpperm
1087         stvx_u          $out4,$x10,$out
1088         le?vperm        $out6,$out6,$out6,$inpperm
1089         stvx_u          $out5,$x20,$out
1090         le?vperm        $out7,$out7,$out7,$inpperm
1091         stvx_u          $out6,$x30,$out
1092         stvx_u          $out7,$x40,$out
1093         addi            $out,$out,0x50
1094         b               Lcbc_dec8x_done
1095
1096 .align  5
1097 Lcbc_dec8x_four:
1098         vncipherlast    $out4,$out4,$ivec
1099         vncipherlast    $out5,$out5,$in4
1100         vncipherlast    $out6,$out6,$in5
1101         vncipherlast    $out7,$out7,$in6
1102         vmr             $ivec,$in7
1103
1104         le?vperm        $out4,$out4,$out4,$inpperm
1105         le?vperm        $out5,$out5,$out5,$inpperm
1106         stvx_u          $out4,$x00,$out
1107         le?vperm        $out6,$out6,$out6,$inpperm
1108         stvx_u          $out5,$x10,$out
1109         le?vperm        $out7,$out7,$out7,$inpperm
1110         stvx_u          $out6,$x20,$out
1111         stvx_u          $out7,$x30,$out
1112         addi            $out,$out,0x40
1113         b               Lcbc_dec8x_done
1114
1115 .align  5
1116 Lcbc_dec8x_three:
1117         vncipherlast    $out5,$out5,$ivec
1118         vncipherlast    $out6,$out6,$in5
1119         vncipherlast    $out7,$out7,$in6
1120         vmr             $ivec,$in7
1121
1122         le?vperm        $out5,$out5,$out5,$inpperm
1123         le?vperm        $out6,$out6,$out6,$inpperm
1124         stvx_u          $out5,$x00,$out
1125         le?vperm        $out7,$out7,$out7,$inpperm
1126         stvx_u          $out6,$x10,$out
1127         stvx_u          $out7,$x20,$out
1128         addi            $out,$out,0x30
1129         b               Lcbc_dec8x_done
1130
1131 .align  5
1132 Lcbc_dec8x_two:
1133         vncipherlast    $out6,$out6,$ivec
1134         vncipherlast    $out7,$out7,$in6
1135         vmr             $ivec,$in7
1136
1137         le?vperm        $out6,$out6,$out6,$inpperm
1138         le?vperm        $out7,$out7,$out7,$inpperm
1139         stvx_u          $out6,$x00,$out
1140         stvx_u          $out7,$x10,$out
1141         addi            $out,$out,0x20
1142         b               Lcbc_dec8x_done
1143
1144 .align  5
1145 Lcbc_dec8x_one:
1146         vncipherlast    $out7,$out7,$ivec
1147         vmr             $ivec,$in7
1148
1149         le?vperm        $out7,$out7,$out7,$inpperm
1150         stvx_u          $out7,0,$out
1151         addi            $out,$out,0x10
1152
1153 Lcbc_dec8x_done:
1154         le?vperm        $ivec,$ivec,$ivec,$inpperm
1155         stvx_u          $ivec,0,$ivp            # write [unaligned] iv
1156
1157         li              r10,`$FRAME+15`
1158         li              r11,`$FRAME+31`
1159         stvx            $inpperm,r10,$sp        # wipe copies of round keys
1160         addi            r10,r10,32
1161         stvx            $inpperm,r11,$sp
1162         addi            r11,r11,32
1163         stvx            $inpperm,r10,$sp
1164         addi            r10,r10,32
1165         stvx            $inpperm,r11,$sp
1166         addi            r11,r11,32
1167         stvx            $inpperm,r10,$sp
1168         addi            r10,r10,32
1169         stvx            $inpperm,r11,$sp
1170         addi            r11,r11,32
1171         stvx            $inpperm,r10,$sp
1172         addi            r10,r10,32
1173         stvx            $inpperm,r11,$sp
1174         addi            r11,r11,32
1175
1176         mtspr           256,$vrsave
1177         lvx             v20,r10,$sp             # ABI says so
1178         addi            r10,r10,32
1179         lvx             v21,r11,$sp
1180         addi            r11,r11,32
1181         lvx             v22,r10,$sp
1182         addi            r10,r10,32
1183         lvx             v23,r11,$sp
1184         addi            r11,r11,32
1185         lvx             v24,r10,$sp
1186         addi            r10,r10,32
1187         lvx             v25,r11,$sp
1188         addi            r11,r11,32
1189         lvx             v26,r10,$sp
1190         addi            r10,r10,32
1191         lvx             v27,r11,$sp
1192         addi            r11,r11,32
1193         lvx             v28,r10,$sp
1194         addi            r10,r10,32
1195         lvx             v29,r11,$sp
1196         addi            r11,r11,32
1197         lvx             v30,r10,$sp
1198         lvx             v31,r11,$sp
1199         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1200         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1201         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1202         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1203         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1204         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1205         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1206         blr
1207         .long           0
1208         .byte           0,12,0x14,0,0x80,6,6,0
1209         .long           0
1210 .size   .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1211 ___
1212 }}      }}}
1213
1214 my $consts=1;
1215 foreach(split("\n",$code)) {
1216         s/\`([^\`]*)\`/eval($1)/geo;
1217
1218         # constants table endian-specific conversion
1219         if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
1220             my $conv=$3;
1221             my @bytes=();
1222
1223             # convert to endian-agnostic format
1224             if ($1 eq "long") {
1225               foreach (split(/,\s*/,$2)) {
1226                 my $l = /^0/?oct:int;
1227                 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1228               }
1229             } else {
1230                 @bytes = map(/^0/?oct:int,split(/,\s*/,$2));
1231             }
1232
1233             # little-endian conversion
1234             if ($flavour =~ /le$/o) {
1235                 SWITCH: for($conv)  {
1236                     /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
1237                     /\?rev/ && do   { @bytes=reverse(@bytes);    last; }; 
1238                 }
1239             }
1240
1241             #emit
1242             print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1243             next;
1244         }
1245         $consts=0 if (m/Lconsts:/o);    # end of table
1246
1247         # instructions prefixed with '?' are endian-specific and need
1248         # to be adjusted accordingly...
1249         if ($flavour =~ /le$/o) {       # little-endian
1250             s/le\?//o           or
1251             s/be\?/#be#/o       or
1252             s/\?lvsr/lvsl/o     or
1253             s/\?lvsl/lvsr/o     or
1254             s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1255             s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1256             s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1257         } else {                        # big-endian
1258             s/le\?/#le#/o       or
1259             s/be\?//o           or
1260             s/\?([a-z]+)/$1/o;
1261         }
1262
1263         print $_,"\n";
1264 }
1265
1266 close STDOUT;