aesp8-ppc.pl: rigid input verification in key setup.
[openssl.git] / crypto / aes / asm / aesp8-ppc.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for AES instructions as per PowerISA
11 # specification version 2.07, first implemented by POWER8 processor.
12 # The module is endian-agnostic in sense that it supports both big-
13 # and little-endian cases. Data alignment in parallelizable modes is
14 # handled with VSX loads and stores, which implies MSR.VSX flag being
15 # set. It should also be noted that ISA specification doesn't prohibit
16 # alignment exceptions for these instructions on page boundaries.
17 # Initially alignment was handled in pure AltiVec/VMX way [when data
18 # is aligned programmatically, which in turn guarantees exception-
19 # free execution], but it turned to hamper performance when vcipher
20 # instructions are interleaved. It's reckoned that eventual
21 # misalignment penalties at page boundaries are in average lower
22 # than additional overhead in pure AltiVec approach.
23
24 $flavour = shift;
25
26 if ($flavour =~ /64/) {
27         $SIZE_T =8;
28         $LRSAVE =2*$SIZE_T;
29         $STU    ="stdu";
30         $POP    ="ld";
31         $PUSH   ="std";
32         $UCMP   ="cmpld";
33         $SHL    ="sldi";
34 } elsif ($flavour =~ /32/) {
35         $SIZE_T =4;
36         $LRSAVE =$SIZE_T;
37         $STU    ="stwu";
38         $POP    ="lwz";
39         $PUSH   ="stw";
40         $UCMP   ="cmplw";
41         $SHL    ="slwi";
42 } else { die "nonsense $flavour"; }
43
44 $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
45
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
49 die "can't locate ppc-xlate.pl";
50
51 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
52
53 $FRAME=8*$SIZE_T;
54 $prefix="aes_p8";
55
56 $sp="r1";
57 $vrsave="r12";
58
59 #########################################################################
60 {{{     # Key setup procedures                                          #
61 my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
62 my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
63 my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
64
65 $code.=<<___;
66 .machine        "any"
67
68 .text
69
70 .align  7
71 rcon:
72 .long   0x01000000, 0x01000000, 0x01000000, 0x01000000  ?rev
73 .long   0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000  ?rev
74 .long   0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c  ?rev
75 .long   0,0,0,0                                         ?asis
76 Lconsts:
77         mflr    r0
78         bcl     20,31,\$+4
79         mflr    $ptr     #vvvvv "distance between . and rcon
80         addi    $ptr,$ptr,-0x48
81         mtlr    r0
82         blr
83         .long   0
84         .byte   0,12,0x14,0,0,0,0,0
85 .asciz  "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
86
87 .globl  .${prefix}_set_encrypt_key
88 .align  5
89 .${prefix}_set_encrypt_key:
90 Lset_encrypt_key:
91         mflr            r11
92         $PUSH           r11,$LRSAVE($sp)
93
94         li              $ptr,-1
95         ${UCMP}i        $inp,0
96         beq-            Lenc_key_abort          # if ($inp==0) return -1;
97         ${UCMP}i        $out,0
98         beq-            Lenc_key_abort          # if ($out==0) return -1;
99         li              $ptr,-2
100         cmpwi           $bits,128
101         blt-            Lenc_key_abort
102         cmpwi           $bits,256
103         bgt-            Lenc_key_abort
104         andi.           r0,$bits,0x3f
105         bne-            Lenc_key_abort
106
107         lis             r0,0xfff0
108         mfspr           $vrsave,256
109         mtspr           256,r0
110
111         bl              Lconsts
112         mtlr            r11
113
114         neg             r9,$inp
115         lvx             $in0,0,$inp
116         addi            $inp,$inp,15            # 15 is not typo
117         lvsr            $key,0,r9               # borrow $key
118         li              r8,0x20
119         cmpwi           $bits,192
120         lvx             $in1,0,$inp
121         le?vspltisb     $mask,0x0f              # borrow $mask
122         lvx             $rcon,0,$ptr
123         le?vxor         $key,$key,$mask         # adjust for byte swap
124         lvx             $mask,r8,$ptr
125         addi            $ptr,$ptr,0x10
126         vperm           $in0,$in0,$in1,$key     # align [and byte swap in LE]
127         li              $cnt,8
128         vxor            $zero,$zero,$zero
129         mtctr           $cnt
130
131         ?lvsr           $outperm,0,$out
132         vspltisb        $outmask,-1
133         lvx             $outhead,0,$out
134         ?vperm          $outmask,$zero,$outmask,$outperm
135
136         blt             Loop128
137         addi            $inp,$inp,8
138         beq             L192
139         addi            $inp,$inp,8
140         b               L256
141
142 .align  4
143 Loop128:
144         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
145         vsldoi          $tmp,$zero,$in0,12      # >>32
146          vperm          $outtail,$in0,$in0,$outperm     # rotate
147          vsel           $stage,$outhead,$outtail,$outmask
148          vmr            $outhead,$outtail
149         vcipherlast     $key,$key,$rcon
150          stvx           $stage,0,$out
151          addi           $out,$out,16
152
153         vxor            $in0,$in0,$tmp
154         vsldoi          $tmp,$zero,$tmp,12      # >>32
155         vxor            $in0,$in0,$tmp
156         vsldoi          $tmp,$zero,$tmp,12      # >>32
157         vxor            $in0,$in0,$tmp
158          vadduwm        $rcon,$rcon,$rcon
159         vxor            $in0,$in0,$key
160         bdnz            Loop128
161
162         lvx             $rcon,0,$ptr            # last two round keys
163
164         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
165         vsldoi          $tmp,$zero,$in0,12      # >>32
166          vperm          $outtail,$in0,$in0,$outperm     # rotate
167          vsel           $stage,$outhead,$outtail,$outmask
168          vmr            $outhead,$outtail
169         vcipherlast     $key,$key,$rcon
170          stvx           $stage,0,$out
171          addi           $out,$out,16
172
173         vxor            $in0,$in0,$tmp
174         vsldoi          $tmp,$zero,$tmp,12      # >>32
175         vxor            $in0,$in0,$tmp
176         vsldoi          $tmp,$zero,$tmp,12      # >>32
177         vxor            $in0,$in0,$tmp
178          vadduwm        $rcon,$rcon,$rcon
179         vxor            $in0,$in0,$key
180
181         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
182         vsldoi          $tmp,$zero,$in0,12      # >>32
183          vperm          $outtail,$in0,$in0,$outperm     # rotate
184          vsel           $stage,$outhead,$outtail,$outmask
185          vmr            $outhead,$outtail
186         vcipherlast     $key,$key,$rcon
187          stvx           $stage,0,$out
188          addi           $out,$out,16
189
190         vxor            $in0,$in0,$tmp
191         vsldoi          $tmp,$zero,$tmp,12      # >>32
192         vxor            $in0,$in0,$tmp
193         vsldoi          $tmp,$zero,$tmp,12      # >>32
194         vxor            $in0,$in0,$tmp
195         vxor            $in0,$in0,$key
196          vperm          $outtail,$in0,$in0,$outperm     # rotate
197          vsel           $stage,$outhead,$outtail,$outmask
198          vmr            $outhead,$outtail
199          stvx           $stage,0,$out
200
201         addi            $inp,$out,15            # 15 is not typo
202         addi            $out,$out,0x50
203
204         li              $rounds,10
205         b               Ldone
206
207 .align  4
208 L192:
209         lvx             $tmp,0,$inp
210         li              $cnt,4
211          vperm          $outtail,$in0,$in0,$outperm     # rotate
212          vsel           $stage,$outhead,$outtail,$outmask
213          vmr            $outhead,$outtail
214          stvx           $stage,0,$out
215          addi           $out,$out,16
216         vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
217         vspltisb        $key,8                  # borrow $key
218         mtctr           $cnt
219         vsububm         $mask,$mask,$key        # adjust the mask
220
221 Loop192:
222         vperm           $key,$in1,$in1,$mask    # roate-n-splat
223         vsldoi          $tmp,$zero,$in0,12      # >>32
224         vcipherlast     $key,$key,$rcon
225
226         vxor            $in0,$in0,$tmp
227         vsldoi          $tmp,$zero,$tmp,12      # >>32
228         vxor            $in0,$in0,$tmp
229         vsldoi          $tmp,$zero,$tmp,12      # >>32
230         vxor            $in0,$in0,$tmp
231
232          vsldoi         $stage,$zero,$in1,8
233         vspltw          $tmp,$in0,3
234         vxor            $tmp,$tmp,$in1
235         vsldoi          $in1,$zero,$in1,12      # >>32
236          vadduwm        $rcon,$rcon,$rcon
237         vxor            $in1,$in1,$tmp
238         vxor            $in0,$in0,$key
239         vxor            $in1,$in1,$key
240          vsldoi         $stage,$stage,$in0,8
241
242         vperm           $key,$in1,$in1,$mask    # rotate-n-splat
243         vsldoi          $tmp,$zero,$in0,12      # >>32
244          vperm          $outtail,$stage,$stage,$outperm # rotate
245          vsel           $stage,$outhead,$outtail,$outmask
246          vmr            $outhead,$outtail
247         vcipherlast     $key,$key,$rcon
248          stvx           $stage,0,$out
249          addi           $out,$out,16
250
251          vsldoi         $stage,$in0,$in1,8
252         vxor            $in0,$in0,$tmp
253         vsldoi          $tmp,$zero,$tmp,12      # >>32
254          vperm          $outtail,$stage,$stage,$outperm # rotate
255          vsel           $stage,$outhead,$outtail,$outmask
256          vmr            $outhead,$outtail
257         vxor            $in0,$in0,$tmp
258         vsldoi          $tmp,$zero,$tmp,12      # >>32
259         vxor            $in0,$in0,$tmp
260          stvx           $stage,0,$out
261          addi           $out,$out,16
262
263         vspltw          $tmp,$in0,3
264         vxor            $tmp,$tmp,$in1
265         vsldoi          $in1,$zero,$in1,12      # >>32
266          vadduwm        $rcon,$rcon,$rcon
267         vxor            $in1,$in1,$tmp
268         vxor            $in0,$in0,$key
269         vxor            $in1,$in1,$key
270          vperm          $outtail,$in0,$in0,$outperm     # rotate
271          vsel           $stage,$outhead,$outtail,$outmask
272          vmr            $outhead,$outtail
273          stvx           $stage,0,$out
274          addi           $inp,$out,15            # 15 is not typo
275          addi           $out,$out,16
276         bdnz            Loop192
277
278         li              $rounds,12
279         addi            $out,$out,0x20
280         b               Ldone
281
282 .align  4
283 L256:
284         lvx             $tmp,0,$inp
285         li              $cnt,7
286         li              $rounds,14
287          vperm          $outtail,$in0,$in0,$outperm     # rotate
288          vsel           $stage,$outhead,$outtail,$outmask
289          vmr            $outhead,$outtail
290          stvx           $stage,0,$out
291          addi           $out,$out,16
292         vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
293         mtctr           $cnt
294
295 Loop256:
296         vperm           $key,$in1,$in1,$mask    # rotate-n-splat
297         vsldoi          $tmp,$zero,$in0,12      # >>32
298          vperm          $outtail,$in1,$in1,$outperm     # rotate
299          vsel           $stage,$outhead,$outtail,$outmask
300          vmr            $outhead,$outtail
301         vcipherlast     $key,$key,$rcon
302          stvx           $stage,0,$out
303          addi           $out,$out,16
304
305         vxor            $in0,$in0,$tmp
306         vsldoi          $tmp,$zero,$tmp,12      # >>32
307         vxor            $in0,$in0,$tmp
308         vsldoi          $tmp,$zero,$tmp,12      # >>32
309         vxor            $in0,$in0,$tmp
310          vadduwm        $rcon,$rcon,$rcon
311         vxor            $in0,$in0,$key
312          vperm          $outtail,$in0,$in0,$outperm     # rotate
313          vsel           $stage,$outhead,$outtail,$outmask
314          vmr            $outhead,$outtail
315          stvx           $stage,0,$out
316          addi           $inp,$out,15            # 15 is not typo
317          addi           $out,$out,16
318         bdz             Ldone
319
320         vspltw          $key,$in0,3             # just splat
321         vsldoi          $tmp,$zero,$in1,12      # >>32
322         vsbox           $key,$key
323
324         vxor            $in1,$in1,$tmp
325         vsldoi          $tmp,$zero,$tmp,12      # >>32
326         vxor            $in1,$in1,$tmp
327         vsldoi          $tmp,$zero,$tmp,12      # >>32
328         vxor            $in1,$in1,$tmp
329
330         vxor            $in1,$in1,$key
331         b               Loop256
332
333 .align  4
334 Ldone:
335         lvx             $in1,0,$inp             # redundant in aligned case
336         vsel            $in1,$outhead,$in1,$outmask
337         stvx            $in1,0,$inp
338         li              $ptr,0
339         mtspr           256,$vrsave
340         stw             $rounds,0($out)
341
342 Lenc_key_abort:
343         mr              r3,$ptr
344         blr
345         .long           0
346         .byte           0,12,0x14,1,0,0,3,0
347         .long           0
348 .size   .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
349
350 .globl  .${prefix}_set_decrypt_key
351 .align  5
352 .${prefix}_set_decrypt_key:
353         $STU            $sp,-$FRAME($sp)
354         mflr            r10
355         $PUSH           r10,$FRAME+$LRSAVE($sp)
356         bl              Lset_encrypt_key
357         mtlr            r10
358
359         cmpwi           r3,0
360         bne-            Ldec_key_abort
361
362         slwi            $cnt,$rounds,4
363         subi            $inp,$out,240           # first round key
364         srwi            $rounds,$rounds,1
365         add             $out,$inp,$cnt          # last round key
366         mtctr           $rounds
367
368 Ldeckey:
369         lwz             r0, 0($inp)
370         lwz             r6, 4($inp)
371         lwz             r7, 8($inp)
372         lwz             r8, 12($inp)
373         addi            $inp,$inp,16
374         lwz             r9, 0($out)
375         lwz             r10,4($out)
376         lwz             r11,8($out)
377         lwz             r12,12($out)
378         stw             r0, 0($out)
379         stw             r6, 4($out)
380         stw             r7, 8($out)
381         stw             r8, 12($out)
382         subi            $out,$out,16
383         stw             r9, -16($inp)
384         stw             r10,-12($inp)
385         stw             r11,-8($inp)
386         stw             r12,-4($inp)
387         bdnz            Ldeckey
388
389         xor             r3,r3,r3                # return value
390 Ldec_key_abort:
391         addi            $sp,$sp,$FRAME
392         blr
393         .long           0
394         .byte           0,12,4,1,0x80,0,3,0
395         .long           0
396 .size   .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
397 ___
398 }}}
399 #########################################################################
400 {{{     # Single block en- and decrypt procedures                       #
401 sub gen_block () {
402 my $dir = shift;
403 my $n   = $dir eq "de" ? "n" : "";
404 my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
405
406 $code.=<<___;
407 .globl  .${prefix}_${dir}crypt
408 .align  5
409 .${prefix}_${dir}crypt:
410         lwz             $rounds,240($key)
411         lis             r0,0xfc00
412         mfspr           $vrsave,256
413         li              $idx,15                 # 15 is not typo
414         mtspr           256,r0
415
416         lvx             v0,0,$inp
417         neg             r11,$out
418         lvx             v1,$idx,$inp
419         lvsl            v2,0,$inp               # inpperm
420         le?vspltisb     v4,0x0f
421         ?lvsl           v3,0,r11                # outperm
422         le?vxor         v2,v2,v4
423         li              $idx,16
424         vperm           v0,v0,v1,v2             # align [and byte swap in LE]
425         lvx             v1,0,$key
426         ?lvsl           v5,0,$key               # keyperm
427         srwi            $rounds,$rounds,1
428         lvx             v2,$idx,$key
429         addi            $idx,$idx,16
430         subi            $rounds,$rounds,1
431         ?vperm          v1,v1,v2,v5             # align round key
432
433         vxor            v0,v0,v1
434         lvx             v1,$idx,$key
435         addi            $idx,$idx,16
436         mtctr           $rounds
437
438 Loop_${dir}c:
439         ?vperm          v2,v2,v1,v5
440         v${n}cipher     v0,v0,v2
441         lvx             v2,$idx,$key
442         addi            $idx,$idx,16
443         ?vperm          v1,v1,v2,v5
444         v${n}cipher     v0,v0,v1
445         lvx             v1,$idx,$key
446         addi            $idx,$idx,16
447         bdnz            Loop_${dir}c
448
449         ?vperm          v2,v2,v1,v5
450         v${n}cipher     v0,v0,v2
451         lvx             v2,$idx,$key
452         ?vperm          v1,v1,v2,v5
453         v${n}cipherlast v0,v0,v1
454
455         vspltisb        v2,-1
456         vxor            v1,v1,v1
457         li              $idx,15                 # 15 is not typo
458         ?vperm          v2,v1,v2,v3             # outmask
459         le?vxor         v3,v3,v4
460         lvx             v1,0,$out               # outhead
461         vperm           v0,v0,v0,v3             # rotate [and byte swap in LE]
462         vsel            v1,v1,v0,v2
463         lvx             v4,$idx,$out
464         stvx            v1,0,$out
465         vsel            v0,v0,v4,v2
466         stvx            v0,$idx,$out
467
468         mtspr           256,$vrsave
469         blr
470         .long           0
471         .byte           0,12,0x14,0,0,0,3,0
472         .long           0
473 .size   .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
474 ___
475 }
476 &gen_block("en");
477 &gen_block("de");
478 }}}
479 #########################################################################
480 {{{     # CBC en- and decrypt procedures                                #
481 my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
482 my ($rndkey0,$rndkey1,$inout,$tmp)=             map("v$_",(0..3));
483 my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
484                                                 map("v$_",(4..10));
485 $code.=<<___;
486 .globl  .${prefix}_cbc_encrypt
487 .align  5
488 .${prefix}_cbc_encrypt:
489         ${UCMP}i        $len,16
490         bltlr-
491
492         cmpwi           $enc,0                  # test direction
493         lis             r0,0xffe0
494         mfspr           $vrsave,256
495         mtspr           256,r0
496
497         li              $idx,15
498         vxor            $rndkey0,$rndkey0,$rndkey0
499         le?vspltisb     $tmp,0x0f
500
501         lvx             $ivec,0,$ivp            # load [unaligned] iv
502         lvsl            $inpperm,0,$ivp
503         lvx             $inptail,$idx,$ivp
504         le?vxor         $inpperm,$inpperm,$tmp
505         vperm           $ivec,$ivec,$inptail,$inpperm
506
507         neg             r11,$inp
508         ?lvsl           $keyperm,0,$key         # prepare for unaligned key
509         lwz             $rounds,240($key)
510
511         lvsr            $inpperm,0,r11          # prepare for unaligned load
512         lvx             $inptail,0,$inp
513         addi            $inp,$inp,15            # 15 is not typo
514         le?vxor         $inpperm,$inpperm,$tmp
515
516         ?lvsr           $outperm,0,$out         # prepare for unaligned store
517         vspltisb        $outmask,-1
518         lvx             $outhead,0,$out
519         ?vperm          $outmask,$rndkey0,$outmask,$outperm
520         le?vxor         $outperm,$outperm,$tmp
521
522         srwi            $rounds,$rounds,1
523         li              $idx,16
524         subi            $rounds,$rounds,1
525         beq             Lcbc_dec
526
527 Lcbc_enc:
528         vmr             $inout,$inptail
529         lvx             $inptail,0,$inp
530         addi            $inp,$inp,16
531         mtctr           $rounds
532         subi            $len,$len,16            # len-=16
533
534         lvx             $rndkey0,0,$key
535          vperm          $inout,$inout,$inptail,$inpperm
536         lvx             $rndkey1,$idx,$key
537         addi            $idx,$idx,16
538         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
539         vxor            $inout,$inout,$rndkey0
540         lvx             $rndkey0,$idx,$key
541         addi            $idx,$idx,16
542         vxor            $inout,$inout,$ivec
543
544 Loop_cbc_enc:
545         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
546         vcipher         $inout,$inout,$rndkey1
547         lvx             $rndkey1,$idx,$key
548         addi            $idx,$idx,16
549         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
550         vcipher         $inout,$inout,$rndkey0
551         lvx             $rndkey0,$idx,$key
552         addi            $idx,$idx,16
553         bdnz            Loop_cbc_enc
554
555         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
556         vcipher         $inout,$inout,$rndkey1
557         lvx             $rndkey1,$idx,$key
558         li              $idx,16
559         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
560         vcipherlast     $ivec,$inout,$rndkey0
561         ${UCMP}i        $len,16
562
563         vperm           $tmp,$ivec,$ivec,$outperm
564         vsel            $inout,$outhead,$tmp,$outmask
565         vmr             $outhead,$tmp
566         stvx            $inout,0,$out
567         addi            $out,$out,16
568         bge             Lcbc_enc
569
570         b               Lcbc_done
571
572 .align  4
573 Lcbc_dec:
574         ${UCMP}i        $len,128
575         bge             _aesp8_cbc_decrypt8x
576         vmr             $tmp,$inptail
577         lvx             $inptail,0,$inp
578         addi            $inp,$inp,16
579         mtctr           $rounds
580         subi            $len,$len,16            # len-=16
581
582         lvx             $rndkey0,0,$key
583          vperm          $tmp,$tmp,$inptail,$inpperm
584         lvx             $rndkey1,$idx,$key
585         addi            $idx,$idx,16
586         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
587         vxor            $inout,$tmp,$rndkey0
588         lvx             $rndkey0,$idx,$key
589         addi            $idx,$idx,16
590
591 Loop_cbc_dec:
592         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
593         vncipher        $inout,$inout,$rndkey1
594         lvx             $rndkey1,$idx,$key
595         addi            $idx,$idx,16
596         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
597         vncipher        $inout,$inout,$rndkey0
598         lvx             $rndkey0,$idx,$key
599         addi            $idx,$idx,16
600         bdnz            Loop_cbc_dec
601
602         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
603         vncipher        $inout,$inout,$rndkey1
604         lvx             $rndkey1,$idx,$key
605         li              $idx,16
606         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
607         vncipherlast    $inout,$inout,$rndkey0
608         ${UCMP}i        $len,16
609
610         vxor            $inout,$inout,$ivec
611         vmr             $ivec,$tmp
612         vperm           $tmp,$inout,$inout,$outperm
613         vsel            $inout,$outhead,$tmp,$outmask
614         vmr             $outhead,$tmp
615         stvx            $inout,0,$out
616         addi            $out,$out,16
617         bge             Lcbc_dec
618
619 Lcbc_done:
620         addi            $out,$out,-1
621         lvx             $inout,0,$out           # redundant in aligned case
622         vsel            $inout,$outhead,$inout,$outmask
623         stvx            $inout,0,$out
624
625         neg             $enc,$ivp               # write [unaligned] iv
626         li              $idx,15                 # 15 is not typo
627         vxor            $rndkey0,$rndkey0,$rndkey0
628         vspltisb        $outmask,-1
629         le?vspltisb     $tmp,0x0f
630         ?lvsl           $outperm,0,$enc
631         ?vperm          $outmask,$rndkey0,$outmask,$outperm
632         le?vxor         $outperm,$outperm,$tmp
633         lvx             $outhead,0,$ivp
634         vperm           $ivec,$ivec,$ivec,$outperm
635         vsel            $inout,$outhead,$ivec,$outmask
636         lvx             $inptail,$idx,$ivp
637         stvx            $inout,0,$ivp
638         vsel            $inout,$ivec,$inptail,$outmask
639         stvx            $inout,$idx,$ivp
640
641         mtspr           256,$vrsave
642         blr
643         .long           0
644         .byte           0,12,0x14,0,0,0,6,0
645         .long           0
646 ___
647 #########################################################################
648 {{      # Optimized CBC decrypt procedure                               #
649 my $key_="r11";
650 my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
651 my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
652 my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
653 my $rndkey0="v23";      # v24-v25 rotating buffer for first found keys
654                         # v26-v31 last 6 round keys
655 my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
656
657 $code.=<<___;
658 .align  5
659 _aesp8_cbc_decrypt8x:
660         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
661         li              r10,`$FRAME+8*16+15`
662         li              r11,`$FRAME+8*16+31`
663         stvx            v20,r10,$sp             # ABI says so
664         addi            r10,r10,32
665         stvx            v21,r11,$sp
666         addi            r11,r11,32
667         stvx            v22,r10,$sp
668         addi            r10,r10,32
669         stvx            v23,r11,$sp
670         addi            r11,r11,32
671         stvx            v24,r10,$sp
672         addi            r10,r10,32
673         stvx            v25,r11,$sp
674         addi            r11,r11,32
675         stvx            v26,r10,$sp
676         addi            r10,r10,32
677         stvx            v27,r11,$sp
678         addi            r11,r11,32
679         stvx            v28,r10,$sp
680         addi            r10,r10,32
681         stvx            v29,r11,$sp
682         addi            r11,r11,32
683         stvx            v30,r10,$sp
684         stvx            v31,r11,$sp
685         li              r0,-1
686         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
687         li              $x10,0x10
688         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
689         li              $x20,0x20
690         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
691         li              $x30,0x30
692         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
693         li              $x40,0x40
694         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
695         li              $x50,0x50
696         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
697         li              $x60,0x60
698         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
699         li              $x70,0x70
700         mtspr           256,r0
701
702         subi            $rounds,$rounds,3       # -4 in total
703         subi            $len,$len,128           # bias
704
705         lvx             $rndkey0,$x00,$key      # load key schedule
706         lvx             v30,$x10,$key
707         addi            $key,$key,0x20
708         lvx             v31,$x00,$key
709         ?vperm          $rndkey0,$rndkey0,v30,$keyperm
710         addi            $key_,$sp,$FRAME+15
711         mtctr           $rounds
712
713 Load_cbc_dec_key:
714         ?vperm          v24,v30,v31,$keyperm
715         lvx             v30,$x10,$key
716         addi            $key,$key,0x20
717         stvx            v24,$x00,$key_          # off-load round[1]
718         ?vperm          v25,v31,v30,$keyperm
719         lvx             v31,$x00,$key
720         stvx            v25,$x10,$key_          # off-load round[2]
721         addi            $key_,$key_,0x20
722         bdnz            Load_cbc_dec_key
723
724         lvx             v26,$x10,$key
725         ?vperm          v24,v30,v31,$keyperm
726         lvx             v27,$x20,$key
727         stvx            v24,$x00,$key_          # off-load round[3]
728         ?vperm          v25,v31,v26,$keyperm
729         lvx             v28,$x30,$key
730         stvx            v25,$x10,$key_          # off-load round[4]
731         addi            $key_,$sp,$FRAME+15     # rewind $key_
732         ?vperm          v26,v26,v27,$keyperm
733         lvx             v29,$x40,$key
734         ?vperm          v27,v27,v28,$keyperm
735         lvx             v30,$x50,$key
736         ?vperm          v28,v28,v29,$keyperm
737         lvx             v31,$x60,$key
738         ?vperm          v29,v29,v30,$keyperm
739         lvx             $out0,$x70,$key         # borrow $out0
740         ?vperm          v30,v30,v31,$keyperm
741         lvx             v24,$x00,$key_          # pre-load round[1]
742         ?vperm          v31,v31,$out0,$keyperm
743         lvx             v25,$x10,$key_          # pre-load round[2]
744
745         #lvx            $inptail,0,$inp         # "caller" already did this
746         #addi           $inp,$inp,15            # 15 is not typo
747         subi            $inp,$inp,15            # undo "caller"
748
749          le?li          $idx,8
750         lvx_u           $in0,$x00,$inp          # load first 8 "words"
751          le?lvsl        $inpperm,0,$idx
752          le?vspltisb    $tmp,0x0f
753         lvx_u           $in1,$x10,$inp
754          le?vxor        $inpperm,$inpperm,$tmp  # transform for lvx_u/stvx_u
755         lvx_u           $in2,$x20,$inp
756          le?vperm       $in0,$in0,$in0,$inpperm
757         lvx_u           $in3,$x30,$inp
758          le?vperm       $in1,$in1,$in1,$inpperm
759         lvx_u           $in4,$x40,$inp
760          le?vperm       $in2,$in2,$in2,$inpperm
761         vxor            $out0,$in0,$rndkey0
762         lvx_u           $in5,$x50,$inp
763          le?vperm       $in3,$in3,$in3,$inpperm
764         vxor            $out1,$in1,$rndkey0
765         lvx_u           $in6,$x60,$inp
766          le?vperm       $in4,$in4,$in4,$inpperm
767         vxor            $out2,$in2,$rndkey0
768         lvx_u           $in7,$x70,$inp
769         addi            $inp,$inp,0x80
770          le?vperm       $in5,$in5,$in5,$inpperm
771         vxor            $out3,$in3,$rndkey0
772          le?vperm       $in6,$in6,$in6,$inpperm
773         vxor            $out4,$in4,$rndkey0
774          le?vperm       $in7,$in7,$in7,$inpperm
775         vxor            $out5,$in5,$rndkey0
776         vxor            $out6,$in6,$rndkey0
777         vxor            $out7,$in7,$rndkey0
778
779         mtctr           $rounds
780         b               Loop_cbc_dec8x
781 .align  5
782 Loop_cbc_dec8x:
783         vncipher        $out0,$out0,v24
784         vncipher        $out1,$out1,v24
785         vncipher        $out2,$out2,v24
786         vncipher        $out3,$out3,v24
787         vncipher        $out4,$out4,v24
788         vncipher        $out5,$out5,v24
789         vncipher        $out6,$out6,v24
790         vncipher        $out7,$out7,v24
791         lvx             v24,$x20,$key_          # round[3]
792         addi            $key_,$key_,0x20
793
794         vncipher        $out0,$out0,v25
795         vncipher        $out1,$out1,v25
796         vncipher        $out2,$out2,v25
797         vncipher        $out3,$out3,v25
798         vncipher        $out4,$out4,v25
799         vncipher        $out5,$out5,v25
800         vncipher        $out6,$out6,v25
801         vncipher        $out7,$out7,v25
802         lvx             v25,$x10,$key_          # round[4]
803         bdnz            Loop_cbc_dec8x
804
805         subic           $len,$len,128           # $len-=128
806         vncipher        $out0,$out0,v24
807         vncipher        $out1,$out1,v24
808         vncipher        $out2,$out2,v24
809         vncipher        $out3,$out3,v24
810         vncipher        $out4,$out4,v24
811         vncipher        $out5,$out5,v24
812         vncipher        $out6,$out6,v24
813         vncipher        $out7,$out7,v24
814
815         subfe.          r0,r0,r0                # borrow?-1:0
816         vncipher        $out0,$out0,v25
817         vncipher        $out1,$out1,v25
818         vncipher        $out2,$out2,v25
819         vncipher        $out3,$out3,v25
820         vncipher        $out4,$out4,v25
821         vncipher        $out5,$out5,v25
822         vncipher        $out6,$out6,v25
823         vncipher        $out7,$out7,v25
824
825         and             r0,r0,$len
826         vncipher        $out0,$out0,v26
827         vncipher        $out1,$out1,v26
828         vncipher        $out2,$out2,v26
829         vncipher        $out3,$out3,v26
830         vncipher        $out4,$out4,v26
831         vncipher        $out5,$out5,v26
832         vncipher        $out6,$out6,v26
833         vncipher        $out7,$out7,v26
834
835         add             $inp,$inp,r0            # $inp is adjusted in such
836                                                 # way that at exit from the
837                                                 # loop inX-in7 are loaded
838                                                 # with last "words"
839         vncipher        $out0,$out0,v27
840         vncipher        $out1,$out1,v27
841         vncipher        $out2,$out2,v27
842         vncipher        $out3,$out3,v27
843         vncipher        $out4,$out4,v27
844         vncipher        $out5,$out5,v27
845         vncipher        $out6,$out6,v27
846         vncipher        $out7,$out7,v27
847
848         addi            $key_,$sp,$FRAME+15     # rewind $key_
849         vncipher        $out0,$out0,v28
850         vncipher        $out1,$out1,v28
851         vncipher        $out2,$out2,v28
852         vncipher        $out3,$out3,v28
853         vncipher        $out4,$out4,v28
854         vncipher        $out5,$out5,v28
855         vncipher        $out6,$out6,v28
856         vncipher        $out7,$out7,v28
857         lvx             v24,$x00,$key_          # re-pre-load round[1]
858
859         vncipher        $out0,$out0,v29
860         vncipher        $out1,$out1,v29
861         vncipher        $out2,$out2,v29
862         vncipher        $out3,$out3,v29
863         vncipher        $out4,$out4,v29
864         vncipher        $out5,$out5,v29
865         vncipher        $out6,$out6,v29
866         vncipher        $out7,$out7,v29
867         lvx             v25,$x10,$key_          # re-pre-load round[2]
868
869         vncipher        $out0,$out0,v30
870          vxor           $ivec,$ivec,v31         # xor with last round key
871         vncipher        $out1,$out1,v30
872          vxor           $in0,$in0,v31
873         vncipher        $out2,$out2,v30
874          vxor           $in1,$in1,v31
875         vncipher        $out3,$out3,v30
876          vxor           $in2,$in2,v31
877         vncipher        $out4,$out4,v30
878          vxor           $in3,$in3,v31
879         vncipher        $out5,$out5,v30
880          vxor           $in4,$in4,v31
881         vncipher        $out6,$out6,v30
882          vxor           $in5,$in5,v31
883         vncipher        $out7,$out7,v30
884          vxor           $in6,$in6,v31
885
886         vncipherlast    $out0,$out0,$ivec
887         vncipherlast    $out1,$out1,$in0
888          lvx_u          $in0,$x00,$inp          # load next input block
889         vncipherlast    $out2,$out2,$in1
890          lvx_u          $in1,$x10,$inp
891         vncipherlast    $out3,$out3,$in2
892          le?vperm       $in0,$in0,$in0,$inpperm
893          lvx_u          $in2,$x20,$inp
894         vncipherlast    $out4,$out4,$in3
895          le?vperm       $in1,$in1,$in1,$inpperm
896          lvx_u          $in3,$x30,$inp
897         vncipherlast    $out5,$out5,$in4
898          le?vperm       $in2,$in2,$in2,$inpperm
899          lvx_u          $in4,$x40,$inp
900         vncipherlast    $out6,$out6,$in5
901          le?vperm       $in3,$in3,$in3,$inpperm
902          lvx_u          $in5,$x50,$inp
903         vncipherlast    $out7,$out7,$in6
904          le?vperm       $in4,$in4,$in4,$inpperm
905          lvx_u          $in6,$x60,$inp
906         vmr             $ivec,$in7
907          le?vperm       $in5,$in5,$in5,$inpperm
908          lvx_u          $in7,$x70,$inp
909          addi           $inp,$inp,0x80
910
911         le?vperm        $out0,$out0,$out0,$inpperm
912         le?vperm        $out1,$out1,$out1,$inpperm
913         stvx_u          $out0,$x00,$out
914          le?vperm       $in6,$in6,$in6,$inpperm
915          vxor           $out0,$in0,$rndkey0
916         le?vperm        $out2,$out2,$out2,$inpperm
917         stvx_u          $out1,$x10,$out
918          le?vperm       $in7,$in7,$in7,$inpperm
919          vxor           $out1,$in1,$rndkey0
920         le?vperm        $out3,$out3,$out3,$inpperm
921         stvx_u          $out2,$x20,$out
922          vxor           $out2,$in2,$rndkey0
923         le?vperm        $out4,$out4,$out4,$inpperm
924         stvx_u          $out3,$x30,$out
925          vxor           $out3,$in3,$rndkey0
926         le?vperm        $out5,$out5,$out5,$inpperm
927         stvx_u          $out4,$x40,$out
928          vxor           $out4,$in4,$rndkey0
929         le?vperm        $out6,$out6,$out6,$inpperm
930         stvx_u          $out5,$x50,$out
931          vxor           $out5,$in5,$rndkey0
932         le?vperm        $out7,$out7,$out7,$inpperm
933         stvx_u          $out6,$x60,$out
934          vxor           $out6,$in6,$rndkey0
935         stvx_u          $out7,$x70,$out
936         addi            $out,$out,0x80
937          vxor           $out7,$in7,$rndkey0
938
939         mtctr           $rounds
940         beq             Loop_cbc_dec8x          # did $len-=128 borrow?
941
942         addic.          $len,$len,128
943         beq             Lcbc_dec8x_done
944         nop
945         nop
946
947 Loop_cbc_dec8x_tail:                            # up to 7 "words" tail...
948         vncipher        $out1,$out1,v24
949         vncipher        $out2,$out2,v24
950         vncipher        $out3,$out3,v24
951         vncipher        $out4,$out4,v24
952         vncipher        $out5,$out5,v24
953         vncipher        $out6,$out6,v24
954         vncipher        $out7,$out7,v24
955         lvx             v24,$x20,$key_          # round[3]
956         addi            $key_,$key_,0x20
957
958         vncipher        $out1,$out1,v25
959         vncipher        $out2,$out2,v25
960         vncipher        $out3,$out3,v25
961         vncipher        $out4,$out4,v25
962         vncipher        $out5,$out5,v25
963         vncipher        $out6,$out6,v25
964         vncipher        $out7,$out7,v25
965         lvx             v25,$x10,$key_          # round[4]
966         bdnz            Loop_cbc_dec8x_tail
967
968         vncipher        $out1,$out1,v24
969         vncipher        $out2,$out2,v24
970         vncipher        $out3,$out3,v24
971         vncipher        $out4,$out4,v24
972         vncipher        $out5,$out5,v24
973         vncipher        $out6,$out6,v24
974         vncipher        $out7,$out7,v24
975
976         vncipher        $out1,$out1,v25
977         vncipher        $out2,$out2,v25
978         vncipher        $out3,$out3,v25
979         vncipher        $out4,$out4,v25
980         vncipher        $out5,$out5,v25
981         vncipher        $out6,$out6,v25
982         vncipher        $out7,$out7,v25
983
984         vncipher        $out1,$out1,v26
985         vncipher        $out2,$out2,v26
986         vncipher        $out3,$out3,v26
987         vncipher        $out4,$out4,v26
988         vncipher        $out5,$out5,v26
989         vncipher        $out6,$out6,v26
990         vncipher        $out7,$out7,v26
991
992         vncipher        $out1,$out1,v27
993         vncipher        $out2,$out2,v27
994         vncipher        $out3,$out3,v27
995         vncipher        $out4,$out4,v27
996         vncipher        $out5,$out5,v27
997         vncipher        $out6,$out6,v27
998         vncipher        $out7,$out7,v27
999
1000         vncipher        $out1,$out1,v28
1001         vncipher        $out2,$out2,v28
1002         vncipher        $out3,$out3,v28
1003         vncipher        $out4,$out4,v28
1004         vncipher        $out5,$out5,v28
1005         vncipher        $out6,$out6,v28
1006         vncipher        $out7,$out7,v28
1007
1008         vncipher        $out1,$out1,v29
1009         vncipher        $out2,$out2,v29
1010         vncipher        $out3,$out3,v29
1011         vncipher        $out4,$out4,v29
1012         vncipher        $out5,$out5,v29
1013         vncipher        $out6,$out6,v29
1014         vncipher        $out7,$out7,v29
1015
1016         vncipher        $out1,$out1,v30
1017          vxor           $ivec,$ivec,v31         # last round key
1018         vncipher        $out2,$out2,v30
1019          vxor           $in1,$in1,v31
1020         vncipher        $out3,$out3,v30
1021          vxor           $in2,$in2,v31
1022         vncipher        $out4,$out4,v30
1023          vxor           $in3,$in3,v31
1024         vncipher        $out5,$out5,v30
1025          vxor           $in4,$in4,v31
1026         vncipher        $out6,$out6,v30
1027          vxor           $in5,$in5,v31
1028         vncipher        $out7,$out7,v30
1029          vxor           $in6,$in6,v31
1030
1031         cmplwi          $len,32                 # switch($len)
1032         blt             Lcbc_dec8x_one
1033         nop
1034         beq             Lcbc_dec8x_two
1035         cmplwi          $len,64
1036         blt             Lcbc_dec8x_three
1037         nop
1038         beq             Lcbc_dec8x_four
1039         cmplwi          $len,96
1040         blt             Lcbc_dec8x_five
1041         nop
1042         beq             Lcbc_dec8x_six
1043
1044 Lcbc_dec8x_seven:
1045         vncipherlast    $out1,$out1,$ivec
1046         vncipherlast    $out2,$out2,$in1
1047         vncipherlast    $out3,$out3,$in2
1048         vncipherlast    $out4,$out4,$in3
1049         vncipherlast    $out5,$out5,$in4
1050         vncipherlast    $out6,$out6,$in5
1051         vncipherlast    $out7,$out7,$in6
1052         vmr             $ivec,$in7
1053
1054         le?vperm        $out1,$out1,$out1,$inpperm
1055         le?vperm        $out2,$out2,$out2,$inpperm
1056         stvx_u          $out1,$x00,$out
1057         le?vperm        $out3,$out3,$out3,$inpperm
1058         stvx_u          $out2,$x10,$out
1059         le?vperm        $out4,$out4,$out4,$inpperm
1060         stvx_u          $out3,$x20,$out
1061         le?vperm        $out5,$out5,$out5,$inpperm
1062         stvx_u          $out4,$x30,$out
1063         le?vperm        $out6,$out6,$out6,$inpperm
1064         stvx_u          $out5,$x40,$out
1065         le?vperm        $out7,$out7,$out7,$inpperm
1066         stvx_u          $out6,$x50,$out
1067         stvx_u          $out7,$x60,$out
1068         addi            $out,$out,0x70
1069         b               Lcbc_dec8x_done
1070
1071 .align  5
1072 Lcbc_dec8x_six:
1073         vncipherlast    $out2,$out2,$ivec
1074         vncipherlast    $out3,$out3,$in2
1075         vncipherlast    $out4,$out4,$in3
1076         vncipherlast    $out5,$out5,$in4
1077         vncipherlast    $out6,$out6,$in5
1078         vncipherlast    $out7,$out7,$in6
1079         vmr             $ivec,$in7
1080
1081         le?vperm        $out2,$out2,$out2,$inpperm
1082         le?vperm        $out3,$out3,$out3,$inpperm
1083         stvx_u          $out2,$x00,$out
1084         le?vperm        $out4,$out4,$out4,$inpperm
1085         stvx_u          $out3,$x10,$out
1086         le?vperm        $out5,$out5,$out5,$inpperm
1087         stvx_u          $out4,$x20,$out
1088         le?vperm        $out6,$out6,$out6,$inpperm
1089         stvx_u          $out5,$x30,$out
1090         le?vperm        $out7,$out7,$out7,$inpperm
1091         stvx_u          $out6,$x40,$out
1092         stvx_u          $out7,$x50,$out
1093         addi            $out,$out,0x60
1094         b               Lcbc_dec8x_done
1095
1096 .align  5
1097 Lcbc_dec8x_five:
1098         vncipherlast    $out3,$out3,$ivec
1099         vncipherlast    $out4,$out4,$in3
1100         vncipherlast    $out5,$out5,$in4
1101         vncipherlast    $out6,$out6,$in5
1102         vncipherlast    $out7,$out7,$in6
1103         vmr             $ivec,$in7
1104
1105         le?vperm        $out3,$out3,$out3,$inpperm
1106         le?vperm        $out4,$out4,$out4,$inpperm
1107         stvx_u          $out3,$x00,$out
1108         le?vperm        $out5,$out5,$out5,$inpperm
1109         stvx_u          $out4,$x10,$out
1110         le?vperm        $out6,$out6,$out6,$inpperm
1111         stvx_u          $out5,$x20,$out
1112         le?vperm        $out7,$out7,$out7,$inpperm
1113         stvx_u          $out6,$x30,$out
1114         stvx_u          $out7,$x40,$out
1115         addi            $out,$out,0x50
1116         b               Lcbc_dec8x_done
1117
1118 .align  5
1119 Lcbc_dec8x_four:
1120         vncipherlast    $out4,$out4,$ivec
1121         vncipherlast    $out5,$out5,$in4
1122         vncipherlast    $out6,$out6,$in5
1123         vncipherlast    $out7,$out7,$in6
1124         vmr             $ivec,$in7
1125
1126         le?vperm        $out4,$out4,$out4,$inpperm
1127         le?vperm        $out5,$out5,$out5,$inpperm
1128         stvx_u          $out4,$x00,$out
1129         le?vperm        $out6,$out6,$out6,$inpperm
1130         stvx_u          $out5,$x10,$out
1131         le?vperm        $out7,$out7,$out7,$inpperm
1132         stvx_u          $out6,$x20,$out
1133         stvx_u          $out7,$x30,$out
1134         addi            $out,$out,0x40
1135         b               Lcbc_dec8x_done
1136
1137 .align  5
1138 Lcbc_dec8x_three:
1139         vncipherlast    $out5,$out5,$ivec
1140         vncipherlast    $out6,$out6,$in5
1141         vncipherlast    $out7,$out7,$in6
1142         vmr             $ivec,$in7
1143
1144         le?vperm        $out5,$out5,$out5,$inpperm
1145         le?vperm        $out6,$out6,$out6,$inpperm
1146         stvx_u          $out5,$x00,$out
1147         le?vperm        $out7,$out7,$out7,$inpperm
1148         stvx_u          $out6,$x10,$out
1149         stvx_u          $out7,$x20,$out
1150         addi            $out,$out,0x30
1151         b               Lcbc_dec8x_done
1152
1153 .align  5
1154 Lcbc_dec8x_two:
1155         vncipherlast    $out6,$out6,$ivec
1156         vncipherlast    $out7,$out7,$in6
1157         vmr             $ivec,$in7
1158
1159         le?vperm        $out6,$out6,$out6,$inpperm
1160         le?vperm        $out7,$out7,$out7,$inpperm
1161         stvx_u          $out6,$x00,$out
1162         stvx_u          $out7,$x10,$out
1163         addi            $out,$out,0x20
1164         b               Lcbc_dec8x_done
1165
1166 .align  5
1167 Lcbc_dec8x_one:
1168         vncipherlast    $out7,$out7,$ivec
1169         vmr             $ivec,$in7
1170
1171         le?vperm        $out7,$out7,$out7,$inpperm
1172         stvx_u          $out7,0,$out
1173         addi            $out,$out,0x10
1174
1175 Lcbc_dec8x_done:
1176         le?vperm        $ivec,$ivec,$ivec,$inpperm
1177         stvx_u          $ivec,0,$ivp            # write [unaligned] iv
1178
1179         li              r10,`$FRAME+15`
1180         li              r11,`$FRAME+31`
1181         stvx            $inpperm,r10,$sp        # wipe copies of round keys
1182         addi            r10,r10,32
1183         stvx            $inpperm,r11,$sp
1184         addi            r11,r11,32
1185         stvx            $inpperm,r10,$sp
1186         addi            r10,r10,32
1187         stvx            $inpperm,r11,$sp
1188         addi            r11,r11,32
1189         stvx            $inpperm,r10,$sp
1190         addi            r10,r10,32
1191         stvx            $inpperm,r11,$sp
1192         addi            r11,r11,32
1193         stvx            $inpperm,r10,$sp
1194         addi            r10,r10,32
1195         stvx            $inpperm,r11,$sp
1196         addi            r11,r11,32
1197
1198         mtspr           256,$vrsave
1199         lvx             v20,r10,$sp             # ABI says so
1200         addi            r10,r10,32
1201         lvx             v21,r11,$sp
1202         addi            r11,r11,32
1203         lvx             v22,r10,$sp
1204         addi            r10,r10,32
1205         lvx             v23,r11,$sp
1206         addi            r11,r11,32
1207         lvx             v24,r10,$sp
1208         addi            r10,r10,32
1209         lvx             v25,r11,$sp
1210         addi            r11,r11,32
1211         lvx             v26,r10,$sp
1212         addi            r10,r10,32
1213         lvx             v27,r11,$sp
1214         addi            r11,r11,32
1215         lvx             v28,r10,$sp
1216         addi            r10,r10,32
1217         lvx             v29,r11,$sp
1218         addi            r11,r11,32
1219         lvx             v30,r10,$sp
1220         lvx             v31,r11,$sp
1221         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1222         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1223         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1224         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1225         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1226         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1227         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1228         blr
1229         .long           0
1230         .byte           0,12,0x14,0,0x80,6,6,0
1231         .long           0
1232 .size   .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1233 ___
1234 }}      }}}
1235
1236 #########################################################################
1237 {{{     # CTR procedure[s]                                              #
1238 my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1239 my ($rndkey0,$rndkey1,$inout,$tmp)=             map("v$_",(0..3));
1240 my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1241                                                 map("v$_",(4..11));
1242 my $dat=$tmp;
1243
1244 $code.=<<___;
1245 .globl  .${prefix}_ctr32_encrypt_blocks
1246 .align  5
1247 .${prefix}_ctr32_encrypt_blocks:
1248         ${UCMP}i        $len,1
1249         bltlr-
1250
1251         lis             r0,0xfff0
1252         mfspr           $vrsave,256
1253         mtspr           256,r0
1254
1255         li              $idx,15
1256         vxor            $rndkey0,$rndkey0,$rndkey0
1257         le?vspltisb     $tmp,0x0f
1258
1259         lvx             $ivec,0,$ivp            # load [unaligned] iv
1260         lvsl            $inpperm,0,$ivp
1261         lvx             $inptail,$idx,$ivp
1262          vspltisb       $one,1
1263         le?vxor         $inpperm,$inpperm,$tmp
1264         vperm           $ivec,$ivec,$inptail,$inpperm
1265          vsldoi         $one,$rndkey0,$one,1
1266
1267         neg             r11,$inp
1268         ?lvsl           $keyperm,0,$key         # prepare for unaligned key
1269         lwz             $rounds,240($key)
1270
1271         lvsr            $inpperm,0,r11          # prepare for unaligned load
1272         lvx             $inptail,0,$inp
1273         addi            $inp,$inp,15            # 15 is not typo
1274         le?vxor         $inpperm,$inpperm,$tmp
1275
1276         srwi            $rounds,$rounds,1
1277         li              $idx,16
1278         subi            $rounds,$rounds,1
1279
1280         ${UCMP}i        $len,8
1281         bge             _aesp8_ctr32_encrypt8x
1282
1283         ?lvsr           $outperm,0,$out         # prepare for unaligned store
1284         vspltisb        $outmask,-1
1285         lvx             $outhead,0,$out
1286         ?vperm          $outmask,$rndkey0,$outmask,$outperm
1287         le?vxor         $outperm,$outperm,$tmp
1288
1289         lvx             $rndkey0,0,$key
1290         mtctr           $rounds
1291         lvx             $rndkey1,$idx,$key
1292         addi            $idx,$idx,16
1293         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
1294         vxor            $inout,$ivec,$rndkey0
1295         lvx             $rndkey0,$idx,$key
1296         addi            $idx,$idx,16
1297         b               Loop_ctr32_enc
1298
1299 .align  5
1300 Loop_ctr32_enc:
1301         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
1302         vcipher         $inout,$inout,$rndkey1
1303         lvx             $rndkey1,$idx,$key
1304         addi            $idx,$idx,16
1305         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
1306         vcipher         $inout,$inout,$rndkey0
1307         lvx             $rndkey0,$idx,$key
1308         addi            $idx,$idx,16
1309         bdnz            Loop_ctr32_enc
1310
1311         vadduwm         $ivec,$ivec,$one
1312          vmr            $dat,$inptail
1313          lvx            $inptail,0,$inp
1314          addi           $inp,$inp,16
1315          subic.         $len,$len,1             # blocks--
1316
1317         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
1318         vcipher         $inout,$inout,$rndkey1
1319         lvx             $rndkey1,$idx,$key
1320          vperm          $dat,$dat,$inptail,$inpperm
1321          li             $idx,16
1322         ?vperm          $rndkey1,$rndkey0,$rndkey1,$keyperm
1323          lvx            $rndkey0,0,$key
1324         vxor            $dat,$dat,$rndkey1      # last round key
1325         vcipherlast     $inout,$inout,$dat
1326
1327          lvx            $rndkey1,$idx,$key
1328          addi           $idx,$idx,16
1329         vperm           $inout,$inout,$inout,$outperm
1330         vsel            $dat,$outhead,$inout,$outmask
1331          mtctr          $rounds
1332          ?vperm         $rndkey0,$rndkey0,$rndkey1,$keyperm
1333         vmr             $outhead,$inout
1334          vxor           $inout,$ivec,$rndkey0
1335          lvx            $rndkey0,$idx,$key
1336          addi           $idx,$idx,16
1337         stvx            $dat,0,$out
1338         addi            $out,$out,16
1339         bne             Loop_ctr32_enc
1340
1341         addi            $out,$out,-1
1342         lvx             $inout,0,$out           # redundant in aligned case
1343         vsel            $inout,$outhead,$inout,$outmask
1344         stvx            $inout,0,$out
1345
1346         mtspr           256,$vrsave
1347         blr
1348         .long           0
1349         .byte           0,12,0x14,0,0,0,6,0
1350         .long           0
1351 ___
1352 #########################################################################
1353 {{      # Optimized CTR procedure                                       #
1354 my $key_="r11";
1355 my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1356 my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1357 my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1358 my $rndkey0="v23";      # v24-v25 rotating buffer for first found keys
1359                         # v26-v31 last 6 round keys
1360 my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
1361 my ($two,$three,$four)=($outhead,$outperm,$outmask);
1362
1363 $code.=<<___;
1364 .align  5
1365 _aesp8_ctr32_encrypt8x:
1366         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1367         li              r10,`$FRAME+8*16+15`
1368         li              r11,`$FRAME+8*16+31`
1369         stvx            v20,r10,$sp             # ABI says so
1370         addi            r10,r10,32
1371         stvx            v21,r11,$sp
1372         addi            r11,r11,32
1373         stvx            v22,r10,$sp
1374         addi            r10,r10,32
1375         stvx            v23,r11,$sp
1376         addi            r11,r11,32
1377         stvx            v24,r10,$sp
1378         addi            r10,r10,32
1379         stvx            v25,r11,$sp
1380         addi            r11,r11,32
1381         stvx            v26,r10,$sp
1382         addi            r10,r10,32
1383         stvx            v27,r11,$sp
1384         addi            r11,r11,32
1385         stvx            v28,r10,$sp
1386         addi            r10,r10,32
1387         stvx            v29,r11,$sp
1388         addi            r11,r11,32
1389         stvx            v30,r10,$sp
1390         stvx            v31,r11,$sp
1391         li              r0,-1
1392         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
1393         li              $x10,0x10
1394         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1395         li              $x20,0x20
1396         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1397         li              $x30,0x30
1398         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1399         li              $x40,0x40
1400         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1401         li              $x50,0x50
1402         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1403         li              $x60,0x60
1404         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1405         li              $x70,0x70
1406         mtspr           256,r0
1407
1408         subi            $rounds,$rounds,3       # -4 in total
1409
1410         lvx             $rndkey0,$x00,$key      # load key schedule
1411         lvx             v30,$x10,$key
1412         addi            $key,$key,0x20
1413         lvx             v31,$x00,$key
1414         ?vperm          $rndkey0,$rndkey0,v30,$keyperm
1415         addi            $key_,$sp,$FRAME+15
1416         mtctr           $rounds
1417
1418 Load_ctr32_enc_key:
1419         ?vperm          v24,v30,v31,$keyperm
1420         lvx             v30,$x10,$key
1421         addi            $key,$key,0x20
1422         stvx            v24,$x00,$key_          # off-load round[1]
1423         ?vperm          v25,v31,v30,$keyperm
1424         lvx             v31,$x00,$key
1425         stvx            v25,$x10,$key_          # off-load round[2]
1426         addi            $key_,$key_,0x20
1427         bdnz            Load_ctr32_enc_key
1428
1429         lvx             v26,$x10,$key
1430         ?vperm          v24,v30,v31,$keyperm
1431         lvx             v27,$x20,$key
1432         stvx            v24,$x00,$key_          # off-load round[3]
1433         ?vperm          v25,v31,v26,$keyperm
1434         lvx             v28,$x30,$key
1435         stvx            v25,$x10,$key_          # off-load round[4]
1436         addi            $key_,$sp,$FRAME+15     # rewind $key_
1437         ?vperm          v26,v26,v27,$keyperm
1438         lvx             v29,$x40,$key
1439         ?vperm          v27,v27,v28,$keyperm
1440         lvx             v30,$x50,$key
1441         ?vperm          v28,v28,v29,$keyperm
1442         lvx             v31,$x60,$key
1443         ?vperm          v29,v29,v30,$keyperm
1444         lvx             $out0,$x70,$key         # borrow $out0
1445         ?vperm          v30,v30,v31,$keyperm
1446         lvx             v24,$x00,$key_          # pre-load round[1]
1447         ?vperm          v31,v31,$out0,$keyperm
1448         lvx             v25,$x10,$key_          # pre-load round[2]
1449
1450         vadduwm         $two,$one,$one
1451         subi            $inp,$inp,15            # undo "caller"
1452         $SHL            $len,$len,4
1453
1454         vadduwm         $out1,$ivec,$one        # counter values ...
1455         vadduwm         $out2,$ivec,$two
1456         vxor            $out0,$ivec,$rndkey0    # ... xored with rndkey[0]
1457          le?li          $idx,8
1458         vadduwm         $out3,$out1,$two
1459         vxor            $out1,$out1,$rndkey0
1460          le?lvsl        $inpperm,0,$idx
1461         vadduwm         $out4,$out2,$two
1462         vxor            $out2,$out2,$rndkey0
1463          le?vspltisb    $tmp,0x0f
1464         vadduwm         $out5,$out3,$two
1465         vxor            $out3,$out3,$rndkey0
1466          le?vxor        $inpperm,$inpperm,$tmp  # transform for lvx_u/stvx_u
1467         vadduwm         $out6,$out4,$two
1468         vxor            $out4,$out4,$rndkey0
1469         vadduwm         $out7,$out5,$two
1470         vxor            $out5,$out5,$rndkey0
1471         vadduwm         $ivec,$out6,$two        # next counter value
1472         vxor            $out6,$out6,$rndkey0
1473         vxor            $out7,$out7,$rndkey0
1474
1475         mtctr           $rounds
1476         b               Loop_ctr32_enc8x
1477 .align  5
1478 Loop_ctr32_enc8x:
1479         vcipher         $out0,$out0,v24
1480         vcipher         $out1,$out1,v24
1481         vcipher         $out2,$out2,v24
1482         vcipher         $out3,$out3,v24
1483         vcipher         $out4,$out4,v24
1484         vcipher         $out5,$out5,v24
1485         vcipher         $out6,$out6,v24
1486         vcipher         $out7,$out7,v24
1487 Loop_ctr32_enc8x_middle:
1488         lvx             v24,$x20,$key_          # round[3]
1489         addi            $key_,$key_,0x20
1490
1491         vcipher         $out0,$out0,v25
1492         vcipher         $out1,$out1,v25
1493         vcipher         $out2,$out2,v25
1494         vcipher         $out3,$out3,v25
1495         vcipher         $out4,$out4,v25
1496         vcipher         $out5,$out5,v25
1497         vcipher         $out6,$out6,v25
1498         vcipher         $out7,$out7,v25
1499         lvx             v25,$x10,$key_          # round[4]
1500         bdnz            Loop_ctr32_enc8x
1501
1502         subic           r11,$len,256            # $len-256, borrow $key_
1503         vcipher         $out0,$out0,v24
1504         vcipher         $out1,$out1,v24
1505         vcipher         $out2,$out2,v24
1506         vcipher         $out3,$out3,v24
1507         vcipher         $out4,$out4,v24
1508         vcipher         $out5,$out5,v24
1509         vcipher         $out6,$out6,v24
1510         vcipher         $out7,$out7,v24
1511
1512         subfe           r0,r0,r0                # borrow?-1:0
1513         vcipher         $out0,$out0,v25
1514         vcipher         $out1,$out1,v25
1515         vcipher         $out2,$out2,v25
1516         vcipher         $out3,$out3,v25
1517         vcipher         $out4,$out4,v25
1518         vcipher         $out5,$out5,v25
1519         vcipher         $out6,$out6,v25
1520         vcipher         $out7,$out7,v25
1521
1522         and             r0,r0,r11
1523         addi            $key_,$sp,$FRAME+15     # rewind $key_
1524         vcipher         $out0,$out0,v26
1525         vcipher         $out1,$out1,v26
1526         vcipher         $out2,$out2,v26
1527         vcipher         $out3,$out3,v26
1528         vcipher         $out4,$out4,v26
1529         vcipher         $out5,$out5,v26
1530         vcipher         $out6,$out6,v26
1531         vcipher         $out7,$out7,v26
1532         lvx             v24,$x00,$key_          # re-pre-load round[1]
1533
1534         subic           $len,$len,129           # $len-=129
1535         vcipher         $out0,$out0,v27
1536         addi            $len,$len,1             # $len-=128 really
1537         vcipher         $out1,$out1,v27
1538         vcipher         $out2,$out2,v27
1539         vcipher         $out3,$out3,v27
1540         vcipher         $out4,$out4,v27
1541         vcipher         $out5,$out5,v27
1542         vcipher         $out6,$out6,v27
1543         vcipher         $out7,$out7,v27
1544         lvx             v25,$x10,$key_          # re-pre-load round[2]
1545
1546         vcipher         $out0,$out0,v28
1547          lvx_u          $in0,$x00,$inp          # load input
1548         vcipher         $out1,$out1,v28
1549          lvx_u          $in1,$x10,$inp
1550         vcipher         $out2,$out2,v28
1551          lvx_u          $in2,$x20,$inp
1552         vcipher         $out3,$out3,v28
1553          lvx_u          $in3,$x30,$inp
1554         vcipher         $out4,$out4,v28
1555          lvx_u          $in4,$x40,$inp
1556         vcipher         $out5,$out5,v28
1557          lvx_u          $in5,$x50,$inp
1558         vcipher         $out6,$out6,v28
1559          lvx_u          $in6,$x60,$inp
1560         vcipher         $out7,$out7,v28
1561          lvx_u          $in7,$x70,$inp
1562          addi           $inp,$inp,0x80
1563
1564         vcipher         $out0,$out0,v29
1565          le?vperm       $in0,$in0,$in0,$inpperm
1566         vcipher         $out1,$out1,v29
1567          le?vperm       $in1,$in1,$in1,$inpperm
1568         vcipher         $out2,$out2,v29
1569          le?vperm       $in2,$in2,$in2,$inpperm
1570         vcipher         $out3,$out3,v29
1571          le?vperm       $in3,$in3,$in3,$inpperm
1572         vcipher         $out4,$out4,v29
1573          le?vperm       $in4,$in4,$in4,$inpperm
1574         vcipher         $out5,$out5,v29
1575          le?vperm       $in5,$in5,$in5,$inpperm
1576         vcipher         $out6,$out6,v29
1577          le?vperm       $in6,$in6,$in6,$inpperm
1578         vcipher         $out7,$out7,v29
1579          le?vperm       $in7,$in7,$in7,$inpperm
1580
1581         add             $inp,$inp,r0            # $inp is adjusted in such
1582                                                 # way that at exit from the
1583                                                 # loop inX-in7 are loaded
1584                                                 # with last "words"
1585         subfe.          r0,r0,r0                # borrow?-1:0
1586         vcipher         $out0,$out0,v30
1587          vxor           $in0,$in0,v31           # xor with last round key
1588         vcipher         $out1,$out1,v30
1589          vxor           $in1,$in1,v31
1590         vcipher         $out2,$out2,v30
1591          vxor           $in2,$in2,v31
1592         vcipher         $out3,$out3,v30
1593          vxor           $in3,$in3,v31
1594         vcipher         $out4,$out4,v30
1595          vxor           $in4,$in4,v31
1596         vcipher         $out5,$out5,v30
1597          vxor           $in5,$in5,v31
1598         vcipher         $out6,$out6,v30
1599          vxor           $in6,$in6,v31
1600         vcipher         $out7,$out7,v30
1601          vxor           $in7,$in7,v31
1602
1603         bne             Lctr32_enc8x_break      # did $len-129 borrow?
1604
1605         vcipherlast     $in0,$out0,$in0
1606         vcipherlast     $in1,$out1,$in1
1607          vadduwm        $out1,$ivec,$one        # counter values ...
1608         vcipherlast     $in2,$out2,$in2
1609          vadduwm        $out2,$ivec,$two
1610          vxor           $out0,$ivec,$rndkey0    # ... xored with rndkey[0]
1611         vcipherlast     $in3,$out3,$in3
1612          vadduwm        $out3,$out1,$two
1613          vxor           $out1,$out1,$rndkey0
1614         vcipherlast     $in4,$out4,$in4
1615          vadduwm        $out4,$out2,$two
1616          vxor           $out2,$out2,$rndkey0
1617         vcipherlast     $in5,$out5,$in5
1618          vadduwm        $out5,$out3,$two
1619          vxor           $out3,$out3,$rndkey0
1620         vcipherlast     $in6,$out6,$in6
1621          vadduwm        $out6,$out4,$two
1622          vxor           $out4,$out4,$rndkey0
1623         vcipherlast     $in7,$out7,$in7
1624          vadduwm        $out7,$out5,$two
1625          vxor           $out5,$out5,$rndkey0
1626         le?vperm        $in0,$in0,$in0,$inpperm
1627          vadduwm        $ivec,$out6,$two        # next counter value
1628          vxor           $out6,$out6,$rndkey0
1629         le?vperm        $in1,$in1,$in1,$inpperm
1630          vxor           $out7,$out7,$rndkey0
1631         mtctr           $rounds
1632
1633          vcipher        $out0,$out0,v24
1634         stvx_u          $in0,$x00,$out
1635         le?vperm        $in2,$in2,$in2,$inpperm
1636          vcipher        $out1,$out1,v24
1637         stvx_u          $in1,$x10,$out
1638         le?vperm        $in3,$in3,$in3,$inpperm
1639          vcipher        $out2,$out2,v24
1640         stvx_u          $in2,$x20,$out
1641         le?vperm        $in4,$in4,$in4,$inpperm
1642          vcipher        $out3,$out3,v24
1643         stvx_u          $in3,$x30,$out
1644         le?vperm        $in5,$in5,$in5,$inpperm
1645          vcipher        $out4,$out4,v24
1646         stvx_u          $in4,$x40,$out
1647         le?vperm        $in6,$in6,$in6,$inpperm
1648          vcipher        $out5,$out5,v24
1649         stvx_u          $in5,$x50,$out
1650         le?vperm        $in7,$in7,$in7,$inpperm
1651          vcipher        $out6,$out6,v24
1652         stvx_u          $in6,$x60,$out
1653          vcipher        $out7,$out7,v24
1654         stvx_u          $in7,$x70,$out
1655         addi            $out,$out,0x80
1656
1657         b               Loop_ctr32_enc8x_middle
1658
1659 .align  5
1660 Lctr32_enc8x_break:
1661         cmpwi           $len,-0x60
1662         blt             Lctr32_enc8x_one
1663         nop
1664         beq             Lctr32_enc8x_two
1665         cmpwi           $len,-0x40
1666         blt             Lctr32_enc8x_three
1667         nop
1668         beq             Lctr32_enc8x_four
1669         cmpwi           $len,-0x20
1670         blt             Lctr32_enc8x_five
1671         nop
1672         beq             Lctr32_enc8x_six
1673         cmpwi           $len,0x00
1674         blt             Lctr32_enc8x_seven
1675
1676 Lctr32_enc8x_eight:
1677         vcipherlast     $out0,$out0,$in0
1678         vcipherlast     $out1,$out1,$in1
1679         vcipherlast     $out2,$out2,$in2
1680         vcipherlast     $out3,$out3,$in3
1681         vcipherlast     $out4,$out4,$in4
1682         vcipherlast     $out5,$out5,$in5
1683         vcipherlast     $out6,$out6,$in6
1684         vcipherlast     $out7,$out7,$in7
1685
1686         le?vperm        $out0,$out0,$out0,$inpperm
1687         le?vperm        $out1,$out1,$out1,$inpperm
1688         stvx_u          $out0,$x00,$out
1689         le?vperm        $out2,$out2,$out2,$inpperm
1690         stvx_u          $out1,$x10,$out
1691         le?vperm        $out3,$out3,$out3,$inpperm
1692         stvx_u          $out2,$x20,$out
1693         le?vperm        $out4,$out4,$out4,$inpperm
1694         stvx_u          $out3,$x30,$out
1695         le?vperm        $out5,$out5,$out5,$inpperm
1696         stvx_u          $out4,$x40,$out
1697         le?vperm        $out6,$out6,$out6,$inpperm
1698         stvx_u          $out5,$x50,$out
1699         le?vperm        $out7,$out7,$out7,$inpperm
1700         stvx_u          $out6,$x60,$out
1701         stvx_u          $out7,$x70,$out
1702         addi            $out,$out,0x80
1703         b               Lctr32_enc8x_done
1704
1705 .align  5
1706 Lctr32_enc8x_seven:
1707         vcipherlast     $out0,$out0,$in1
1708         vcipherlast     $out1,$out1,$in2
1709         vcipherlast     $out2,$out2,$in3
1710         vcipherlast     $out3,$out3,$in4
1711         vcipherlast     $out4,$out4,$in5
1712         vcipherlast     $out5,$out5,$in6
1713         vcipherlast     $out6,$out6,$in7
1714
1715         le?vperm        $out0,$out0,$out0,$inpperm
1716         le?vperm        $out1,$out1,$out1,$inpperm
1717         stvx_u          $out0,$x00,$out
1718         le?vperm        $out2,$out2,$out2,$inpperm
1719         stvx_u          $out1,$x10,$out
1720         le?vperm        $out3,$out3,$out3,$inpperm
1721         stvx_u          $out2,$x20,$out
1722         le?vperm        $out4,$out4,$out4,$inpperm
1723         stvx_u          $out3,$x30,$out
1724         le?vperm        $out5,$out5,$out5,$inpperm
1725         stvx_u          $out4,$x40,$out
1726         le?vperm        $out6,$out6,$out6,$inpperm
1727         stvx_u          $out5,$x50,$out
1728         stvx_u          $out6,$x60,$out
1729         addi            $out,$out,0x70
1730         b               Lctr32_enc8x_done
1731
1732 .align  5
1733 Lctr32_enc8x_six:
1734         vcipherlast     $out0,$out0,$in2
1735         vcipherlast     $out1,$out1,$in3
1736         vcipherlast     $out2,$out2,$in4
1737         vcipherlast     $out3,$out3,$in5
1738         vcipherlast     $out4,$out4,$in6
1739         vcipherlast     $out5,$out5,$in7
1740
1741         le?vperm        $out0,$out0,$out0,$inpperm
1742         le?vperm        $out1,$out1,$out1,$inpperm
1743         stvx_u          $out0,$x00,$out
1744         le?vperm        $out2,$out2,$out2,$inpperm
1745         stvx_u          $out1,$x10,$out
1746         le?vperm        $out3,$out3,$out3,$inpperm
1747         stvx_u          $out2,$x20,$out
1748         le?vperm        $out4,$out4,$out4,$inpperm
1749         stvx_u          $out3,$x30,$out
1750         le?vperm        $out5,$out5,$out5,$inpperm
1751         stvx_u          $out4,$x40,$out
1752         stvx_u          $out5,$x50,$out
1753         addi            $out,$out,0x60
1754         b               Lctr32_enc8x_done
1755
1756 .align  5
1757 Lctr32_enc8x_five:
1758         vcipherlast     $out0,$out0,$in3
1759         vcipherlast     $out1,$out1,$in4
1760         vcipherlast     $out2,$out2,$in5
1761         vcipherlast     $out3,$out3,$in6
1762         vcipherlast     $out4,$out4,$in7
1763
1764         le?vperm        $out0,$out0,$out0,$inpperm
1765         le?vperm        $out1,$out1,$out1,$inpperm
1766         stvx_u          $out0,$x00,$out
1767         le?vperm        $out2,$out2,$out2,$inpperm
1768         stvx_u          $out1,$x10,$out
1769         le?vperm        $out3,$out3,$out3,$inpperm
1770         stvx_u          $out2,$x20,$out
1771         le?vperm        $out4,$out4,$out4,$inpperm
1772         stvx_u          $out3,$x30,$out
1773         stvx_u          $out4,$x40,$out
1774         addi            $out,$out,0x50
1775         b               Lctr32_enc8x_done
1776
1777 .align  5
1778 Lctr32_enc8x_four:
1779         vcipherlast     $out0,$out0,$in4
1780         vcipherlast     $out1,$out1,$in5
1781         vcipherlast     $out2,$out2,$in6
1782         vcipherlast     $out3,$out3,$in7
1783
1784         le?vperm        $out0,$out0,$out0,$inpperm
1785         le?vperm        $out1,$out1,$out1,$inpperm
1786         stvx_u          $out0,$x00,$out
1787         le?vperm        $out2,$out2,$out2,$inpperm
1788         stvx_u          $out1,$x10,$out
1789         le?vperm        $out3,$out3,$out3,$inpperm
1790         stvx_u          $out2,$x20,$out
1791         stvx_u          $out3,$x30,$out
1792         addi            $out,$out,0x40
1793         b               Lctr32_enc8x_done
1794
1795 .align  5
1796 Lctr32_enc8x_three:
1797         vcipherlast     $out0,$out0,$in5
1798         vcipherlast     $out1,$out1,$in6
1799         vcipherlast     $out2,$out2,$in7
1800
1801         le?vperm        $out0,$out0,$out0,$inpperm
1802         le?vperm        $out1,$out1,$out1,$inpperm
1803         stvx_u          $out0,$x00,$out
1804         le?vperm        $out2,$out2,$out2,$inpperm
1805         stvx_u          $out1,$x10,$out
1806         stvx_u          $out2,$x20,$out
1807         addi            $out,$out,0x30
1808         b               Lcbc_dec8x_done
1809
1810 .align  5
1811 Lctr32_enc8x_two:
1812         vcipherlast     $out0,$out0,$in6
1813         vcipherlast     $out1,$out1,$in7
1814
1815         le?vperm        $out0,$out0,$out0,$inpperm
1816         le?vperm        $out1,$out1,$out1,$inpperm
1817         stvx_u          $out0,$x00,$out
1818         stvx_u          $out1,$x10,$out
1819         addi            $out,$out,0x20
1820         b               Lcbc_dec8x_done
1821
1822 .align  5
1823 Lctr32_enc8x_one:
1824         vcipherlast     $out0,$out0,$in7
1825
1826         le?vperm        $out0,$out0,$out0,$inpperm
1827         stvx_u          $out0,0,$out
1828         addi            $out,$out,0x10
1829
1830 Lctr32_enc8x_done:
1831         li              r10,`$FRAME+15`
1832         li              r11,`$FRAME+31`
1833         stvx            $inpperm,r10,$sp        # wipe copies of round keys
1834         addi            r10,r10,32
1835         stvx            $inpperm,r11,$sp
1836         addi            r11,r11,32
1837         stvx            $inpperm,r10,$sp
1838         addi            r10,r10,32
1839         stvx            $inpperm,r11,$sp
1840         addi            r11,r11,32
1841         stvx            $inpperm,r10,$sp
1842         addi            r10,r10,32
1843         stvx            $inpperm,r11,$sp
1844         addi            r11,r11,32
1845         stvx            $inpperm,r10,$sp
1846         addi            r10,r10,32
1847         stvx            $inpperm,r11,$sp
1848         addi            r11,r11,32
1849
1850         mtspr           256,$vrsave
1851         lvx             v20,r10,$sp             # ABI says so
1852         addi            r10,r10,32
1853         lvx             v21,r11,$sp
1854         addi            r11,r11,32
1855         lvx             v22,r10,$sp
1856         addi            r10,r10,32
1857         lvx             v23,r11,$sp
1858         addi            r11,r11,32
1859         lvx             v24,r10,$sp
1860         addi            r10,r10,32
1861         lvx             v25,r11,$sp
1862         addi            r11,r11,32
1863         lvx             v26,r10,$sp
1864         addi            r10,r10,32
1865         lvx             v27,r11,$sp
1866         addi            r11,r11,32
1867         lvx             v28,r10,$sp
1868         addi            r10,r10,32
1869         lvx             v29,r11,$sp
1870         addi            r11,r11,32
1871         lvx             v30,r10,$sp
1872         lvx             v31,r11,$sp
1873         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1874         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1875         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1876         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1877         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1878         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1879         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1880         blr
1881         .long           0
1882         .byte           0,12,0x14,0,0x80,6,6,0
1883         .long           0
1884 .size   .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1885 ___
1886 }}      }}}
1887
1888 my $consts=1;
1889 foreach(split("\n",$code)) {
1890         s/\`([^\`]*)\`/eval($1)/geo;
1891
1892         # constants table endian-specific conversion
1893         if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
1894             my $conv=$3;
1895             my @bytes=();
1896
1897             # convert to endian-agnostic format
1898             if ($1 eq "long") {
1899               foreach (split(/,\s*/,$2)) {
1900                 my $l = /^0/?oct:int;
1901                 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1902               }
1903             } else {
1904                 @bytes = map(/^0/?oct:int,split(/,\s*/,$2));
1905             }
1906
1907             # little-endian conversion
1908             if ($flavour =~ /le$/o) {
1909                 SWITCH: for($conv)  {
1910                     /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
1911                     /\?rev/ && do   { @bytes=reverse(@bytes);    last; }; 
1912                 }
1913             }
1914
1915             #emit
1916             print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1917             next;
1918         }
1919         $consts=0 if (m/Lconsts:/o);    # end of table
1920
1921         # instructions prefixed with '?' are endian-specific and need
1922         # to be adjusted accordingly...
1923         if ($flavour =~ /le$/o) {       # little-endian
1924             s/le\?//o           or
1925             s/be\?/#be#/o       or
1926             s/\?lvsr/lvsl/o     or
1927             s/\?lvsl/lvsr/o     or
1928             s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1929             s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1930             s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1931         } else {                        # big-endian
1932             s/le\?/#le#/o       or
1933             s/be\?//o           or
1934             s/\?([a-z]+)/$1/o;
1935         }
1936
1937         print $_,"\n";
1938 }
1939
1940 close STDOUT;