aes/asm/bsaes-armv7.pl: fix compilation with Xcode 6.3.
[openssl.git] / crypto / aes / asm / aesp8-ppc.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for AES instructions as per PowerISA
11 # specification version 2.07, first implemented by POWER8 processor.
12 # The module is endian-agnostic in sense that it supports both big-
13 # and little-endian cases. Data alignment in parallelizable modes is
14 # handled with VSX loads and stores, which implies MSR.VSX flag being
15 # set. It should also be noted that ISA specification doesn't prohibit
16 # alignment exceptions for these instructions on page boundaries.
17 # Initially alignment was handled in pure AltiVec/VMX way [when data
18 # is aligned programmatically, which in turn guarantees exception-
19 # free execution], but it turned to hamper performance when vcipher
20 # instructions are interleaved. It's reckoned that eventual
21 # misalignment penalties at page boundaries are in average lower
22 # than additional overhead in pure AltiVec approach.
23
24 $flavour = shift;
25
26 if ($flavour =~ /64/) {
27         $SIZE_T =8;
28         $LRSAVE =2*$SIZE_T;
29         $STU    ="stdu";
30         $POP    ="ld";
31         $PUSH   ="std";
32         $UCMP   ="cmpld";
33         $SHL    ="sldi";
34 } elsif ($flavour =~ /32/) {
35         $SIZE_T =4;
36         $LRSAVE =$SIZE_T;
37         $STU    ="stwu";
38         $POP    ="lwz";
39         $PUSH   ="stw";
40         $UCMP   ="cmplw";
41         $SHL    ="slwi";
42 } else { die "nonsense $flavour"; }
43
44 $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
45
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
49 die "can't locate ppc-xlate.pl";
50
51 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
52
53 $FRAME=8*$SIZE_T;
54 $prefix="aes_p8";
55
56 $sp="r1";
57 $vrsave="r12";
58
59 #########################################################################
60 {{{     # Key setup procedures                                          #
61 my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
62 my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
63 my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
64
65 $code.=<<___;
66 .machine        "any"
67
68 .text
69
70 .align  7
71 rcon:
72 .long   0x01000000, 0x01000000, 0x01000000, 0x01000000  ?rev
73 .long   0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000  ?rev
74 .long   0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c  ?rev
75 .long   0,0,0,0                                         ?asis
76 Lconsts:
77         mflr    r0
78         bcl     20,31,\$+4
79         mflr    $ptr     #vvvvv "distance between . and rcon
80         addi    $ptr,$ptr,-0x48
81         mtlr    r0
82         blr
83         .long   0
84         .byte   0,12,0x14,0,0,0,0,0
85 .asciz  "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
86
87 .globl  .${prefix}_set_encrypt_key
88 .align  5
89 .${prefix}_set_encrypt_key:
90 Lset_encrypt_key:
91         mflr            r11
92         $PUSH           r11,$LRSAVE($sp)
93
94         li              $ptr,-1
95         ${UCMP}i        $inp,0
96         beq-            Lenc_key_abort          # if ($inp==0) return -1;
97         ${UCMP}i        $out,0
98         beq-            Lenc_key_abort          # if ($out==0) return -1;
99         li              $ptr,-2
100         cmpwi           $bits,128
101         blt-            Lenc_key_abort
102         cmpwi           $bits,256
103         bgt-            Lenc_key_abort
104         andi.           r0,$bits,0x3f
105         bne-            Lenc_key_abort
106
107         lis             r0,0xfff0
108         mfspr           $vrsave,256
109         mtspr           256,r0
110
111         bl              Lconsts
112         mtlr            r11
113
114         neg             r9,$inp
115         lvx             $in0,0,$inp
116         addi            $inp,$inp,15            # 15 is not typo
117         lvsr            $key,0,r9               # borrow $key
118         li              r8,0x20
119         cmpwi           $bits,192
120         lvx             $in1,0,$inp
121         le?vspltisb     $mask,0x0f              # borrow $mask
122         lvx             $rcon,0,$ptr
123         le?vxor         $key,$key,$mask         # adjust for byte swap
124         lvx             $mask,r8,$ptr
125         addi            $ptr,$ptr,0x10
126         vperm           $in0,$in0,$in1,$key     # align [and byte swap in LE]
127         li              $cnt,8
128         vxor            $zero,$zero,$zero
129         mtctr           $cnt
130
131         ?lvsr           $outperm,0,$out
132         vspltisb        $outmask,-1
133         lvx             $outhead,0,$out
134         ?vperm          $outmask,$zero,$outmask,$outperm
135
136         blt             Loop128
137         addi            $inp,$inp,8
138         beq             L192
139         addi            $inp,$inp,8
140         b               L256
141
142 .align  4
143 Loop128:
144         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
145         vsldoi          $tmp,$zero,$in0,12      # >>32
146          vperm          $outtail,$in0,$in0,$outperm     # rotate
147          vsel           $stage,$outhead,$outtail,$outmask
148          vmr            $outhead,$outtail
149         vcipherlast     $key,$key,$rcon
150          stvx           $stage,0,$out
151          addi           $out,$out,16
152
153         vxor            $in0,$in0,$tmp
154         vsldoi          $tmp,$zero,$tmp,12      # >>32
155         vxor            $in0,$in0,$tmp
156         vsldoi          $tmp,$zero,$tmp,12      # >>32
157         vxor            $in0,$in0,$tmp
158          vadduwm        $rcon,$rcon,$rcon
159         vxor            $in0,$in0,$key
160         bdnz            Loop128
161
162         lvx             $rcon,0,$ptr            # last two round keys
163
164         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
165         vsldoi          $tmp,$zero,$in0,12      # >>32
166          vperm          $outtail,$in0,$in0,$outperm     # rotate
167          vsel           $stage,$outhead,$outtail,$outmask
168          vmr            $outhead,$outtail
169         vcipherlast     $key,$key,$rcon
170          stvx           $stage,0,$out
171          addi           $out,$out,16
172
173         vxor            $in0,$in0,$tmp
174         vsldoi          $tmp,$zero,$tmp,12      # >>32
175         vxor            $in0,$in0,$tmp
176         vsldoi          $tmp,$zero,$tmp,12      # >>32
177         vxor            $in0,$in0,$tmp
178          vadduwm        $rcon,$rcon,$rcon
179         vxor            $in0,$in0,$key
180
181         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
182         vsldoi          $tmp,$zero,$in0,12      # >>32
183          vperm          $outtail,$in0,$in0,$outperm     # rotate
184          vsel           $stage,$outhead,$outtail,$outmask
185          vmr            $outhead,$outtail
186         vcipherlast     $key,$key,$rcon
187          stvx           $stage,0,$out
188          addi           $out,$out,16
189
190         vxor            $in0,$in0,$tmp
191         vsldoi          $tmp,$zero,$tmp,12      # >>32
192         vxor            $in0,$in0,$tmp
193         vsldoi          $tmp,$zero,$tmp,12      # >>32
194         vxor            $in0,$in0,$tmp
195         vxor            $in0,$in0,$key
196          vperm          $outtail,$in0,$in0,$outperm     # rotate
197          vsel           $stage,$outhead,$outtail,$outmask
198          vmr            $outhead,$outtail
199          stvx           $stage,0,$out
200
201         addi            $inp,$out,15            # 15 is not typo
202         addi            $out,$out,0x50
203
204         li              $rounds,10
205         b               Ldone
206
207 .align  4
208 L192:
209         lvx             $tmp,0,$inp
210         li              $cnt,4
211          vperm          $outtail,$in0,$in0,$outperm     # rotate
212          vsel           $stage,$outhead,$outtail,$outmask
213          vmr            $outhead,$outtail
214          stvx           $stage,0,$out
215          addi           $out,$out,16
216         vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
217         vspltisb        $key,8                  # borrow $key
218         mtctr           $cnt
219         vsububm         $mask,$mask,$key        # adjust the mask
220
221 Loop192:
222         vperm           $key,$in1,$in1,$mask    # roate-n-splat
223         vsldoi          $tmp,$zero,$in0,12      # >>32
224         vcipherlast     $key,$key,$rcon
225
226         vxor            $in0,$in0,$tmp
227         vsldoi          $tmp,$zero,$tmp,12      # >>32
228         vxor            $in0,$in0,$tmp
229         vsldoi          $tmp,$zero,$tmp,12      # >>32
230         vxor            $in0,$in0,$tmp
231
232          vsldoi         $stage,$zero,$in1,8
233         vspltw          $tmp,$in0,3
234         vxor            $tmp,$tmp,$in1
235         vsldoi          $in1,$zero,$in1,12      # >>32
236          vadduwm        $rcon,$rcon,$rcon
237         vxor            $in1,$in1,$tmp
238         vxor            $in0,$in0,$key
239         vxor            $in1,$in1,$key
240          vsldoi         $stage,$stage,$in0,8
241
242         vperm           $key,$in1,$in1,$mask    # rotate-n-splat
243         vsldoi          $tmp,$zero,$in0,12      # >>32
244          vperm          $outtail,$stage,$stage,$outperm # rotate
245          vsel           $stage,$outhead,$outtail,$outmask
246          vmr            $outhead,$outtail
247         vcipherlast     $key,$key,$rcon
248          stvx           $stage,0,$out
249          addi           $out,$out,16
250
251          vsldoi         $stage,$in0,$in1,8
252         vxor            $in0,$in0,$tmp
253         vsldoi          $tmp,$zero,$tmp,12      # >>32
254          vperm          $outtail,$stage,$stage,$outperm # rotate
255          vsel           $stage,$outhead,$outtail,$outmask
256          vmr            $outhead,$outtail
257         vxor            $in0,$in0,$tmp
258         vsldoi          $tmp,$zero,$tmp,12      # >>32
259         vxor            $in0,$in0,$tmp
260          stvx           $stage,0,$out
261          addi           $out,$out,16
262
263         vspltw          $tmp,$in0,3
264         vxor            $tmp,$tmp,$in1
265         vsldoi          $in1,$zero,$in1,12      # >>32
266          vadduwm        $rcon,$rcon,$rcon
267         vxor            $in1,$in1,$tmp
268         vxor            $in0,$in0,$key
269         vxor            $in1,$in1,$key
270          vperm          $outtail,$in0,$in0,$outperm     # rotate
271          vsel           $stage,$outhead,$outtail,$outmask
272          vmr            $outhead,$outtail
273          stvx           $stage,0,$out
274          addi           $inp,$out,15            # 15 is not typo
275          addi           $out,$out,16
276         bdnz            Loop192
277
278         li              $rounds,12
279         addi            $out,$out,0x20
280         b               Ldone
281
282 .align  4
283 L256:
284         lvx             $tmp,0,$inp
285         li              $cnt,7
286         li              $rounds,14
287          vperm          $outtail,$in0,$in0,$outperm     # rotate
288          vsel           $stage,$outhead,$outtail,$outmask
289          vmr            $outhead,$outtail
290          stvx           $stage,0,$out
291          addi           $out,$out,16
292         vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
293         mtctr           $cnt
294
295 Loop256:
296         vperm           $key,$in1,$in1,$mask    # rotate-n-splat
297         vsldoi          $tmp,$zero,$in0,12      # >>32
298          vperm          $outtail,$in1,$in1,$outperm     # rotate
299          vsel           $stage,$outhead,$outtail,$outmask
300          vmr            $outhead,$outtail
301         vcipherlast     $key,$key,$rcon
302          stvx           $stage,0,$out
303          addi           $out,$out,16
304
305         vxor            $in0,$in0,$tmp
306         vsldoi          $tmp,$zero,$tmp,12      # >>32
307         vxor            $in0,$in0,$tmp
308         vsldoi          $tmp,$zero,$tmp,12      # >>32
309         vxor            $in0,$in0,$tmp
310          vadduwm        $rcon,$rcon,$rcon
311         vxor            $in0,$in0,$key
312          vperm          $outtail,$in0,$in0,$outperm     # rotate
313          vsel           $stage,$outhead,$outtail,$outmask
314          vmr            $outhead,$outtail
315          stvx           $stage,0,$out
316          addi           $inp,$out,15            # 15 is not typo
317          addi           $out,$out,16
318         bdz             Ldone
319
320         vspltw          $key,$in0,3             # just splat
321         vsldoi          $tmp,$zero,$in1,12      # >>32
322         vsbox           $key,$key
323
324         vxor            $in1,$in1,$tmp
325         vsldoi          $tmp,$zero,$tmp,12      # >>32
326         vxor            $in1,$in1,$tmp
327         vsldoi          $tmp,$zero,$tmp,12      # >>32
328         vxor            $in1,$in1,$tmp
329
330         vxor            $in1,$in1,$key
331         b               Loop256
332
333 .align  4
334 Ldone:
335         lvx             $in1,0,$inp             # redundant in aligned case
336         vsel            $in1,$outhead,$in1,$outmask
337         stvx            $in1,0,$inp
338         li              $ptr,0
339         mtspr           256,$vrsave
340         stw             $rounds,0($out)
341
342 Lenc_key_abort:
343         mr              r3,$ptr
344         blr
345         .long           0
346         .byte           0,12,0x14,1,0,0,3,0
347         .long           0
348 .size   .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
349
350 .globl  .${prefix}_set_decrypt_key
351 .align  5
352 .${prefix}_set_decrypt_key:
353         $STU            $sp,-$FRAME($sp)
354         mflr            r10
355         $PUSH           r10,$FRAME+$LRSAVE($sp)
356         bl              Lset_encrypt_key
357         mtlr            r10
358
359         cmpwi           r3,0
360         bne-            Ldec_key_abort
361
362         slwi            $cnt,$rounds,4
363         subi            $inp,$out,240           # first round key
364         srwi            $rounds,$rounds,1
365         add             $out,$inp,$cnt          # last round key
366         mtctr           $rounds
367
368 Ldeckey:
369         lwz             r0, 0($inp)
370         lwz             r6, 4($inp)
371         lwz             r7, 8($inp)
372         lwz             r8, 12($inp)
373         addi            $inp,$inp,16
374         lwz             r9, 0($out)
375         lwz             r10,4($out)
376         lwz             r11,8($out)
377         lwz             r12,12($out)
378         stw             r0, 0($out)
379         stw             r6, 4($out)
380         stw             r7, 8($out)
381         stw             r8, 12($out)
382         subi            $out,$out,16
383         stw             r9, -16($inp)
384         stw             r10,-12($inp)
385         stw             r11,-8($inp)
386         stw             r12,-4($inp)
387         bdnz            Ldeckey
388
389         xor             r3,r3,r3                # return value
390 Ldec_key_abort:
391         addi            $sp,$sp,$FRAME
392         blr
393         .long           0
394         .byte           0,12,4,1,0x80,0,3,0
395         .long           0
396 .size   .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
397 ___
398 }}}
399 #########################################################################
400 {{{     # Single block en- and decrypt procedures                       #
401 sub gen_block () {
402 my $dir = shift;
403 my $n   = $dir eq "de" ? "n" : "";
404 my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
405
406 $code.=<<___;
407 .globl  .${prefix}_${dir}crypt
408 .align  5
409 .${prefix}_${dir}crypt:
410         lwz             $rounds,240($key)
411         lis             r0,0xfc00
412         mfspr           $vrsave,256
413         li              $idx,15                 # 15 is not typo
414         mtspr           256,r0
415
416         lvx             v0,0,$inp
417         neg             r11,$out
418         lvx             v1,$idx,$inp
419         lvsl            v2,0,$inp               # inpperm
420         le?vspltisb     v4,0x0f
421         ?lvsl           v3,0,r11                # outperm
422         le?vxor         v2,v2,v4
423         li              $idx,16
424         vperm           v0,v0,v1,v2             # align [and byte swap in LE]
425         lvx             v1,0,$key
426         ?lvsl           v5,0,$key               # keyperm
427         srwi            $rounds,$rounds,1
428         lvx             v2,$idx,$key
429         addi            $idx,$idx,16
430         subi            $rounds,$rounds,1
431         ?vperm          v1,v1,v2,v5             # align round key
432
433         vxor            v0,v0,v1
434         lvx             v1,$idx,$key
435         addi            $idx,$idx,16
436         mtctr           $rounds
437
438 Loop_${dir}c:
439         ?vperm          v2,v2,v1,v5
440         v${n}cipher     v0,v0,v2
441         lvx             v2,$idx,$key
442         addi            $idx,$idx,16
443         ?vperm          v1,v1,v2,v5
444         v${n}cipher     v0,v0,v1
445         lvx             v1,$idx,$key
446         addi            $idx,$idx,16
447         bdnz            Loop_${dir}c
448
449         ?vperm          v2,v2,v1,v5
450         v${n}cipher     v0,v0,v2
451         lvx             v2,$idx,$key
452         ?vperm          v1,v1,v2,v5
453         v${n}cipherlast v0,v0,v1
454
455         vspltisb        v2,-1
456         vxor            v1,v1,v1
457         li              $idx,15                 # 15 is not typo
458         ?vperm          v2,v1,v2,v3             # outmask
459         le?vxor         v3,v3,v4
460         lvx             v1,0,$out               # outhead
461         vperm           v0,v0,v0,v3             # rotate [and byte swap in LE]
462         vsel            v1,v1,v0,v2
463         lvx             v4,$idx,$out
464         stvx            v1,0,$out
465         vsel            v0,v0,v4,v2
466         stvx            v0,$idx,$out
467
468         mtspr           256,$vrsave
469         blr
470         .long           0
471         .byte           0,12,0x14,0,0,0,3,0
472         .long           0
473 .size   .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
474 ___
475 }
476 &gen_block("en");
477 &gen_block("de");
478 }}}
479 #########################################################################
480 {{{     # CBC en- and decrypt procedures                                #
481 my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
482 my ($rndkey0,$rndkey1,$inout,$tmp)=             map("v$_",(0..3));
483 my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
484                                                 map("v$_",(4..10));
485 $code.=<<___;
486 .globl  .${prefix}_cbc_encrypt
487 .align  5
488 .${prefix}_cbc_encrypt:
489         ${UCMP}i        $len,16
490         bltlr-
491
492         cmpwi           $enc,0                  # test direction
493         lis             r0,0xffe0
494         mfspr           $vrsave,256
495         mtspr           256,r0
496
497         li              $idx,15
498         vxor            $rndkey0,$rndkey0,$rndkey0
499         le?vspltisb     $tmp,0x0f
500
501         lvx             $ivec,0,$ivp            # load [unaligned] iv
502         lvsl            $inpperm,0,$ivp
503         lvx             $inptail,$idx,$ivp
504         le?vxor         $inpperm,$inpperm,$tmp
505         vperm           $ivec,$ivec,$inptail,$inpperm
506
507         neg             r11,$inp
508         ?lvsl           $keyperm,0,$key         # prepare for unaligned key
509         lwz             $rounds,240($key)
510
511         lvsr            $inpperm,0,r11          # prepare for unaligned load
512         lvx             $inptail,0,$inp
513         addi            $inp,$inp,15            # 15 is not typo
514         le?vxor         $inpperm,$inpperm,$tmp
515
516         ?lvsr           $outperm,0,$out         # prepare for unaligned store
517         vspltisb        $outmask,-1
518         lvx             $outhead,0,$out
519         ?vperm          $outmask,$rndkey0,$outmask,$outperm
520         le?vxor         $outperm,$outperm,$tmp
521
522         srwi            $rounds,$rounds,1
523         li              $idx,16
524         subi            $rounds,$rounds,1
525         beq             Lcbc_dec
526
527 Lcbc_enc:
528         vmr             $inout,$inptail
529         lvx             $inptail,0,$inp
530         addi            $inp,$inp,16
531         mtctr           $rounds
532         subi            $len,$len,16            # len-=16
533
534         lvx             $rndkey0,0,$key
535          vperm          $inout,$inout,$inptail,$inpperm
536         lvx             $rndkey1,$idx,$key
537         addi            $idx,$idx,16
538         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
539         vxor            $inout,$inout,$rndkey0
540         lvx             $rndkey0,$idx,$key
541         addi            $idx,$idx,16
542         vxor            $inout,$inout,$ivec
543
544 Loop_cbc_enc:
545         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
546         vcipher         $inout,$inout,$rndkey1
547         lvx             $rndkey1,$idx,$key
548         addi            $idx,$idx,16
549         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
550         vcipher         $inout,$inout,$rndkey0
551         lvx             $rndkey0,$idx,$key
552         addi            $idx,$idx,16
553         bdnz            Loop_cbc_enc
554
555         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
556         vcipher         $inout,$inout,$rndkey1
557         lvx             $rndkey1,$idx,$key
558         li              $idx,16
559         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
560         vcipherlast     $ivec,$inout,$rndkey0
561         ${UCMP}i        $len,16
562
563         vperm           $tmp,$ivec,$ivec,$outperm
564         vsel            $inout,$outhead,$tmp,$outmask
565         vmr             $outhead,$tmp
566         stvx            $inout,0,$out
567         addi            $out,$out,16
568         bge             Lcbc_enc
569
570         b               Lcbc_done
571
572 .align  4
573 Lcbc_dec:
574         ${UCMP}i        $len,128
575         bge             _aesp8_cbc_decrypt8x
576         vmr             $tmp,$inptail
577         lvx             $inptail,0,$inp
578         addi            $inp,$inp,16
579         mtctr           $rounds
580         subi            $len,$len,16            # len-=16
581
582         lvx             $rndkey0,0,$key
583          vperm          $tmp,$tmp,$inptail,$inpperm
584         lvx             $rndkey1,$idx,$key
585         addi            $idx,$idx,16
586         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
587         vxor            $inout,$tmp,$rndkey0
588         lvx             $rndkey0,$idx,$key
589         addi            $idx,$idx,16
590
591 Loop_cbc_dec:
592         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
593         vncipher        $inout,$inout,$rndkey1
594         lvx             $rndkey1,$idx,$key
595         addi            $idx,$idx,16
596         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
597         vncipher        $inout,$inout,$rndkey0
598         lvx             $rndkey0,$idx,$key
599         addi            $idx,$idx,16
600         bdnz            Loop_cbc_dec
601
602         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
603         vncipher        $inout,$inout,$rndkey1
604         lvx             $rndkey1,$idx,$key
605         li              $idx,16
606         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
607         vncipherlast    $inout,$inout,$rndkey0
608         ${UCMP}i        $len,16
609
610         vxor            $inout,$inout,$ivec
611         vmr             $ivec,$tmp
612         vperm           $tmp,$inout,$inout,$outperm
613         vsel            $inout,$outhead,$tmp,$outmask
614         vmr             $outhead,$tmp
615         stvx            $inout,0,$out
616         addi            $out,$out,16
617         bge             Lcbc_dec
618
619 Lcbc_done:
620         addi            $out,$out,-1
621         lvx             $inout,0,$out           # redundant in aligned case
622         vsel            $inout,$outhead,$inout,$outmask
623         stvx            $inout,0,$out
624
625         neg             $enc,$ivp               # write [unaligned] iv
626         li              $idx,15                 # 15 is not typo
627         vxor            $rndkey0,$rndkey0,$rndkey0
628         vspltisb        $outmask,-1
629         le?vspltisb     $tmp,0x0f
630         ?lvsl           $outperm,0,$enc
631         ?vperm          $outmask,$rndkey0,$outmask,$outperm
632         le?vxor         $outperm,$outperm,$tmp
633         lvx             $outhead,0,$ivp
634         vperm           $ivec,$ivec,$ivec,$outperm
635         vsel            $inout,$outhead,$ivec,$outmask
636         lvx             $inptail,$idx,$ivp
637         stvx            $inout,0,$ivp
638         vsel            $inout,$ivec,$inptail,$outmask
639         stvx            $inout,$idx,$ivp
640
641         mtspr           256,$vrsave
642         blr
643         .long           0
644         .byte           0,12,0x14,0,0,0,6,0
645         .long           0
646 ___
647 #########################################################################
648 {{      # Optimized CBC decrypt procedure                               #
649 my $key_="r11";
650 my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
651     $x00=0 if ($flavour =~ /osx/);
652 my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
653 my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
654 my $rndkey0="v23";      # v24-v25 rotating buffer for first found keys
655                         # v26-v31 last 6 round keys
656 my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
657
658 $code.=<<___;
659 .align  5
660 _aesp8_cbc_decrypt8x:
661         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
662         li              r10,`$FRAME+8*16+15`
663         li              r11,`$FRAME+8*16+31`
664         stvx            v20,r10,$sp             # ABI says so
665         addi            r10,r10,32
666         stvx            v21,r11,$sp
667         addi            r11,r11,32
668         stvx            v22,r10,$sp
669         addi            r10,r10,32
670         stvx            v23,r11,$sp
671         addi            r11,r11,32
672         stvx            v24,r10,$sp
673         addi            r10,r10,32
674         stvx            v25,r11,$sp
675         addi            r11,r11,32
676         stvx            v26,r10,$sp
677         addi            r10,r10,32
678         stvx            v27,r11,$sp
679         addi            r11,r11,32
680         stvx            v28,r10,$sp
681         addi            r10,r10,32
682         stvx            v29,r11,$sp
683         addi            r11,r11,32
684         stvx            v30,r10,$sp
685         stvx            v31,r11,$sp
686         li              r0,-1
687         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
688         li              $x10,0x10
689         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
690         li              $x20,0x20
691         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
692         li              $x30,0x30
693         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
694         li              $x40,0x40
695         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
696         li              $x50,0x50
697         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
698         li              $x60,0x60
699         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
700         li              $x70,0x70
701         mtspr           256,r0
702
703         subi            $rounds,$rounds,3       # -4 in total
704         subi            $len,$len,128           # bias
705
706         lvx             $rndkey0,$x00,$key      # load key schedule
707         lvx             v30,$x10,$key
708         addi            $key,$key,0x20
709         lvx             v31,$x00,$key
710         ?vperm          $rndkey0,$rndkey0,v30,$keyperm
711         addi            $key_,$sp,$FRAME+15
712         mtctr           $rounds
713
714 Load_cbc_dec_key:
715         ?vperm          v24,v30,v31,$keyperm
716         lvx             v30,$x10,$key
717         addi            $key,$key,0x20
718         stvx            v24,$x00,$key_          # off-load round[1]
719         ?vperm          v25,v31,v30,$keyperm
720         lvx             v31,$x00,$key
721         stvx            v25,$x10,$key_          # off-load round[2]
722         addi            $key_,$key_,0x20
723         bdnz            Load_cbc_dec_key
724
725         lvx             v26,$x10,$key
726         ?vperm          v24,v30,v31,$keyperm
727         lvx             v27,$x20,$key
728         stvx            v24,$x00,$key_          # off-load round[3]
729         ?vperm          v25,v31,v26,$keyperm
730         lvx             v28,$x30,$key
731         stvx            v25,$x10,$key_          # off-load round[4]
732         addi            $key_,$sp,$FRAME+15     # rewind $key_
733         ?vperm          v26,v26,v27,$keyperm
734         lvx             v29,$x40,$key
735         ?vperm          v27,v27,v28,$keyperm
736         lvx             v30,$x50,$key
737         ?vperm          v28,v28,v29,$keyperm
738         lvx             v31,$x60,$key
739         ?vperm          v29,v29,v30,$keyperm
740         lvx             $out0,$x70,$key         # borrow $out0
741         ?vperm          v30,v30,v31,$keyperm
742         lvx             v24,$x00,$key_          # pre-load round[1]
743         ?vperm          v31,v31,$out0,$keyperm
744         lvx             v25,$x10,$key_          # pre-load round[2]
745
746         #lvx            $inptail,0,$inp         # "caller" already did this
747         #addi           $inp,$inp,15            # 15 is not typo
748         subi            $inp,$inp,15            # undo "caller"
749
750          le?li          $idx,8
751         lvx_u           $in0,$x00,$inp          # load first 8 "words"
752          le?lvsl        $inpperm,0,$idx
753          le?vspltisb    $tmp,0x0f
754         lvx_u           $in1,$x10,$inp
755          le?vxor        $inpperm,$inpperm,$tmp  # transform for lvx_u/stvx_u
756         lvx_u           $in2,$x20,$inp
757          le?vperm       $in0,$in0,$in0,$inpperm
758         lvx_u           $in3,$x30,$inp
759          le?vperm       $in1,$in1,$in1,$inpperm
760         lvx_u           $in4,$x40,$inp
761          le?vperm       $in2,$in2,$in2,$inpperm
762         vxor            $out0,$in0,$rndkey0
763         lvx_u           $in5,$x50,$inp
764          le?vperm       $in3,$in3,$in3,$inpperm
765         vxor            $out1,$in1,$rndkey0
766         lvx_u           $in6,$x60,$inp
767          le?vperm       $in4,$in4,$in4,$inpperm
768         vxor            $out2,$in2,$rndkey0
769         lvx_u           $in7,$x70,$inp
770         addi            $inp,$inp,0x80
771          le?vperm       $in5,$in5,$in5,$inpperm
772         vxor            $out3,$in3,$rndkey0
773          le?vperm       $in6,$in6,$in6,$inpperm
774         vxor            $out4,$in4,$rndkey0
775          le?vperm       $in7,$in7,$in7,$inpperm
776         vxor            $out5,$in5,$rndkey0
777         vxor            $out6,$in6,$rndkey0
778         vxor            $out7,$in7,$rndkey0
779
780         mtctr           $rounds
781         b               Loop_cbc_dec8x
782 .align  5
783 Loop_cbc_dec8x:
784         vncipher        $out0,$out0,v24
785         vncipher        $out1,$out1,v24
786         vncipher        $out2,$out2,v24
787         vncipher        $out3,$out3,v24
788         vncipher        $out4,$out4,v24
789         vncipher        $out5,$out5,v24
790         vncipher        $out6,$out6,v24
791         vncipher        $out7,$out7,v24
792         lvx             v24,$x20,$key_          # round[3]
793         addi            $key_,$key_,0x20
794
795         vncipher        $out0,$out0,v25
796         vncipher        $out1,$out1,v25
797         vncipher        $out2,$out2,v25
798         vncipher        $out3,$out3,v25
799         vncipher        $out4,$out4,v25
800         vncipher        $out5,$out5,v25
801         vncipher        $out6,$out6,v25
802         vncipher        $out7,$out7,v25
803         lvx             v25,$x10,$key_          # round[4]
804         bdnz            Loop_cbc_dec8x
805
806         subic           $len,$len,128           # $len-=128
807         vncipher        $out0,$out0,v24
808         vncipher        $out1,$out1,v24
809         vncipher        $out2,$out2,v24
810         vncipher        $out3,$out3,v24
811         vncipher        $out4,$out4,v24
812         vncipher        $out5,$out5,v24
813         vncipher        $out6,$out6,v24
814         vncipher        $out7,$out7,v24
815
816         subfe.          r0,r0,r0                # borrow?-1:0
817         vncipher        $out0,$out0,v25
818         vncipher        $out1,$out1,v25
819         vncipher        $out2,$out2,v25
820         vncipher        $out3,$out3,v25
821         vncipher        $out4,$out4,v25
822         vncipher        $out5,$out5,v25
823         vncipher        $out6,$out6,v25
824         vncipher        $out7,$out7,v25
825
826         and             r0,r0,$len
827         vncipher        $out0,$out0,v26
828         vncipher        $out1,$out1,v26
829         vncipher        $out2,$out2,v26
830         vncipher        $out3,$out3,v26
831         vncipher        $out4,$out4,v26
832         vncipher        $out5,$out5,v26
833         vncipher        $out6,$out6,v26
834         vncipher        $out7,$out7,v26
835
836         add             $inp,$inp,r0            # $inp is adjusted in such
837                                                 # way that at exit from the
838                                                 # loop inX-in7 are loaded
839                                                 # with last "words"
840         vncipher        $out0,$out0,v27
841         vncipher        $out1,$out1,v27
842         vncipher        $out2,$out2,v27
843         vncipher        $out3,$out3,v27
844         vncipher        $out4,$out4,v27
845         vncipher        $out5,$out5,v27
846         vncipher        $out6,$out6,v27
847         vncipher        $out7,$out7,v27
848
849         addi            $key_,$sp,$FRAME+15     # rewind $key_
850         vncipher        $out0,$out0,v28
851         vncipher        $out1,$out1,v28
852         vncipher        $out2,$out2,v28
853         vncipher        $out3,$out3,v28
854         vncipher        $out4,$out4,v28
855         vncipher        $out5,$out5,v28
856         vncipher        $out6,$out6,v28
857         vncipher        $out7,$out7,v28
858         lvx             v24,$x00,$key_          # re-pre-load round[1]
859
860         vncipher        $out0,$out0,v29
861         vncipher        $out1,$out1,v29
862         vncipher        $out2,$out2,v29
863         vncipher        $out3,$out3,v29
864         vncipher        $out4,$out4,v29
865         vncipher        $out5,$out5,v29
866         vncipher        $out6,$out6,v29
867         vncipher        $out7,$out7,v29
868         lvx             v25,$x10,$key_          # re-pre-load round[2]
869
870         vncipher        $out0,$out0,v30
871          vxor           $ivec,$ivec,v31         # xor with last round key
872         vncipher        $out1,$out1,v30
873          vxor           $in0,$in0,v31
874         vncipher        $out2,$out2,v30
875          vxor           $in1,$in1,v31
876         vncipher        $out3,$out3,v30
877          vxor           $in2,$in2,v31
878         vncipher        $out4,$out4,v30
879          vxor           $in3,$in3,v31
880         vncipher        $out5,$out5,v30
881          vxor           $in4,$in4,v31
882         vncipher        $out6,$out6,v30
883          vxor           $in5,$in5,v31
884         vncipher        $out7,$out7,v30
885          vxor           $in6,$in6,v31
886
887         vncipherlast    $out0,$out0,$ivec
888         vncipherlast    $out1,$out1,$in0
889          lvx_u          $in0,$x00,$inp          # load next input block
890         vncipherlast    $out2,$out2,$in1
891          lvx_u          $in1,$x10,$inp
892         vncipherlast    $out3,$out3,$in2
893          le?vperm       $in0,$in0,$in0,$inpperm
894          lvx_u          $in2,$x20,$inp
895         vncipherlast    $out4,$out4,$in3
896          le?vperm       $in1,$in1,$in1,$inpperm
897          lvx_u          $in3,$x30,$inp
898         vncipherlast    $out5,$out5,$in4
899          le?vperm       $in2,$in2,$in2,$inpperm
900          lvx_u          $in4,$x40,$inp
901         vncipherlast    $out6,$out6,$in5
902          le?vperm       $in3,$in3,$in3,$inpperm
903          lvx_u          $in5,$x50,$inp
904         vncipherlast    $out7,$out7,$in6
905          le?vperm       $in4,$in4,$in4,$inpperm
906          lvx_u          $in6,$x60,$inp
907         vmr             $ivec,$in7
908          le?vperm       $in5,$in5,$in5,$inpperm
909          lvx_u          $in7,$x70,$inp
910          addi           $inp,$inp,0x80
911
912         le?vperm        $out0,$out0,$out0,$inpperm
913         le?vperm        $out1,$out1,$out1,$inpperm
914         stvx_u          $out0,$x00,$out
915          le?vperm       $in6,$in6,$in6,$inpperm
916          vxor           $out0,$in0,$rndkey0
917         le?vperm        $out2,$out2,$out2,$inpperm
918         stvx_u          $out1,$x10,$out
919          le?vperm       $in7,$in7,$in7,$inpperm
920          vxor           $out1,$in1,$rndkey0
921         le?vperm        $out3,$out3,$out3,$inpperm
922         stvx_u          $out2,$x20,$out
923          vxor           $out2,$in2,$rndkey0
924         le?vperm        $out4,$out4,$out4,$inpperm
925         stvx_u          $out3,$x30,$out
926          vxor           $out3,$in3,$rndkey0
927         le?vperm        $out5,$out5,$out5,$inpperm
928         stvx_u          $out4,$x40,$out
929          vxor           $out4,$in4,$rndkey0
930         le?vperm        $out6,$out6,$out6,$inpperm
931         stvx_u          $out5,$x50,$out
932          vxor           $out5,$in5,$rndkey0
933         le?vperm        $out7,$out7,$out7,$inpperm
934         stvx_u          $out6,$x60,$out
935          vxor           $out6,$in6,$rndkey0
936         stvx_u          $out7,$x70,$out
937         addi            $out,$out,0x80
938          vxor           $out7,$in7,$rndkey0
939
940         mtctr           $rounds
941         beq             Loop_cbc_dec8x          # did $len-=128 borrow?
942
943         addic.          $len,$len,128
944         beq             Lcbc_dec8x_done
945         nop
946         nop
947
948 Loop_cbc_dec8x_tail:                            # up to 7 "words" tail...
949         vncipher        $out1,$out1,v24
950         vncipher        $out2,$out2,v24
951         vncipher        $out3,$out3,v24
952         vncipher        $out4,$out4,v24
953         vncipher        $out5,$out5,v24
954         vncipher        $out6,$out6,v24
955         vncipher        $out7,$out7,v24
956         lvx             v24,$x20,$key_          # round[3]
957         addi            $key_,$key_,0x20
958
959         vncipher        $out1,$out1,v25
960         vncipher        $out2,$out2,v25
961         vncipher        $out3,$out3,v25
962         vncipher        $out4,$out4,v25
963         vncipher        $out5,$out5,v25
964         vncipher        $out6,$out6,v25
965         vncipher        $out7,$out7,v25
966         lvx             v25,$x10,$key_          # round[4]
967         bdnz            Loop_cbc_dec8x_tail
968
969         vncipher        $out1,$out1,v24
970         vncipher        $out2,$out2,v24
971         vncipher        $out3,$out3,v24
972         vncipher        $out4,$out4,v24
973         vncipher        $out5,$out5,v24
974         vncipher        $out6,$out6,v24
975         vncipher        $out7,$out7,v24
976
977         vncipher        $out1,$out1,v25
978         vncipher        $out2,$out2,v25
979         vncipher        $out3,$out3,v25
980         vncipher        $out4,$out4,v25
981         vncipher        $out5,$out5,v25
982         vncipher        $out6,$out6,v25
983         vncipher        $out7,$out7,v25
984
985         vncipher        $out1,$out1,v26
986         vncipher        $out2,$out2,v26
987         vncipher        $out3,$out3,v26
988         vncipher        $out4,$out4,v26
989         vncipher        $out5,$out5,v26
990         vncipher        $out6,$out6,v26
991         vncipher        $out7,$out7,v26
992
993         vncipher        $out1,$out1,v27
994         vncipher        $out2,$out2,v27
995         vncipher        $out3,$out3,v27
996         vncipher        $out4,$out4,v27
997         vncipher        $out5,$out5,v27
998         vncipher        $out6,$out6,v27
999         vncipher        $out7,$out7,v27
1000
1001         vncipher        $out1,$out1,v28
1002         vncipher        $out2,$out2,v28
1003         vncipher        $out3,$out3,v28
1004         vncipher        $out4,$out4,v28
1005         vncipher        $out5,$out5,v28
1006         vncipher        $out6,$out6,v28
1007         vncipher        $out7,$out7,v28
1008
1009         vncipher        $out1,$out1,v29
1010         vncipher        $out2,$out2,v29
1011         vncipher        $out3,$out3,v29
1012         vncipher        $out4,$out4,v29
1013         vncipher        $out5,$out5,v29
1014         vncipher        $out6,$out6,v29
1015         vncipher        $out7,$out7,v29
1016
1017         vncipher        $out1,$out1,v30
1018          vxor           $ivec,$ivec,v31         # last round key
1019         vncipher        $out2,$out2,v30
1020          vxor           $in1,$in1,v31
1021         vncipher        $out3,$out3,v30
1022          vxor           $in2,$in2,v31
1023         vncipher        $out4,$out4,v30
1024          vxor           $in3,$in3,v31
1025         vncipher        $out5,$out5,v30
1026          vxor           $in4,$in4,v31
1027         vncipher        $out6,$out6,v30
1028          vxor           $in5,$in5,v31
1029         vncipher        $out7,$out7,v30
1030          vxor           $in6,$in6,v31
1031
1032         cmplwi          $len,32                 # switch($len)
1033         blt             Lcbc_dec8x_one
1034         nop
1035         beq             Lcbc_dec8x_two
1036         cmplwi          $len,64
1037         blt             Lcbc_dec8x_three
1038         nop
1039         beq             Lcbc_dec8x_four
1040         cmplwi          $len,96
1041         blt             Lcbc_dec8x_five
1042         nop
1043         beq             Lcbc_dec8x_six
1044
1045 Lcbc_dec8x_seven:
1046         vncipherlast    $out1,$out1,$ivec
1047         vncipherlast    $out2,$out2,$in1
1048         vncipherlast    $out3,$out3,$in2
1049         vncipherlast    $out4,$out4,$in3
1050         vncipherlast    $out5,$out5,$in4
1051         vncipherlast    $out6,$out6,$in5
1052         vncipherlast    $out7,$out7,$in6
1053         vmr             $ivec,$in7
1054
1055         le?vperm        $out1,$out1,$out1,$inpperm
1056         le?vperm        $out2,$out2,$out2,$inpperm
1057         stvx_u          $out1,$x00,$out
1058         le?vperm        $out3,$out3,$out3,$inpperm
1059         stvx_u          $out2,$x10,$out
1060         le?vperm        $out4,$out4,$out4,$inpperm
1061         stvx_u          $out3,$x20,$out
1062         le?vperm        $out5,$out5,$out5,$inpperm
1063         stvx_u          $out4,$x30,$out
1064         le?vperm        $out6,$out6,$out6,$inpperm
1065         stvx_u          $out5,$x40,$out
1066         le?vperm        $out7,$out7,$out7,$inpperm
1067         stvx_u          $out6,$x50,$out
1068         stvx_u          $out7,$x60,$out
1069         addi            $out,$out,0x70
1070         b               Lcbc_dec8x_done
1071
1072 .align  5
1073 Lcbc_dec8x_six:
1074         vncipherlast    $out2,$out2,$ivec
1075         vncipherlast    $out3,$out3,$in2
1076         vncipherlast    $out4,$out4,$in3
1077         vncipherlast    $out5,$out5,$in4
1078         vncipherlast    $out6,$out6,$in5
1079         vncipherlast    $out7,$out7,$in6
1080         vmr             $ivec,$in7
1081
1082         le?vperm        $out2,$out2,$out2,$inpperm
1083         le?vperm        $out3,$out3,$out3,$inpperm
1084         stvx_u          $out2,$x00,$out
1085         le?vperm        $out4,$out4,$out4,$inpperm
1086         stvx_u          $out3,$x10,$out
1087         le?vperm        $out5,$out5,$out5,$inpperm
1088         stvx_u          $out4,$x20,$out
1089         le?vperm        $out6,$out6,$out6,$inpperm
1090         stvx_u          $out5,$x30,$out
1091         le?vperm        $out7,$out7,$out7,$inpperm
1092         stvx_u          $out6,$x40,$out
1093         stvx_u          $out7,$x50,$out
1094         addi            $out,$out,0x60
1095         b               Lcbc_dec8x_done
1096
1097 .align  5
1098 Lcbc_dec8x_five:
1099         vncipherlast    $out3,$out3,$ivec
1100         vncipherlast    $out4,$out4,$in3
1101         vncipherlast    $out5,$out5,$in4
1102         vncipherlast    $out6,$out6,$in5
1103         vncipherlast    $out7,$out7,$in6
1104         vmr             $ivec,$in7
1105
1106         le?vperm        $out3,$out3,$out3,$inpperm
1107         le?vperm        $out4,$out4,$out4,$inpperm
1108         stvx_u          $out3,$x00,$out
1109         le?vperm        $out5,$out5,$out5,$inpperm
1110         stvx_u          $out4,$x10,$out
1111         le?vperm        $out6,$out6,$out6,$inpperm
1112         stvx_u          $out5,$x20,$out
1113         le?vperm        $out7,$out7,$out7,$inpperm
1114         stvx_u          $out6,$x30,$out
1115         stvx_u          $out7,$x40,$out
1116         addi            $out,$out,0x50
1117         b               Lcbc_dec8x_done
1118
1119 .align  5
1120 Lcbc_dec8x_four:
1121         vncipherlast    $out4,$out4,$ivec
1122         vncipherlast    $out5,$out5,$in4
1123         vncipherlast    $out6,$out6,$in5
1124         vncipherlast    $out7,$out7,$in6
1125         vmr             $ivec,$in7
1126
1127         le?vperm        $out4,$out4,$out4,$inpperm
1128         le?vperm        $out5,$out5,$out5,$inpperm
1129         stvx_u          $out4,$x00,$out
1130         le?vperm        $out6,$out6,$out6,$inpperm
1131         stvx_u          $out5,$x10,$out
1132         le?vperm        $out7,$out7,$out7,$inpperm
1133         stvx_u          $out6,$x20,$out
1134         stvx_u          $out7,$x30,$out
1135         addi            $out,$out,0x40
1136         b               Lcbc_dec8x_done
1137
1138 .align  5
1139 Lcbc_dec8x_three:
1140         vncipherlast    $out5,$out5,$ivec
1141         vncipherlast    $out6,$out6,$in5
1142         vncipherlast    $out7,$out7,$in6
1143         vmr             $ivec,$in7
1144
1145         le?vperm        $out5,$out5,$out5,$inpperm
1146         le?vperm        $out6,$out6,$out6,$inpperm
1147         stvx_u          $out5,$x00,$out
1148         le?vperm        $out7,$out7,$out7,$inpperm
1149         stvx_u          $out6,$x10,$out
1150         stvx_u          $out7,$x20,$out
1151         addi            $out,$out,0x30
1152         b               Lcbc_dec8x_done
1153
1154 .align  5
1155 Lcbc_dec8x_two:
1156         vncipherlast    $out6,$out6,$ivec
1157         vncipherlast    $out7,$out7,$in6
1158         vmr             $ivec,$in7
1159
1160         le?vperm        $out6,$out6,$out6,$inpperm
1161         le?vperm        $out7,$out7,$out7,$inpperm
1162         stvx_u          $out6,$x00,$out
1163         stvx_u          $out7,$x10,$out
1164         addi            $out,$out,0x20
1165         b               Lcbc_dec8x_done
1166
1167 .align  5
1168 Lcbc_dec8x_one:
1169         vncipherlast    $out7,$out7,$ivec
1170         vmr             $ivec,$in7
1171
1172         le?vperm        $out7,$out7,$out7,$inpperm
1173         stvx_u          $out7,0,$out
1174         addi            $out,$out,0x10
1175
1176 Lcbc_dec8x_done:
1177         le?vperm        $ivec,$ivec,$ivec,$inpperm
1178         stvx_u          $ivec,0,$ivp            # write [unaligned] iv
1179
1180         li              r10,`$FRAME+15`
1181         li              r11,`$FRAME+31`
1182         stvx            $inpperm,r10,$sp        # wipe copies of round keys
1183         addi            r10,r10,32
1184         stvx            $inpperm,r11,$sp
1185         addi            r11,r11,32
1186         stvx            $inpperm,r10,$sp
1187         addi            r10,r10,32
1188         stvx            $inpperm,r11,$sp
1189         addi            r11,r11,32
1190         stvx            $inpperm,r10,$sp
1191         addi            r10,r10,32
1192         stvx            $inpperm,r11,$sp
1193         addi            r11,r11,32
1194         stvx            $inpperm,r10,$sp
1195         addi            r10,r10,32
1196         stvx            $inpperm,r11,$sp
1197         addi            r11,r11,32
1198
1199         mtspr           256,$vrsave
1200         lvx             v20,r10,$sp             # ABI says so
1201         addi            r10,r10,32
1202         lvx             v21,r11,$sp
1203         addi            r11,r11,32
1204         lvx             v22,r10,$sp
1205         addi            r10,r10,32
1206         lvx             v23,r11,$sp
1207         addi            r11,r11,32
1208         lvx             v24,r10,$sp
1209         addi            r10,r10,32
1210         lvx             v25,r11,$sp
1211         addi            r11,r11,32
1212         lvx             v26,r10,$sp
1213         addi            r10,r10,32
1214         lvx             v27,r11,$sp
1215         addi            r11,r11,32
1216         lvx             v28,r10,$sp
1217         addi            r10,r10,32
1218         lvx             v29,r11,$sp
1219         addi            r11,r11,32
1220         lvx             v30,r10,$sp
1221         lvx             v31,r11,$sp
1222         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1223         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1224         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1225         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1226         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1227         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1228         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1229         blr
1230         .long           0
1231         .byte           0,12,0x04,0,0x80,6,6,0
1232         .long           0
1233 .size   .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1234 ___
1235 }}      }}}
1236
1237 #########################################################################
1238 {{{     # CTR procedure[s]                                              #
1239 my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1240 my ($rndkey0,$rndkey1,$inout,$tmp)=             map("v$_",(0..3));
1241 my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1242                                                 map("v$_",(4..11));
1243 my $dat=$tmp;
1244
1245 $code.=<<___;
1246 .globl  .${prefix}_ctr32_encrypt_blocks
1247 .align  5
1248 .${prefix}_ctr32_encrypt_blocks:
1249         ${UCMP}i        $len,1
1250         bltlr-
1251
1252         lis             r0,0xfff0
1253         mfspr           $vrsave,256
1254         mtspr           256,r0
1255
1256         li              $idx,15
1257         vxor            $rndkey0,$rndkey0,$rndkey0
1258         le?vspltisb     $tmp,0x0f
1259
1260         lvx             $ivec,0,$ivp            # load [unaligned] iv
1261         lvsl            $inpperm,0,$ivp
1262         lvx             $inptail,$idx,$ivp
1263          vspltisb       $one,1
1264         le?vxor         $inpperm,$inpperm,$tmp
1265         vperm           $ivec,$ivec,$inptail,$inpperm
1266          vsldoi         $one,$rndkey0,$one,1
1267
1268         neg             r11,$inp
1269         ?lvsl           $keyperm,0,$key         # prepare for unaligned key
1270         lwz             $rounds,240($key)
1271
1272         lvsr            $inpperm,0,r11          # prepare for unaligned load
1273         lvx             $inptail,0,$inp
1274         addi            $inp,$inp,15            # 15 is not typo
1275         le?vxor         $inpperm,$inpperm,$tmp
1276
1277         srwi            $rounds,$rounds,1
1278         li              $idx,16
1279         subi            $rounds,$rounds,1
1280
1281         ${UCMP}i        $len,8
1282         bge             _aesp8_ctr32_encrypt8x
1283
1284         ?lvsr           $outperm,0,$out         # prepare for unaligned store
1285         vspltisb        $outmask,-1
1286         lvx             $outhead,0,$out
1287         ?vperm          $outmask,$rndkey0,$outmask,$outperm
1288         le?vxor         $outperm,$outperm,$tmp
1289
1290         lvx             $rndkey0,0,$key
1291         mtctr           $rounds
1292         lvx             $rndkey1,$idx,$key
1293         addi            $idx,$idx,16
1294         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
1295         vxor            $inout,$ivec,$rndkey0
1296         lvx             $rndkey0,$idx,$key
1297         addi            $idx,$idx,16
1298         b               Loop_ctr32_enc
1299
1300 .align  5
1301 Loop_ctr32_enc:
1302         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
1303         vcipher         $inout,$inout,$rndkey1
1304         lvx             $rndkey1,$idx,$key
1305         addi            $idx,$idx,16
1306         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
1307         vcipher         $inout,$inout,$rndkey0
1308         lvx             $rndkey0,$idx,$key
1309         addi            $idx,$idx,16
1310         bdnz            Loop_ctr32_enc
1311
1312         vadduwm         $ivec,$ivec,$one
1313          vmr            $dat,$inptail
1314          lvx            $inptail,0,$inp
1315          addi           $inp,$inp,16
1316          subic.         $len,$len,1             # blocks--
1317
1318         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
1319         vcipher         $inout,$inout,$rndkey1
1320         lvx             $rndkey1,$idx,$key
1321          vperm          $dat,$dat,$inptail,$inpperm
1322          li             $idx,16
1323         ?vperm          $rndkey1,$rndkey0,$rndkey1,$keyperm
1324          lvx            $rndkey0,0,$key
1325         vxor            $dat,$dat,$rndkey1      # last round key
1326         vcipherlast     $inout,$inout,$dat
1327
1328          lvx            $rndkey1,$idx,$key
1329          addi           $idx,$idx,16
1330         vperm           $inout,$inout,$inout,$outperm
1331         vsel            $dat,$outhead,$inout,$outmask
1332          mtctr          $rounds
1333          ?vperm         $rndkey0,$rndkey0,$rndkey1,$keyperm
1334         vmr             $outhead,$inout
1335          vxor           $inout,$ivec,$rndkey0
1336          lvx            $rndkey0,$idx,$key
1337          addi           $idx,$idx,16
1338         stvx            $dat,0,$out
1339         addi            $out,$out,16
1340         bne             Loop_ctr32_enc
1341
1342         addi            $out,$out,-1
1343         lvx             $inout,0,$out           # redundant in aligned case
1344         vsel            $inout,$outhead,$inout,$outmask
1345         stvx            $inout,0,$out
1346
1347         mtspr           256,$vrsave
1348         blr
1349         .long           0
1350         .byte           0,12,0x14,0,0,0,6,0
1351         .long           0
1352 ___
1353 #########################################################################
1354 {{      # Optimized CTR procedure                                       #
1355 my $key_="r11";
1356 my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1357     $x00=0 if ($flavour =~ /osx/);
1358 my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1359 my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1360 my $rndkey0="v23";      # v24-v25 rotating buffer for first found keys
1361                         # v26-v31 last 6 round keys
1362 my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
1363 my ($two,$three,$four)=($outhead,$outperm,$outmask);
1364
1365 $code.=<<___;
1366 .align  5
1367 _aesp8_ctr32_encrypt8x:
1368         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1369         li              r10,`$FRAME+8*16+15`
1370         li              r11,`$FRAME+8*16+31`
1371         stvx            v20,r10,$sp             # ABI says so
1372         addi            r10,r10,32
1373         stvx            v21,r11,$sp
1374         addi            r11,r11,32
1375         stvx            v22,r10,$sp
1376         addi            r10,r10,32
1377         stvx            v23,r11,$sp
1378         addi            r11,r11,32
1379         stvx            v24,r10,$sp
1380         addi            r10,r10,32
1381         stvx            v25,r11,$sp
1382         addi            r11,r11,32
1383         stvx            v26,r10,$sp
1384         addi            r10,r10,32
1385         stvx            v27,r11,$sp
1386         addi            r11,r11,32
1387         stvx            v28,r10,$sp
1388         addi            r10,r10,32
1389         stvx            v29,r11,$sp
1390         addi            r11,r11,32
1391         stvx            v30,r10,$sp
1392         stvx            v31,r11,$sp
1393         li              r0,-1
1394         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
1395         li              $x10,0x10
1396         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1397         li              $x20,0x20
1398         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1399         li              $x30,0x30
1400         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1401         li              $x40,0x40
1402         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1403         li              $x50,0x50
1404         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1405         li              $x60,0x60
1406         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1407         li              $x70,0x70
1408         mtspr           256,r0
1409
1410         subi            $rounds,$rounds,3       # -4 in total
1411
1412         lvx             $rndkey0,$x00,$key      # load key schedule
1413         lvx             v30,$x10,$key
1414         addi            $key,$key,0x20
1415         lvx             v31,$x00,$key
1416         ?vperm          $rndkey0,$rndkey0,v30,$keyperm
1417         addi            $key_,$sp,$FRAME+15
1418         mtctr           $rounds
1419
1420 Load_ctr32_enc_key:
1421         ?vperm          v24,v30,v31,$keyperm
1422         lvx             v30,$x10,$key
1423         addi            $key,$key,0x20
1424         stvx            v24,$x00,$key_          # off-load round[1]
1425         ?vperm          v25,v31,v30,$keyperm
1426         lvx             v31,$x00,$key
1427         stvx            v25,$x10,$key_          # off-load round[2]
1428         addi            $key_,$key_,0x20
1429         bdnz            Load_ctr32_enc_key
1430
1431         lvx             v26,$x10,$key
1432         ?vperm          v24,v30,v31,$keyperm
1433         lvx             v27,$x20,$key
1434         stvx            v24,$x00,$key_          # off-load round[3]
1435         ?vperm          v25,v31,v26,$keyperm
1436         lvx             v28,$x30,$key
1437         stvx            v25,$x10,$key_          # off-load round[4]
1438         addi            $key_,$sp,$FRAME+15     # rewind $key_
1439         ?vperm          v26,v26,v27,$keyperm
1440         lvx             v29,$x40,$key
1441         ?vperm          v27,v27,v28,$keyperm
1442         lvx             v30,$x50,$key
1443         ?vperm          v28,v28,v29,$keyperm
1444         lvx             v31,$x60,$key
1445         ?vperm          v29,v29,v30,$keyperm
1446         lvx             $out0,$x70,$key         # borrow $out0
1447         ?vperm          v30,v30,v31,$keyperm
1448         lvx             v24,$x00,$key_          # pre-load round[1]
1449         ?vperm          v31,v31,$out0,$keyperm
1450         lvx             v25,$x10,$key_          # pre-load round[2]
1451
1452         vadduwm         $two,$one,$one
1453         subi            $inp,$inp,15            # undo "caller"
1454         $SHL            $len,$len,4
1455
1456         vadduwm         $out1,$ivec,$one        # counter values ...
1457         vadduwm         $out2,$ivec,$two
1458         vxor            $out0,$ivec,$rndkey0    # ... xored with rndkey[0]
1459          le?li          $idx,8
1460         vadduwm         $out3,$out1,$two
1461         vxor            $out1,$out1,$rndkey0
1462          le?lvsl        $inpperm,0,$idx
1463         vadduwm         $out4,$out2,$two
1464         vxor            $out2,$out2,$rndkey0
1465          le?vspltisb    $tmp,0x0f
1466         vadduwm         $out5,$out3,$two
1467         vxor            $out3,$out3,$rndkey0
1468          le?vxor        $inpperm,$inpperm,$tmp  # transform for lvx_u/stvx_u
1469         vadduwm         $out6,$out4,$two
1470         vxor            $out4,$out4,$rndkey0
1471         vadduwm         $out7,$out5,$two
1472         vxor            $out5,$out5,$rndkey0
1473         vadduwm         $ivec,$out6,$two        # next counter value
1474         vxor            $out6,$out6,$rndkey0
1475         vxor            $out7,$out7,$rndkey0
1476
1477         mtctr           $rounds
1478         b               Loop_ctr32_enc8x
1479 .align  5
1480 Loop_ctr32_enc8x:
1481         vcipher         $out0,$out0,v24
1482         vcipher         $out1,$out1,v24
1483         vcipher         $out2,$out2,v24
1484         vcipher         $out3,$out3,v24
1485         vcipher         $out4,$out4,v24
1486         vcipher         $out5,$out5,v24
1487         vcipher         $out6,$out6,v24
1488         vcipher         $out7,$out7,v24
1489 Loop_ctr32_enc8x_middle:
1490         lvx             v24,$x20,$key_          # round[3]
1491         addi            $key_,$key_,0x20
1492
1493         vcipher         $out0,$out0,v25
1494         vcipher         $out1,$out1,v25
1495         vcipher         $out2,$out2,v25
1496         vcipher         $out3,$out3,v25
1497         vcipher         $out4,$out4,v25
1498         vcipher         $out5,$out5,v25
1499         vcipher         $out6,$out6,v25
1500         vcipher         $out7,$out7,v25
1501         lvx             v25,$x10,$key_          # round[4]
1502         bdnz            Loop_ctr32_enc8x
1503
1504         subic           r11,$len,256            # $len-256, borrow $key_
1505         vcipher         $out0,$out0,v24
1506         vcipher         $out1,$out1,v24
1507         vcipher         $out2,$out2,v24
1508         vcipher         $out3,$out3,v24
1509         vcipher         $out4,$out4,v24
1510         vcipher         $out5,$out5,v24
1511         vcipher         $out6,$out6,v24
1512         vcipher         $out7,$out7,v24
1513
1514         subfe           r0,r0,r0                # borrow?-1:0
1515         vcipher         $out0,$out0,v25
1516         vcipher         $out1,$out1,v25
1517         vcipher         $out2,$out2,v25
1518         vcipher         $out3,$out3,v25
1519         vcipher         $out4,$out4,v25
1520         vcipher         $out5,$out5,v25
1521         vcipher         $out6,$out6,v25
1522         vcipher         $out7,$out7,v25
1523
1524         and             r0,r0,r11
1525         addi            $key_,$sp,$FRAME+15     # rewind $key_
1526         vcipher         $out0,$out0,v26
1527         vcipher         $out1,$out1,v26
1528         vcipher         $out2,$out2,v26
1529         vcipher         $out3,$out3,v26
1530         vcipher         $out4,$out4,v26
1531         vcipher         $out5,$out5,v26
1532         vcipher         $out6,$out6,v26
1533         vcipher         $out7,$out7,v26
1534         lvx             v24,$x00,$key_          # re-pre-load round[1]
1535
1536         subic           $len,$len,129           # $len-=129
1537         vcipher         $out0,$out0,v27
1538         addi            $len,$len,1             # $len-=128 really
1539         vcipher         $out1,$out1,v27
1540         vcipher         $out2,$out2,v27
1541         vcipher         $out3,$out3,v27
1542         vcipher         $out4,$out4,v27
1543         vcipher         $out5,$out5,v27
1544         vcipher         $out6,$out6,v27
1545         vcipher         $out7,$out7,v27
1546         lvx             v25,$x10,$key_          # re-pre-load round[2]
1547
1548         vcipher         $out0,$out0,v28
1549          lvx_u          $in0,$x00,$inp          # load input
1550         vcipher         $out1,$out1,v28
1551          lvx_u          $in1,$x10,$inp
1552         vcipher         $out2,$out2,v28
1553          lvx_u          $in2,$x20,$inp
1554         vcipher         $out3,$out3,v28
1555          lvx_u          $in3,$x30,$inp
1556         vcipher         $out4,$out4,v28
1557          lvx_u          $in4,$x40,$inp
1558         vcipher         $out5,$out5,v28
1559          lvx_u          $in5,$x50,$inp
1560         vcipher         $out6,$out6,v28
1561          lvx_u          $in6,$x60,$inp
1562         vcipher         $out7,$out7,v28
1563          lvx_u          $in7,$x70,$inp
1564          addi           $inp,$inp,0x80
1565
1566         vcipher         $out0,$out0,v29
1567          le?vperm       $in0,$in0,$in0,$inpperm
1568         vcipher         $out1,$out1,v29
1569          le?vperm       $in1,$in1,$in1,$inpperm
1570         vcipher         $out2,$out2,v29
1571          le?vperm       $in2,$in2,$in2,$inpperm
1572         vcipher         $out3,$out3,v29
1573          le?vperm       $in3,$in3,$in3,$inpperm
1574         vcipher         $out4,$out4,v29
1575          le?vperm       $in4,$in4,$in4,$inpperm
1576         vcipher         $out5,$out5,v29
1577          le?vperm       $in5,$in5,$in5,$inpperm
1578         vcipher         $out6,$out6,v29
1579          le?vperm       $in6,$in6,$in6,$inpperm
1580         vcipher         $out7,$out7,v29
1581          le?vperm       $in7,$in7,$in7,$inpperm
1582
1583         add             $inp,$inp,r0            # $inp is adjusted in such
1584                                                 # way that at exit from the
1585                                                 # loop inX-in7 are loaded
1586                                                 # with last "words"
1587         subfe.          r0,r0,r0                # borrow?-1:0
1588         vcipher         $out0,$out0,v30
1589          vxor           $in0,$in0,v31           # xor with last round key
1590         vcipher         $out1,$out1,v30
1591          vxor           $in1,$in1,v31
1592         vcipher         $out2,$out2,v30
1593          vxor           $in2,$in2,v31
1594         vcipher         $out3,$out3,v30
1595          vxor           $in3,$in3,v31
1596         vcipher         $out4,$out4,v30
1597          vxor           $in4,$in4,v31
1598         vcipher         $out5,$out5,v30
1599          vxor           $in5,$in5,v31
1600         vcipher         $out6,$out6,v30
1601          vxor           $in6,$in6,v31
1602         vcipher         $out7,$out7,v30
1603          vxor           $in7,$in7,v31
1604
1605         bne             Lctr32_enc8x_break      # did $len-129 borrow?
1606
1607         vcipherlast     $in0,$out0,$in0
1608         vcipherlast     $in1,$out1,$in1
1609          vadduwm        $out1,$ivec,$one        # counter values ...
1610         vcipherlast     $in2,$out2,$in2
1611          vadduwm        $out2,$ivec,$two
1612          vxor           $out0,$ivec,$rndkey0    # ... xored with rndkey[0]
1613         vcipherlast     $in3,$out3,$in3
1614          vadduwm        $out3,$out1,$two
1615          vxor           $out1,$out1,$rndkey0
1616         vcipherlast     $in4,$out4,$in4
1617          vadduwm        $out4,$out2,$two
1618          vxor           $out2,$out2,$rndkey0
1619         vcipherlast     $in5,$out5,$in5
1620          vadduwm        $out5,$out3,$two
1621          vxor           $out3,$out3,$rndkey0
1622         vcipherlast     $in6,$out6,$in6
1623          vadduwm        $out6,$out4,$two
1624          vxor           $out4,$out4,$rndkey0
1625         vcipherlast     $in7,$out7,$in7
1626          vadduwm        $out7,$out5,$two
1627          vxor           $out5,$out5,$rndkey0
1628         le?vperm        $in0,$in0,$in0,$inpperm
1629          vadduwm        $ivec,$out6,$two        # next counter value
1630          vxor           $out6,$out6,$rndkey0
1631         le?vperm        $in1,$in1,$in1,$inpperm
1632          vxor           $out7,$out7,$rndkey0
1633         mtctr           $rounds
1634
1635          vcipher        $out0,$out0,v24
1636         stvx_u          $in0,$x00,$out
1637         le?vperm        $in2,$in2,$in2,$inpperm
1638          vcipher        $out1,$out1,v24
1639         stvx_u          $in1,$x10,$out
1640         le?vperm        $in3,$in3,$in3,$inpperm
1641          vcipher        $out2,$out2,v24
1642         stvx_u          $in2,$x20,$out
1643         le?vperm        $in4,$in4,$in4,$inpperm
1644          vcipher        $out3,$out3,v24
1645         stvx_u          $in3,$x30,$out
1646         le?vperm        $in5,$in5,$in5,$inpperm
1647          vcipher        $out4,$out4,v24
1648         stvx_u          $in4,$x40,$out
1649         le?vperm        $in6,$in6,$in6,$inpperm
1650          vcipher        $out5,$out5,v24
1651         stvx_u          $in5,$x50,$out
1652         le?vperm        $in7,$in7,$in7,$inpperm
1653          vcipher        $out6,$out6,v24
1654         stvx_u          $in6,$x60,$out
1655          vcipher        $out7,$out7,v24
1656         stvx_u          $in7,$x70,$out
1657         addi            $out,$out,0x80
1658
1659         b               Loop_ctr32_enc8x_middle
1660
1661 .align  5
1662 Lctr32_enc8x_break:
1663         cmpwi           $len,-0x60
1664         blt             Lctr32_enc8x_one
1665         nop
1666         beq             Lctr32_enc8x_two
1667         cmpwi           $len,-0x40
1668         blt             Lctr32_enc8x_three
1669         nop
1670         beq             Lctr32_enc8x_four
1671         cmpwi           $len,-0x20
1672         blt             Lctr32_enc8x_five
1673         nop
1674         beq             Lctr32_enc8x_six
1675         cmpwi           $len,0x00
1676         blt             Lctr32_enc8x_seven
1677
1678 Lctr32_enc8x_eight:
1679         vcipherlast     $out0,$out0,$in0
1680         vcipherlast     $out1,$out1,$in1
1681         vcipherlast     $out2,$out2,$in2
1682         vcipherlast     $out3,$out3,$in3
1683         vcipherlast     $out4,$out4,$in4
1684         vcipherlast     $out5,$out5,$in5
1685         vcipherlast     $out6,$out6,$in6
1686         vcipherlast     $out7,$out7,$in7
1687
1688         le?vperm        $out0,$out0,$out0,$inpperm
1689         le?vperm        $out1,$out1,$out1,$inpperm
1690         stvx_u          $out0,$x00,$out
1691         le?vperm        $out2,$out2,$out2,$inpperm
1692         stvx_u          $out1,$x10,$out
1693         le?vperm        $out3,$out3,$out3,$inpperm
1694         stvx_u          $out2,$x20,$out
1695         le?vperm        $out4,$out4,$out4,$inpperm
1696         stvx_u          $out3,$x30,$out
1697         le?vperm        $out5,$out5,$out5,$inpperm
1698         stvx_u          $out4,$x40,$out
1699         le?vperm        $out6,$out6,$out6,$inpperm
1700         stvx_u          $out5,$x50,$out
1701         le?vperm        $out7,$out7,$out7,$inpperm
1702         stvx_u          $out6,$x60,$out
1703         stvx_u          $out7,$x70,$out
1704         addi            $out,$out,0x80
1705         b               Lctr32_enc8x_done
1706
1707 .align  5
1708 Lctr32_enc8x_seven:
1709         vcipherlast     $out0,$out0,$in1
1710         vcipherlast     $out1,$out1,$in2
1711         vcipherlast     $out2,$out2,$in3
1712         vcipherlast     $out3,$out3,$in4
1713         vcipherlast     $out4,$out4,$in5
1714         vcipherlast     $out5,$out5,$in6
1715         vcipherlast     $out6,$out6,$in7
1716
1717         le?vperm        $out0,$out0,$out0,$inpperm
1718         le?vperm        $out1,$out1,$out1,$inpperm
1719         stvx_u          $out0,$x00,$out
1720         le?vperm        $out2,$out2,$out2,$inpperm
1721         stvx_u          $out1,$x10,$out
1722         le?vperm        $out3,$out3,$out3,$inpperm
1723         stvx_u          $out2,$x20,$out
1724         le?vperm        $out4,$out4,$out4,$inpperm
1725         stvx_u          $out3,$x30,$out
1726         le?vperm        $out5,$out5,$out5,$inpperm
1727         stvx_u          $out4,$x40,$out
1728         le?vperm        $out6,$out6,$out6,$inpperm
1729         stvx_u          $out5,$x50,$out
1730         stvx_u          $out6,$x60,$out
1731         addi            $out,$out,0x70
1732         b               Lctr32_enc8x_done
1733
1734 .align  5
1735 Lctr32_enc8x_six:
1736         vcipherlast     $out0,$out0,$in2
1737         vcipherlast     $out1,$out1,$in3
1738         vcipherlast     $out2,$out2,$in4
1739         vcipherlast     $out3,$out3,$in5
1740         vcipherlast     $out4,$out4,$in6
1741         vcipherlast     $out5,$out5,$in7
1742
1743         le?vperm        $out0,$out0,$out0,$inpperm
1744         le?vperm        $out1,$out1,$out1,$inpperm
1745         stvx_u          $out0,$x00,$out
1746         le?vperm        $out2,$out2,$out2,$inpperm
1747         stvx_u          $out1,$x10,$out
1748         le?vperm        $out3,$out3,$out3,$inpperm
1749         stvx_u          $out2,$x20,$out
1750         le?vperm        $out4,$out4,$out4,$inpperm
1751         stvx_u          $out3,$x30,$out
1752         le?vperm        $out5,$out5,$out5,$inpperm
1753         stvx_u          $out4,$x40,$out
1754         stvx_u          $out5,$x50,$out
1755         addi            $out,$out,0x60
1756         b               Lctr32_enc8x_done
1757
1758 .align  5
1759 Lctr32_enc8x_five:
1760         vcipherlast     $out0,$out0,$in3
1761         vcipherlast     $out1,$out1,$in4
1762         vcipherlast     $out2,$out2,$in5
1763         vcipherlast     $out3,$out3,$in6
1764         vcipherlast     $out4,$out4,$in7
1765
1766         le?vperm        $out0,$out0,$out0,$inpperm
1767         le?vperm        $out1,$out1,$out1,$inpperm
1768         stvx_u          $out0,$x00,$out
1769         le?vperm        $out2,$out2,$out2,$inpperm
1770         stvx_u          $out1,$x10,$out
1771         le?vperm        $out3,$out3,$out3,$inpperm
1772         stvx_u          $out2,$x20,$out
1773         le?vperm        $out4,$out4,$out4,$inpperm
1774         stvx_u          $out3,$x30,$out
1775         stvx_u          $out4,$x40,$out
1776         addi            $out,$out,0x50
1777         b               Lctr32_enc8x_done
1778
1779 .align  5
1780 Lctr32_enc8x_four:
1781         vcipherlast     $out0,$out0,$in4
1782         vcipherlast     $out1,$out1,$in5
1783         vcipherlast     $out2,$out2,$in6
1784         vcipherlast     $out3,$out3,$in7
1785
1786         le?vperm        $out0,$out0,$out0,$inpperm
1787         le?vperm        $out1,$out1,$out1,$inpperm
1788         stvx_u          $out0,$x00,$out
1789         le?vperm        $out2,$out2,$out2,$inpperm
1790         stvx_u          $out1,$x10,$out
1791         le?vperm        $out3,$out3,$out3,$inpperm
1792         stvx_u          $out2,$x20,$out
1793         stvx_u          $out3,$x30,$out
1794         addi            $out,$out,0x40
1795         b               Lctr32_enc8x_done
1796
1797 .align  5
1798 Lctr32_enc8x_three:
1799         vcipherlast     $out0,$out0,$in5
1800         vcipherlast     $out1,$out1,$in6
1801         vcipherlast     $out2,$out2,$in7
1802
1803         le?vperm        $out0,$out0,$out0,$inpperm
1804         le?vperm        $out1,$out1,$out1,$inpperm
1805         stvx_u          $out0,$x00,$out
1806         le?vperm        $out2,$out2,$out2,$inpperm
1807         stvx_u          $out1,$x10,$out
1808         stvx_u          $out2,$x20,$out
1809         addi            $out,$out,0x30
1810         b               Lcbc_dec8x_done
1811
1812 .align  5
1813 Lctr32_enc8x_two:
1814         vcipherlast     $out0,$out0,$in6
1815         vcipherlast     $out1,$out1,$in7
1816
1817         le?vperm        $out0,$out0,$out0,$inpperm
1818         le?vperm        $out1,$out1,$out1,$inpperm
1819         stvx_u          $out0,$x00,$out
1820         stvx_u          $out1,$x10,$out
1821         addi            $out,$out,0x20
1822         b               Lcbc_dec8x_done
1823
1824 .align  5
1825 Lctr32_enc8x_one:
1826         vcipherlast     $out0,$out0,$in7
1827
1828         le?vperm        $out0,$out0,$out0,$inpperm
1829         stvx_u          $out0,0,$out
1830         addi            $out,$out,0x10
1831
1832 Lctr32_enc8x_done:
1833         li              r10,`$FRAME+15`
1834         li              r11,`$FRAME+31`
1835         stvx            $inpperm,r10,$sp        # wipe copies of round keys
1836         addi            r10,r10,32
1837         stvx            $inpperm,r11,$sp
1838         addi            r11,r11,32
1839         stvx            $inpperm,r10,$sp
1840         addi            r10,r10,32
1841         stvx            $inpperm,r11,$sp
1842         addi            r11,r11,32
1843         stvx            $inpperm,r10,$sp
1844         addi            r10,r10,32
1845         stvx            $inpperm,r11,$sp
1846         addi            r11,r11,32
1847         stvx            $inpperm,r10,$sp
1848         addi            r10,r10,32
1849         stvx            $inpperm,r11,$sp
1850         addi            r11,r11,32
1851
1852         mtspr           256,$vrsave
1853         lvx             v20,r10,$sp             # ABI says so
1854         addi            r10,r10,32
1855         lvx             v21,r11,$sp
1856         addi            r11,r11,32
1857         lvx             v22,r10,$sp
1858         addi            r10,r10,32
1859         lvx             v23,r11,$sp
1860         addi            r11,r11,32
1861         lvx             v24,r10,$sp
1862         addi            r10,r10,32
1863         lvx             v25,r11,$sp
1864         addi            r11,r11,32
1865         lvx             v26,r10,$sp
1866         addi            r10,r10,32
1867         lvx             v27,r11,$sp
1868         addi            r11,r11,32
1869         lvx             v28,r10,$sp
1870         addi            r10,r10,32
1871         lvx             v29,r11,$sp
1872         addi            r11,r11,32
1873         lvx             v30,r10,$sp
1874         lvx             v31,r11,$sp
1875         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1876         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1877         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1878         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1879         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1880         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1881         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1882         blr
1883         .long           0
1884         .byte           0,12,0x04,0,0x80,6,6,0
1885         .long           0
1886 .size   .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1887 ___
1888 }}      }}}
1889
1890 my $consts=1;
1891 foreach(split("\n",$code)) {
1892         s/\`([^\`]*)\`/eval($1)/geo;
1893
1894         # constants table endian-specific conversion
1895         if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
1896             my $conv=$3;
1897             my @bytes=();
1898
1899             # convert to endian-agnostic format
1900             if ($1 eq "long") {
1901               foreach (split(/,\s*/,$2)) {
1902                 my $l = /^0/?oct:int;
1903                 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1904               }
1905             } else {
1906                 @bytes = map(/^0/?oct:int,split(/,\s*/,$2));
1907             }
1908
1909             # little-endian conversion
1910             if ($flavour =~ /le$/o) {
1911                 SWITCH: for($conv)  {
1912                     /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
1913                     /\?rev/ && do   { @bytes=reverse(@bytes);    last; }; 
1914                 }
1915             }
1916
1917             #emit
1918             print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1919             next;
1920         }
1921         $consts=0 if (m/Lconsts:/o);    # end of table
1922
1923         # instructions prefixed with '?' are endian-specific and need
1924         # to be adjusted accordingly...
1925         if ($flavour =~ /le$/o) {       # little-endian
1926             s/le\?//o           or
1927             s/be\?/#be#/o       or
1928             s/\?lvsr/lvsl/o     or
1929             s/\?lvsl/lvsr/o     or
1930             s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1931             s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1932             s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1933         } else {                        # big-endian
1934             s/le\?/#le#/o       or
1935             s/be\?//o           or
1936             s/\?([a-z]+)/$1/o;
1937         }
1938
1939         print $_,"\n";
1940 }
1941
1942 close STDOUT;