aesp8-ppc.pl: add CTR mode.
[openssl.git] / crypto / aes / asm / aesp8-ppc.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for AES instructions as per PowerISA
11 # specification version 2.07, first implemented by POWER8 processor.
12 # The module is endian-agnostic in sense that it supports both big-
13 # and little-endian cases. Data alignment in parallelizable modes is
14 # handled with VSX loads and stores, which implies MSR.VSX flag being
15 # set. It should also be noted that ISA specification doesn't prohibit
16 # alignment exceptions for these instructions on page boundaries.
17 # Initially alignment was handled in pure AltiVec/VMX way [when data
18 # is aligned programmatically, which in turn guarantees exception-
19 # free execution], but it turned to hamper performance when vcipher
20 # instructions are interleaved. It's reckoned that eventual
21 # misalignment penalties at page boundaries are in average lower
22 # than additional overhead in pure AltiVec approach.
23
24 $flavour = shift;
25
26 if ($flavour =~ /64/) {
27         $SIZE_T =8;
28         $LRSAVE =2*$SIZE_T;
29         $STU    ="stdu";
30         $POP    ="ld";
31         $PUSH   ="std";
32         $UCMP   ="cmpld";
33         $SHL    ="sldi";
34 } elsif ($flavour =~ /32/) {
35         $SIZE_T =4;
36         $LRSAVE =$SIZE_T;
37         $STU    ="stwu";
38         $POP    ="lwz";
39         $PUSH   ="stw";
40         $UCMP   ="cmplw";
41         $SHL    ="slwi";
42 } else { die "nonsense $flavour"; }
43
44 $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
45
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
49 die "can't locate ppc-xlate.pl";
50
51 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
52
53 $FRAME=8*$SIZE_T;
54 $prefix="aes_p8";
55
56 $sp="r1";
57 $vrsave="r12";
58
59 #########################################################################
60 {{{     # Key setup procedures                                          #
61 my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
62 my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
63 my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
64
65 $code.=<<___;
66 .machine        "any"
67
68 .text
69
70 .align  7
71 rcon:
72 .long   0x01000000, 0x01000000, 0x01000000, 0x01000000  ?rev
73 .long   0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000  ?rev
74 .long   0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c  ?rev
75 .long   0,0,0,0                                         ?asis
76 Lconsts:
77         mflr    r0
78         bcl     20,31,\$+4
79         mflr    $ptr     #vvvvv "distance between . and rcon
80         addi    $ptr,$ptr,-0x48
81         mtlr    r0
82         blr
83         .long   0
84         .byte   0,12,0x14,0,0,0,0,0
85 .asciz  "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
86
87 .globl  .${prefix}_set_encrypt_key
88 .align  5
89 .${prefix}_set_encrypt_key:
90 Lset_encrypt_key:
91         mflr            r11
92         lis             r0,0xfff0
93         $PUSH           r11,$LRSAVE($sp)
94         mfspr           $vrsave,256
95         mtspr           256,r0
96
97         bl              Lconsts
98         mtlr            r11
99
100         neg             r9,$inp
101         lvx             $in0,0,$inp
102         addi            $inp,$inp,15            # 15 is not typo
103         lvsr            $key,0,r9               # borrow $key
104         li              r8,0x20
105         cmpwi           $bits,192
106         lvx             $in1,0,$inp
107         le?vspltisb     $mask,0x0f              # borrow $mask
108         lvx             $rcon,0,$ptr
109         le?vxor         $key,$key,$mask         # adjust for byte swap
110         lvx             $mask,r8,$ptr
111         addi            $ptr,$ptr,0x10
112         vperm           $in0,$in0,$in1,$key     # align [and byte swap in LE]
113         li              $cnt,8
114         vxor            $zero,$zero,$zero
115         mtctr           $cnt
116
117         ?lvsr           $outperm,0,$out
118         vspltisb        $outmask,-1
119         lvx             $outhead,0,$out
120         ?vperm          $outmask,$zero,$outmask,$outperm
121
122         blt             Loop128
123         addi            $inp,$inp,8
124         beq             L192
125         addi            $inp,$inp,8
126         b               L256
127
128 .align  4
129 Loop128:
130         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
131         vsldoi          $tmp,$zero,$in0,12      # >>32
132          vperm          $outtail,$in0,$in0,$outperm     # rotate
133          vsel           $stage,$outhead,$outtail,$outmask
134          vmr            $outhead,$outtail
135         vcipherlast     $key,$key,$rcon
136          stvx           $stage,0,$out
137          addi           $out,$out,16
138
139         vxor            $in0,$in0,$tmp
140         vsldoi          $tmp,$zero,$tmp,12      # >>32
141         vxor            $in0,$in0,$tmp
142         vsldoi          $tmp,$zero,$tmp,12      # >>32
143         vxor            $in0,$in0,$tmp
144          vadduwm        $rcon,$rcon,$rcon
145         vxor            $in0,$in0,$key
146         bdnz            Loop128
147
148         lvx             $rcon,0,$ptr            # last two round keys
149
150         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
151         vsldoi          $tmp,$zero,$in0,12      # >>32
152          vperm          $outtail,$in0,$in0,$outperm     # rotate
153          vsel           $stage,$outhead,$outtail,$outmask
154          vmr            $outhead,$outtail
155         vcipherlast     $key,$key,$rcon
156          stvx           $stage,0,$out
157          addi           $out,$out,16
158
159         vxor            $in0,$in0,$tmp
160         vsldoi          $tmp,$zero,$tmp,12      # >>32
161         vxor            $in0,$in0,$tmp
162         vsldoi          $tmp,$zero,$tmp,12      # >>32
163         vxor            $in0,$in0,$tmp
164          vadduwm        $rcon,$rcon,$rcon
165         vxor            $in0,$in0,$key
166
167         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
168         vsldoi          $tmp,$zero,$in0,12      # >>32
169          vperm          $outtail,$in0,$in0,$outperm     # rotate
170          vsel           $stage,$outhead,$outtail,$outmask
171          vmr            $outhead,$outtail
172         vcipherlast     $key,$key,$rcon
173          stvx           $stage,0,$out
174          addi           $out,$out,16
175
176         vxor            $in0,$in0,$tmp
177         vsldoi          $tmp,$zero,$tmp,12      # >>32
178         vxor            $in0,$in0,$tmp
179         vsldoi          $tmp,$zero,$tmp,12      # >>32
180         vxor            $in0,$in0,$tmp
181         vxor            $in0,$in0,$key
182          vperm          $outtail,$in0,$in0,$outperm     # rotate
183          vsel           $stage,$outhead,$outtail,$outmask
184          vmr            $outhead,$outtail
185          stvx           $stage,0,$out
186
187         addi            $inp,$out,15            # 15 is not typo
188         addi            $out,$out,0x50
189
190         li              $rounds,10
191         b               Ldone
192
193 .align  4
194 L192:
195         lvx             $tmp,0,$inp
196         li              $cnt,4
197          vperm          $outtail,$in0,$in0,$outperm     # rotate
198          vsel           $stage,$outhead,$outtail,$outmask
199          vmr            $outhead,$outtail
200          stvx           $stage,0,$out
201          addi           $out,$out,16
202         vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
203         vspltisb        $key,8                  # borrow $key
204         mtctr           $cnt
205         vsububm         $mask,$mask,$key        # adjust the mask
206
207 Loop192:
208         vperm           $key,$in1,$in1,$mask    # roate-n-splat
209         vsldoi          $tmp,$zero,$in0,12      # >>32
210         vcipherlast     $key,$key,$rcon
211
212         vxor            $in0,$in0,$tmp
213         vsldoi          $tmp,$zero,$tmp,12      # >>32
214         vxor            $in0,$in0,$tmp
215         vsldoi          $tmp,$zero,$tmp,12      # >>32
216         vxor            $in0,$in0,$tmp
217
218          vsldoi         $stage,$zero,$in1,8
219         vspltw          $tmp,$in0,3
220         vxor            $tmp,$tmp,$in1
221         vsldoi          $in1,$zero,$in1,12      # >>32
222          vadduwm        $rcon,$rcon,$rcon
223         vxor            $in1,$in1,$tmp
224         vxor            $in0,$in0,$key
225         vxor            $in1,$in1,$key
226          vsldoi         $stage,$stage,$in0,8
227
228         vperm           $key,$in1,$in1,$mask    # rotate-n-splat
229         vsldoi          $tmp,$zero,$in0,12      # >>32
230          vperm          $outtail,$stage,$stage,$outperm # rotate
231          vsel           $stage,$outhead,$outtail,$outmask
232          vmr            $outhead,$outtail
233         vcipherlast     $key,$key,$rcon
234          stvx           $stage,0,$out
235          addi           $out,$out,16
236
237          vsldoi         $stage,$in0,$in1,8
238         vxor            $in0,$in0,$tmp
239         vsldoi          $tmp,$zero,$tmp,12      # >>32
240          vperm          $outtail,$stage,$stage,$outperm # rotate
241          vsel           $stage,$outhead,$outtail,$outmask
242          vmr            $outhead,$outtail
243         vxor            $in0,$in0,$tmp
244         vsldoi          $tmp,$zero,$tmp,12      # >>32
245         vxor            $in0,$in0,$tmp
246          stvx           $stage,0,$out
247          addi           $out,$out,16
248
249         vspltw          $tmp,$in0,3
250         vxor            $tmp,$tmp,$in1
251         vsldoi          $in1,$zero,$in1,12      # >>32
252          vadduwm        $rcon,$rcon,$rcon
253         vxor            $in1,$in1,$tmp
254         vxor            $in0,$in0,$key
255         vxor            $in1,$in1,$key
256          vperm          $outtail,$in0,$in0,$outperm     # rotate
257          vsel           $stage,$outhead,$outtail,$outmask
258          vmr            $outhead,$outtail
259          stvx           $stage,0,$out
260          addi           $inp,$out,15            # 15 is not typo
261          addi           $out,$out,16
262         bdnz            Loop192
263
264         li              $rounds,12
265         addi            $out,$out,0x20
266         b               Ldone
267
268 .align  4
269 L256:
270         lvx             $tmp,0,$inp
271         li              $cnt,7
272         li              $rounds,14
273          vperm          $outtail,$in0,$in0,$outperm     # rotate
274          vsel           $stage,$outhead,$outtail,$outmask
275          vmr            $outhead,$outtail
276          stvx           $stage,0,$out
277          addi           $out,$out,16
278         vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
279         mtctr           $cnt
280
281 Loop256:
282         vperm           $key,$in1,$in1,$mask    # rotate-n-splat
283         vsldoi          $tmp,$zero,$in0,12      # >>32
284          vperm          $outtail,$in1,$in1,$outperm     # rotate
285          vsel           $stage,$outhead,$outtail,$outmask
286          vmr            $outhead,$outtail
287         vcipherlast     $key,$key,$rcon
288          stvx           $stage,0,$out
289          addi           $out,$out,16
290
291         vxor            $in0,$in0,$tmp
292         vsldoi          $tmp,$zero,$tmp,12      # >>32
293         vxor            $in0,$in0,$tmp
294         vsldoi          $tmp,$zero,$tmp,12      # >>32
295         vxor            $in0,$in0,$tmp
296          vadduwm        $rcon,$rcon,$rcon
297         vxor            $in0,$in0,$key
298          vperm          $outtail,$in0,$in0,$outperm     # rotate
299          vsel           $stage,$outhead,$outtail,$outmask
300          vmr            $outhead,$outtail
301          stvx           $stage,0,$out
302          addi           $inp,$out,15            # 15 is not typo
303          addi           $out,$out,16
304         bdz             Ldone
305
306         vspltw          $key,$in0,3             # just splat
307         vsldoi          $tmp,$zero,$in1,12      # >>32
308         vsbox           $key,$key
309
310         vxor            $in1,$in1,$tmp
311         vsldoi          $tmp,$zero,$tmp,12      # >>32
312         vxor            $in1,$in1,$tmp
313         vsldoi          $tmp,$zero,$tmp,12      # >>32
314         vxor            $in1,$in1,$tmp
315
316         vxor            $in1,$in1,$key
317         b               Loop256
318
319 .align  4
320 Ldone:
321         lvx             $in1,0,$inp             # redundant in aligned case
322         vsel            $in1,$outhead,$in1,$outmask
323         stvx            $in1,0,$inp
324         xor             r3,r3,r3                # return value
325         mtspr           256,$vrsave
326         stw             $rounds,0($out)
327
328         blr
329         .long           0
330         .byte           0,12,0x14,1,0,0,3,0
331         .long           0
332 .size   .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
333
334 .globl  .${prefix}_set_decrypt_key
335 .align  5
336 .${prefix}_set_decrypt_key:
337         $STU            $sp,-$FRAME($sp)
338         mflr            r10
339         $PUSH           r10,$FRAME+$LRSAVE($sp)
340         bl              Lset_encrypt_key
341         mtlr            r10
342
343         slwi            $cnt,$rounds,4
344         subi            $inp,$out,240           # first round key
345         srwi            $rounds,$rounds,1
346         add             $out,$inp,$cnt          # last round key
347         mtctr           $rounds
348
349 Ldeckey:
350         lwz             r0, 0($inp)
351         lwz             r6, 4($inp)
352         lwz             r7, 8($inp)
353         lwz             r8, 12($inp)
354         addi            $inp,$inp,16
355         lwz             r9, 0($out)
356         lwz             r10,4($out)
357         lwz             r11,8($out)
358         lwz             r12,12($out)
359         stw             r0, 0($out)
360         stw             r6, 4($out)
361         stw             r7, 8($out)
362         stw             r8, 12($out)
363         subi            $out,$out,16
364         stw             r9, -16($inp)
365         stw             r10,-12($inp)
366         stw             r11,-8($inp)
367         stw             r12,-4($inp)
368         bdnz            Ldeckey
369
370         xor             r3,r3,r3                # return value
371         addi            $sp,$sp,$FRAME
372         blr
373         .long           0
374         .byte           0,12,4,1,0x80,0,3,0
375         .long           0
376 .size   .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
377 ___
378 }}}
379 #########################################################################
380 {{{     # Single block en- and decrypt procedures                       #
381 sub gen_block () {
382 my $dir = shift;
383 my $n   = $dir eq "de" ? "n" : "";
384 my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
385
386 $code.=<<___;
387 .globl  .${prefix}_${dir}crypt
388 .align  5
389 .${prefix}_${dir}crypt:
390         lwz             $rounds,240($key)
391         lis             r0,0xfc00
392         mfspr           $vrsave,256
393         li              $idx,15                 # 15 is not typo
394         mtspr           256,r0
395
396         lvx             v0,0,$inp
397         neg             r11,$out
398         lvx             v1,$idx,$inp
399         lvsl            v2,0,$inp               # inpperm
400         le?vspltisb     v4,0x0f
401         ?lvsl           v3,0,r11                # outperm
402         le?vxor         v2,v2,v4
403         li              $idx,16
404         vperm           v0,v0,v1,v2             # align [and byte swap in LE]
405         lvx             v1,0,$key
406         ?lvsl           v5,0,$key               # keyperm
407         srwi            $rounds,$rounds,1
408         lvx             v2,$idx,$key
409         addi            $idx,$idx,16
410         subi            $rounds,$rounds,1
411         ?vperm          v1,v1,v2,v5             # align round key
412
413         vxor            v0,v0,v1
414         lvx             v1,$idx,$key
415         addi            $idx,$idx,16
416         mtctr           $rounds
417
418 Loop_${dir}c:
419         ?vperm          v2,v2,v1,v5
420         v${n}cipher     v0,v0,v2
421         lvx             v2,$idx,$key
422         addi            $idx,$idx,16
423         ?vperm          v1,v1,v2,v5
424         v${n}cipher     v0,v0,v1
425         lvx             v1,$idx,$key
426         addi            $idx,$idx,16
427         bdnz            Loop_${dir}c
428
429         ?vperm          v2,v2,v1,v5
430         v${n}cipher     v0,v0,v2
431         lvx             v2,$idx,$key
432         ?vperm          v1,v1,v2,v5
433         v${n}cipherlast v0,v0,v1
434
435         vspltisb        v2,-1
436         vxor            v1,v1,v1
437         li              $idx,15                 # 15 is not typo
438         ?vperm          v2,v1,v2,v3             # outmask
439         le?vxor         v3,v3,v4
440         lvx             v1,0,$out               # outhead
441         vperm           v0,v0,v0,v3             # rotate [and byte swap in LE]
442         vsel            v1,v1,v0,v2
443         lvx             v4,$idx,$out
444         stvx            v1,0,$out
445         vsel            v0,v0,v4,v2
446         stvx            v0,$idx,$out
447
448         mtspr           256,$vrsave
449         blr
450         .long           0
451         .byte           0,12,0x14,0,0,0,3,0
452         .long           0
453 .size   .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
454 ___
455 }
456 &gen_block("en");
457 &gen_block("de");
458 }}}
459 #########################################################################
460 {{{     # CBC en- and decrypt procedures                                #
461 my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
462 my ($rndkey0,$rndkey1,$inout,$tmp)=             map("v$_",(0..3));
463 my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
464                                                 map("v$_",(4..10));
465 $code.=<<___;
466 .globl  .${prefix}_cbc_encrypt
467 .align  5
468 .${prefix}_cbc_encrypt:
469         ${UCMP}i        $len,16
470         bltlr-
471
472         cmpwi           $enc,0                  # test direction
473         lis             r0,0xffe0
474         mfspr           $vrsave,256
475         mtspr           256,r0
476
477         li              $idx,15
478         vxor            $rndkey0,$rndkey0,$rndkey0
479         le?vspltisb     $tmp,0x0f
480
481         lvx             $ivec,0,$ivp            # load [unaligned] iv
482         lvsl            $inpperm,0,$ivp
483         lvx             $inptail,$idx,$ivp
484         le?vxor         $inpperm,$inpperm,$tmp
485         vperm           $ivec,$ivec,$inptail,$inpperm
486
487         neg             r11,$inp
488         ?lvsl           $keyperm,0,$key         # prepare for unaligned key
489         lwz             $rounds,240($key)
490
491         lvsr            $inpperm,0,r11          # prepare for unaligned load
492         lvx             $inptail,0,$inp
493         addi            $inp,$inp,15            # 15 is not typo
494         le?vxor         $inpperm,$inpperm,$tmp
495
496         ?lvsr           $outperm,0,$out         # prepare for unaligned store
497         vspltisb        $outmask,-1
498         lvx             $outhead,0,$out
499         ?vperm          $outmask,$rndkey0,$outmask,$outperm
500         le?vxor         $outperm,$outperm,$tmp
501
502         srwi            $rounds,$rounds,1
503         li              $idx,16
504         subi            $rounds,$rounds,1
505         beq             Lcbc_dec
506
507 Lcbc_enc:
508         vmr             $inout,$inptail
509         lvx             $inptail,0,$inp
510         addi            $inp,$inp,16
511         mtctr           $rounds
512         subi            $len,$len,16            # len-=16
513
514         lvx             $rndkey0,0,$key
515          vperm          $inout,$inout,$inptail,$inpperm
516         lvx             $rndkey1,$idx,$key
517         addi            $idx,$idx,16
518         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
519         vxor            $inout,$inout,$rndkey0
520         lvx             $rndkey0,$idx,$key
521         addi            $idx,$idx,16
522         vxor            $inout,$inout,$ivec
523
524 Loop_cbc_enc:
525         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
526         vcipher         $inout,$inout,$rndkey1
527         lvx             $rndkey1,$idx,$key
528         addi            $idx,$idx,16
529         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
530         vcipher         $inout,$inout,$rndkey0
531         lvx             $rndkey0,$idx,$key
532         addi            $idx,$idx,16
533         bdnz            Loop_cbc_enc
534
535         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
536         vcipher         $inout,$inout,$rndkey1
537         lvx             $rndkey1,$idx,$key
538         li              $idx,16
539         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
540         vcipherlast     $ivec,$inout,$rndkey0
541         ${UCMP}i        $len,16
542
543         vperm           $tmp,$ivec,$ivec,$outperm
544         vsel            $inout,$outhead,$tmp,$outmask
545         vmr             $outhead,$tmp
546         stvx            $inout,0,$out
547         addi            $out,$out,16
548         bge             Lcbc_enc
549
550         b               Lcbc_done
551
552 .align  4
553 Lcbc_dec:
554         ${UCMP}i        $len,128
555         bge             _aesp8_cbc_decrypt8x
556         vmr             $tmp,$inptail
557         lvx             $inptail,0,$inp
558         addi            $inp,$inp,16
559         mtctr           $rounds
560         subi            $len,$len,16            # len-=16
561
562         lvx             $rndkey0,0,$key
563          vperm          $tmp,$tmp,$inptail,$inpperm
564         lvx             $rndkey1,$idx,$key
565         addi            $idx,$idx,16
566         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
567         vxor            $inout,$tmp,$rndkey0
568         lvx             $rndkey0,$idx,$key
569         addi            $idx,$idx,16
570
571 Loop_cbc_dec:
572         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
573         vncipher        $inout,$inout,$rndkey1
574         lvx             $rndkey1,$idx,$key
575         addi            $idx,$idx,16
576         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
577         vncipher        $inout,$inout,$rndkey0
578         lvx             $rndkey0,$idx,$key
579         addi            $idx,$idx,16
580         bdnz            Loop_cbc_dec
581
582         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
583         vncipher        $inout,$inout,$rndkey1
584         lvx             $rndkey1,$idx,$key
585         li              $idx,16
586         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
587         vncipherlast    $inout,$inout,$rndkey0
588         ${UCMP}i        $len,16
589
590         vxor            $inout,$inout,$ivec
591         vmr             $ivec,$tmp
592         vperm           $tmp,$inout,$inout,$outperm
593         vsel            $inout,$outhead,$tmp,$outmask
594         vmr             $outhead,$tmp
595         stvx            $inout,0,$out
596         addi            $out,$out,16
597         bge             Lcbc_dec
598
599 Lcbc_done:
600         addi            $out,$out,-1
601         lvx             $inout,0,$out           # redundant in aligned case
602         vsel            $inout,$outhead,$inout,$outmask
603         stvx            $inout,0,$out
604
605         neg             $enc,$ivp               # write [unaligned] iv
606         li              $idx,15                 # 15 is not typo
607         vxor            $rndkey0,$rndkey0,$rndkey0
608         vspltisb        $outmask,-1
609         le?vspltisb     $tmp,0x0f
610         ?lvsl           $outperm,0,$enc
611         ?vperm          $outmask,$rndkey0,$outmask,$outperm
612         le?vxor         $outperm,$outperm,$tmp
613         lvx             $outhead,0,$ivp
614         vperm           $ivec,$ivec,$ivec,$outperm
615         vsel            $inout,$outhead,$ivec,$outmask
616         lvx             $inptail,$idx,$ivp
617         stvx            $inout,0,$ivp
618         vsel            $inout,$ivec,$inptail,$outmask
619         stvx            $inout,$idx,$ivp
620
621         mtspr           256,$vrsave
622         blr
623         .long           0
624         .byte           0,12,0x14,0,0,0,6,0
625         .long           0
626 ___
627 #########################################################################
628 {{      # Optimized CBC decrypt procedure                               #
629 my $key_="r11";
630 my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
631 my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
632 my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
633 my $rndkey0="v23";      # v24-v25 rotating buffer for first found keys
634                         # v26-v31 last 6 round keys
635 my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
636
637 $code.=<<___;
638 .align  5
639 _aesp8_cbc_decrypt8x:
640         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
641         li              r10,`$FRAME+8*16+15`
642         li              r11,`$FRAME+8*16+31`
643         stvx            v20,r10,$sp             # ABI says so
644         addi            r10,r10,32
645         stvx            v21,r11,$sp
646         addi            r11,r11,32
647         stvx            v22,r10,$sp
648         addi            r10,r10,32
649         stvx            v23,r11,$sp
650         addi            r11,r11,32
651         stvx            v24,r10,$sp
652         addi            r10,r10,32
653         stvx            v25,r11,$sp
654         addi            r11,r11,32
655         stvx            v26,r10,$sp
656         addi            r10,r10,32
657         stvx            v27,r11,$sp
658         addi            r11,r11,32
659         stvx            v28,r10,$sp
660         addi            r10,r10,32
661         stvx            v29,r11,$sp
662         addi            r11,r11,32
663         stvx            v30,r10,$sp
664         stvx            v31,r11,$sp
665         li              r0,-1
666         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
667         li              $x10,0x10
668         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
669         li              $x20,0x20
670         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
671         li              $x30,0x30
672         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
673         li              $x40,0x40
674         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
675         li              $x50,0x50
676         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
677         li              $x60,0x60
678         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
679         li              $x70,0x70
680         mtspr           256,r0
681
682         subi            $rounds,$rounds,3       # -4 in total
683         subi            $len,$len,128           # bias
684
685         lvx             $rndkey0,$x00,$key      # load key schedule
686         lvx             v30,$x10,$key
687         addi            $key,$key,0x20
688         lvx             v31,$x00,$key
689         ?vperm          $rndkey0,$rndkey0,v30,$keyperm
690         addi            $key_,$sp,$FRAME+15
691         mtctr           $rounds
692
693 Load_cbc_dec_key:
694         ?vperm          v24,v30,v31,$keyperm
695         lvx             v30,$x10,$key
696         addi            $key,$key,0x20
697         stvx            v24,$x00,$key_          # off-load round[1]
698         ?vperm          v25,v31,v30,$keyperm
699         lvx             v31,$x00,$key
700         stvx            v25,$x10,$key_          # off-load round[2]
701         addi            $key_,$key_,0x20
702         bdnz            Load_cbc_dec_key
703
704         lvx             v26,$x10,$key
705         ?vperm          v24,v30,v31,$keyperm
706         lvx             v27,$x20,$key
707         stvx            v24,$x00,$key_          # off-load round[3]
708         ?vperm          v25,v31,v26,$keyperm
709         lvx             v28,$x30,$key
710         stvx            v25,$x10,$key_          # off-load round[4]
711         addi            $key_,$sp,$FRAME+15     # rewind $key_
712         ?vperm          v26,v26,v27,$keyperm
713         lvx             v29,$x40,$key
714         ?vperm          v27,v27,v28,$keyperm
715         lvx             v30,$x50,$key
716         ?vperm          v28,v28,v29,$keyperm
717         lvx             v31,$x60,$key
718         ?vperm          v29,v29,v30,$keyperm
719         lvx             $out0,$x70,$key         # borrow $out0
720         ?vperm          v30,v30,v31,$keyperm
721         lvx             v24,$x00,$key_          # pre-load round[1]
722         ?vperm          v31,v31,$out0,$keyperm
723         lvx             v25,$x10,$key_          # pre-load round[2]
724
725         #lvx            $inptail,0,$inp         # "caller" already did this
726         #addi           $inp,$inp,15            # 15 is not typo
727         subi            $inp,$inp,15            # undo "caller"
728
729          le?li          $idx,8
730         lvx_u           $in0,$x00,$inp          # load first 8 "words"
731          le?lvsl        $inpperm,0,$idx
732          le?vspltisb    $tmp,0x0f
733         lvx_u           $in1,$x10,$inp
734          le?vxor        $inpperm,$inpperm,$tmp  # transform for lvx_u/stvx_u
735         lvx_u           $in2,$x20,$inp
736          le?vperm       $in0,$in0,$in0,$inpperm
737         lvx_u           $in3,$x30,$inp
738          le?vperm       $in1,$in1,$in1,$inpperm
739         lvx_u           $in4,$x40,$inp
740          le?vperm       $in2,$in2,$in2,$inpperm
741         vxor            $out0,$in0,$rndkey0
742         lvx_u           $in5,$x50,$inp
743          le?vperm       $in3,$in3,$in3,$inpperm
744         vxor            $out1,$in1,$rndkey0
745         lvx_u           $in6,$x60,$inp
746          le?vperm       $in4,$in4,$in4,$inpperm
747         vxor            $out2,$in2,$rndkey0
748         lvx_u           $in7,$x70,$inp
749         addi            $inp,$inp,0x80
750          le?vperm       $in5,$in5,$in5,$inpperm
751         vxor            $out3,$in3,$rndkey0
752          le?vperm       $in6,$in6,$in6,$inpperm
753         vxor            $out4,$in4,$rndkey0
754          le?vperm       $in7,$in7,$in7,$inpperm
755         vxor            $out5,$in5,$rndkey0
756         vxor            $out6,$in6,$rndkey0
757         vxor            $out7,$in7,$rndkey0
758
759         mtctr           $rounds
760         b               Loop_cbc_dec8x
761 .align  5
762 Loop_cbc_dec8x:
763         vncipher        $out0,$out0,v24
764         vncipher        $out1,$out1,v24
765         vncipher        $out2,$out2,v24
766         vncipher        $out3,$out3,v24
767         vncipher        $out4,$out4,v24
768         vncipher        $out5,$out5,v24
769         vncipher        $out6,$out6,v24
770         vncipher        $out7,$out7,v24
771         lvx             v24,$x20,$key_          # round[3]
772         addi            $key_,$key_,0x20
773
774         vncipher        $out0,$out0,v25
775         vncipher        $out1,$out1,v25
776         vncipher        $out2,$out2,v25
777         vncipher        $out3,$out3,v25
778         vncipher        $out4,$out4,v25
779         vncipher        $out5,$out5,v25
780         vncipher        $out6,$out6,v25
781         vncipher        $out7,$out7,v25
782         lvx             v25,$x10,$key_          # round[4]
783         bdnz            Loop_cbc_dec8x
784
785         subic           $len,$len,128           # $len-=128
786         vncipher        $out0,$out0,v24
787         vncipher        $out1,$out1,v24
788         vncipher        $out2,$out2,v24
789         vncipher        $out3,$out3,v24
790         vncipher        $out4,$out4,v24
791         vncipher        $out5,$out5,v24
792         vncipher        $out6,$out6,v24
793         vncipher        $out7,$out7,v24
794
795         subfe.          r0,r0,r0                # borrow?-1:0
796         vncipher        $out0,$out0,v25
797         vncipher        $out1,$out1,v25
798         vncipher        $out2,$out2,v25
799         vncipher        $out3,$out3,v25
800         vncipher        $out4,$out4,v25
801         vncipher        $out5,$out5,v25
802         vncipher        $out6,$out6,v25
803         vncipher        $out7,$out7,v25
804
805         and             r0,r0,$len
806         vncipher        $out0,$out0,v26
807         vncipher        $out1,$out1,v26
808         vncipher        $out2,$out2,v26
809         vncipher        $out3,$out3,v26
810         vncipher        $out4,$out4,v26
811         vncipher        $out5,$out5,v26
812         vncipher        $out6,$out6,v26
813         vncipher        $out7,$out7,v26
814
815         add             $inp,$inp,r0            # $inp is adjusted in such
816                                                 # way that at exit from the
817                                                 # loop inX-in7 are loaded
818                                                 # with last "words"
819         vncipher        $out0,$out0,v27
820         vncipher        $out1,$out1,v27
821         vncipher        $out2,$out2,v27
822         vncipher        $out3,$out3,v27
823         vncipher        $out4,$out4,v27
824         vncipher        $out5,$out5,v27
825         vncipher        $out6,$out6,v27
826         vncipher        $out7,$out7,v27
827
828         addi            $key_,$sp,$FRAME+15     # rewind $key_
829         vncipher        $out0,$out0,v28
830         vncipher        $out1,$out1,v28
831         vncipher        $out2,$out2,v28
832         vncipher        $out3,$out3,v28
833         vncipher        $out4,$out4,v28
834         vncipher        $out5,$out5,v28
835         vncipher        $out6,$out6,v28
836         vncipher        $out7,$out7,v28
837         lvx             v24,$x00,$key_          # re-pre-load round[1]
838
839         vncipher        $out0,$out0,v29
840         vncipher        $out1,$out1,v29
841         vncipher        $out2,$out2,v29
842         vncipher        $out3,$out3,v29
843         vncipher        $out4,$out4,v29
844         vncipher        $out5,$out5,v29
845         vncipher        $out6,$out6,v29
846         vncipher        $out7,$out7,v29
847         lvx             v25,$x10,$key_          # re-pre-load round[2]
848
849         vncipher        $out0,$out0,v30
850          vxor           $ivec,$ivec,v31         # xor with last round key
851         vncipher        $out1,$out1,v30
852          vxor           $in0,$in0,v31
853         vncipher        $out2,$out2,v30
854          vxor           $in1,$in1,v31
855         vncipher        $out3,$out3,v30
856          vxor           $in2,$in2,v31
857         vncipher        $out4,$out4,v30
858          vxor           $in3,$in3,v31
859         vncipher        $out5,$out5,v30
860          vxor           $in4,$in4,v31
861         vncipher        $out6,$out6,v30
862          vxor           $in5,$in5,v31
863         vncipher        $out7,$out7,v30
864          vxor           $in6,$in6,v31
865
866         vncipherlast    $out0,$out0,$ivec
867         vncipherlast    $out1,$out1,$in0
868          lvx_u          $in0,$x00,$inp          # load next input block
869         vncipherlast    $out2,$out2,$in1
870          lvx_u          $in1,$x10,$inp
871         vncipherlast    $out3,$out3,$in2
872          le?vperm       $in0,$in0,$in0,$inpperm
873          lvx_u          $in2,$x20,$inp
874         vncipherlast    $out4,$out4,$in3
875          le?vperm       $in1,$in1,$in1,$inpperm
876          lvx_u          $in3,$x30,$inp
877         vncipherlast    $out5,$out5,$in4
878          le?vperm       $in2,$in2,$in2,$inpperm
879          lvx_u          $in4,$x40,$inp
880         vncipherlast    $out6,$out6,$in5
881          le?vperm       $in3,$in3,$in3,$inpperm
882          lvx_u          $in5,$x50,$inp
883         vncipherlast    $out7,$out7,$in6
884          le?vperm       $in4,$in4,$in4,$inpperm
885          lvx_u          $in6,$x60,$inp
886         vmr             $ivec,$in7
887          le?vperm       $in5,$in5,$in5,$inpperm
888          lvx_u          $in7,$x70,$inp
889          addi           $inp,$inp,0x80
890
891         le?vperm        $out0,$out0,$out0,$inpperm
892         le?vperm        $out1,$out1,$out1,$inpperm
893         stvx_u          $out0,$x00,$out
894          le?vperm       $in6,$in6,$in6,$inpperm
895          vxor           $out0,$in0,$rndkey0
896         le?vperm        $out2,$out2,$out2,$inpperm
897         stvx_u          $out1,$x10,$out
898          le?vperm       $in7,$in7,$in7,$inpperm
899          vxor           $out1,$in1,$rndkey0
900         le?vperm        $out3,$out3,$out3,$inpperm
901         stvx_u          $out2,$x20,$out
902          vxor           $out2,$in2,$rndkey0
903         le?vperm        $out4,$out4,$out4,$inpperm
904         stvx_u          $out3,$x30,$out
905          vxor           $out3,$in3,$rndkey0
906         le?vperm        $out5,$out5,$out5,$inpperm
907         stvx_u          $out4,$x40,$out
908          vxor           $out4,$in4,$rndkey0
909         le?vperm        $out6,$out6,$out6,$inpperm
910         stvx_u          $out5,$x50,$out
911          vxor           $out5,$in5,$rndkey0
912         le?vperm        $out7,$out7,$out7,$inpperm
913         stvx_u          $out6,$x60,$out
914          vxor           $out6,$in6,$rndkey0
915         stvx_u          $out7,$x70,$out
916         addi            $out,$out,0x80
917          vxor           $out7,$in7,$rndkey0
918
919         mtctr           $rounds
920         beq             Loop_cbc_dec8x          # did $len-=128 borrow?
921
922         addic.          $len,$len,128
923         beq             Lcbc_dec8x_done
924         nop
925         nop
926
927 Loop_cbc_dec8x_tail:                            # up to 7 "words" tail...
928         vncipher        $out1,$out1,v24
929         vncipher        $out2,$out2,v24
930         vncipher        $out3,$out3,v24
931         vncipher        $out4,$out4,v24
932         vncipher        $out5,$out5,v24
933         vncipher        $out6,$out6,v24
934         vncipher        $out7,$out7,v24
935         lvx             v24,$x20,$key_          # round[3]
936         addi            $key_,$key_,0x20
937
938         vncipher        $out1,$out1,v25
939         vncipher        $out2,$out2,v25
940         vncipher        $out3,$out3,v25
941         vncipher        $out4,$out4,v25
942         vncipher        $out5,$out5,v25
943         vncipher        $out6,$out6,v25
944         vncipher        $out7,$out7,v25
945         lvx             v25,$x10,$key_          # round[4]
946         bdnz            Loop_cbc_dec8x_tail
947
948         vncipher        $out1,$out1,v24
949         vncipher        $out2,$out2,v24
950         vncipher        $out3,$out3,v24
951         vncipher        $out4,$out4,v24
952         vncipher        $out5,$out5,v24
953         vncipher        $out6,$out6,v24
954         vncipher        $out7,$out7,v24
955
956         vncipher        $out1,$out1,v25
957         vncipher        $out2,$out2,v25
958         vncipher        $out3,$out3,v25
959         vncipher        $out4,$out4,v25
960         vncipher        $out5,$out5,v25
961         vncipher        $out6,$out6,v25
962         vncipher        $out7,$out7,v25
963
964         vncipher        $out1,$out1,v26
965         vncipher        $out2,$out2,v26
966         vncipher        $out3,$out3,v26
967         vncipher        $out4,$out4,v26
968         vncipher        $out5,$out5,v26
969         vncipher        $out6,$out6,v26
970         vncipher        $out7,$out7,v26
971
972         vncipher        $out1,$out1,v27
973         vncipher        $out2,$out2,v27
974         vncipher        $out3,$out3,v27
975         vncipher        $out4,$out4,v27
976         vncipher        $out5,$out5,v27
977         vncipher        $out6,$out6,v27
978         vncipher        $out7,$out7,v27
979
980         vncipher        $out1,$out1,v28
981         vncipher        $out2,$out2,v28
982         vncipher        $out3,$out3,v28
983         vncipher        $out4,$out4,v28
984         vncipher        $out5,$out5,v28
985         vncipher        $out6,$out6,v28
986         vncipher        $out7,$out7,v28
987
988         vncipher        $out1,$out1,v29
989         vncipher        $out2,$out2,v29
990         vncipher        $out3,$out3,v29
991         vncipher        $out4,$out4,v29
992         vncipher        $out5,$out5,v29
993         vncipher        $out6,$out6,v29
994         vncipher        $out7,$out7,v29
995
996         vncipher        $out1,$out1,v30
997          vxor           $ivec,$ivec,v31         # last round key
998         vncipher        $out2,$out2,v30
999          vxor           $in1,$in1,v31
1000         vncipher        $out3,$out3,v30
1001          vxor           $in2,$in2,v31
1002         vncipher        $out4,$out4,v30
1003          vxor           $in3,$in3,v31
1004         vncipher        $out5,$out5,v30
1005          vxor           $in4,$in4,v31
1006         vncipher        $out6,$out6,v30
1007          vxor           $in5,$in5,v31
1008         vncipher        $out7,$out7,v30
1009          vxor           $in6,$in6,v31
1010
1011         cmplwi          $len,32                 # switch($len)
1012         blt             Lcbc_dec8x_one
1013         nop
1014         beq             Lcbc_dec8x_two
1015         cmplwi          $len,64
1016         blt             Lcbc_dec8x_three
1017         nop
1018         beq             Lcbc_dec8x_four
1019         cmplwi          $len,96
1020         blt             Lcbc_dec8x_five
1021         nop
1022         beq             Lcbc_dec8x_six
1023
1024 Lcbc_dec8x_seven:
1025         vncipherlast    $out1,$out1,$ivec
1026         vncipherlast    $out2,$out2,$in1
1027         vncipherlast    $out3,$out3,$in2
1028         vncipherlast    $out4,$out4,$in3
1029         vncipherlast    $out5,$out5,$in4
1030         vncipherlast    $out6,$out6,$in5
1031         vncipherlast    $out7,$out7,$in6
1032         vmr             $ivec,$in7
1033
1034         le?vperm        $out1,$out1,$out1,$inpperm
1035         le?vperm        $out2,$out2,$out2,$inpperm
1036         stvx_u          $out1,$x00,$out
1037         le?vperm        $out3,$out3,$out3,$inpperm
1038         stvx_u          $out2,$x10,$out
1039         le?vperm        $out4,$out4,$out4,$inpperm
1040         stvx_u          $out3,$x20,$out
1041         le?vperm        $out5,$out5,$out5,$inpperm
1042         stvx_u          $out4,$x30,$out
1043         le?vperm        $out6,$out6,$out6,$inpperm
1044         stvx_u          $out5,$x40,$out
1045         le?vperm        $out7,$out7,$out7,$inpperm
1046         stvx_u          $out6,$x50,$out
1047         stvx_u          $out7,$x60,$out
1048         addi            $out,$out,0x70
1049         b               Lcbc_dec8x_done
1050
1051 .align  5
1052 Lcbc_dec8x_six:
1053         vncipherlast    $out2,$out2,$ivec
1054         vncipherlast    $out3,$out3,$in2
1055         vncipherlast    $out4,$out4,$in3
1056         vncipherlast    $out5,$out5,$in4
1057         vncipherlast    $out6,$out6,$in5
1058         vncipherlast    $out7,$out7,$in6
1059         vmr             $ivec,$in7
1060
1061         le?vperm        $out2,$out2,$out2,$inpperm
1062         le?vperm        $out3,$out3,$out3,$inpperm
1063         stvx_u          $out2,$x00,$out
1064         le?vperm        $out4,$out4,$out4,$inpperm
1065         stvx_u          $out3,$x10,$out
1066         le?vperm        $out5,$out5,$out5,$inpperm
1067         stvx_u          $out4,$x20,$out
1068         le?vperm        $out6,$out6,$out6,$inpperm
1069         stvx_u          $out5,$x30,$out
1070         le?vperm        $out7,$out7,$out7,$inpperm
1071         stvx_u          $out6,$x40,$out
1072         stvx_u          $out7,$x50,$out
1073         addi            $out,$out,0x60
1074         b               Lcbc_dec8x_done
1075
1076 .align  5
1077 Lcbc_dec8x_five:
1078         vncipherlast    $out3,$out3,$ivec
1079         vncipherlast    $out4,$out4,$in3
1080         vncipherlast    $out5,$out5,$in4
1081         vncipherlast    $out6,$out6,$in5
1082         vncipherlast    $out7,$out7,$in6
1083         vmr             $ivec,$in7
1084
1085         le?vperm        $out3,$out3,$out3,$inpperm
1086         le?vperm        $out4,$out4,$out4,$inpperm
1087         stvx_u          $out3,$x00,$out
1088         le?vperm        $out5,$out5,$out5,$inpperm
1089         stvx_u          $out4,$x10,$out
1090         le?vperm        $out6,$out6,$out6,$inpperm
1091         stvx_u          $out5,$x20,$out
1092         le?vperm        $out7,$out7,$out7,$inpperm
1093         stvx_u          $out6,$x30,$out
1094         stvx_u          $out7,$x40,$out
1095         addi            $out,$out,0x50
1096         b               Lcbc_dec8x_done
1097
1098 .align  5
1099 Lcbc_dec8x_four:
1100         vncipherlast    $out4,$out4,$ivec
1101         vncipherlast    $out5,$out5,$in4
1102         vncipherlast    $out6,$out6,$in5
1103         vncipherlast    $out7,$out7,$in6
1104         vmr             $ivec,$in7
1105
1106         le?vperm        $out4,$out4,$out4,$inpperm
1107         le?vperm        $out5,$out5,$out5,$inpperm
1108         stvx_u          $out4,$x00,$out
1109         le?vperm        $out6,$out6,$out6,$inpperm
1110         stvx_u          $out5,$x10,$out
1111         le?vperm        $out7,$out7,$out7,$inpperm
1112         stvx_u          $out6,$x20,$out
1113         stvx_u          $out7,$x30,$out
1114         addi            $out,$out,0x40
1115         b               Lcbc_dec8x_done
1116
1117 .align  5
1118 Lcbc_dec8x_three:
1119         vncipherlast    $out5,$out5,$ivec
1120         vncipherlast    $out6,$out6,$in5
1121         vncipherlast    $out7,$out7,$in6
1122         vmr             $ivec,$in7
1123
1124         le?vperm        $out5,$out5,$out5,$inpperm
1125         le?vperm        $out6,$out6,$out6,$inpperm
1126         stvx_u          $out5,$x00,$out
1127         le?vperm        $out7,$out7,$out7,$inpperm
1128         stvx_u          $out6,$x10,$out
1129         stvx_u          $out7,$x20,$out
1130         addi            $out,$out,0x30
1131         b               Lcbc_dec8x_done
1132
1133 .align  5
1134 Lcbc_dec8x_two:
1135         vncipherlast    $out6,$out6,$ivec
1136         vncipherlast    $out7,$out7,$in6
1137         vmr             $ivec,$in7
1138
1139         le?vperm        $out6,$out6,$out6,$inpperm
1140         le?vperm        $out7,$out7,$out7,$inpperm
1141         stvx_u          $out6,$x00,$out
1142         stvx_u          $out7,$x10,$out
1143         addi            $out,$out,0x20
1144         b               Lcbc_dec8x_done
1145
1146 .align  5
1147 Lcbc_dec8x_one:
1148         vncipherlast    $out7,$out7,$ivec
1149         vmr             $ivec,$in7
1150
1151         le?vperm        $out7,$out7,$out7,$inpperm
1152         stvx_u          $out7,0,$out
1153         addi            $out,$out,0x10
1154
1155 Lcbc_dec8x_done:
1156         le?vperm        $ivec,$ivec,$ivec,$inpperm
1157         stvx_u          $ivec,0,$ivp            # write [unaligned] iv
1158
1159         li              r10,`$FRAME+15`
1160         li              r11,`$FRAME+31`
1161         stvx            $inpperm,r10,$sp        # wipe copies of round keys
1162         addi            r10,r10,32
1163         stvx            $inpperm,r11,$sp
1164         addi            r11,r11,32
1165         stvx            $inpperm,r10,$sp
1166         addi            r10,r10,32
1167         stvx            $inpperm,r11,$sp
1168         addi            r11,r11,32
1169         stvx            $inpperm,r10,$sp
1170         addi            r10,r10,32
1171         stvx            $inpperm,r11,$sp
1172         addi            r11,r11,32
1173         stvx            $inpperm,r10,$sp
1174         addi            r10,r10,32
1175         stvx            $inpperm,r11,$sp
1176         addi            r11,r11,32
1177
1178         mtspr           256,$vrsave
1179         lvx             v20,r10,$sp             # ABI says so
1180         addi            r10,r10,32
1181         lvx             v21,r11,$sp
1182         addi            r11,r11,32
1183         lvx             v22,r10,$sp
1184         addi            r10,r10,32
1185         lvx             v23,r11,$sp
1186         addi            r11,r11,32
1187         lvx             v24,r10,$sp
1188         addi            r10,r10,32
1189         lvx             v25,r11,$sp
1190         addi            r11,r11,32
1191         lvx             v26,r10,$sp
1192         addi            r10,r10,32
1193         lvx             v27,r11,$sp
1194         addi            r11,r11,32
1195         lvx             v28,r10,$sp
1196         addi            r10,r10,32
1197         lvx             v29,r11,$sp
1198         addi            r11,r11,32
1199         lvx             v30,r10,$sp
1200         lvx             v31,r11,$sp
1201         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1202         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1203         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1204         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1205         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1206         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1207         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1208         blr
1209         .long           0
1210         .byte           0,12,0x14,0,0x80,6,6,0
1211         .long           0
1212 .size   .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1213 ___
1214 }}      }}}
1215
1216 #########################################################################
1217 {{{     # CTR procedure[s]                                              #
1218 my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1219 my ($rndkey0,$rndkey1,$inout,$tmp)=             map("v$_",(0..3));
1220 my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1221                                                 map("v$_",(4..11));
1222 my $dat=$tmp;
1223
1224 $code.=<<___;
1225 .globl  .${prefix}_ctr32_encrypt_blocks
1226 .align  5
1227 .${prefix}_ctr32_encrypt_blocks:
1228         ${UCMP}i        $len,1
1229         bltlr-
1230
1231         lis             r0,0xfff0
1232         mfspr           $vrsave,256
1233         mtspr           256,r0
1234
1235         li              $idx,15
1236         vxor            $rndkey0,$rndkey0,$rndkey0
1237         le?vspltisb     $tmp,0x0f
1238
1239         lvx             $ivec,0,$ivp            # load [unaligned] iv
1240         lvsl            $inpperm,0,$ivp
1241         lvx             $inptail,$idx,$ivp
1242          vspltisb       $one,1
1243         le?vxor         $inpperm,$inpperm,$tmp
1244         vperm           $ivec,$ivec,$inptail,$inpperm
1245          vsldoi         $one,$rndkey0,$one,1
1246
1247         neg             r11,$inp
1248         ?lvsl           $keyperm,0,$key         # prepare for unaligned key
1249         lwz             $rounds,240($key)
1250
1251         lvsr            $inpperm,0,r11          # prepare for unaligned load
1252         lvx             $inptail,0,$inp
1253         addi            $inp,$inp,15            # 15 is not typo
1254         le?vxor         $inpperm,$inpperm,$tmp
1255
1256         srwi            $rounds,$rounds,1
1257         li              $idx,16
1258         subi            $rounds,$rounds,1
1259
1260         ${UCMP}i        $len,8
1261         bge             _aesp8_ctr32_encrypt8x
1262
1263         ?lvsr           $outperm,0,$out         # prepare for unaligned store
1264         vspltisb        $outmask,-1
1265         lvx             $outhead,0,$out
1266         ?vperm          $outmask,$rndkey0,$outmask,$outperm
1267         le?vxor         $outperm,$outperm,$tmp
1268
1269         lvx             $rndkey0,0,$key
1270         mtctr           $rounds
1271         lvx             $rndkey1,$idx,$key
1272         addi            $idx,$idx,16
1273         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
1274         vxor            $inout,$ivec,$rndkey0
1275         lvx             $rndkey0,$idx,$key
1276         addi            $idx,$idx,16
1277         b               Loop_ctr32_enc
1278
1279 .align  5
1280 Loop_ctr32_enc:
1281         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
1282         vcipher         $inout,$inout,$rndkey1
1283         lvx             $rndkey1,$idx,$key
1284         addi            $idx,$idx,16
1285         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
1286         vcipher         $inout,$inout,$rndkey0
1287         lvx             $rndkey0,$idx,$key
1288         addi            $idx,$idx,16
1289         bdnz            Loop_ctr32_enc
1290
1291         vadduwm         $ivec,$ivec,$one
1292          vmr            $dat,$inptail
1293          lvx            $inptail,0,$inp
1294          addi           $inp,$inp,16
1295          subic.         $len,$len,1             # blocks--
1296
1297         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
1298         vcipher         $inout,$inout,$rndkey1
1299         lvx             $rndkey1,$idx,$key
1300          vperm          $dat,$dat,$inptail,$inpperm
1301          li             $idx,16
1302         ?vperm          $rndkey1,$rndkey0,$rndkey1,$keyperm
1303          lvx            $rndkey0,0,$key
1304         vxor            $dat,$dat,$rndkey1      # last round key
1305         vcipherlast     $inout,$inout,$dat
1306
1307          lvx            $rndkey1,$idx,$key
1308          addi           $idx,$idx,16
1309         vperm           $inout,$inout,$inout,$outperm
1310         vsel            $dat,$outhead,$inout,$outmask
1311          mtctr          $rounds
1312          ?vperm         $rndkey0,$rndkey0,$rndkey1,$keyperm
1313         vmr             $outhead,$inout
1314          vxor           $inout,$ivec,$rndkey0
1315          lvx            $rndkey0,$idx,$key
1316          addi           $idx,$idx,16
1317         stvx            $dat,0,$out
1318         addi            $out,$out,16
1319         bne             Loop_ctr32_enc
1320
1321         addi            $out,$out,-1
1322         lvx             $inout,0,$out           # redundant in aligned case
1323         vsel            $inout,$outhead,$inout,$outmask
1324         stvx            $inout,0,$out
1325
1326         mtspr           256,$vrsave
1327         blr
1328         .long           0
1329         .byte           0,12,0x14,0,0,0,6,0
1330         .long           0
1331 ___
1332 #########################################################################
1333 {{      # Optimized CTR procedure                                       #
1334 my $key_="r11";
1335 my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1336 my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1337 my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1338 my $rndkey0="v23";      # v24-v25 rotating buffer for first found keys
1339                         # v26-v31 last 6 round keys
1340 my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
1341 my ($two,$three,$four)=($outhead,$outperm,$outmask);
1342
1343 $code.=<<___;
1344 .align  5
1345 _aesp8_ctr32_encrypt8x:
1346         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1347         li              r10,`$FRAME+8*16+15`
1348         li              r11,`$FRAME+8*16+31`
1349         stvx            v20,r10,$sp             # ABI says so
1350         addi            r10,r10,32
1351         stvx            v21,r11,$sp
1352         addi            r11,r11,32
1353         stvx            v22,r10,$sp
1354         addi            r10,r10,32
1355         stvx            v23,r11,$sp
1356         addi            r11,r11,32
1357         stvx            v24,r10,$sp
1358         addi            r10,r10,32
1359         stvx            v25,r11,$sp
1360         addi            r11,r11,32
1361         stvx            v26,r10,$sp
1362         addi            r10,r10,32
1363         stvx            v27,r11,$sp
1364         addi            r11,r11,32
1365         stvx            v28,r10,$sp
1366         addi            r10,r10,32
1367         stvx            v29,r11,$sp
1368         addi            r11,r11,32
1369         stvx            v30,r10,$sp
1370         stvx            v31,r11,$sp
1371         li              r0,-1
1372         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
1373         li              $x10,0x10
1374         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1375         li              $x20,0x20
1376         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1377         li              $x30,0x30
1378         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1379         li              $x40,0x40
1380         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1381         li              $x50,0x50
1382         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1383         li              $x60,0x60
1384         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1385         li              $x70,0x70
1386         mtspr           256,r0
1387
1388         subi            $rounds,$rounds,3       # -4 in total
1389
1390         lvx             $rndkey0,$x00,$key      # load key schedule
1391         lvx             v30,$x10,$key
1392         addi            $key,$key,0x20
1393         lvx             v31,$x00,$key
1394         ?vperm          $rndkey0,$rndkey0,v30,$keyperm
1395         addi            $key_,$sp,$FRAME+15
1396         mtctr           $rounds
1397
1398 Load_ctr32_enc_key:
1399         ?vperm          v24,v30,v31,$keyperm
1400         lvx             v30,$x10,$key
1401         addi            $key,$key,0x20
1402         stvx            v24,$x00,$key_          # off-load round[1]
1403         ?vperm          v25,v31,v30,$keyperm
1404         lvx             v31,$x00,$key
1405         stvx            v25,$x10,$key_          # off-load round[2]
1406         addi            $key_,$key_,0x20
1407         bdnz            Load_ctr32_enc_key
1408
1409         lvx             v26,$x10,$key
1410         ?vperm          v24,v30,v31,$keyperm
1411         lvx             v27,$x20,$key
1412         stvx            v24,$x00,$key_          # off-load round[3]
1413         ?vperm          v25,v31,v26,$keyperm
1414         lvx             v28,$x30,$key
1415         stvx            v25,$x10,$key_          # off-load round[4]
1416         addi            $key_,$sp,$FRAME+15     # rewind $key_
1417         ?vperm          v26,v26,v27,$keyperm
1418         lvx             v29,$x40,$key
1419         ?vperm          v27,v27,v28,$keyperm
1420         lvx             v30,$x50,$key
1421         ?vperm          v28,v28,v29,$keyperm
1422         lvx             v31,$x60,$key
1423         ?vperm          v29,v29,v30,$keyperm
1424         lvx             $out0,$x70,$key         # borrow $out0
1425         ?vperm          v30,v30,v31,$keyperm
1426         lvx             v24,$x00,$key_          # pre-load round[1]
1427         ?vperm          v31,v31,$out0,$keyperm
1428         lvx             v25,$x10,$key_          # pre-load round[2]
1429
1430         vadduwm         $two,$one,$one
1431         subi            $inp,$inp,15            # undo "caller"
1432         $SHL            $len,$len,4
1433
1434         vadduwm         $out1,$ivec,$one        # counter values ...
1435         vadduwm         $out2,$ivec,$two
1436         vxor            $out0,$ivec,$rndkey0    # ... xored with rndkey[0]
1437          le?li          $idx,8
1438         vadduwm         $out3,$out1,$two
1439         vxor            $out1,$out1,$rndkey0
1440          le?lvsl        $inpperm,0,$idx
1441         vadduwm         $out4,$out2,$two
1442         vxor            $out2,$out2,$rndkey0
1443          le?vspltisb    $tmp,0x0f
1444         vadduwm         $out5,$out3,$two
1445         vxor            $out3,$out3,$rndkey0
1446          le?vxor        $inpperm,$inpperm,$tmp  # transform for lvx_u/stvx_u
1447         vadduwm         $out6,$out4,$two
1448         vxor            $out4,$out4,$rndkey0
1449         vadduwm         $out7,$out5,$two
1450         vxor            $out5,$out5,$rndkey0
1451         vadduwm         $ivec,$out6,$two        # next counter value
1452         vxor            $out6,$out6,$rndkey0
1453         vxor            $out7,$out7,$rndkey0
1454
1455         mtctr           $rounds
1456         b               Loop_ctr32_enc8x
1457 .align  5
1458 Loop_ctr32_enc8x:
1459         vcipher         $out0,$out0,v24
1460         vcipher         $out1,$out1,v24
1461         vcipher         $out2,$out2,v24
1462         vcipher         $out3,$out3,v24
1463         vcipher         $out4,$out4,v24
1464         vcipher         $out5,$out5,v24
1465         vcipher         $out6,$out6,v24
1466         vcipher         $out7,$out7,v24
1467 Loop_ctr32_enc8x_middle:
1468         lvx             v24,$x20,$key_          # round[3]
1469         addi            $key_,$key_,0x20
1470
1471         vcipher         $out0,$out0,v25
1472         vcipher         $out1,$out1,v25
1473         vcipher         $out2,$out2,v25
1474         vcipher         $out3,$out3,v25
1475         vcipher         $out4,$out4,v25
1476         vcipher         $out5,$out5,v25
1477         vcipher         $out6,$out6,v25
1478         vcipher         $out7,$out7,v25
1479         lvx             v25,$x10,$key_          # round[4]
1480         bdnz            Loop_ctr32_enc8x
1481
1482         subic           r11,$len,256            # $len-256, borrow $key_
1483         vcipher         $out0,$out0,v24
1484         vcipher         $out1,$out1,v24
1485         vcipher         $out2,$out2,v24
1486         vcipher         $out3,$out3,v24
1487         vcipher         $out4,$out4,v24
1488         vcipher         $out5,$out5,v24
1489         vcipher         $out6,$out6,v24
1490         vcipher         $out7,$out7,v24
1491
1492         subfe           r0,r0,r0                # borrow?-1:0
1493         vcipher         $out0,$out0,v25
1494         vcipher         $out1,$out1,v25
1495         vcipher         $out2,$out2,v25
1496         vcipher         $out3,$out3,v25
1497         vcipher         $out4,$out4,v25
1498         vcipher         $out5,$out5,v25
1499         vcipher         $out6,$out6,v25
1500         vcipher         $out7,$out7,v25
1501
1502         and             r0,r0,r11
1503         addi            $key_,$sp,$FRAME+15     # rewind $key_
1504         vcipher         $out0,$out0,v26
1505         vcipher         $out1,$out1,v26
1506         vcipher         $out2,$out2,v26
1507         vcipher         $out3,$out3,v26
1508         vcipher         $out4,$out4,v26
1509         vcipher         $out5,$out5,v26
1510         vcipher         $out6,$out6,v26
1511         vcipher         $out7,$out7,v26
1512         lvx             v24,$x00,$key_          # re-pre-load round[1]
1513
1514         subic           $len,$len,129           # $len-=129
1515         vcipher         $out0,$out0,v27
1516         addi            $len,$len,1             # $len-=128 really
1517         vcipher         $out1,$out1,v27
1518         vcipher         $out2,$out2,v27
1519         vcipher         $out3,$out3,v27
1520         vcipher         $out4,$out4,v27
1521         vcipher         $out5,$out5,v27
1522         vcipher         $out6,$out6,v27
1523         vcipher         $out7,$out7,v27
1524         lvx             v25,$x10,$key_          # re-pre-load round[2]
1525
1526         vcipher         $out0,$out0,v28
1527          lvx_u          $in0,$x00,$inp          # load input
1528         vcipher         $out1,$out1,v28
1529          lvx_u          $in1,$x10,$inp
1530         vcipher         $out2,$out2,v28
1531          lvx_u          $in2,$x20,$inp
1532         vcipher         $out3,$out3,v28
1533          lvx_u          $in3,$x30,$inp
1534         vcipher         $out4,$out4,v28
1535          lvx_u          $in4,$x40,$inp
1536         vcipher         $out5,$out5,v28
1537          lvx_u          $in5,$x50,$inp
1538         vcipher         $out6,$out6,v28
1539          lvx_u          $in6,$x60,$inp
1540         vcipher         $out7,$out7,v28
1541          lvx_u          $in7,$x70,$inp
1542          addi           $inp,$inp,0x80
1543
1544         vcipher         $out0,$out0,v29
1545          le?vperm       $in0,$in0,$in0,$inpperm
1546         vcipher         $out1,$out1,v29
1547          le?vperm       $in1,$in1,$in1,$inpperm
1548         vcipher         $out2,$out2,v29
1549          le?vperm       $in2,$in2,$in2,$inpperm
1550         vcipher         $out3,$out3,v29
1551          le?vperm       $in3,$in3,$in3,$inpperm
1552         vcipher         $out4,$out4,v29
1553          le?vperm       $in4,$in4,$in4,$inpperm
1554         vcipher         $out5,$out5,v29
1555          le?vperm       $in5,$in5,$in5,$inpperm
1556         vcipher         $out6,$out6,v29
1557          le?vperm       $in6,$in6,$in6,$inpperm
1558         vcipher         $out7,$out7,v29
1559          le?vperm       $in7,$in7,$in7,$inpperm
1560
1561         add             $inp,$inp,r0            # $inp is adjusted in such
1562                                                 # way that at exit from the
1563                                                 # loop inX-in7 are loaded
1564                                                 # with last "words"
1565         subfe.          r0,r0,r0                # borrow?-1:0
1566         vcipher         $out0,$out0,v30
1567          vxor           $in0,$in0,v31           # xor with last round key
1568         vcipher         $out1,$out1,v30
1569          vxor           $in1,$in1,v31
1570         vcipher         $out2,$out2,v30
1571          vxor           $in2,$in2,v31
1572         vcipher         $out3,$out3,v30
1573          vxor           $in3,$in3,v31
1574         vcipher         $out4,$out4,v30
1575          vxor           $in4,$in4,v31
1576         vcipher         $out5,$out5,v30
1577          vxor           $in5,$in5,v31
1578         vcipher         $out6,$out6,v30
1579          vxor           $in6,$in6,v31
1580         vcipher         $out7,$out7,v30
1581          vxor           $in7,$in7,v31
1582
1583         bne             Lctr32_enc8x_break      # did $len-129 borrow?
1584
1585         vcipherlast     $in0,$out0,$in0
1586         vcipherlast     $in1,$out1,$in1
1587          vadduwm        $out1,$ivec,$one        # counter values ...
1588         vcipherlast     $in2,$out2,$in2
1589          vadduwm        $out2,$ivec,$two
1590          vxor           $out0,$ivec,$rndkey0    # ... xored with rndkey[0]
1591         vcipherlast     $in3,$out3,$in3
1592          vadduwm        $out3,$out1,$two
1593          vxor           $out1,$out1,$rndkey0
1594         vcipherlast     $in4,$out4,$in4
1595          vadduwm        $out4,$out2,$two
1596          vxor           $out2,$out2,$rndkey0
1597         vcipherlast     $in5,$out5,$in5
1598          vadduwm        $out5,$out3,$two
1599          vxor           $out3,$out3,$rndkey0
1600         vcipherlast     $in6,$out6,$in6
1601          vadduwm        $out6,$out4,$two
1602          vxor           $out4,$out4,$rndkey0
1603         vcipherlast     $in7,$out7,$in7
1604          vadduwm        $out7,$out5,$two
1605          vxor           $out5,$out5,$rndkey0
1606         le?vperm        $in0,$in0,$in0,$inpperm
1607          vadduwm        $ivec,$out6,$two        # next counter value
1608          vxor           $out6,$out6,$rndkey0
1609         le?vperm        $in1,$in1,$in1,$inpperm
1610          vxor           $out7,$out7,$rndkey0
1611         mtctr           $rounds
1612
1613          vcipher        $out0,$out0,v24
1614         stvx_u          $in0,$x00,$out
1615         le?vperm        $in2,$in2,$in2,$inpperm
1616          vcipher        $out1,$out1,v24
1617         stvx_u          $in1,$x10,$out
1618         le?vperm        $in3,$in3,$in3,$inpperm
1619          vcipher        $out2,$out2,v24
1620         stvx_u          $in2,$x20,$out
1621         le?vperm        $in4,$in4,$in4,$inpperm
1622          vcipher        $out3,$out3,v24
1623         stvx_u          $in3,$x30,$out
1624         le?vperm        $in5,$in5,$in5,$inpperm
1625          vcipher        $out4,$out4,v24
1626         stvx_u          $in4,$x40,$out
1627         le?vperm        $in6,$in6,$in6,$inpperm
1628          vcipher        $out5,$out5,v24
1629         stvx_u          $in5,$x50,$out
1630         le?vperm        $in7,$in7,$in7,$inpperm
1631          vcipher        $out6,$out6,v24
1632         stvx_u          $in6,$x60,$out
1633          vcipher        $out7,$out7,v24
1634         stvx_u          $in7,$x70,$out
1635         addi            $out,$out,0x80
1636
1637         b               Loop_ctr32_enc8x_middle
1638
1639 .align  5
1640 Lctr32_enc8x_break:
1641         cmpwi           $len,-0x60
1642         blt             Lctr32_enc8x_one
1643         nop
1644         beq             Lctr32_enc8x_two
1645         cmpwi           $len,-0x40
1646         blt             Lctr32_enc8x_three
1647         nop
1648         beq             Lctr32_enc8x_four
1649         cmpwi           $len,-0x20
1650         blt             Lctr32_enc8x_five
1651         nop
1652         beq             Lctr32_enc8x_six
1653         cmpwi           $len,0x00
1654         blt             Lctr32_enc8x_seven
1655
1656 Lctr32_enc8x_eight:
1657         vcipherlast     $out0,$out0,$in0
1658         vcipherlast     $out1,$out1,$in1
1659         vcipherlast     $out2,$out2,$in2
1660         vcipherlast     $out3,$out3,$in3
1661         vcipherlast     $out4,$out4,$in4
1662         vcipherlast     $out5,$out5,$in5
1663         vcipherlast     $out6,$out6,$in6
1664         vcipherlast     $out7,$out7,$in7
1665
1666         le?vperm        $out0,$out0,$out0,$inpperm
1667         le?vperm        $out1,$out1,$out1,$inpperm
1668         stvx_u          $out0,$x00,$out
1669         le?vperm        $out2,$out2,$out2,$inpperm
1670         stvx_u          $out1,$x10,$out
1671         le?vperm        $out3,$out3,$out3,$inpperm
1672         stvx_u          $out2,$x20,$out
1673         le?vperm        $out4,$out4,$out4,$inpperm
1674         stvx_u          $out3,$x30,$out
1675         le?vperm        $out5,$out5,$out5,$inpperm
1676         stvx_u          $out4,$x40,$out
1677         le?vperm        $out6,$out6,$out6,$inpperm
1678         stvx_u          $out5,$x50,$out
1679         le?vperm        $out7,$out7,$out7,$inpperm
1680         stvx_u          $out6,$x60,$out
1681         stvx_u          $out7,$x70,$out
1682         addi            $out,$out,0x80
1683         b               Lctr32_enc8x_done
1684
1685 .align  5
1686 Lctr32_enc8x_seven:
1687         vcipherlast     $out0,$out0,$in1
1688         vcipherlast     $out1,$out1,$in2
1689         vcipherlast     $out2,$out2,$in3
1690         vcipherlast     $out3,$out3,$in4
1691         vcipherlast     $out4,$out4,$in5
1692         vcipherlast     $out5,$out5,$in6
1693         vcipherlast     $out6,$out6,$in7
1694
1695         le?vperm        $out0,$out0,$out0,$inpperm
1696         le?vperm        $out1,$out1,$out1,$inpperm
1697         stvx_u          $out0,$x00,$out
1698         le?vperm        $out2,$out2,$out2,$inpperm
1699         stvx_u          $out1,$x10,$out
1700         le?vperm        $out3,$out3,$out3,$inpperm
1701         stvx_u          $out2,$x20,$out
1702         le?vperm        $out4,$out4,$out4,$inpperm
1703         stvx_u          $out3,$x30,$out
1704         le?vperm        $out5,$out5,$out5,$inpperm
1705         stvx_u          $out4,$x40,$out
1706         le?vperm        $out6,$out6,$out6,$inpperm
1707         stvx_u          $out5,$x50,$out
1708         stvx_u          $out6,$x60,$out
1709         addi            $out,$out,0x70
1710         b               Lctr32_enc8x_done
1711
1712 .align  5
1713 Lctr32_enc8x_six:
1714         vcipherlast     $out0,$out0,$in2
1715         vcipherlast     $out1,$out1,$in3
1716         vcipherlast     $out2,$out2,$in4
1717         vcipherlast     $out3,$out3,$in5
1718         vcipherlast     $out4,$out4,$in6
1719         vcipherlast     $out5,$out5,$in7
1720
1721         le?vperm        $out0,$out0,$out0,$inpperm
1722         le?vperm        $out1,$out1,$out1,$inpperm
1723         stvx_u          $out0,$x00,$out
1724         le?vperm        $out2,$out2,$out2,$inpperm
1725         stvx_u          $out1,$x10,$out
1726         le?vperm        $out3,$out3,$out3,$inpperm
1727         stvx_u          $out2,$x20,$out
1728         le?vperm        $out4,$out4,$out4,$inpperm
1729         stvx_u          $out3,$x30,$out
1730         le?vperm        $out5,$out5,$out5,$inpperm
1731         stvx_u          $out4,$x40,$out
1732         stvx_u          $out5,$x50,$out
1733         addi            $out,$out,0x60
1734         b               Lctr32_enc8x_done
1735
1736 .align  5
1737 Lctr32_enc8x_five:
1738         vcipherlast     $out0,$out0,$in3
1739         vcipherlast     $out1,$out1,$in4
1740         vcipherlast     $out2,$out2,$in5
1741         vcipherlast     $out3,$out3,$in6
1742         vcipherlast     $out4,$out4,$in7
1743
1744         le?vperm        $out0,$out0,$out0,$inpperm
1745         le?vperm        $out1,$out1,$out1,$inpperm
1746         stvx_u          $out0,$x00,$out
1747         le?vperm        $out2,$out2,$out2,$inpperm
1748         stvx_u          $out1,$x10,$out
1749         le?vperm        $out3,$out3,$out3,$inpperm
1750         stvx_u          $out2,$x20,$out
1751         le?vperm        $out4,$out4,$out4,$inpperm
1752         stvx_u          $out3,$x30,$out
1753         stvx_u          $out4,$x40,$out
1754         addi            $out,$out,0x50
1755         b               Lctr32_enc8x_done
1756
1757 .align  5
1758 Lctr32_enc8x_four:
1759         vcipherlast     $out0,$out0,$in4
1760         vcipherlast     $out1,$out1,$in5
1761         vcipherlast     $out2,$out2,$in6
1762         vcipherlast     $out3,$out3,$in7
1763
1764         le?vperm        $out0,$out0,$out0,$inpperm
1765         le?vperm        $out1,$out1,$out1,$inpperm
1766         stvx_u          $out0,$x00,$out
1767         le?vperm        $out2,$out2,$out2,$inpperm
1768         stvx_u          $out1,$x10,$out
1769         le?vperm        $out3,$out3,$out3,$inpperm
1770         stvx_u          $out2,$x20,$out
1771         stvx_u          $out3,$x30,$out
1772         addi            $out,$out,0x40
1773         b               Lctr32_enc8x_done
1774
1775 .align  5
1776 Lctr32_enc8x_three:
1777         vcipherlast     $out0,$out0,$in5
1778         vcipherlast     $out1,$out1,$in6
1779         vcipherlast     $out2,$out2,$in7
1780
1781         le?vperm        $out0,$out0,$out0,$inpperm
1782         le?vperm        $out1,$out1,$out1,$inpperm
1783         stvx_u          $out0,$x00,$out
1784         le?vperm        $out2,$out2,$out2,$inpperm
1785         stvx_u          $out1,$x10,$out
1786         stvx_u          $out2,$x20,$out
1787         addi            $out,$out,0x30
1788         b               Lcbc_dec8x_done
1789
1790 .align  5
1791 Lctr32_enc8x_two:
1792         vcipherlast     $out0,$out0,$in6
1793         vcipherlast     $out1,$out1,$in7
1794
1795         le?vperm        $out0,$out0,$out0,$inpperm
1796         le?vperm        $out1,$out1,$out1,$inpperm
1797         stvx_u          $out0,$x00,$out
1798         stvx_u          $out1,$x10,$out
1799         addi            $out,$out,0x20
1800         b               Lcbc_dec8x_done
1801
1802 .align  5
1803 Lctr32_enc8x_one:
1804         vcipherlast     $out0,$out0,$in7
1805
1806         le?vperm        $out0,$out0,$out0,$inpperm
1807         stvx_u          $out0,0,$out
1808         addi            $out,$out,0x10
1809
1810 Lctr32_enc8x_done:
1811         li              r10,`$FRAME+15`
1812         li              r11,`$FRAME+31`
1813         stvx            $inpperm,r10,$sp        # wipe copies of round keys
1814         addi            r10,r10,32
1815         stvx            $inpperm,r11,$sp
1816         addi            r11,r11,32
1817         stvx            $inpperm,r10,$sp
1818         addi            r10,r10,32
1819         stvx            $inpperm,r11,$sp
1820         addi            r11,r11,32
1821         stvx            $inpperm,r10,$sp
1822         addi            r10,r10,32
1823         stvx            $inpperm,r11,$sp
1824         addi            r11,r11,32
1825         stvx            $inpperm,r10,$sp
1826         addi            r10,r10,32
1827         stvx            $inpperm,r11,$sp
1828         addi            r11,r11,32
1829
1830         mtspr           256,$vrsave
1831         lvx             v20,r10,$sp             # ABI says so
1832         addi            r10,r10,32
1833         lvx             v21,r11,$sp
1834         addi            r11,r11,32
1835         lvx             v22,r10,$sp
1836         addi            r10,r10,32
1837         lvx             v23,r11,$sp
1838         addi            r11,r11,32
1839         lvx             v24,r10,$sp
1840         addi            r10,r10,32
1841         lvx             v25,r11,$sp
1842         addi            r11,r11,32
1843         lvx             v26,r10,$sp
1844         addi            r10,r10,32
1845         lvx             v27,r11,$sp
1846         addi            r11,r11,32
1847         lvx             v28,r10,$sp
1848         addi            r10,r10,32
1849         lvx             v29,r11,$sp
1850         addi            r11,r11,32
1851         lvx             v30,r10,$sp
1852         lvx             v31,r11,$sp
1853         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1854         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1855         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1856         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1857         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1858         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1859         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1860         blr
1861         .long           0
1862         .byte           0,12,0x14,0,0x80,6,6,0
1863         .long           0
1864 .size   .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1865 ___
1866 }}      }}}
1867
1868 my $consts=1;
1869 foreach(split("\n",$code)) {
1870         s/\`([^\`]*)\`/eval($1)/geo;
1871
1872         # constants table endian-specific conversion
1873         if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
1874             my $conv=$3;
1875             my @bytes=();
1876
1877             # convert to endian-agnostic format
1878             if ($1 eq "long") {
1879               foreach (split(/,\s*/,$2)) {
1880                 my $l = /^0/?oct:int;
1881                 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1882               }
1883             } else {
1884                 @bytes = map(/^0/?oct:int,split(/,\s*/,$2));
1885             }
1886
1887             # little-endian conversion
1888             if ($flavour =~ /le$/o) {
1889                 SWITCH: for($conv)  {
1890                     /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
1891                     /\?rev/ && do   { @bytes=reverse(@bytes);    last; }; 
1892                 }
1893             }
1894
1895             #emit
1896             print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1897             next;
1898         }
1899         $consts=0 if (m/Lconsts:/o);    # end of table
1900
1901         # instructions prefixed with '?' are endian-specific and need
1902         # to be adjusted accordingly...
1903         if ($flavour =~ /le$/o) {       # little-endian
1904             s/le\?//o           or
1905             s/be\?/#be#/o       or
1906             s/\?lvsr/lvsl/o     or
1907             s/\?lvsl/lvsr/o     or
1908             s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1909             s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1910             s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1911         } else {                        # big-endian
1912             s/le\?/#le#/o       or
1913             s/be\?//o           or
1914             s/\?([a-z]+)/$1/o;
1915         }
1916
1917         print $_,"\n";
1918 }
1919
1920 close STDOUT;