Added an explicit yield (OP_SLEEP) to QUIC testing for cooperative threading.
[openssl.git] / crypto / aes / asm / aesp8-ppc.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements support for AES instructions as per PowerISA
18 # specification version 2.07, first implemented by POWER8 processor.
19 # The module is endian-agnostic in sense that it supports both big-
20 # and little-endian cases. Data alignment in parallelizable modes is
21 # handled with VSX loads and stores, which implies MSR.VSX flag being
22 # set. It should also be noted that ISA specification doesn't prohibit
23 # alignment exceptions for these instructions on page boundaries.
24 # Initially alignment was handled in pure AltiVec/VMX way [when data
25 # is aligned programmatically, which in turn guarantees exception-
26 # free execution], but it turned to hamper performance when vcipher
27 # instructions are interleaved. It's reckoned that eventual
28 # misalignment penalties at page boundaries are in average lower
29 # than additional overhead in pure AltiVec approach.
30 #
31 # May 2016
32 #
33 # Add XTS subroutine, 9x on little- and 12x improvement on big-endian
34 # systems were measured.
35 #
36 ######################################################################
37 # Current large-block performance in cycles per byte processed with
38 # 128-bit key (less is better).
39 #
40 #               CBC en-/decrypt CTR     XTS
41 # POWER8[le]    3.96/0.72       0.74    1.1
42 # POWER8[be]    3.75/0.65       0.66    1.0
43 # POWER9[le]    4.02/0.86       0.84    1.05
44 # POWER9[be]    3.99/0.78       0.79    0.97
45
46 # $output is the last argument if it looks like a file (it has an extension)
47 # $flavour is the first argument if it doesn't look like a file
48 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
49 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
50
51 if ($flavour =~ /64/) {
52         $SIZE_T =8;
53         $LRSAVE =2*$SIZE_T;
54         $STU    ="stdu";
55         $POP    ="ld";
56         $PUSH   ="std";
57         $UCMP   ="cmpld";
58         $SHL    ="sldi";
59 } elsif ($flavour =~ /32/) {
60         $SIZE_T =4;
61         $LRSAVE =$SIZE_T;
62         $STU    ="stwu";
63         $POP    ="lwz";
64         $PUSH   ="stw";
65         $UCMP   ="cmplw";
66         $SHL    ="slwi";
67 } else { die "nonsense $flavour"; }
68
69 $LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
70
71 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
72 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
73 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
74 die "can't locate ppc-xlate.pl";
75
76 open STDOUT,"| $^X $xlate $flavour \"$output\""
77     or die "can't call $xlate: $!";
78
79 $FRAME=8*$SIZE_T;
80 $prefix="aes_p8";
81
82 $sp="r1";
83 $vrsave="r12";
84
85 #########################################################################
86 {{{     # Key setup procedures                                          #
87 my ($inp,$bits,$out,$ptr,$cnt,$rounds)=map("r$_",(3..8));
88 my ($zero,$in0,$in1,$key,$rcon,$mask,$tmp)=map("v$_",(0..6));
89 my ($stage,$outperm,$outmask,$outhead,$outtail)=map("v$_",(7..11));
90
91 $code.=<<___;
92 .machine        "any"
93
94 .text
95
96 .align  7
97 rcon:
98 .long   0x01000000, 0x01000000, 0x01000000, 0x01000000  ?rev
99 .long   0x1b000000, 0x1b000000, 0x1b000000, 0x1b000000  ?rev
100 .long   0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c, 0x0d0e0f0c  ?rev
101 .long   0,0,0,0                                         ?asis
102 .long   0x0f102132, 0x43546576, 0x8798a9ba, 0xcbdcedfe
103 Lconsts:
104         mflr    r0
105         bcl     20,31,\$+4
106         mflr    $ptr     #vvvvv "distance between . and rcon
107         addi    $ptr,$ptr,-0x58
108         mtlr    r0
109         blr
110         .long   0
111         .byte   0,12,0x14,0,0,0,0,0
112 .asciz  "AES for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
113
114 .globl  .${prefix}_set_encrypt_key
115 .align  5
116 .${prefix}_set_encrypt_key:
117 Lset_encrypt_key:
118         mflr            r11
119         $PUSH           r11,$LRSAVE($sp)
120
121         li              $ptr,-1
122         ${UCMP}i        $inp,0
123         beq-            Lenc_key_abort          # if ($inp==0) return -1;
124         ${UCMP}i        $out,0
125         beq-            Lenc_key_abort          # if ($out==0) return -1;
126         li              $ptr,-2
127         cmpwi           $bits,128
128         blt-            Lenc_key_abort
129         cmpwi           $bits,256
130         bgt-            Lenc_key_abort
131         andi.           r0,$bits,0x3f
132         bne-            Lenc_key_abort
133
134         lis             r0,0xfff0
135         mfspr           $vrsave,256
136         mtspr           256,r0
137
138         bl              Lconsts
139         mtlr            r11
140
141         neg             r9,$inp
142         lvx             $in0,0,$inp
143         addi            $inp,$inp,15            # 15 is not typo
144         lvsr            $key,0,r9               # borrow $key
145         li              r8,0x20
146         cmpwi           $bits,192
147         lvx             $in1,0,$inp
148         le?vspltisb     $mask,0x0f              # borrow $mask
149         lvx             $rcon,0,$ptr
150         le?vxor         $key,$key,$mask         # adjust for byte swap
151         lvx             $mask,r8,$ptr
152         addi            $ptr,$ptr,0x10
153         vperm           $in0,$in0,$in1,$key     # align [and byte swap in LE]
154         li              $cnt,8
155         vxor            $zero,$zero,$zero
156         mtctr           $cnt
157
158         ?lvsr           $outperm,0,$out
159         vspltisb        $outmask,-1
160         lvx             $outhead,0,$out
161         ?vperm          $outmask,$zero,$outmask,$outperm
162
163         blt             Loop128
164         addi            $inp,$inp,8
165         beq             L192
166         addi            $inp,$inp,8
167         b               L256
168
169 .align  4
170 Loop128:
171         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
172         vsldoi          $tmp,$zero,$in0,12      # >>32
173          vperm          $outtail,$in0,$in0,$outperm     # rotate
174          vsel           $stage,$outhead,$outtail,$outmask
175          vmr            $outhead,$outtail
176         vcipherlast     $key,$key,$rcon
177          stvx           $stage,0,$out
178          addi           $out,$out,16
179
180         vxor            $in0,$in0,$tmp
181         vsldoi          $tmp,$zero,$tmp,12      # >>32
182         vxor            $in0,$in0,$tmp
183         vsldoi          $tmp,$zero,$tmp,12      # >>32
184         vxor            $in0,$in0,$tmp
185          vadduwm        $rcon,$rcon,$rcon
186         vxor            $in0,$in0,$key
187         bdnz            Loop128
188
189         lvx             $rcon,0,$ptr            # last two round keys
190
191         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
192         vsldoi          $tmp,$zero,$in0,12      # >>32
193          vperm          $outtail,$in0,$in0,$outperm     # rotate
194          vsel           $stage,$outhead,$outtail,$outmask
195          vmr            $outhead,$outtail
196         vcipherlast     $key,$key,$rcon
197          stvx           $stage,0,$out
198          addi           $out,$out,16
199
200         vxor            $in0,$in0,$tmp
201         vsldoi          $tmp,$zero,$tmp,12      # >>32
202         vxor            $in0,$in0,$tmp
203         vsldoi          $tmp,$zero,$tmp,12      # >>32
204         vxor            $in0,$in0,$tmp
205          vadduwm        $rcon,$rcon,$rcon
206         vxor            $in0,$in0,$key
207
208         vperm           $key,$in0,$in0,$mask    # rotate-n-splat
209         vsldoi          $tmp,$zero,$in0,12      # >>32
210          vperm          $outtail,$in0,$in0,$outperm     # rotate
211          vsel           $stage,$outhead,$outtail,$outmask
212          vmr            $outhead,$outtail
213         vcipherlast     $key,$key,$rcon
214          stvx           $stage,0,$out
215          addi           $out,$out,16
216
217         vxor            $in0,$in0,$tmp
218         vsldoi          $tmp,$zero,$tmp,12      # >>32
219         vxor            $in0,$in0,$tmp
220         vsldoi          $tmp,$zero,$tmp,12      # >>32
221         vxor            $in0,$in0,$tmp
222         vxor            $in0,$in0,$key
223          vperm          $outtail,$in0,$in0,$outperm     # rotate
224          vsel           $stage,$outhead,$outtail,$outmask
225          vmr            $outhead,$outtail
226          stvx           $stage,0,$out
227
228         addi            $inp,$out,15            # 15 is not typo
229         addi            $out,$out,0x50
230
231         li              $rounds,10
232         b               Ldone
233
234 .align  4
235 L192:
236         lvx             $tmp,0,$inp
237         li              $cnt,4
238          vperm          $outtail,$in0,$in0,$outperm     # rotate
239          vsel           $stage,$outhead,$outtail,$outmask
240          vmr            $outhead,$outtail
241          stvx           $stage,0,$out
242          addi           $out,$out,16
243         vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
244         vspltisb        $key,8                  # borrow $key
245         mtctr           $cnt
246         vsububm         $mask,$mask,$key        # adjust the mask
247
248 Loop192:
249         vperm           $key,$in1,$in1,$mask    # roate-n-splat
250         vsldoi          $tmp,$zero,$in0,12      # >>32
251         vcipherlast     $key,$key,$rcon
252
253         vxor            $in0,$in0,$tmp
254         vsldoi          $tmp,$zero,$tmp,12      # >>32
255         vxor            $in0,$in0,$tmp
256         vsldoi          $tmp,$zero,$tmp,12      # >>32
257         vxor            $in0,$in0,$tmp
258
259          vsldoi         $stage,$zero,$in1,8
260         vspltw          $tmp,$in0,3
261         vxor            $tmp,$tmp,$in1
262         vsldoi          $in1,$zero,$in1,12      # >>32
263          vadduwm        $rcon,$rcon,$rcon
264         vxor            $in1,$in1,$tmp
265         vxor            $in0,$in0,$key
266         vxor            $in1,$in1,$key
267          vsldoi         $stage,$stage,$in0,8
268
269         vperm           $key,$in1,$in1,$mask    # rotate-n-splat
270         vsldoi          $tmp,$zero,$in0,12      # >>32
271          vperm          $outtail,$stage,$stage,$outperm # rotate
272          vsel           $stage,$outhead,$outtail,$outmask
273          vmr            $outhead,$outtail
274         vcipherlast     $key,$key,$rcon
275          stvx           $stage,0,$out
276          addi           $out,$out,16
277
278          vsldoi         $stage,$in0,$in1,8
279         vxor            $in0,$in0,$tmp
280         vsldoi          $tmp,$zero,$tmp,12      # >>32
281          vperm          $outtail,$stage,$stage,$outperm # rotate
282          vsel           $stage,$outhead,$outtail,$outmask
283          vmr            $outhead,$outtail
284         vxor            $in0,$in0,$tmp
285         vsldoi          $tmp,$zero,$tmp,12      # >>32
286         vxor            $in0,$in0,$tmp
287          stvx           $stage,0,$out
288          addi           $out,$out,16
289
290         vspltw          $tmp,$in0,3
291         vxor            $tmp,$tmp,$in1
292         vsldoi          $in1,$zero,$in1,12      # >>32
293          vadduwm        $rcon,$rcon,$rcon
294         vxor            $in1,$in1,$tmp
295         vxor            $in0,$in0,$key
296         vxor            $in1,$in1,$key
297          vperm          $outtail,$in0,$in0,$outperm     # rotate
298          vsel           $stage,$outhead,$outtail,$outmask
299          vmr            $outhead,$outtail
300          stvx           $stage,0,$out
301          addi           $inp,$out,15            # 15 is not typo
302          addi           $out,$out,16
303         bdnz            Loop192
304
305         li              $rounds,12
306         addi            $out,$out,0x20
307         b               Ldone
308
309 .align  4
310 L256:
311         lvx             $tmp,0,$inp
312         li              $cnt,7
313         li              $rounds,14
314          vperm          $outtail,$in0,$in0,$outperm     # rotate
315          vsel           $stage,$outhead,$outtail,$outmask
316          vmr            $outhead,$outtail
317          stvx           $stage,0,$out
318          addi           $out,$out,16
319         vperm           $in1,$in1,$tmp,$key     # align [and byte swap in LE]
320         mtctr           $cnt
321
322 Loop256:
323         vperm           $key,$in1,$in1,$mask    # rotate-n-splat
324         vsldoi          $tmp,$zero,$in0,12      # >>32
325          vperm          $outtail,$in1,$in1,$outperm     # rotate
326          vsel           $stage,$outhead,$outtail,$outmask
327          vmr            $outhead,$outtail
328         vcipherlast     $key,$key,$rcon
329          stvx           $stage,0,$out
330          addi           $out,$out,16
331
332         vxor            $in0,$in0,$tmp
333         vsldoi          $tmp,$zero,$tmp,12      # >>32
334         vxor            $in0,$in0,$tmp
335         vsldoi          $tmp,$zero,$tmp,12      # >>32
336         vxor            $in0,$in0,$tmp
337          vadduwm        $rcon,$rcon,$rcon
338         vxor            $in0,$in0,$key
339          vperm          $outtail,$in0,$in0,$outperm     # rotate
340          vsel           $stage,$outhead,$outtail,$outmask
341          vmr            $outhead,$outtail
342          stvx           $stage,0,$out
343          addi           $inp,$out,15            # 15 is not typo
344          addi           $out,$out,16
345         bdz             Ldone
346
347         vspltw          $key,$in0,3             # just splat
348         vsldoi          $tmp,$zero,$in1,12      # >>32
349         vsbox           $key,$key
350
351         vxor            $in1,$in1,$tmp
352         vsldoi          $tmp,$zero,$tmp,12      # >>32
353         vxor            $in1,$in1,$tmp
354         vsldoi          $tmp,$zero,$tmp,12      # >>32
355         vxor            $in1,$in1,$tmp
356
357         vxor            $in1,$in1,$key
358         b               Loop256
359
360 .align  4
361 Ldone:
362         lvx             $in1,0,$inp             # redundant in aligned case
363         vsel            $in1,$outhead,$in1,$outmask
364         stvx            $in1,0,$inp
365         li              $ptr,0
366         mtspr           256,$vrsave
367         stw             $rounds,0($out)
368
369 Lenc_key_abort:
370         mr              r3,$ptr
371         blr
372         .long           0
373         .byte           0,12,0x14,1,0,0,3,0
374         .long           0
375 .size   .${prefix}_set_encrypt_key,.-.${prefix}_set_encrypt_key
376
377 .globl  .${prefix}_set_decrypt_key
378 .align  5
379 .${prefix}_set_decrypt_key:
380         $STU            $sp,-$FRAME($sp)
381         mflr            r10
382         $PUSH           r10,$FRAME+$LRSAVE($sp)
383         bl              Lset_encrypt_key
384         mtlr            r10
385
386         cmpwi           r3,0
387         bne-            Ldec_key_abort
388
389         slwi            $cnt,$rounds,4
390         subi            $inp,$out,240           # first round key
391         srwi            $rounds,$rounds,1
392         add             $out,$inp,$cnt          # last round key
393         mtctr           $rounds
394
395 Ldeckey:
396         lwz             r0, 0($inp)
397         lwz             r6, 4($inp)
398         lwz             r7, 8($inp)
399         lwz             r8, 12($inp)
400         addi            $inp,$inp,16
401         lwz             r9, 0($out)
402         lwz             r10,4($out)
403         lwz             r11,8($out)
404         lwz             r12,12($out)
405         stw             r0, 0($out)
406         stw             r6, 4($out)
407         stw             r7, 8($out)
408         stw             r8, 12($out)
409         subi            $out,$out,16
410         stw             r9, -16($inp)
411         stw             r10,-12($inp)
412         stw             r11,-8($inp)
413         stw             r12,-4($inp)
414         bdnz            Ldeckey
415
416         xor             r3,r3,r3                # return value
417 Ldec_key_abort:
418         addi            $sp,$sp,$FRAME
419         blr
420         .long           0
421         .byte           0,12,4,1,0x80,0,3,0
422         .long           0
423 .size   .${prefix}_set_decrypt_key,.-.${prefix}_set_decrypt_key
424 ___
425 }}}
426 #########################################################################
427 {{{     # Single block en- and decrypt procedures                       #
428 sub gen_block () {
429 my $dir = shift;
430 my $n   = $dir eq "de" ? "n" : "";
431 my ($inp,$out,$key,$rounds,$idx)=map("r$_",(3..7));
432
433 $code.=<<___;
434 .globl  .${prefix}_${dir}crypt
435 .align  5
436 .${prefix}_${dir}crypt:
437         lwz             $rounds,240($key)
438         lis             r0,0xfc00
439         mfspr           $vrsave,256
440         li              $idx,15                 # 15 is not typo
441         mtspr           256,r0
442
443         lvx             v0,0,$inp
444         neg             r11,$out
445         lvx             v1,$idx,$inp
446         lvsl            v2,0,$inp               # inpperm
447         le?vspltisb     v4,0x0f
448         ?lvsl           v3,0,r11                # outperm
449         le?vxor         v2,v2,v4
450         li              $idx,16
451         vperm           v0,v0,v1,v2             # align [and byte swap in LE]
452         lvx             v1,0,$key
453         ?lvsl           v5,0,$key               # keyperm
454         srwi            $rounds,$rounds,1
455         lvx             v2,$idx,$key
456         addi            $idx,$idx,16
457         subi            $rounds,$rounds,1
458         ?vperm          v1,v1,v2,v5             # align round key
459
460         vxor            v0,v0,v1
461         lvx             v1,$idx,$key
462         addi            $idx,$idx,16
463         mtctr           $rounds
464
465 Loop_${dir}c:
466         ?vperm          v2,v2,v1,v5
467         v${n}cipher     v0,v0,v2
468         lvx             v2,$idx,$key
469         addi            $idx,$idx,16
470         ?vperm          v1,v1,v2,v5
471         v${n}cipher     v0,v0,v1
472         lvx             v1,$idx,$key
473         addi            $idx,$idx,16
474         bdnz            Loop_${dir}c
475
476         ?vperm          v2,v2,v1,v5
477         v${n}cipher     v0,v0,v2
478         lvx             v2,$idx,$key
479         ?vperm          v1,v1,v2,v5
480         v${n}cipherlast v0,v0,v1
481
482         vspltisb        v2,-1
483         vxor            v1,v1,v1
484         li              $idx,15                 # 15 is not typo
485         ?vperm          v2,v1,v2,v3             # outmask
486         le?vxor         v3,v3,v4
487         lvx             v1,0,$out               # outhead
488         vperm           v0,v0,v0,v3             # rotate [and byte swap in LE]
489         vsel            v1,v1,v0,v2
490         lvx             v4,$idx,$out
491         stvx            v1,0,$out
492         vsel            v0,v0,v4,v2
493         stvx            v0,$idx,$out
494
495         mtspr           256,$vrsave
496         blr
497         .long           0
498         .byte           0,12,0x14,0,0,0,3,0
499         .long           0
500 .size   .${prefix}_${dir}crypt,.-.${prefix}_${dir}crypt
501 ___
502 }
503 &gen_block("en");
504 &gen_block("de");
505 }}}
506 #########################################################################
507 {{{     # CBC en- and decrypt procedures                                #
508 my ($inp,$out,$len,$key,$ivp,$enc,$rounds,$idx)=map("r$_",(3..10));
509 my ($rndkey0,$rndkey1,$inout,$tmp)=             map("v$_",(0..3));
510 my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm)=
511                                                 map("v$_",(4..10));
512 $code.=<<___;
513 .globl  .${prefix}_cbc_encrypt
514 .align  5
515 .${prefix}_cbc_encrypt:
516         ${UCMP}i        $len,16
517         bltlr-
518
519         cmpwi           $enc,0                  # test direction
520         lis             r0,0xffe0
521         mfspr           $vrsave,256
522         mtspr           256,r0
523
524         li              $idx,15
525         vxor            $rndkey0,$rndkey0,$rndkey0
526         le?vspltisb     $tmp,0x0f
527
528         lvx             $ivec,0,$ivp            # load [unaligned] iv
529         lvsl            $inpperm,0,$ivp
530         lvx             $inptail,$idx,$ivp
531         le?vxor         $inpperm,$inpperm,$tmp
532         vperm           $ivec,$ivec,$inptail,$inpperm
533
534         neg             r11,$inp
535         ?lvsl           $keyperm,0,$key         # prepare for unaligned key
536         lwz             $rounds,240($key)
537
538         lvsr            $inpperm,0,r11          # prepare for unaligned load
539         lvx             $inptail,0,$inp
540         addi            $inp,$inp,15            # 15 is not typo
541         le?vxor         $inpperm,$inpperm,$tmp
542
543         ?lvsr           $outperm,0,$out         # prepare for unaligned store
544         vspltisb        $outmask,-1
545         lvx             $outhead,0,$out
546         ?vperm          $outmask,$rndkey0,$outmask,$outperm
547         le?vxor         $outperm,$outperm,$tmp
548
549         srwi            $rounds,$rounds,1
550         li              $idx,16
551         subi            $rounds,$rounds,1
552         beq             Lcbc_dec
553
554 Lcbc_enc:
555         vmr             $inout,$inptail
556         lvx             $inptail,0,$inp
557         addi            $inp,$inp,16
558         mtctr           $rounds
559         subi            $len,$len,16            # len-=16
560
561         lvx             $rndkey0,0,$key
562          vperm          $inout,$inout,$inptail,$inpperm
563         lvx             $rndkey1,$idx,$key
564         addi            $idx,$idx,16
565         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
566         vxor            $inout,$inout,$rndkey0
567         lvx             $rndkey0,$idx,$key
568         addi            $idx,$idx,16
569         vxor            $inout,$inout,$ivec
570
571 Loop_cbc_enc:
572         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
573         vcipher         $inout,$inout,$rndkey1
574         lvx             $rndkey1,$idx,$key
575         addi            $idx,$idx,16
576         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
577         vcipher         $inout,$inout,$rndkey0
578         lvx             $rndkey0,$idx,$key
579         addi            $idx,$idx,16
580         bdnz            Loop_cbc_enc
581
582         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
583         vcipher         $inout,$inout,$rndkey1
584         lvx             $rndkey1,$idx,$key
585         li              $idx,16
586         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
587         vcipherlast     $ivec,$inout,$rndkey0
588         ${UCMP}i        $len,16
589
590         vperm           $tmp,$ivec,$ivec,$outperm
591         vsel            $inout,$outhead,$tmp,$outmask
592         vmr             $outhead,$tmp
593         stvx            $inout,0,$out
594         addi            $out,$out,16
595         bge             Lcbc_enc
596
597         b               Lcbc_done
598
599 .align  4
600 Lcbc_dec:
601         ${UCMP}i        $len,128
602         bge             _aesp8_cbc_decrypt8x
603         vmr             $tmp,$inptail
604         lvx             $inptail,0,$inp
605         addi            $inp,$inp,16
606         mtctr           $rounds
607         subi            $len,$len,16            # len-=16
608
609         lvx             $rndkey0,0,$key
610          vperm          $tmp,$tmp,$inptail,$inpperm
611         lvx             $rndkey1,$idx,$key
612         addi            $idx,$idx,16
613         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
614         vxor            $inout,$tmp,$rndkey0
615         lvx             $rndkey0,$idx,$key
616         addi            $idx,$idx,16
617
618 Loop_cbc_dec:
619         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
620         vncipher        $inout,$inout,$rndkey1
621         lvx             $rndkey1,$idx,$key
622         addi            $idx,$idx,16
623         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
624         vncipher        $inout,$inout,$rndkey0
625         lvx             $rndkey0,$idx,$key
626         addi            $idx,$idx,16
627         bdnz            Loop_cbc_dec
628
629         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
630         vncipher        $inout,$inout,$rndkey1
631         lvx             $rndkey1,$idx,$key
632         li              $idx,16
633         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
634         vncipherlast    $inout,$inout,$rndkey0
635         ${UCMP}i        $len,16
636
637         vxor            $inout,$inout,$ivec
638         vmr             $ivec,$tmp
639         vperm           $tmp,$inout,$inout,$outperm
640         vsel            $inout,$outhead,$tmp,$outmask
641         vmr             $outhead,$tmp
642         stvx            $inout,0,$out
643         addi            $out,$out,16
644         bge             Lcbc_dec
645
646 Lcbc_done:
647         addi            $out,$out,-1
648         lvx             $inout,0,$out           # redundant in aligned case
649         vsel            $inout,$outhead,$inout,$outmask
650         stvx            $inout,0,$out
651
652         neg             $enc,$ivp               # write [unaligned] iv
653         li              $idx,15                 # 15 is not typo
654         vxor            $rndkey0,$rndkey0,$rndkey0
655         vspltisb        $outmask,-1
656         le?vspltisb     $tmp,0x0f
657         ?lvsl           $outperm,0,$enc
658         ?vperm          $outmask,$rndkey0,$outmask,$outperm
659         le?vxor         $outperm,$outperm,$tmp
660         lvx             $outhead,0,$ivp
661         vperm           $ivec,$ivec,$ivec,$outperm
662         vsel            $inout,$outhead,$ivec,$outmask
663         lvx             $inptail,$idx,$ivp
664         stvx            $inout,0,$ivp
665         vsel            $inout,$ivec,$inptail,$outmask
666         stvx            $inout,$idx,$ivp
667
668         mtspr           256,$vrsave
669         blr
670         .long           0
671         .byte           0,12,0x14,0,0,0,6,0
672         .long           0
673 ___
674 #########################################################################
675 {{      # Optimized CBC decrypt procedure                               #
676 my $key_="r11";
677 my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
678     $x00=0 if ($flavour =~ /osx/);
679 my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10..13));
680 my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(14..21));
681 my $rndkey0="v23";      # v24-v25 rotating buffer for first found keys
682                         # v26-v31 last 6 round keys
683 my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
684
685 $code.=<<___;
686 .align  5
687 _aesp8_cbc_decrypt8x:
688         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
689         li              r10,`$FRAME+8*16+15`
690         li              r11,`$FRAME+8*16+31`
691         stvx            v20,r10,$sp             # ABI says so
692         addi            r10,r10,32
693         stvx            v21,r11,$sp
694         addi            r11,r11,32
695         stvx            v22,r10,$sp
696         addi            r10,r10,32
697         stvx            v23,r11,$sp
698         addi            r11,r11,32
699         stvx            v24,r10,$sp
700         addi            r10,r10,32
701         stvx            v25,r11,$sp
702         addi            r11,r11,32
703         stvx            v26,r10,$sp
704         addi            r10,r10,32
705         stvx            v27,r11,$sp
706         addi            r11,r11,32
707         stvx            v28,r10,$sp
708         addi            r10,r10,32
709         stvx            v29,r11,$sp
710         addi            r11,r11,32
711         stvx            v30,r10,$sp
712         stvx            v31,r11,$sp
713         li              r0,-1
714         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
715         li              $x10,0x10
716         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
717         li              $x20,0x20
718         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
719         li              $x30,0x30
720         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
721         li              $x40,0x40
722         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
723         li              $x50,0x50
724         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
725         li              $x60,0x60
726         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
727         li              $x70,0x70
728         mtspr           256,r0
729
730         subi            $rounds,$rounds,3       # -4 in total
731         subi            $len,$len,128           # bias
732
733         lvx             $rndkey0,$x00,$key      # load key schedule
734         lvx             v30,$x10,$key
735         addi            $key,$key,0x20
736         lvx             v31,$x00,$key
737         ?vperm          $rndkey0,$rndkey0,v30,$keyperm
738         addi            $key_,$sp,$FRAME+15
739         mtctr           $rounds
740
741 Load_cbc_dec_key:
742         ?vperm          v24,v30,v31,$keyperm
743         lvx             v30,$x10,$key
744         addi            $key,$key,0x20
745         stvx            v24,$x00,$key_          # off-load round[1]
746         ?vperm          v25,v31,v30,$keyperm
747         lvx             v31,$x00,$key
748         stvx            v25,$x10,$key_          # off-load round[2]
749         addi            $key_,$key_,0x20
750         bdnz            Load_cbc_dec_key
751
752         lvx             v26,$x10,$key
753         ?vperm          v24,v30,v31,$keyperm
754         lvx             v27,$x20,$key
755         stvx            v24,$x00,$key_          # off-load round[3]
756         ?vperm          v25,v31,v26,$keyperm
757         lvx             v28,$x30,$key
758         stvx            v25,$x10,$key_          # off-load round[4]
759         addi            $key_,$sp,$FRAME+15     # rewind $key_
760         ?vperm          v26,v26,v27,$keyperm
761         lvx             v29,$x40,$key
762         ?vperm          v27,v27,v28,$keyperm
763         lvx             v30,$x50,$key
764         ?vperm          v28,v28,v29,$keyperm
765         lvx             v31,$x60,$key
766         ?vperm          v29,v29,v30,$keyperm
767         lvx             $out0,$x70,$key         # borrow $out0
768         ?vperm          v30,v30,v31,$keyperm
769         lvx             v24,$x00,$key_          # pre-load round[1]
770         ?vperm          v31,v31,$out0,$keyperm
771         lvx             v25,$x10,$key_          # pre-load round[2]
772
773         #lvx            $inptail,0,$inp         # "caller" already did this
774         #addi           $inp,$inp,15            # 15 is not typo
775         subi            $inp,$inp,15            # undo "caller"
776
777          le?li          $idx,8
778         lvx_u           $in0,$x00,$inp          # load first 8 "words"
779          le?lvsl        $inpperm,0,$idx
780          le?vspltisb    $tmp,0x0f
781         lvx_u           $in1,$x10,$inp
782          le?vxor        $inpperm,$inpperm,$tmp  # transform for lvx_u/stvx_u
783         lvx_u           $in2,$x20,$inp
784          le?vperm       $in0,$in0,$in0,$inpperm
785         lvx_u           $in3,$x30,$inp
786          le?vperm       $in1,$in1,$in1,$inpperm
787         lvx_u           $in4,$x40,$inp
788          le?vperm       $in2,$in2,$in2,$inpperm
789         vxor            $out0,$in0,$rndkey0
790         lvx_u           $in5,$x50,$inp
791          le?vperm       $in3,$in3,$in3,$inpperm
792         vxor            $out1,$in1,$rndkey0
793         lvx_u           $in6,$x60,$inp
794          le?vperm       $in4,$in4,$in4,$inpperm
795         vxor            $out2,$in2,$rndkey0
796         lvx_u           $in7,$x70,$inp
797         addi            $inp,$inp,0x80
798          le?vperm       $in5,$in5,$in5,$inpperm
799         vxor            $out3,$in3,$rndkey0
800          le?vperm       $in6,$in6,$in6,$inpperm
801         vxor            $out4,$in4,$rndkey0
802          le?vperm       $in7,$in7,$in7,$inpperm
803         vxor            $out5,$in5,$rndkey0
804         vxor            $out6,$in6,$rndkey0
805         vxor            $out7,$in7,$rndkey0
806
807         mtctr           $rounds
808         b               Loop_cbc_dec8x
809 .align  5
810 Loop_cbc_dec8x:
811         vncipher        $out0,$out0,v24
812         vncipher        $out1,$out1,v24
813         vncipher        $out2,$out2,v24
814         vncipher        $out3,$out3,v24
815         vncipher        $out4,$out4,v24
816         vncipher        $out5,$out5,v24
817         vncipher        $out6,$out6,v24
818         vncipher        $out7,$out7,v24
819         lvx             v24,$x20,$key_          # round[3]
820         addi            $key_,$key_,0x20
821
822         vncipher        $out0,$out0,v25
823         vncipher        $out1,$out1,v25
824         vncipher        $out2,$out2,v25
825         vncipher        $out3,$out3,v25
826         vncipher        $out4,$out4,v25
827         vncipher        $out5,$out5,v25
828         vncipher        $out6,$out6,v25
829         vncipher        $out7,$out7,v25
830         lvx             v25,$x10,$key_          # round[4]
831         bdnz            Loop_cbc_dec8x
832
833         subic           $len,$len,128           # $len-=128
834         vncipher        $out0,$out0,v24
835         vncipher        $out1,$out1,v24
836         vncipher        $out2,$out2,v24
837         vncipher        $out3,$out3,v24
838         vncipher        $out4,$out4,v24
839         vncipher        $out5,$out5,v24
840         vncipher        $out6,$out6,v24
841         vncipher        $out7,$out7,v24
842
843         subfe.          r0,r0,r0                # borrow?-1:0
844         vncipher        $out0,$out0,v25
845         vncipher        $out1,$out1,v25
846         vncipher        $out2,$out2,v25
847         vncipher        $out3,$out3,v25
848         vncipher        $out4,$out4,v25
849         vncipher        $out5,$out5,v25
850         vncipher        $out6,$out6,v25
851         vncipher        $out7,$out7,v25
852
853         and             r0,r0,$len
854         vncipher        $out0,$out0,v26
855         vncipher        $out1,$out1,v26
856         vncipher        $out2,$out2,v26
857         vncipher        $out3,$out3,v26
858         vncipher        $out4,$out4,v26
859         vncipher        $out5,$out5,v26
860         vncipher        $out6,$out6,v26
861         vncipher        $out7,$out7,v26
862
863         add             $inp,$inp,r0            # $inp is adjusted in such
864                                                 # way that at exit from the
865                                                 # loop inX-in7 are loaded
866                                                 # with last "words"
867         vncipher        $out0,$out0,v27
868         vncipher        $out1,$out1,v27
869         vncipher        $out2,$out2,v27
870         vncipher        $out3,$out3,v27
871         vncipher        $out4,$out4,v27
872         vncipher        $out5,$out5,v27
873         vncipher        $out6,$out6,v27
874         vncipher        $out7,$out7,v27
875
876         addi            $key_,$sp,$FRAME+15     # rewind $key_
877         vncipher        $out0,$out0,v28
878         vncipher        $out1,$out1,v28
879         vncipher        $out2,$out2,v28
880         vncipher        $out3,$out3,v28
881         vncipher        $out4,$out4,v28
882         vncipher        $out5,$out5,v28
883         vncipher        $out6,$out6,v28
884         vncipher        $out7,$out7,v28
885         lvx             v24,$x00,$key_          # re-pre-load round[1]
886
887         vncipher        $out0,$out0,v29
888         vncipher        $out1,$out1,v29
889         vncipher        $out2,$out2,v29
890         vncipher        $out3,$out3,v29
891         vncipher        $out4,$out4,v29
892         vncipher        $out5,$out5,v29
893         vncipher        $out6,$out6,v29
894         vncipher        $out7,$out7,v29
895         lvx             v25,$x10,$key_          # re-pre-load round[2]
896
897         vncipher        $out0,$out0,v30
898          vxor           $ivec,$ivec,v31         # xor with last round key
899         vncipher        $out1,$out1,v30
900          vxor           $in0,$in0,v31
901         vncipher        $out2,$out2,v30
902          vxor           $in1,$in1,v31
903         vncipher        $out3,$out3,v30
904          vxor           $in2,$in2,v31
905         vncipher        $out4,$out4,v30
906          vxor           $in3,$in3,v31
907         vncipher        $out5,$out5,v30
908          vxor           $in4,$in4,v31
909         vncipher        $out6,$out6,v30
910          vxor           $in5,$in5,v31
911         vncipher        $out7,$out7,v30
912          vxor           $in6,$in6,v31
913
914         vncipherlast    $out0,$out0,$ivec
915         vncipherlast    $out1,$out1,$in0
916          lvx_u          $in0,$x00,$inp          # load next input block
917         vncipherlast    $out2,$out2,$in1
918          lvx_u          $in1,$x10,$inp
919         vncipherlast    $out3,$out3,$in2
920          le?vperm       $in0,$in0,$in0,$inpperm
921          lvx_u          $in2,$x20,$inp
922         vncipherlast    $out4,$out4,$in3
923          le?vperm       $in1,$in1,$in1,$inpperm
924          lvx_u          $in3,$x30,$inp
925         vncipherlast    $out5,$out5,$in4
926          le?vperm       $in2,$in2,$in2,$inpperm
927          lvx_u          $in4,$x40,$inp
928         vncipherlast    $out6,$out6,$in5
929          le?vperm       $in3,$in3,$in3,$inpperm
930          lvx_u          $in5,$x50,$inp
931         vncipherlast    $out7,$out7,$in6
932          le?vperm       $in4,$in4,$in4,$inpperm
933          lvx_u          $in6,$x60,$inp
934         vmr             $ivec,$in7
935          le?vperm       $in5,$in5,$in5,$inpperm
936          lvx_u          $in7,$x70,$inp
937          addi           $inp,$inp,0x80
938
939         le?vperm        $out0,$out0,$out0,$inpperm
940         le?vperm        $out1,$out1,$out1,$inpperm
941         stvx_u          $out0,$x00,$out
942          le?vperm       $in6,$in6,$in6,$inpperm
943          vxor           $out0,$in0,$rndkey0
944         le?vperm        $out2,$out2,$out2,$inpperm
945         stvx_u          $out1,$x10,$out
946          le?vperm       $in7,$in7,$in7,$inpperm
947          vxor           $out1,$in1,$rndkey0
948         le?vperm        $out3,$out3,$out3,$inpperm
949         stvx_u          $out2,$x20,$out
950          vxor           $out2,$in2,$rndkey0
951         le?vperm        $out4,$out4,$out4,$inpperm
952         stvx_u          $out3,$x30,$out
953          vxor           $out3,$in3,$rndkey0
954         le?vperm        $out5,$out5,$out5,$inpperm
955         stvx_u          $out4,$x40,$out
956          vxor           $out4,$in4,$rndkey0
957         le?vperm        $out6,$out6,$out6,$inpperm
958         stvx_u          $out5,$x50,$out
959          vxor           $out5,$in5,$rndkey0
960         le?vperm        $out7,$out7,$out7,$inpperm
961         stvx_u          $out6,$x60,$out
962          vxor           $out6,$in6,$rndkey0
963         stvx_u          $out7,$x70,$out
964         addi            $out,$out,0x80
965          vxor           $out7,$in7,$rndkey0
966
967         mtctr           $rounds
968         beq             Loop_cbc_dec8x          # did $len-=128 borrow?
969
970         addic.          $len,$len,128
971         beq             Lcbc_dec8x_done
972         nop
973         nop
974
975 Loop_cbc_dec8x_tail:                            # up to 7 "words" tail...
976         vncipher        $out1,$out1,v24
977         vncipher        $out2,$out2,v24
978         vncipher        $out3,$out3,v24
979         vncipher        $out4,$out4,v24
980         vncipher        $out5,$out5,v24
981         vncipher        $out6,$out6,v24
982         vncipher        $out7,$out7,v24
983         lvx             v24,$x20,$key_          # round[3]
984         addi            $key_,$key_,0x20
985
986         vncipher        $out1,$out1,v25
987         vncipher        $out2,$out2,v25
988         vncipher        $out3,$out3,v25
989         vncipher        $out4,$out4,v25
990         vncipher        $out5,$out5,v25
991         vncipher        $out6,$out6,v25
992         vncipher        $out7,$out7,v25
993         lvx             v25,$x10,$key_          # round[4]
994         bdnz            Loop_cbc_dec8x_tail
995
996         vncipher        $out1,$out1,v24
997         vncipher        $out2,$out2,v24
998         vncipher        $out3,$out3,v24
999         vncipher        $out4,$out4,v24
1000         vncipher        $out5,$out5,v24
1001         vncipher        $out6,$out6,v24
1002         vncipher        $out7,$out7,v24
1003
1004         vncipher        $out1,$out1,v25
1005         vncipher        $out2,$out2,v25
1006         vncipher        $out3,$out3,v25
1007         vncipher        $out4,$out4,v25
1008         vncipher        $out5,$out5,v25
1009         vncipher        $out6,$out6,v25
1010         vncipher        $out7,$out7,v25
1011
1012         vncipher        $out1,$out1,v26
1013         vncipher        $out2,$out2,v26
1014         vncipher        $out3,$out3,v26
1015         vncipher        $out4,$out4,v26
1016         vncipher        $out5,$out5,v26
1017         vncipher        $out6,$out6,v26
1018         vncipher        $out7,$out7,v26
1019
1020         vncipher        $out1,$out1,v27
1021         vncipher        $out2,$out2,v27
1022         vncipher        $out3,$out3,v27
1023         vncipher        $out4,$out4,v27
1024         vncipher        $out5,$out5,v27
1025         vncipher        $out6,$out6,v27
1026         vncipher        $out7,$out7,v27
1027
1028         vncipher        $out1,$out1,v28
1029         vncipher        $out2,$out2,v28
1030         vncipher        $out3,$out3,v28
1031         vncipher        $out4,$out4,v28
1032         vncipher        $out5,$out5,v28
1033         vncipher        $out6,$out6,v28
1034         vncipher        $out7,$out7,v28
1035
1036         vncipher        $out1,$out1,v29
1037         vncipher        $out2,$out2,v29
1038         vncipher        $out3,$out3,v29
1039         vncipher        $out4,$out4,v29
1040         vncipher        $out5,$out5,v29
1041         vncipher        $out6,$out6,v29
1042         vncipher        $out7,$out7,v29
1043
1044         vncipher        $out1,$out1,v30
1045          vxor           $ivec,$ivec,v31         # last round key
1046         vncipher        $out2,$out2,v30
1047          vxor           $in1,$in1,v31
1048         vncipher        $out3,$out3,v30
1049          vxor           $in2,$in2,v31
1050         vncipher        $out4,$out4,v30
1051          vxor           $in3,$in3,v31
1052         vncipher        $out5,$out5,v30
1053          vxor           $in4,$in4,v31
1054         vncipher        $out6,$out6,v30
1055          vxor           $in5,$in5,v31
1056         vncipher        $out7,$out7,v30
1057          vxor           $in6,$in6,v31
1058
1059         cmplwi          $len,32                 # switch($len)
1060         blt             Lcbc_dec8x_one
1061         nop
1062         beq             Lcbc_dec8x_two
1063         cmplwi          $len,64
1064         blt             Lcbc_dec8x_three
1065         nop
1066         beq             Lcbc_dec8x_four
1067         cmplwi          $len,96
1068         blt             Lcbc_dec8x_five
1069         nop
1070         beq             Lcbc_dec8x_six
1071
1072 Lcbc_dec8x_seven:
1073         vncipherlast    $out1,$out1,$ivec
1074         vncipherlast    $out2,$out2,$in1
1075         vncipherlast    $out3,$out3,$in2
1076         vncipherlast    $out4,$out4,$in3
1077         vncipherlast    $out5,$out5,$in4
1078         vncipherlast    $out6,$out6,$in5
1079         vncipherlast    $out7,$out7,$in6
1080         vmr             $ivec,$in7
1081
1082         le?vperm        $out1,$out1,$out1,$inpperm
1083         le?vperm        $out2,$out2,$out2,$inpperm
1084         stvx_u          $out1,$x00,$out
1085         le?vperm        $out3,$out3,$out3,$inpperm
1086         stvx_u          $out2,$x10,$out
1087         le?vperm        $out4,$out4,$out4,$inpperm
1088         stvx_u          $out3,$x20,$out
1089         le?vperm        $out5,$out5,$out5,$inpperm
1090         stvx_u          $out4,$x30,$out
1091         le?vperm        $out6,$out6,$out6,$inpperm
1092         stvx_u          $out5,$x40,$out
1093         le?vperm        $out7,$out7,$out7,$inpperm
1094         stvx_u          $out6,$x50,$out
1095         stvx_u          $out7,$x60,$out
1096         addi            $out,$out,0x70
1097         b               Lcbc_dec8x_done
1098
1099 .align  5
1100 Lcbc_dec8x_six:
1101         vncipherlast    $out2,$out2,$ivec
1102         vncipherlast    $out3,$out3,$in2
1103         vncipherlast    $out4,$out4,$in3
1104         vncipherlast    $out5,$out5,$in4
1105         vncipherlast    $out6,$out6,$in5
1106         vncipherlast    $out7,$out7,$in6
1107         vmr             $ivec,$in7
1108
1109         le?vperm        $out2,$out2,$out2,$inpperm
1110         le?vperm        $out3,$out3,$out3,$inpperm
1111         stvx_u          $out2,$x00,$out
1112         le?vperm        $out4,$out4,$out4,$inpperm
1113         stvx_u          $out3,$x10,$out
1114         le?vperm        $out5,$out5,$out5,$inpperm
1115         stvx_u          $out4,$x20,$out
1116         le?vperm        $out6,$out6,$out6,$inpperm
1117         stvx_u          $out5,$x30,$out
1118         le?vperm        $out7,$out7,$out7,$inpperm
1119         stvx_u          $out6,$x40,$out
1120         stvx_u          $out7,$x50,$out
1121         addi            $out,$out,0x60
1122         b               Lcbc_dec8x_done
1123
1124 .align  5
1125 Lcbc_dec8x_five:
1126         vncipherlast    $out3,$out3,$ivec
1127         vncipherlast    $out4,$out4,$in3
1128         vncipherlast    $out5,$out5,$in4
1129         vncipherlast    $out6,$out6,$in5
1130         vncipherlast    $out7,$out7,$in6
1131         vmr             $ivec,$in7
1132
1133         le?vperm        $out3,$out3,$out3,$inpperm
1134         le?vperm        $out4,$out4,$out4,$inpperm
1135         stvx_u          $out3,$x00,$out
1136         le?vperm        $out5,$out5,$out5,$inpperm
1137         stvx_u          $out4,$x10,$out
1138         le?vperm        $out6,$out6,$out6,$inpperm
1139         stvx_u          $out5,$x20,$out
1140         le?vperm        $out7,$out7,$out7,$inpperm
1141         stvx_u          $out6,$x30,$out
1142         stvx_u          $out7,$x40,$out
1143         addi            $out,$out,0x50
1144         b               Lcbc_dec8x_done
1145
1146 .align  5
1147 Lcbc_dec8x_four:
1148         vncipherlast    $out4,$out4,$ivec
1149         vncipherlast    $out5,$out5,$in4
1150         vncipherlast    $out6,$out6,$in5
1151         vncipherlast    $out7,$out7,$in6
1152         vmr             $ivec,$in7
1153
1154         le?vperm        $out4,$out4,$out4,$inpperm
1155         le?vperm        $out5,$out5,$out5,$inpperm
1156         stvx_u          $out4,$x00,$out
1157         le?vperm        $out6,$out6,$out6,$inpperm
1158         stvx_u          $out5,$x10,$out
1159         le?vperm        $out7,$out7,$out7,$inpperm
1160         stvx_u          $out6,$x20,$out
1161         stvx_u          $out7,$x30,$out
1162         addi            $out,$out,0x40
1163         b               Lcbc_dec8x_done
1164
1165 .align  5
1166 Lcbc_dec8x_three:
1167         vncipherlast    $out5,$out5,$ivec
1168         vncipherlast    $out6,$out6,$in5
1169         vncipherlast    $out7,$out7,$in6
1170         vmr             $ivec,$in7
1171
1172         le?vperm        $out5,$out5,$out5,$inpperm
1173         le?vperm        $out6,$out6,$out6,$inpperm
1174         stvx_u          $out5,$x00,$out
1175         le?vperm        $out7,$out7,$out7,$inpperm
1176         stvx_u          $out6,$x10,$out
1177         stvx_u          $out7,$x20,$out
1178         addi            $out,$out,0x30
1179         b               Lcbc_dec8x_done
1180
1181 .align  5
1182 Lcbc_dec8x_two:
1183         vncipherlast    $out6,$out6,$ivec
1184         vncipherlast    $out7,$out7,$in6
1185         vmr             $ivec,$in7
1186
1187         le?vperm        $out6,$out6,$out6,$inpperm
1188         le?vperm        $out7,$out7,$out7,$inpperm
1189         stvx_u          $out6,$x00,$out
1190         stvx_u          $out7,$x10,$out
1191         addi            $out,$out,0x20
1192         b               Lcbc_dec8x_done
1193
1194 .align  5
1195 Lcbc_dec8x_one:
1196         vncipherlast    $out7,$out7,$ivec
1197         vmr             $ivec,$in7
1198
1199         le?vperm        $out7,$out7,$out7,$inpperm
1200         stvx_u          $out7,0,$out
1201         addi            $out,$out,0x10
1202
1203 Lcbc_dec8x_done:
1204         le?vperm        $ivec,$ivec,$ivec,$inpperm
1205         stvx_u          $ivec,0,$ivp            # write [unaligned] iv
1206
1207         li              r10,`$FRAME+15`
1208         li              r11,`$FRAME+31`
1209         stvx            $inpperm,r10,$sp        # wipe copies of round keys
1210         addi            r10,r10,32
1211         stvx            $inpperm,r11,$sp
1212         addi            r11,r11,32
1213         stvx            $inpperm,r10,$sp
1214         addi            r10,r10,32
1215         stvx            $inpperm,r11,$sp
1216         addi            r11,r11,32
1217         stvx            $inpperm,r10,$sp
1218         addi            r10,r10,32
1219         stvx            $inpperm,r11,$sp
1220         addi            r11,r11,32
1221         stvx            $inpperm,r10,$sp
1222         addi            r10,r10,32
1223         stvx            $inpperm,r11,$sp
1224         addi            r11,r11,32
1225
1226         mtspr           256,$vrsave
1227         lvx             v20,r10,$sp             # ABI says so
1228         addi            r10,r10,32
1229         lvx             v21,r11,$sp
1230         addi            r11,r11,32
1231         lvx             v22,r10,$sp
1232         addi            r10,r10,32
1233         lvx             v23,r11,$sp
1234         addi            r11,r11,32
1235         lvx             v24,r10,$sp
1236         addi            r10,r10,32
1237         lvx             v25,r11,$sp
1238         addi            r11,r11,32
1239         lvx             v26,r10,$sp
1240         addi            r10,r10,32
1241         lvx             v27,r11,$sp
1242         addi            r11,r11,32
1243         lvx             v28,r10,$sp
1244         addi            r10,r10,32
1245         lvx             v29,r11,$sp
1246         addi            r11,r11,32
1247         lvx             v30,r10,$sp
1248         lvx             v31,r11,$sp
1249         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1250         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1251         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1252         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1253         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1254         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1255         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1256         blr
1257         .long           0
1258         .byte           0,12,0x04,0,0x80,6,6,0
1259         .long           0
1260 .size   .${prefix}_cbc_encrypt,.-.${prefix}_cbc_encrypt
1261 ___
1262 }}      }}}
1263
1264 #########################################################################
1265 {{{     # CTR procedure[s]                                              #
1266 my ($inp,$out,$len,$key,$ivp,$x10,$rounds,$idx)=map("r$_",(3..10));
1267 my ($rndkey0,$rndkey1,$inout,$tmp)=             map("v$_",(0..3));
1268 my ($ivec,$inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm,$one)=
1269                                                 map("v$_",(4..11));
1270 my $dat=$tmp;
1271
1272 $code.=<<___;
1273 .globl  .${prefix}_ctr32_encrypt_blocks
1274 .align  5
1275 .${prefix}_ctr32_encrypt_blocks:
1276         ${UCMP}i        $len,1
1277         bltlr-
1278
1279         lis             r0,0xfff0
1280         mfspr           $vrsave,256
1281         mtspr           256,r0
1282
1283         li              $idx,15
1284         vxor            $rndkey0,$rndkey0,$rndkey0
1285         le?vspltisb     $tmp,0x0f
1286
1287         lvx             $ivec,0,$ivp            # load [unaligned] iv
1288         lvsl            $inpperm,0,$ivp
1289         lvx             $inptail,$idx,$ivp
1290          vspltisb       $one,1
1291         le?vxor         $inpperm,$inpperm,$tmp
1292         vperm           $ivec,$ivec,$inptail,$inpperm
1293          vsldoi         $one,$rndkey0,$one,1
1294
1295         neg             r11,$inp
1296         ?lvsl           $keyperm,0,$key         # prepare for unaligned key
1297         lwz             $rounds,240($key)
1298
1299         lvsr            $inpperm,0,r11          # prepare for unaligned load
1300         lvx             $inptail,0,$inp
1301         addi            $inp,$inp,15            # 15 is not typo
1302         le?vxor         $inpperm,$inpperm,$tmp
1303
1304         srwi            $rounds,$rounds,1
1305         li              $idx,16
1306         subi            $rounds,$rounds,1
1307
1308         ${UCMP}i        $len,8
1309         bge             _aesp8_ctr32_encrypt8x
1310
1311         ?lvsr           $outperm,0,$out         # prepare for unaligned store
1312         vspltisb        $outmask,-1
1313         lvx             $outhead,0,$out
1314         ?vperm          $outmask,$rndkey0,$outmask,$outperm
1315         le?vxor         $outperm,$outperm,$tmp
1316
1317         lvx             $rndkey0,0,$key
1318         mtctr           $rounds
1319         lvx             $rndkey1,$idx,$key
1320         addi            $idx,$idx,16
1321         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
1322         vxor            $inout,$ivec,$rndkey0
1323         lvx             $rndkey0,$idx,$key
1324         addi            $idx,$idx,16
1325         b               Loop_ctr32_enc
1326
1327 .align  5
1328 Loop_ctr32_enc:
1329         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
1330         vcipher         $inout,$inout,$rndkey1
1331         lvx             $rndkey1,$idx,$key
1332         addi            $idx,$idx,16
1333         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
1334         vcipher         $inout,$inout,$rndkey0
1335         lvx             $rndkey0,$idx,$key
1336         addi            $idx,$idx,16
1337         bdnz            Loop_ctr32_enc
1338
1339         vadduwm         $ivec,$ivec,$one
1340          vmr            $dat,$inptail
1341          lvx            $inptail,0,$inp
1342          addi           $inp,$inp,16
1343          subic.         $len,$len,1             # blocks--
1344
1345         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
1346         vcipher         $inout,$inout,$rndkey1
1347         lvx             $rndkey1,$idx,$key
1348          vperm          $dat,$dat,$inptail,$inpperm
1349          li             $idx,16
1350         ?vperm          $rndkey1,$rndkey0,$rndkey1,$keyperm
1351          lvx            $rndkey0,0,$key
1352         vxor            $dat,$dat,$rndkey1      # last round key
1353         vcipherlast     $inout,$inout,$dat
1354
1355          lvx            $rndkey1,$idx,$key
1356          addi           $idx,$idx,16
1357         vperm           $inout,$inout,$inout,$outperm
1358         vsel            $dat,$outhead,$inout,$outmask
1359          mtctr          $rounds
1360          ?vperm         $rndkey0,$rndkey0,$rndkey1,$keyperm
1361         vmr             $outhead,$inout
1362          vxor           $inout,$ivec,$rndkey0
1363          lvx            $rndkey0,$idx,$key
1364          addi           $idx,$idx,16
1365         stvx            $dat,0,$out
1366         addi            $out,$out,16
1367         bne             Loop_ctr32_enc
1368
1369         addi            $out,$out,-1
1370         lvx             $inout,0,$out           # redundant in aligned case
1371         vsel            $inout,$outhead,$inout,$outmask
1372         stvx            $inout,0,$out
1373
1374         mtspr           256,$vrsave
1375         blr
1376         .long           0
1377         .byte           0,12,0x14,0,0,0,6,0
1378         .long           0
1379 ___
1380 #########################################################################
1381 {{      # Optimized CTR procedure                                       #
1382 my $key_="r11";
1383 my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,8,26..31));
1384     $x00=0 if ($flavour =~ /osx/);
1385 my ($in0, $in1, $in2, $in3, $in4, $in5, $in6, $in7 )=map("v$_",(0..3,10,12..14));
1386 my ($out0,$out1,$out2,$out3,$out4,$out5,$out6,$out7)=map("v$_",(15..22));
1387 my $rndkey0="v23";      # v24-v25 rotating buffer for first found keys
1388                         # v26-v31 last 6 round keys
1389 my ($tmp,$keyperm)=($in3,$in4); # aliases with "caller", redundant assignment
1390 my ($two,$three,$four)=($outhead,$outperm,$outmask);
1391
1392 $code.=<<___;
1393 .align  5
1394 _aesp8_ctr32_encrypt8x:
1395         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
1396         li              r10,`$FRAME+8*16+15`
1397         li              r11,`$FRAME+8*16+31`
1398         stvx            v20,r10,$sp             # ABI says so
1399         addi            r10,r10,32
1400         stvx            v21,r11,$sp
1401         addi            r11,r11,32
1402         stvx            v22,r10,$sp
1403         addi            r10,r10,32
1404         stvx            v23,r11,$sp
1405         addi            r11,r11,32
1406         stvx            v24,r10,$sp
1407         addi            r10,r10,32
1408         stvx            v25,r11,$sp
1409         addi            r11,r11,32
1410         stvx            v26,r10,$sp
1411         addi            r10,r10,32
1412         stvx            v27,r11,$sp
1413         addi            r11,r11,32
1414         stvx            v28,r10,$sp
1415         addi            r10,r10,32
1416         stvx            v29,r11,$sp
1417         addi            r11,r11,32
1418         stvx            v30,r10,$sp
1419         stvx            v31,r11,$sp
1420         li              r0,-1
1421         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
1422         li              $x10,0x10
1423         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1424         li              $x20,0x20
1425         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1426         li              $x30,0x30
1427         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1428         li              $x40,0x40
1429         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1430         li              $x50,0x50
1431         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1432         li              $x60,0x60
1433         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1434         li              $x70,0x70
1435         mtspr           256,r0
1436
1437         subi            $rounds,$rounds,3       # -4 in total
1438
1439         lvx             $rndkey0,$x00,$key      # load key schedule
1440         lvx             v30,$x10,$key
1441         addi            $key,$key,0x20
1442         lvx             v31,$x00,$key
1443         ?vperm          $rndkey0,$rndkey0,v30,$keyperm
1444         addi            $key_,$sp,$FRAME+15
1445         mtctr           $rounds
1446
1447 Load_ctr32_enc_key:
1448         ?vperm          v24,v30,v31,$keyperm
1449         lvx             v30,$x10,$key
1450         addi            $key,$key,0x20
1451         stvx            v24,$x00,$key_          # off-load round[1]
1452         ?vperm          v25,v31,v30,$keyperm
1453         lvx             v31,$x00,$key
1454         stvx            v25,$x10,$key_          # off-load round[2]
1455         addi            $key_,$key_,0x20
1456         bdnz            Load_ctr32_enc_key
1457
1458         lvx             v26,$x10,$key
1459         ?vperm          v24,v30,v31,$keyperm
1460         lvx             v27,$x20,$key
1461         stvx            v24,$x00,$key_          # off-load round[3]
1462         ?vperm          v25,v31,v26,$keyperm
1463         lvx             v28,$x30,$key
1464         stvx            v25,$x10,$key_          # off-load round[4]
1465         addi            $key_,$sp,$FRAME+15     # rewind $key_
1466         ?vperm          v26,v26,v27,$keyperm
1467         lvx             v29,$x40,$key
1468         ?vperm          v27,v27,v28,$keyperm
1469         lvx             v30,$x50,$key
1470         ?vperm          v28,v28,v29,$keyperm
1471         lvx             v31,$x60,$key
1472         ?vperm          v29,v29,v30,$keyperm
1473         lvx             $out0,$x70,$key         # borrow $out0
1474         ?vperm          v30,v30,v31,$keyperm
1475         lvx             v24,$x00,$key_          # pre-load round[1]
1476         ?vperm          v31,v31,$out0,$keyperm
1477         lvx             v25,$x10,$key_          # pre-load round[2]
1478
1479         vadduwm         $two,$one,$one
1480         subi            $inp,$inp,15            # undo "caller"
1481         $SHL            $len,$len,4
1482
1483         vadduwm         $out1,$ivec,$one        # counter values ...
1484         vadduwm         $out2,$ivec,$two
1485         vxor            $out0,$ivec,$rndkey0    # ... xored with rndkey[0]
1486          le?li          $idx,8
1487         vadduwm         $out3,$out1,$two
1488         vxor            $out1,$out1,$rndkey0
1489          le?lvsl        $inpperm,0,$idx
1490         vadduwm         $out4,$out2,$two
1491         vxor            $out2,$out2,$rndkey0
1492          le?vspltisb    $tmp,0x0f
1493         vadduwm         $out5,$out3,$two
1494         vxor            $out3,$out3,$rndkey0
1495          le?vxor        $inpperm,$inpperm,$tmp  # transform for lvx_u/stvx_u
1496         vadduwm         $out6,$out4,$two
1497         vxor            $out4,$out4,$rndkey0
1498         vadduwm         $out7,$out5,$two
1499         vxor            $out5,$out5,$rndkey0
1500         vadduwm         $ivec,$out6,$two        # next counter value
1501         vxor            $out6,$out6,$rndkey0
1502         vxor            $out7,$out7,$rndkey0
1503
1504         mtctr           $rounds
1505         b               Loop_ctr32_enc8x
1506 .align  5
1507 Loop_ctr32_enc8x:
1508         vcipher         $out0,$out0,v24
1509         vcipher         $out1,$out1,v24
1510         vcipher         $out2,$out2,v24
1511         vcipher         $out3,$out3,v24
1512         vcipher         $out4,$out4,v24
1513         vcipher         $out5,$out5,v24
1514         vcipher         $out6,$out6,v24
1515         vcipher         $out7,$out7,v24
1516 Loop_ctr32_enc8x_middle:
1517         lvx             v24,$x20,$key_          # round[3]
1518         addi            $key_,$key_,0x20
1519
1520         vcipher         $out0,$out0,v25
1521         vcipher         $out1,$out1,v25
1522         vcipher         $out2,$out2,v25
1523         vcipher         $out3,$out3,v25
1524         vcipher         $out4,$out4,v25
1525         vcipher         $out5,$out5,v25
1526         vcipher         $out6,$out6,v25
1527         vcipher         $out7,$out7,v25
1528         lvx             v25,$x10,$key_          # round[4]
1529         bdnz            Loop_ctr32_enc8x
1530
1531         subic           r11,$len,256            # $len-256, borrow $key_
1532         vcipher         $out0,$out0,v24
1533         vcipher         $out1,$out1,v24
1534         vcipher         $out2,$out2,v24
1535         vcipher         $out3,$out3,v24
1536         vcipher         $out4,$out4,v24
1537         vcipher         $out5,$out5,v24
1538         vcipher         $out6,$out6,v24
1539         vcipher         $out7,$out7,v24
1540
1541         subfe           r0,r0,r0                # borrow?-1:0
1542         vcipher         $out0,$out0,v25
1543         vcipher         $out1,$out1,v25
1544         vcipher         $out2,$out2,v25
1545         vcipher         $out3,$out3,v25
1546         vcipher         $out4,$out4,v25
1547         vcipher         $out5,$out5,v25
1548         vcipher         $out6,$out6,v25
1549         vcipher         $out7,$out7,v25
1550
1551         and             r0,r0,r11
1552         addi            $key_,$sp,$FRAME+15     # rewind $key_
1553         vcipher         $out0,$out0,v26
1554         vcipher         $out1,$out1,v26
1555         vcipher         $out2,$out2,v26
1556         vcipher         $out3,$out3,v26
1557         vcipher         $out4,$out4,v26
1558         vcipher         $out5,$out5,v26
1559         vcipher         $out6,$out6,v26
1560         vcipher         $out7,$out7,v26
1561         lvx             v24,$x00,$key_          # re-pre-load round[1]
1562
1563         subic           $len,$len,129           # $len-=129
1564         vcipher         $out0,$out0,v27
1565         addi            $len,$len,1             # $len-=128 really
1566         vcipher         $out1,$out1,v27
1567         vcipher         $out2,$out2,v27
1568         vcipher         $out3,$out3,v27
1569         vcipher         $out4,$out4,v27
1570         vcipher         $out5,$out5,v27
1571         vcipher         $out6,$out6,v27
1572         vcipher         $out7,$out7,v27
1573         lvx             v25,$x10,$key_          # re-pre-load round[2]
1574
1575         vcipher         $out0,$out0,v28
1576          lvx_u          $in0,$x00,$inp          # load input
1577         vcipher         $out1,$out1,v28
1578          lvx_u          $in1,$x10,$inp
1579         vcipher         $out2,$out2,v28
1580          lvx_u          $in2,$x20,$inp
1581         vcipher         $out3,$out3,v28
1582          lvx_u          $in3,$x30,$inp
1583         vcipher         $out4,$out4,v28
1584          lvx_u          $in4,$x40,$inp
1585         vcipher         $out5,$out5,v28
1586          lvx_u          $in5,$x50,$inp
1587         vcipher         $out6,$out6,v28
1588          lvx_u          $in6,$x60,$inp
1589         vcipher         $out7,$out7,v28
1590          lvx_u          $in7,$x70,$inp
1591          addi           $inp,$inp,0x80
1592
1593         vcipher         $out0,$out0,v29
1594          le?vperm       $in0,$in0,$in0,$inpperm
1595         vcipher         $out1,$out1,v29
1596          le?vperm       $in1,$in1,$in1,$inpperm
1597         vcipher         $out2,$out2,v29
1598          le?vperm       $in2,$in2,$in2,$inpperm
1599         vcipher         $out3,$out3,v29
1600          le?vperm       $in3,$in3,$in3,$inpperm
1601         vcipher         $out4,$out4,v29
1602          le?vperm       $in4,$in4,$in4,$inpperm
1603         vcipher         $out5,$out5,v29
1604          le?vperm       $in5,$in5,$in5,$inpperm
1605         vcipher         $out6,$out6,v29
1606          le?vperm       $in6,$in6,$in6,$inpperm
1607         vcipher         $out7,$out7,v29
1608          le?vperm       $in7,$in7,$in7,$inpperm
1609
1610         add             $inp,$inp,r0            # $inp is adjusted in such
1611                                                 # way that at exit from the
1612                                                 # loop inX-in7 are loaded
1613                                                 # with last "words"
1614         subfe.          r0,r0,r0                # borrow?-1:0
1615         vcipher         $out0,$out0,v30
1616          vxor           $in0,$in0,v31           # xor with last round key
1617         vcipher         $out1,$out1,v30
1618          vxor           $in1,$in1,v31
1619         vcipher         $out2,$out2,v30
1620          vxor           $in2,$in2,v31
1621         vcipher         $out3,$out3,v30
1622          vxor           $in3,$in3,v31
1623         vcipher         $out4,$out4,v30
1624          vxor           $in4,$in4,v31
1625         vcipher         $out5,$out5,v30
1626          vxor           $in5,$in5,v31
1627         vcipher         $out6,$out6,v30
1628          vxor           $in6,$in6,v31
1629         vcipher         $out7,$out7,v30
1630          vxor           $in7,$in7,v31
1631
1632         bne             Lctr32_enc8x_break      # did $len-129 borrow?
1633
1634         vcipherlast     $in0,$out0,$in0
1635         vcipherlast     $in1,$out1,$in1
1636          vadduwm        $out1,$ivec,$one        # counter values ...
1637         vcipherlast     $in2,$out2,$in2
1638          vadduwm        $out2,$ivec,$two
1639          vxor           $out0,$ivec,$rndkey0    # ... xored with rndkey[0]
1640         vcipherlast     $in3,$out3,$in3
1641          vadduwm        $out3,$out1,$two
1642          vxor           $out1,$out1,$rndkey0
1643         vcipherlast     $in4,$out4,$in4
1644          vadduwm        $out4,$out2,$two
1645          vxor           $out2,$out2,$rndkey0
1646         vcipherlast     $in5,$out5,$in5
1647          vadduwm        $out5,$out3,$two
1648          vxor           $out3,$out3,$rndkey0
1649         vcipherlast     $in6,$out6,$in6
1650          vadduwm        $out6,$out4,$two
1651          vxor           $out4,$out4,$rndkey0
1652         vcipherlast     $in7,$out7,$in7
1653          vadduwm        $out7,$out5,$two
1654          vxor           $out5,$out5,$rndkey0
1655         le?vperm        $in0,$in0,$in0,$inpperm
1656          vadduwm        $ivec,$out6,$two        # next counter value
1657          vxor           $out6,$out6,$rndkey0
1658         le?vperm        $in1,$in1,$in1,$inpperm
1659          vxor           $out7,$out7,$rndkey0
1660         mtctr           $rounds
1661
1662          vcipher        $out0,$out0,v24
1663         stvx_u          $in0,$x00,$out
1664         le?vperm        $in2,$in2,$in2,$inpperm
1665          vcipher        $out1,$out1,v24
1666         stvx_u          $in1,$x10,$out
1667         le?vperm        $in3,$in3,$in3,$inpperm
1668          vcipher        $out2,$out2,v24
1669         stvx_u          $in2,$x20,$out
1670         le?vperm        $in4,$in4,$in4,$inpperm
1671          vcipher        $out3,$out3,v24
1672         stvx_u          $in3,$x30,$out
1673         le?vperm        $in5,$in5,$in5,$inpperm
1674          vcipher        $out4,$out4,v24
1675         stvx_u          $in4,$x40,$out
1676         le?vperm        $in6,$in6,$in6,$inpperm
1677          vcipher        $out5,$out5,v24
1678         stvx_u          $in5,$x50,$out
1679         le?vperm        $in7,$in7,$in7,$inpperm
1680          vcipher        $out6,$out6,v24
1681         stvx_u          $in6,$x60,$out
1682          vcipher        $out7,$out7,v24
1683         stvx_u          $in7,$x70,$out
1684         addi            $out,$out,0x80
1685
1686         b               Loop_ctr32_enc8x_middle
1687
1688 .align  5
1689 Lctr32_enc8x_break:
1690         cmpwi           $len,-0x60
1691         blt             Lctr32_enc8x_one
1692         nop
1693         beq             Lctr32_enc8x_two
1694         cmpwi           $len,-0x40
1695         blt             Lctr32_enc8x_three
1696         nop
1697         beq             Lctr32_enc8x_four
1698         cmpwi           $len,-0x20
1699         blt             Lctr32_enc8x_five
1700         nop
1701         beq             Lctr32_enc8x_six
1702         cmpwi           $len,0x00
1703         blt             Lctr32_enc8x_seven
1704
1705 Lctr32_enc8x_eight:
1706         vcipherlast     $out0,$out0,$in0
1707         vcipherlast     $out1,$out1,$in1
1708         vcipherlast     $out2,$out2,$in2
1709         vcipherlast     $out3,$out3,$in3
1710         vcipherlast     $out4,$out4,$in4
1711         vcipherlast     $out5,$out5,$in5
1712         vcipherlast     $out6,$out6,$in6
1713         vcipherlast     $out7,$out7,$in7
1714
1715         le?vperm        $out0,$out0,$out0,$inpperm
1716         le?vperm        $out1,$out1,$out1,$inpperm
1717         stvx_u          $out0,$x00,$out
1718         le?vperm        $out2,$out2,$out2,$inpperm
1719         stvx_u          $out1,$x10,$out
1720         le?vperm        $out3,$out3,$out3,$inpperm
1721         stvx_u          $out2,$x20,$out
1722         le?vperm        $out4,$out4,$out4,$inpperm
1723         stvx_u          $out3,$x30,$out
1724         le?vperm        $out5,$out5,$out5,$inpperm
1725         stvx_u          $out4,$x40,$out
1726         le?vperm        $out6,$out6,$out6,$inpperm
1727         stvx_u          $out5,$x50,$out
1728         le?vperm        $out7,$out7,$out7,$inpperm
1729         stvx_u          $out6,$x60,$out
1730         stvx_u          $out7,$x70,$out
1731         addi            $out,$out,0x80
1732         b               Lctr32_enc8x_done
1733
1734 .align  5
1735 Lctr32_enc8x_seven:
1736         vcipherlast     $out0,$out0,$in1
1737         vcipherlast     $out1,$out1,$in2
1738         vcipherlast     $out2,$out2,$in3
1739         vcipherlast     $out3,$out3,$in4
1740         vcipherlast     $out4,$out4,$in5
1741         vcipherlast     $out5,$out5,$in6
1742         vcipherlast     $out6,$out6,$in7
1743
1744         le?vperm        $out0,$out0,$out0,$inpperm
1745         le?vperm        $out1,$out1,$out1,$inpperm
1746         stvx_u          $out0,$x00,$out
1747         le?vperm        $out2,$out2,$out2,$inpperm
1748         stvx_u          $out1,$x10,$out
1749         le?vperm        $out3,$out3,$out3,$inpperm
1750         stvx_u          $out2,$x20,$out
1751         le?vperm        $out4,$out4,$out4,$inpperm
1752         stvx_u          $out3,$x30,$out
1753         le?vperm        $out5,$out5,$out5,$inpperm
1754         stvx_u          $out4,$x40,$out
1755         le?vperm        $out6,$out6,$out6,$inpperm
1756         stvx_u          $out5,$x50,$out
1757         stvx_u          $out6,$x60,$out
1758         addi            $out,$out,0x70
1759         b               Lctr32_enc8x_done
1760
1761 .align  5
1762 Lctr32_enc8x_six:
1763         vcipherlast     $out0,$out0,$in2
1764         vcipherlast     $out1,$out1,$in3
1765         vcipherlast     $out2,$out2,$in4
1766         vcipherlast     $out3,$out3,$in5
1767         vcipherlast     $out4,$out4,$in6
1768         vcipherlast     $out5,$out5,$in7
1769
1770         le?vperm        $out0,$out0,$out0,$inpperm
1771         le?vperm        $out1,$out1,$out1,$inpperm
1772         stvx_u          $out0,$x00,$out
1773         le?vperm        $out2,$out2,$out2,$inpperm
1774         stvx_u          $out1,$x10,$out
1775         le?vperm        $out3,$out3,$out3,$inpperm
1776         stvx_u          $out2,$x20,$out
1777         le?vperm        $out4,$out4,$out4,$inpperm
1778         stvx_u          $out3,$x30,$out
1779         le?vperm        $out5,$out5,$out5,$inpperm
1780         stvx_u          $out4,$x40,$out
1781         stvx_u          $out5,$x50,$out
1782         addi            $out,$out,0x60
1783         b               Lctr32_enc8x_done
1784
1785 .align  5
1786 Lctr32_enc8x_five:
1787         vcipherlast     $out0,$out0,$in3
1788         vcipherlast     $out1,$out1,$in4
1789         vcipherlast     $out2,$out2,$in5
1790         vcipherlast     $out3,$out3,$in6
1791         vcipherlast     $out4,$out4,$in7
1792
1793         le?vperm        $out0,$out0,$out0,$inpperm
1794         le?vperm        $out1,$out1,$out1,$inpperm
1795         stvx_u          $out0,$x00,$out
1796         le?vperm        $out2,$out2,$out2,$inpperm
1797         stvx_u          $out1,$x10,$out
1798         le?vperm        $out3,$out3,$out3,$inpperm
1799         stvx_u          $out2,$x20,$out
1800         le?vperm        $out4,$out4,$out4,$inpperm
1801         stvx_u          $out3,$x30,$out
1802         stvx_u          $out4,$x40,$out
1803         addi            $out,$out,0x50
1804         b               Lctr32_enc8x_done
1805
1806 .align  5
1807 Lctr32_enc8x_four:
1808         vcipherlast     $out0,$out0,$in4
1809         vcipherlast     $out1,$out1,$in5
1810         vcipherlast     $out2,$out2,$in6
1811         vcipherlast     $out3,$out3,$in7
1812
1813         le?vperm        $out0,$out0,$out0,$inpperm
1814         le?vperm        $out1,$out1,$out1,$inpperm
1815         stvx_u          $out0,$x00,$out
1816         le?vperm        $out2,$out2,$out2,$inpperm
1817         stvx_u          $out1,$x10,$out
1818         le?vperm        $out3,$out3,$out3,$inpperm
1819         stvx_u          $out2,$x20,$out
1820         stvx_u          $out3,$x30,$out
1821         addi            $out,$out,0x40
1822         b               Lctr32_enc8x_done
1823
1824 .align  5
1825 Lctr32_enc8x_three:
1826         vcipherlast     $out0,$out0,$in5
1827         vcipherlast     $out1,$out1,$in6
1828         vcipherlast     $out2,$out2,$in7
1829
1830         le?vperm        $out0,$out0,$out0,$inpperm
1831         le?vperm        $out1,$out1,$out1,$inpperm
1832         stvx_u          $out0,$x00,$out
1833         le?vperm        $out2,$out2,$out2,$inpperm
1834         stvx_u          $out1,$x10,$out
1835         stvx_u          $out2,$x20,$out
1836         addi            $out,$out,0x30
1837         b               Lctr32_enc8x_done
1838
1839 .align  5
1840 Lctr32_enc8x_two:
1841         vcipherlast     $out0,$out0,$in6
1842         vcipherlast     $out1,$out1,$in7
1843
1844         le?vperm        $out0,$out0,$out0,$inpperm
1845         le?vperm        $out1,$out1,$out1,$inpperm
1846         stvx_u          $out0,$x00,$out
1847         stvx_u          $out1,$x10,$out
1848         addi            $out,$out,0x20
1849         b               Lctr32_enc8x_done
1850
1851 .align  5
1852 Lctr32_enc8x_one:
1853         vcipherlast     $out0,$out0,$in7
1854
1855         le?vperm        $out0,$out0,$out0,$inpperm
1856         stvx_u          $out0,0,$out
1857         addi            $out,$out,0x10
1858
1859 Lctr32_enc8x_done:
1860         li              r10,`$FRAME+15`
1861         li              r11,`$FRAME+31`
1862         stvx            $inpperm,r10,$sp        # wipe copies of round keys
1863         addi            r10,r10,32
1864         stvx            $inpperm,r11,$sp
1865         addi            r11,r11,32
1866         stvx            $inpperm,r10,$sp
1867         addi            r10,r10,32
1868         stvx            $inpperm,r11,$sp
1869         addi            r11,r11,32
1870         stvx            $inpperm,r10,$sp
1871         addi            r10,r10,32
1872         stvx            $inpperm,r11,$sp
1873         addi            r11,r11,32
1874         stvx            $inpperm,r10,$sp
1875         addi            r10,r10,32
1876         stvx            $inpperm,r11,$sp
1877         addi            r11,r11,32
1878
1879         mtspr           256,$vrsave
1880         lvx             v20,r10,$sp             # ABI says so
1881         addi            r10,r10,32
1882         lvx             v21,r11,$sp
1883         addi            r11,r11,32
1884         lvx             v22,r10,$sp
1885         addi            r10,r10,32
1886         lvx             v23,r11,$sp
1887         addi            r11,r11,32
1888         lvx             v24,r10,$sp
1889         addi            r10,r10,32
1890         lvx             v25,r11,$sp
1891         addi            r11,r11,32
1892         lvx             v26,r10,$sp
1893         addi            r10,r10,32
1894         lvx             v27,r11,$sp
1895         addi            r11,r11,32
1896         lvx             v28,r10,$sp
1897         addi            r10,r10,32
1898         lvx             v29,r11,$sp
1899         addi            r11,r11,32
1900         lvx             v30,r10,$sp
1901         lvx             v31,r11,$sp
1902         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
1903         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
1904         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
1905         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
1906         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
1907         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
1908         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
1909         blr
1910         .long           0
1911         .byte           0,12,0x04,0,0x80,6,6,0
1912         .long           0
1913 .size   .${prefix}_ctr32_encrypt_blocks,.-.${prefix}_ctr32_encrypt_blocks
1914 ___
1915 }}      }}}
1916
1917 #########################################################################
1918 {{{     # XTS procedures                                                #
1919 # int aes_p8_xts_[en|de]crypt(const char *inp, char *out, size_t len,   #
1920 #                             const AES_KEY *key1, const AES_KEY *key2, #
1921 #                             [const] unsigned char iv[16]);            #
1922 # If $key2 is NULL, then a "tweak chaining" mode is engaged, in which   #
1923 # input tweak value is assumed to be encrypted already, and last tweak  #
1924 # value, one suitable for consecutive call on same chunk of data, is    #
1925 # written back to original buffer. In addition, in "tweak chaining"     #
1926 # mode only complete input blocks are processed.                        #
1927
1928 my ($inp,$out,$len,$key1,$key2,$ivp,$rounds,$idx) =     map("r$_",(3..10));
1929 my ($rndkey0,$rndkey1,$inout) =                         map("v$_",(0..2));
1930 my ($output,$inptail,$inpperm,$leperm,$keyperm) =       map("v$_",(3..7));
1931 my ($tweak,$seven,$eighty7,$tmp,$tweak1) =              map("v$_",(8..12));
1932 my $taillen = $key2;
1933
1934    ($inp,$idx) = ($idx,$inp);                           # reassign
1935
1936 $code.=<<___;
1937 .globl  .${prefix}_xts_encrypt
1938 .align  5
1939 .${prefix}_xts_encrypt:
1940         mr              $inp,r3                         # reassign
1941         li              r3,-1
1942         ${UCMP}i        $len,16
1943         bltlr-
1944
1945         lis             r0,0xfff0
1946         mfspr           r12,256                         # save vrsave
1947         li              r11,0
1948         mtspr           256,r0
1949
1950         vspltisb        $seven,0x07                     # 0x070707..07
1951         le?lvsl         $leperm,r11,r11
1952         le?vspltisb     $tmp,0x0f
1953         le?vxor         $leperm,$leperm,$seven
1954
1955         li              $idx,15
1956         lvx             $tweak,0,$ivp                   # load [unaligned] iv
1957         lvsl            $inpperm,0,$ivp
1958         lvx             $inptail,$idx,$ivp
1959         le?vxor         $inpperm,$inpperm,$tmp
1960         vperm           $tweak,$tweak,$inptail,$inpperm
1961
1962         neg             r11,$inp
1963         lvsr            $inpperm,0,r11                  # prepare for unaligned load
1964         lvx             $inout,0,$inp
1965         addi            $inp,$inp,15                    # 15 is not typo
1966         le?vxor         $inpperm,$inpperm,$tmp
1967
1968         ${UCMP}i        $key2,0                         # key2==NULL?
1969         beq             Lxts_enc_no_key2
1970
1971         ?lvsl           $keyperm,0,$key2                # prepare for unaligned key
1972         lwz             $rounds,240($key2)
1973         srwi            $rounds,$rounds,1
1974         subi            $rounds,$rounds,1
1975         li              $idx,16
1976
1977         lvx             $rndkey0,0,$key2
1978         lvx             $rndkey1,$idx,$key2
1979         addi            $idx,$idx,16
1980         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
1981         vxor            $tweak,$tweak,$rndkey0
1982         lvx             $rndkey0,$idx,$key2
1983         addi            $idx,$idx,16
1984         mtctr           $rounds
1985
1986 Ltweak_xts_enc:
1987         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
1988         vcipher         $tweak,$tweak,$rndkey1
1989         lvx             $rndkey1,$idx,$key2
1990         addi            $idx,$idx,16
1991         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
1992         vcipher         $tweak,$tweak,$rndkey0
1993         lvx             $rndkey0,$idx,$key2
1994         addi            $idx,$idx,16
1995         bdnz            Ltweak_xts_enc
1996
1997         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
1998         vcipher         $tweak,$tweak,$rndkey1
1999         lvx             $rndkey1,$idx,$key2
2000         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2001         vcipherlast     $tweak,$tweak,$rndkey0
2002
2003         li              $ivp,0                          # don't chain the tweak
2004         b               Lxts_enc
2005
2006 Lxts_enc_no_key2:
2007         li              $idx,-16
2008         and             $len,$len,$idx                  # in "tweak chaining"
2009                                                         # mode only complete
2010                                                         # blocks are processed
2011 Lxts_enc:
2012         lvx             $inptail,0,$inp
2013         addi            $inp,$inp,16
2014
2015         ?lvsl           $keyperm,0,$key1                # prepare for unaligned key
2016         lwz             $rounds,240($key1)
2017         srwi            $rounds,$rounds,1
2018         subi            $rounds,$rounds,1
2019         li              $idx,16
2020
2021         vslb            $eighty7,$seven,$seven          # 0x808080..80
2022         vor             $eighty7,$eighty7,$seven        # 0x878787..87
2023         vspltisb        $tmp,1                          # 0x010101..01
2024         vsldoi          $eighty7,$eighty7,$tmp,15       # 0x870101..01
2025
2026         ${UCMP}i        $len,96
2027         bge             _aesp8_xts_encrypt6x
2028
2029         andi.           $taillen,$len,15
2030         subic           r0,$len,32
2031         subi            $taillen,$taillen,16
2032         subfe           r0,r0,r0
2033         and             r0,r0,$taillen
2034         add             $inp,$inp,r0
2035
2036         lvx             $rndkey0,0,$key1
2037         lvx             $rndkey1,$idx,$key1
2038         addi            $idx,$idx,16
2039         vperm           $inout,$inout,$inptail,$inpperm
2040         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2041         vxor            $inout,$inout,$tweak
2042         vxor            $inout,$inout,$rndkey0
2043         lvx             $rndkey0,$idx,$key1
2044         addi            $idx,$idx,16
2045         mtctr           $rounds
2046         b               Loop_xts_enc
2047
2048 .align  5
2049 Loop_xts_enc:
2050         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
2051         vcipher         $inout,$inout,$rndkey1
2052         lvx             $rndkey1,$idx,$key1
2053         addi            $idx,$idx,16
2054         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2055         vcipher         $inout,$inout,$rndkey0
2056         lvx             $rndkey0,$idx,$key1
2057         addi            $idx,$idx,16
2058         bdnz            Loop_xts_enc
2059
2060         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
2061         vcipher         $inout,$inout,$rndkey1
2062         lvx             $rndkey1,$idx,$key1
2063         li              $idx,16
2064         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2065         vxor            $rndkey0,$rndkey0,$tweak
2066         vcipherlast     $output,$inout,$rndkey0
2067
2068         le?vperm        $tmp,$output,$output,$leperm
2069         be?nop
2070         le?stvx_u       $tmp,0,$out
2071         be?stvx_u       $output,0,$out
2072         addi            $out,$out,16
2073
2074         subic.          $len,$len,16
2075         beq             Lxts_enc_done
2076
2077         vmr             $inout,$inptail
2078         lvx             $inptail,0,$inp
2079         addi            $inp,$inp,16
2080         lvx             $rndkey0,0,$key1
2081         lvx             $rndkey1,$idx,$key1
2082         addi            $idx,$idx,16
2083
2084         subic           r0,$len,32
2085         subfe           r0,r0,r0
2086         and             r0,r0,$taillen
2087         add             $inp,$inp,r0
2088
2089         vsrab           $tmp,$tweak,$seven              # next tweak value
2090         vaddubm         $tweak,$tweak,$tweak
2091         vsldoi          $tmp,$tmp,$tmp,15
2092         vand            $tmp,$tmp,$eighty7
2093         vxor            $tweak,$tweak,$tmp
2094
2095         vperm           $inout,$inout,$inptail,$inpperm
2096         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2097         vxor            $inout,$inout,$tweak
2098         vxor            $output,$output,$rndkey0        # just in case $len<16
2099         vxor            $inout,$inout,$rndkey0
2100         lvx             $rndkey0,$idx,$key1
2101         addi            $idx,$idx,16
2102
2103         mtctr           $rounds
2104         ${UCMP}i        $len,16
2105         bge             Loop_xts_enc
2106
2107         vxor            $output,$output,$tweak
2108         lvsr            $inpperm,0,$len                 # $inpperm is no longer needed
2109         vxor            $inptail,$inptail,$inptail      # $inptail is no longer needed
2110         vspltisb        $tmp,-1
2111         vperm           $inptail,$inptail,$tmp,$inpperm
2112         vsel            $inout,$inout,$output,$inptail
2113
2114         subi            r11,$out,17
2115         subi            $out,$out,16
2116         mtctr           $len
2117         li              $len,16
2118 Loop_xts_enc_steal:
2119         lbzu            r0,1(r11)
2120         stb             r0,16(r11)
2121         bdnz            Loop_xts_enc_steal
2122
2123         mtctr           $rounds
2124         b               Loop_xts_enc                    # one more time...
2125
2126 Lxts_enc_done:
2127         ${UCMP}i        $ivp,0
2128         beq             Lxts_enc_ret
2129
2130         vsrab           $tmp,$tweak,$seven              # next tweak value
2131         vaddubm         $tweak,$tweak,$tweak
2132         vsldoi          $tmp,$tmp,$tmp,15
2133         vand            $tmp,$tmp,$eighty7
2134         vxor            $tweak,$tweak,$tmp
2135
2136         le?vperm        $tweak,$tweak,$tweak,$leperm
2137         stvx_u          $tweak,0,$ivp
2138
2139 Lxts_enc_ret:
2140         mtspr           256,r12                         # restore vrsave
2141         li              r3,0
2142         blr
2143         .long           0
2144         .byte           0,12,0x04,0,0x80,6,6,0
2145         .long           0
2146 .size   .${prefix}_xts_encrypt,.-.${prefix}_xts_encrypt
2147
2148 .globl  .${prefix}_xts_decrypt
2149 .align  5
2150 .${prefix}_xts_decrypt:
2151         mr              $inp,r3                         # reassign
2152         li              r3,-1
2153         ${UCMP}i        $len,16
2154         bltlr-
2155
2156         lis             r0,0xfff8
2157         mfspr           r12,256                         # save vrsave
2158         li              r11,0
2159         mtspr           256,r0
2160
2161         andi.           r0,$len,15
2162         neg             r0,r0
2163         andi.           r0,r0,16
2164         sub             $len,$len,r0
2165
2166         vspltisb        $seven,0x07                     # 0x070707..07
2167         le?lvsl         $leperm,r11,r11
2168         le?vspltisb     $tmp,0x0f
2169         le?vxor         $leperm,$leperm,$seven
2170
2171         li              $idx,15
2172         lvx             $tweak,0,$ivp                   # load [unaligned] iv
2173         lvsl            $inpperm,0,$ivp
2174         lvx             $inptail,$idx,$ivp
2175         le?vxor         $inpperm,$inpperm,$tmp
2176         vperm           $tweak,$tweak,$inptail,$inpperm
2177
2178         neg             r11,$inp
2179         lvsr            $inpperm,0,r11                  # prepare for unaligned load
2180         lvx             $inout,0,$inp
2181         addi            $inp,$inp,15                    # 15 is not typo
2182         le?vxor         $inpperm,$inpperm,$tmp
2183
2184         ${UCMP}i        $key2,0                         # key2==NULL?
2185         beq             Lxts_dec_no_key2
2186
2187         ?lvsl           $keyperm,0,$key2                # prepare for unaligned key
2188         lwz             $rounds,240($key2)
2189         srwi            $rounds,$rounds,1
2190         subi            $rounds,$rounds,1
2191         li              $idx,16
2192
2193         lvx             $rndkey0,0,$key2
2194         lvx             $rndkey1,$idx,$key2
2195         addi            $idx,$idx,16
2196         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2197         vxor            $tweak,$tweak,$rndkey0
2198         lvx             $rndkey0,$idx,$key2
2199         addi            $idx,$idx,16
2200         mtctr           $rounds
2201
2202 Ltweak_xts_dec:
2203         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
2204         vcipher         $tweak,$tweak,$rndkey1
2205         lvx             $rndkey1,$idx,$key2
2206         addi            $idx,$idx,16
2207         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2208         vcipher         $tweak,$tweak,$rndkey0
2209         lvx             $rndkey0,$idx,$key2
2210         addi            $idx,$idx,16
2211         bdnz            Ltweak_xts_dec
2212
2213         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
2214         vcipher         $tweak,$tweak,$rndkey1
2215         lvx             $rndkey1,$idx,$key2
2216         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2217         vcipherlast     $tweak,$tweak,$rndkey0
2218
2219         li              $ivp,0                          # don't chain the tweak
2220         b               Lxts_dec
2221
2222 Lxts_dec_no_key2:
2223         neg             $idx,$len
2224         andi.           $idx,$idx,15
2225         add             $len,$len,$idx                  # in "tweak chaining"
2226                                                         # mode only complete
2227                                                         # blocks are processed
2228 Lxts_dec:
2229         lvx             $inptail,0,$inp
2230         addi            $inp,$inp,16
2231
2232         ?lvsl           $keyperm,0,$key1                # prepare for unaligned key
2233         lwz             $rounds,240($key1)
2234         srwi            $rounds,$rounds,1
2235         subi            $rounds,$rounds,1
2236         li              $idx,16
2237
2238         vslb            $eighty7,$seven,$seven          # 0x808080..80
2239         vor             $eighty7,$eighty7,$seven        # 0x878787..87
2240         vspltisb        $tmp,1                          # 0x010101..01
2241         vsldoi          $eighty7,$eighty7,$tmp,15       # 0x870101..01
2242
2243         ${UCMP}i        $len,96
2244         bge             _aesp8_xts_decrypt6x
2245
2246         lvx             $rndkey0,0,$key1
2247         lvx             $rndkey1,$idx,$key1
2248         addi            $idx,$idx,16
2249         vperm           $inout,$inout,$inptail,$inpperm
2250         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2251         vxor            $inout,$inout,$tweak
2252         vxor            $inout,$inout,$rndkey0
2253         lvx             $rndkey0,$idx,$key1
2254         addi            $idx,$idx,16
2255         mtctr           $rounds
2256
2257         ${UCMP}i        $len,16
2258         blt             Ltail_xts_dec
2259         be?b            Loop_xts_dec
2260
2261 .align  5
2262 Loop_xts_dec:
2263         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
2264         vncipher        $inout,$inout,$rndkey1
2265         lvx             $rndkey1,$idx,$key1
2266         addi            $idx,$idx,16
2267         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2268         vncipher        $inout,$inout,$rndkey0
2269         lvx             $rndkey0,$idx,$key1
2270         addi            $idx,$idx,16
2271         bdnz            Loop_xts_dec
2272
2273         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
2274         vncipher        $inout,$inout,$rndkey1
2275         lvx             $rndkey1,$idx,$key1
2276         li              $idx,16
2277         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2278         vxor            $rndkey0,$rndkey0,$tweak
2279         vncipherlast    $output,$inout,$rndkey0
2280
2281         le?vperm        $tmp,$output,$output,$leperm
2282         be?nop
2283         le?stvx_u       $tmp,0,$out
2284         be?stvx_u       $output,0,$out
2285         addi            $out,$out,16
2286
2287         subic.          $len,$len,16
2288         beq             Lxts_dec_done
2289
2290         vmr             $inout,$inptail
2291         lvx             $inptail,0,$inp
2292         addi            $inp,$inp,16
2293         lvx             $rndkey0,0,$key1
2294         lvx             $rndkey1,$idx,$key1
2295         addi            $idx,$idx,16
2296
2297         vsrab           $tmp,$tweak,$seven              # next tweak value
2298         vaddubm         $tweak,$tweak,$tweak
2299         vsldoi          $tmp,$tmp,$tmp,15
2300         vand            $tmp,$tmp,$eighty7
2301         vxor            $tweak,$tweak,$tmp
2302
2303         vperm           $inout,$inout,$inptail,$inpperm
2304         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2305         vxor            $inout,$inout,$tweak
2306         vxor            $inout,$inout,$rndkey0
2307         lvx             $rndkey0,$idx,$key1
2308         addi            $idx,$idx,16
2309
2310         mtctr           $rounds
2311         ${UCMP}i        $len,16
2312         bge             Loop_xts_dec
2313
2314 Ltail_xts_dec:
2315         vsrab           $tmp,$tweak,$seven              # next tweak value
2316         vaddubm         $tweak1,$tweak,$tweak
2317         vsldoi          $tmp,$tmp,$tmp,15
2318         vand            $tmp,$tmp,$eighty7
2319         vxor            $tweak1,$tweak1,$tmp
2320
2321         subi            $inp,$inp,16
2322         add             $inp,$inp,$len
2323
2324         vxor            $inout,$inout,$tweak            # :-(
2325         vxor            $inout,$inout,$tweak1           # :-)
2326
2327 Loop_xts_dec_short:
2328         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
2329         vncipher        $inout,$inout,$rndkey1
2330         lvx             $rndkey1,$idx,$key1
2331         addi            $idx,$idx,16
2332         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2333         vncipher        $inout,$inout,$rndkey0
2334         lvx             $rndkey0,$idx,$key1
2335         addi            $idx,$idx,16
2336         bdnz            Loop_xts_dec_short
2337
2338         ?vperm          $rndkey1,$rndkey1,$rndkey0,$keyperm
2339         vncipher        $inout,$inout,$rndkey1
2340         lvx             $rndkey1,$idx,$key1
2341         li              $idx,16
2342         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2343         vxor            $rndkey0,$rndkey0,$tweak1
2344         vncipherlast    $output,$inout,$rndkey0
2345
2346         le?vperm        $tmp,$output,$output,$leperm
2347         be?nop
2348         le?stvx_u       $tmp,0,$out
2349         be?stvx_u       $output,0,$out
2350
2351         vmr             $inout,$inptail
2352         lvx             $inptail,0,$inp
2353         #addi           $inp,$inp,16
2354         lvx             $rndkey0,0,$key1
2355         lvx             $rndkey1,$idx,$key1
2356         addi            $idx,$idx,16
2357         vperm           $inout,$inout,$inptail,$inpperm
2358         ?vperm          $rndkey0,$rndkey0,$rndkey1,$keyperm
2359
2360         lvsr            $inpperm,0,$len                 # $inpperm is no longer needed
2361         vxor            $inptail,$inptail,$inptail      # $inptail is no longer needed
2362         vspltisb        $tmp,-1
2363         vperm           $inptail,$inptail,$tmp,$inpperm
2364         vsel            $inout,$inout,$output,$inptail
2365
2366         vxor            $rndkey0,$rndkey0,$tweak
2367         vxor            $inout,$inout,$rndkey0
2368         lvx             $rndkey0,$idx,$key1
2369         addi            $idx,$idx,16
2370
2371         subi            r11,$out,1
2372         mtctr           $len
2373         li              $len,16
2374 Loop_xts_dec_steal:
2375         lbzu            r0,1(r11)
2376         stb             r0,16(r11)
2377         bdnz            Loop_xts_dec_steal
2378
2379         mtctr           $rounds
2380         b               Loop_xts_dec                    # one more time...
2381
2382 Lxts_dec_done:
2383         ${UCMP}i        $ivp,0
2384         beq             Lxts_dec_ret
2385
2386         vsrab           $tmp,$tweak,$seven              # next tweak value
2387         vaddubm         $tweak,$tweak,$tweak
2388         vsldoi          $tmp,$tmp,$tmp,15
2389         vand            $tmp,$tmp,$eighty7
2390         vxor            $tweak,$tweak,$tmp
2391
2392         le?vperm        $tweak,$tweak,$tweak,$leperm
2393         stvx_u          $tweak,0,$ivp
2394
2395 Lxts_dec_ret:
2396         mtspr           256,r12                         # restore vrsave
2397         li              r3,0
2398         blr
2399         .long           0
2400         .byte           0,12,0x04,0,0x80,6,6,0
2401         .long           0
2402 .size   .${prefix}_xts_decrypt,.-.${prefix}_xts_decrypt
2403 ___
2404 #########################################################################
2405 {{      # Optimized XTS procedures                                      #
2406 my $key_=$key2;
2407 my ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70)=map("r$_",(0,3,26..31));
2408     $x00=0 if ($flavour =~ /osx/);
2409 my ($in0,  $in1,  $in2,  $in3,  $in4,  $in5)=map("v$_",(0..5));
2410 my ($out0, $out1, $out2, $out3, $out4, $out5)=map("v$_",(7,12..16));
2411 my ($twk0, $twk1, $twk2, $twk3, $twk4, $twk5)=map("v$_",(17..22));
2412 my $rndkey0="v23";      # v24-v25 rotating buffer for first found keys
2413                         # v26-v31 last 6 round keys
2414 my ($keyperm)=($out0);  # aliases with "caller", redundant assignment
2415 my $taillen=$x70;
2416
2417 $code.=<<___;
2418 .align  5
2419 _aesp8_xts_encrypt6x:
2420         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
2421         mflr            r11
2422         li              r7,`$FRAME+8*16+15`
2423         li              r3,`$FRAME+8*16+31`
2424         $PUSH           r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
2425         stvx            v20,r7,$sp              # ABI says so
2426         addi            r7,r7,32
2427         stvx            v21,r3,$sp
2428         addi            r3,r3,32
2429         stvx            v22,r7,$sp
2430         addi            r7,r7,32
2431         stvx            v23,r3,$sp
2432         addi            r3,r3,32
2433         stvx            v24,r7,$sp
2434         addi            r7,r7,32
2435         stvx            v25,r3,$sp
2436         addi            r3,r3,32
2437         stvx            v26,r7,$sp
2438         addi            r7,r7,32
2439         stvx            v27,r3,$sp
2440         addi            r3,r3,32
2441         stvx            v28,r7,$sp
2442         addi            r7,r7,32
2443         stvx            v29,r3,$sp
2444         addi            r3,r3,32
2445         stvx            v30,r7,$sp
2446         stvx            v31,r3,$sp
2447         li              r0,-1
2448         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
2449         li              $x10,0x10
2450         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
2451         li              $x20,0x20
2452         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
2453         li              $x30,0x30
2454         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
2455         li              $x40,0x40
2456         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
2457         li              $x50,0x50
2458         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
2459         li              $x60,0x60
2460         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
2461         li              $x70,0x70
2462         mtspr           256,r0
2463
2464         # Reverse eighty7 to 0x010101..87
2465         xxlor           2, 32+$eighty7, 32+$eighty7
2466         vsldoi          $eighty7,$tmp,$eighty7,1        # 0x010101..87
2467         xxlor           1, 32+$eighty7, 32+$eighty7
2468
2469         # Load XOR contents. 0xf102132435465768798a9bacbdcedfe
2470         mr              $x70, r6
2471         bl              Lconsts
2472         lxvw4x          0, $x40, r6             # load XOR contents
2473         mr              r6, $x70
2474         li              $x70,0x70
2475
2476         subi            $rounds,$rounds,3       # -4 in total
2477
2478         lvx             $rndkey0,$x00,$key1     # load key schedule
2479         lvx             v30,$x10,$key1
2480         addi            $key1,$key1,0x20
2481         lvx             v31,$x00,$key1
2482         ?vperm          $rndkey0,$rndkey0,v30,$keyperm
2483         addi            $key_,$sp,$FRAME+15
2484         mtctr           $rounds
2485
2486 Load_xts_enc_key:
2487         ?vperm          v24,v30,v31,$keyperm
2488         lvx             v30,$x10,$key1
2489         addi            $key1,$key1,0x20
2490         stvx            v24,$x00,$key_          # off-load round[1]
2491         ?vperm          v25,v31,v30,$keyperm
2492         lvx             v31,$x00,$key1
2493         stvx            v25,$x10,$key_          # off-load round[2]
2494         addi            $key_,$key_,0x20
2495         bdnz            Load_xts_enc_key
2496
2497         lvx             v26,$x10,$key1
2498         ?vperm          v24,v30,v31,$keyperm
2499         lvx             v27,$x20,$key1
2500         stvx            v24,$x00,$key_          # off-load round[3]
2501         ?vperm          v25,v31,v26,$keyperm
2502         lvx             v28,$x30,$key1
2503         stvx            v25,$x10,$key_          # off-load round[4]
2504         addi            $key_,$sp,$FRAME+15     # rewind $key_
2505         ?vperm          v26,v26,v27,$keyperm
2506         lvx             v29,$x40,$key1
2507         ?vperm          v27,v27,v28,$keyperm
2508         lvx             v30,$x50,$key1
2509         ?vperm          v28,v28,v29,$keyperm
2510         lvx             v31,$x60,$key1
2511         ?vperm          v29,v29,v30,$keyperm
2512         lvx             $twk5,$x70,$key1        # borrow $twk5
2513         ?vperm          v30,v30,v31,$keyperm
2514         lvx             v24,$x00,$key_          # pre-load round[1]
2515         ?vperm          v31,v31,$twk5,$keyperm
2516         lvx             v25,$x10,$key_          # pre-load round[2]
2517
2518         # Switch to use the following codes with 0x010101..87 to generate tweak.
2519         #     eighty7 = 0x010101..87
2520         # vsrab         tmp, tweak, seven       # next tweak value, right shift 7 bits
2521         # vand          tmp, tmp, eighty7       # last byte with carry
2522         # vaddubm       tweak, tweak, tweak     # left shift 1 bit (x2)
2523         # xxlor         vsx, 0, 0
2524         # vpermxor      tweak, tweak, tmp, vsx
2525
2526          vperm          $in0,$inout,$inptail,$inpperm
2527          subi           $inp,$inp,31            # undo "caller"
2528         vxor            $twk0,$tweak,$rndkey0
2529         vsrab           $tmp,$tweak,$seven      # next tweak value
2530         vaddubm         $tweak,$tweak,$tweak
2531         vand            $tmp,$tmp,$eighty7
2532          vxor           $out0,$in0,$twk0
2533         xxlor           32+$in1, 0, 0
2534         vpermxor        $tweak, $tweak, $tmp, $in1
2535
2536          lvx_u          $in1,$x10,$inp
2537         vxor            $twk1,$tweak,$rndkey0
2538         vsrab           $tmp,$tweak,$seven      # next tweak value
2539         vaddubm         $tweak,$tweak,$tweak
2540          le?vperm       $in1,$in1,$in1,$leperm
2541         vand            $tmp,$tmp,$eighty7
2542          vxor           $out1,$in1,$twk1
2543         xxlor           32+$in2, 0, 0
2544         vpermxor        $tweak, $tweak, $tmp, $in2
2545
2546          lvx_u          $in2,$x20,$inp
2547          andi.          $taillen,$len,15
2548         vxor            $twk2,$tweak,$rndkey0
2549         vsrab           $tmp,$tweak,$seven      # next tweak value
2550         vaddubm         $tweak,$tweak,$tweak
2551          le?vperm       $in2,$in2,$in2,$leperm
2552         vand            $tmp,$tmp,$eighty7
2553          vxor           $out2,$in2,$twk2
2554         xxlor           32+$in3, 0, 0
2555         vpermxor        $tweak, $tweak, $tmp, $in3
2556
2557          lvx_u          $in3,$x30,$inp
2558          sub            $len,$len,$taillen
2559         vxor            $twk3,$tweak,$rndkey0
2560         vsrab           $tmp,$tweak,$seven      # next tweak value
2561         vaddubm         $tweak,$tweak,$tweak
2562          le?vperm       $in3,$in3,$in3,$leperm
2563         vand            $tmp,$tmp,$eighty7
2564          vxor           $out3,$in3,$twk3
2565         xxlor           32+$in4, 0, 0
2566         vpermxor        $tweak, $tweak, $tmp, $in4
2567
2568          lvx_u          $in4,$x40,$inp
2569          subi           $len,$len,0x60
2570         vxor            $twk4,$tweak,$rndkey0
2571         vsrab           $tmp,$tweak,$seven      # next tweak value
2572         vaddubm         $tweak,$tweak,$tweak
2573          le?vperm       $in4,$in4,$in4,$leperm
2574         vand            $tmp,$tmp,$eighty7
2575          vxor           $out4,$in4,$twk4
2576         xxlor           32+$in5, 0, 0
2577         vpermxor        $tweak, $tweak, $tmp, $in5
2578
2579          lvx_u          $in5,$x50,$inp
2580          addi           $inp,$inp,0x60
2581         vxor            $twk5,$tweak,$rndkey0
2582         vsrab           $tmp,$tweak,$seven      # next tweak value
2583         vaddubm         $tweak,$tweak,$tweak
2584          le?vperm       $in5,$in5,$in5,$leperm
2585         vand            $tmp,$tmp,$eighty7
2586          vxor           $out5,$in5,$twk5
2587         xxlor           32+$in0, 0, 0
2588         vpermxor        $tweak, $tweak, $tmp, $in0
2589
2590         vxor            v31,v31,$rndkey0
2591         mtctr           $rounds
2592         b               Loop_xts_enc6x
2593
2594 .align  5
2595 Loop_xts_enc6x:
2596         vcipher         $out0,$out0,v24
2597         vcipher         $out1,$out1,v24
2598         vcipher         $out2,$out2,v24
2599         vcipher         $out3,$out3,v24
2600         vcipher         $out4,$out4,v24
2601         vcipher         $out5,$out5,v24
2602         lvx             v24,$x20,$key_          # round[3]
2603         addi            $key_,$key_,0x20
2604
2605         vcipher         $out0,$out0,v25
2606         vcipher         $out1,$out1,v25
2607         vcipher         $out2,$out2,v25
2608         vcipher         $out3,$out3,v25
2609         vcipher         $out4,$out4,v25
2610         vcipher         $out5,$out5,v25
2611         lvx             v25,$x10,$key_          # round[4]
2612         bdnz            Loop_xts_enc6x
2613
2614         xxlor           32+$eighty7, 1, 1               # 0x010101..87
2615
2616         subic           $len,$len,96            # $len-=96
2617          vxor           $in0,$twk0,v31          # xor with last round key
2618         vcipher         $out0,$out0,v24
2619         vcipher         $out1,$out1,v24
2620          vsrab          $tmp,$tweak,$seven      # next tweak value
2621          vxor           $twk0,$tweak,$rndkey0
2622          vaddubm        $tweak,$tweak,$tweak
2623         vcipher         $out2,$out2,v24
2624         vcipher         $out3,$out3,v24
2625         vcipher         $out4,$out4,v24
2626         vcipher         $out5,$out5,v24
2627
2628         subfe.          r0,r0,r0                # borrow?-1:0
2629          vand           $tmp,$tmp,$eighty7
2630         vcipher         $out0,$out0,v25
2631         vcipher         $out1,$out1,v25
2632          xxlor          32+$in1, 0, 0
2633          vpermxor       $tweak, $tweak, $tmp, $in1
2634         vcipher         $out2,$out2,v25
2635         vcipher         $out3,$out3,v25
2636          vxor           $in1,$twk1,v31
2637          vsrab          $tmp,$tweak,$seven      # next tweak value
2638          vxor           $twk1,$tweak,$rndkey0
2639         vcipher         $out4,$out4,v25
2640         vcipher         $out5,$out5,v25
2641
2642         and             r0,r0,$len
2643          vaddubm        $tweak,$tweak,$tweak
2644         vcipher         $out0,$out0,v26
2645         vcipher         $out1,$out1,v26
2646          vand           $tmp,$tmp,$eighty7
2647         vcipher         $out2,$out2,v26
2648         vcipher         $out3,$out3,v26
2649          xxlor          32+$in2, 0, 0
2650          vpermxor       $tweak, $tweak, $tmp, $in2
2651         vcipher         $out4,$out4,v26
2652         vcipher         $out5,$out5,v26
2653
2654         add             $inp,$inp,r0            # $inp is adjusted in such
2655                                                 # way that at exit from the
2656                                                 # loop inX-in5 are loaded
2657                                                 # with last "words"
2658          vxor           $in2,$twk2,v31
2659          vsrab          $tmp,$tweak,$seven      # next tweak value
2660          vxor           $twk2,$tweak,$rndkey0
2661          vaddubm        $tweak,$tweak,$tweak
2662         vcipher         $out0,$out0,v27
2663         vcipher         $out1,$out1,v27
2664         vcipher         $out2,$out2,v27
2665         vcipher         $out3,$out3,v27
2666          vand           $tmp,$tmp,$eighty7
2667         vcipher         $out4,$out4,v27
2668         vcipher         $out5,$out5,v27
2669
2670         addi            $key_,$sp,$FRAME+15     # rewind $key_
2671          xxlor          32+$in3, 0, 0
2672          vpermxor       $tweak, $tweak, $tmp, $in3
2673         vcipher         $out0,$out0,v28
2674         vcipher         $out1,$out1,v28
2675          vxor           $in3,$twk3,v31
2676          vsrab          $tmp,$tweak,$seven      # next tweak value
2677          vxor           $twk3,$tweak,$rndkey0
2678         vcipher         $out2,$out2,v28
2679         vcipher         $out3,$out3,v28
2680          vaddubm        $tweak,$tweak,$tweak
2681         vcipher         $out4,$out4,v28
2682         vcipher         $out5,$out5,v28
2683         lvx             v24,$x00,$key_          # re-pre-load round[1]
2684          vand           $tmp,$tmp,$eighty7
2685
2686         vcipher         $out0,$out0,v29
2687         vcipher         $out1,$out1,v29
2688          xxlor          32+$in4, 0, 0
2689          vpermxor       $tweak, $tweak, $tmp, $in4
2690         vcipher         $out2,$out2,v29
2691         vcipher         $out3,$out3,v29
2692          vxor           $in4,$twk4,v31
2693          vsrab          $tmp,$tweak,$seven      # next tweak value
2694          vxor           $twk4,$tweak,$rndkey0
2695         vcipher         $out4,$out4,v29
2696         vcipher         $out5,$out5,v29
2697         lvx             v25,$x10,$key_          # re-pre-load round[2]
2698          vaddubm        $tweak,$tweak,$tweak
2699
2700         vcipher         $out0,$out0,v30
2701         vcipher         $out1,$out1,v30
2702          vand           $tmp,$tmp,$eighty7
2703         vcipher         $out2,$out2,v30
2704         vcipher         $out3,$out3,v30
2705          xxlor          32+$in5, 0, 0
2706          vpermxor       $tweak, $tweak, $tmp, $in5
2707         vcipher         $out4,$out4,v30
2708         vcipher         $out5,$out5,v30
2709          vxor           $in5,$twk5,v31
2710          vsrab          $tmp,$tweak,$seven      # next tweak value
2711          vxor           $twk5,$tweak,$rndkey0
2712
2713         vcipherlast     $out0,$out0,$in0
2714          lvx_u          $in0,$x00,$inp          # load next input block
2715          vaddubm        $tweak,$tweak,$tweak
2716         vcipherlast     $out1,$out1,$in1
2717          lvx_u          $in1,$x10,$inp
2718         vcipherlast     $out2,$out2,$in2
2719          le?vperm       $in0,$in0,$in0,$leperm
2720          lvx_u          $in2,$x20,$inp
2721          vand           $tmp,$tmp,$eighty7
2722         vcipherlast     $out3,$out3,$in3
2723          le?vperm       $in1,$in1,$in1,$leperm
2724          lvx_u          $in3,$x30,$inp
2725         vcipherlast     $out4,$out4,$in4
2726          le?vperm       $in2,$in2,$in2,$leperm
2727          lvx_u          $in4,$x40,$inp
2728          xxlor          10, 32+$in0, 32+$in0
2729          xxlor          32+$in0, 0, 0
2730          vpermxor       $tweak, $tweak, $tmp, $in0
2731          xxlor          32+$in0, 10, 10
2732         vcipherlast     $tmp,$out5,$in5         # last block might be needed
2733                                                 # in stealing mode
2734          le?vperm       $in3,$in3,$in3,$leperm
2735          lvx_u          $in5,$x50,$inp
2736          addi           $inp,$inp,0x60
2737          le?vperm       $in4,$in4,$in4,$leperm
2738          le?vperm       $in5,$in5,$in5,$leperm
2739
2740         le?vperm        $out0,$out0,$out0,$leperm
2741         le?vperm        $out1,$out1,$out1,$leperm
2742         stvx_u          $out0,$x00,$out         # store output
2743          vxor           $out0,$in0,$twk0
2744         le?vperm        $out2,$out2,$out2,$leperm
2745         stvx_u          $out1,$x10,$out
2746          vxor           $out1,$in1,$twk1
2747         le?vperm        $out3,$out3,$out3,$leperm
2748         stvx_u          $out2,$x20,$out
2749          vxor           $out2,$in2,$twk2
2750         le?vperm        $out4,$out4,$out4,$leperm
2751         stvx_u          $out3,$x30,$out
2752          vxor           $out3,$in3,$twk3
2753         le?vperm        $out5,$tmp,$tmp,$leperm
2754         stvx_u          $out4,$x40,$out
2755          vxor           $out4,$in4,$twk4
2756         le?stvx_u       $out5,$x50,$out
2757         be?stvx_u       $tmp, $x50,$out
2758          vxor           $out5,$in5,$twk5
2759         addi            $out,$out,0x60
2760
2761         mtctr           $rounds
2762         beq             Loop_xts_enc6x          # did $len-=96 borrow?
2763
2764         xxlor           32+$eighty7, 2, 2               # 0x870101..01
2765
2766         addic.          $len,$len,0x60
2767         beq             Lxts_enc6x_zero
2768         cmpwi           $len,0x20
2769         blt             Lxts_enc6x_one
2770         nop
2771         beq             Lxts_enc6x_two
2772         cmpwi           $len,0x40
2773         blt             Lxts_enc6x_three
2774         nop
2775         beq             Lxts_enc6x_four
2776
2777 Lxts_enc6x_five:
2778         vxor            $out0,$in1,$twk0
2779         vxor            $out1,$in2,$twk1
2780         vxor            $out2,$in3,$twk2
2781         vxor            $out3,$in4,$twk3
2782         vxor            $out4,$in5,$twk4
2783
2784         bl              _aesp8_xts_enc5x
2785
2786         le?vperm        $out0,$out0,$out0,$leperm
2787         vmr             $twk0,$twk5             # unused tweak
2788         le?vperm        $out1,$out1,$out1,$leperm
2789         stvx_u          $out0,$x00,$out         # store output
2790         le?vperm        $out2,$out2,$out2,$leperm
2791         stvx_u          $out1,$x10,$out
2792         le?vperm        $out3,$out3,$out3,$leperm
2793         stvx_u          $out2,$x20,$out
2794         vxor            $tmp,$out4,$twk5        # last block prep for stealing
2795         le?vperm        $out4,$out4,$out4,$leperm
2796         stvx_u          $out3,$x30,$out
2797         stvx_u          $out4,$x40,$out
2798         addi            $out,$out,0x50
2799         bne             Lxts_enc6x_steal
2800         b               Lxts_enc6x_done
2801
2802 .align  4
2803 Lxts_enc6x_four:
2804         vxor            $out0,$in2,$twk0
2805         vxor            $out1,$in3,$twk1
2806         vxor            $out2,$in4,$twk2
2807         vxor            $out3,$in5,$twk3
2808         vxor            $out4,$out4,$out4
2809
2810         bl              _aesp8_xts_enc5x
2811
2812         le?vperm        $out0,$out0,$out0,$leperm
2813         vmr             $twk0,$twk4             # unused tweak
2814         le?vperm        $out1,$out1,$out1,$leperm
2815         stvx_u          $out0,$x00,$out         # store output
2816         le?vperm        $out2,$out2,$out2,$leperm
2817         stvx_u          $out1,$x10,$out
2818         vxor            $tmp,$out3,$twk4        # last block prep for stealing
2819         le?vperm        $out3,$out3,$out3,$leperm
2820         stvx_u          $out2,$x20,$out
2821         stvx_u          $out3,$x30,$out
2822         addi            $out,$out,0x40
2823         bne             Lxts_enc6x_steal
2824         b               Lxts_enc6x_done
2825
2826 .align  4
2827 Lxts_enc6x_three:
2828         vxor            $out0,$in3,$twk0
2829         vxor            $out1,$in4,$twk1
2830         vxor            $out2,$in5,$twk2
2831         vxor            $out3,$out3,$out3
2832         vxor            $out4,$out4,$out4
2833
2834         bl              _aesp8_xts_enc5x
2835
2836         le?vperm        $out0,$out0,$out0,$leperm
2837         vmr             $twk0,$twk3             # unused tweak
2838         le?vperm        $out1,$out1,$out1,$leperm
2839         stvx_u          $out0,$x00,$out         # store output
2840         vxor            $tmp,$out2,$twk3        # last block prep for stealing
2841         le?vperm        $out2,$out2,$out2,$leperm
2842         stvx_u          $out1,$x10,$out
2843         stvx_u          $out2,$x20,$out
2844         addi            $out,$out,0x30
2845         bne             Lxts_enc6x_steal
2846         b               Lxts_enc6x_done
2847
2848 .align  4
2849 Lxts_enc6x_two:
2850         vxor            $out0,$in4,$twk0
2851         vxor            $out1,$in5,$twk1
2852         vxor            $out2,$out2,$out2
2853         vxor            $out3,$out3,$out3
2854         vxor            $out4,$out4,$out4
2855
2856         bl              _aesp8_xts_enc5x
2857
2858         le?vperm        $out0,$out0,$out0,$leperm
2859         vmr             $twk0,$twk2             # unused tweak
2860         vxor            $tmp,$out1,$twk2        # last block prep for stealing
2861         le?vperm        $out1,$out1,$out1,$leperm
2862         stvx_u          $out0,$x00,$out         # store output
2863         stvx_u          $out1,$x10,$out
2864         addi            $out,$out,0x20
2865         bne             Lxts_enc6x_steal
2866         b               Lxts_enc6x_done
2867
2868 .align  4
2869 Lxts_enc6x_one:
2870         vxor            $out0,$in5,$twk0
2871         nop
2872 Loop_xts_enc1x:
2873         vcipher         $out0,$out0,v24
2874         lvx             v24,$x20,$key_          # round[3]
2875         addi            $key_,$key_,0x20
2876
2877         vcipher         $out0,$out0,v25
2878         lvx             v25,$x10,$key_          # round[4]
2879         bdnz            Loop_xts_enc1x
2880
2881         add             $inp,$inp,$taillen
2882         cmpwi           $taillen,0
2883         vcipher         $out0,$out0,v24
2884
2885         subi            $inp,$inp,16
2886         vcipher         $out0,$out0,v25
2887
2888         lvsr            $inpperm,0,$taillen
2889         vcipher         $out0,$out0,v26
2890
2891         lvx_u           $in0,0,$inp
2892         vcipher         $out0,$out0,v27
2893
2894         addi            $key_,$sp,$FRAME+15     # rewind $key_
2895         vcipher         $out0,$out0,v28
2896         lvx             v24,$x00,$key_          # re-pre-load round[1]
2897
2898         vcipher         $out0,$out0,v29
2899         lvx             v25,$x10,$key_          # re-pre-load round[2]
2900          vxor           $twk0,$twk0,v31
2901
2902         le?vperm        $in0,$in0,$in0,$leperm
2903         vcipher         $out0,$out0,v30
2904
2905         vperm           $in0,$in0,$in0,$inpperm
2906         vcipherlast     $out0,$out0,$twk0
2907
2908         vmr             $twk0,$twk1             # unused tweak
2909         vxor            $tmp,$out0,$twk1        # last block prep for stealing
2910         le?vperm        $out0,$out0,$out0,$leperm
2911         stvx_u          $out0,$x00,$out         # store output
2912         addi            $out,$out,0x10
2913         bne             Lxts_enc6x_steal
2914         b               Lxts_enc6x_done
2915
2916 .align  4
2917 Lxts_enc6x_zero:
2918         cmpwi           $taillen,0
2919         beq             Lxts_enc6x_done
2920
2921         add             $inp,$inp,$taillen
2922         subi            $inp,$inp,16
2923         lvx_u           $in0,0,$inp
2924         lvsr            $inpperm,0,$taillen     # $in5 is no more
2925         le?vperm        $in0,$in0,$in0,$leperm
2926         vperm           $in0,$in0,$in0,$inpperm
2927         vxor            $tmp,$tmp,$twk0
2928 Lxts_enc6x_steal:
2929         vxor            $in0,$in0,$twk0
2930         vxor            $out0,$out0,$out0
2931         vspltisb        $out1,-1
2932         vperm           $out0,$out0,$out1,$inpperm
2933         vsel            $out0,$in0,$tmp,$out0   # $tmp is last block, remember?
2934
2935         subi            r30,$out,17
2936         subi            $out,$out,16
2937         mtctr           $taillen
2938 Loop_xts_enc6x_steal:
2939         lbzu            r0,1(r30)
2940         stb             r0,16(r30)
2941         bdnz            Loop_xts_enc6x_steal
2942
2943         li              $taillen,0
2944         mtctr           $rounds
2945         b               Loop_xts_enc1x          # one more time...
2946
2947 .align  4
2948 Lxts_enc6x_done:
2949         ${UCMP}i        $ivp,0
2950         beq             Lxts_enc6x_ret
2951
2952         vxor            $tweak,$twk0,$rndkey0
2953         le?vperm        $tweak,$tweak,$tweak,$leperm
2954         stvx_u          $tweak,0,$ivp
2955
2956 Lxts_enc6x_ret:
2957         mtlr            r11
2958         li              r10,`$FRAME+15`
2959         li              r11,`$FRAME+31`
2960         stvx            $seven,r10,$sp          # wipe copies of round keys
2961         addi            r10,r10,32
2962         stvx            $seven,r11,$sp
2963         addi            r11,r11,32
2964         stvx            $seven,r10,$sp
2965         addi            r10,r10,32
2966         stvx            $seven,r11,$sp
2967         addi            r11,r11,32
2968         stvx            $seven,r10,$sp
2969         addi            r10,r10,32
2970         stvx            $seven,r11,$sp
2971         addi            r11,r11,32
2972         stvx            $seven,r10,$sp
2973         addi            r10,r10,32
2974         stvx            $seven,r11,$sp
2975         addi            r11,r11,32
2976
2977         mtspr           256,$vrsave
2978         lvx             v20,r10,$sp             # ABI says so
2979         addi            r10,r10,32
2980         lvx             v21,r11,$sp
2981         addi            r11,r11,32
2982         lvx             v22,r10,$sp
2983         addi            r10,r10,32
2984         lvx             v23,r11,$sp
2985         addi            r11,r11,32
2986         lvx             v24,r10,$sp
2987         addi            r10,r10,32
2988         lvx             v25,r11,$sp
2989         addi            r11,r11,32
2990         lvx             v26,r10,$sp
2991         addi            r10,r10,32
2992         lvx             v27,r11,$sp
2993         addi            r11,r11,32
2994         lvx             v28,r10,$sp
2995         addi            r10,r10,32
2996         lvx             v29,r11,$sp
2997         addi            r11,r11,32
2998         lvx             v30,r10,$sp
2999         lvx             v31,r11,$sp
3000         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3001         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3002         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3003         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3004         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3005         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3006         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3007         blr
3008         .long           0
3009         .byte           0,12,0x04,1,0x80,6,6,0
3010         .long           0
3011
3012 .align  5
3013 _aesp8_xts_enc5x:
3014         vcipher         $out0,$out0,v24
3015         vcipher         $out1,$out1,v24
3016         vcipher         $out2,$out2,v24
3017         vcipher         $out3,$out3,v24
3018         vcipher         $out4,$out4,v24
3019         lvx             v24,$x20,$key_          # round[3]
3020         addi            $key_,$key_,0x20
3021
3022         vcipher         $out0,$out0,v25
3023         vcipher         $out1,$out1,v25
3024         vcipher         $out2,$out2,v25
3025         vcipher         $out3,$out3,v25
3026         vcipher         $out4,$out4,v25
3027         lvx             v25,$x10,$key_          # round[4]
3028         bdnz            _aesp8_xts_enc5x
3029
3030         add             $inp,$inp,$taillen
3031         cmpwi           $taillen,0
3032         vcipher         $out0,$out0,v24
3033         vcipher         $out1,$out1,v24
3034         vcipher         $out2,$out2,v24
3035         vcipher         $out3,$out3,v24
3036         vcipher         $out4,$out4,v24
3037
3038         subi            $inp,$inp,16
3039         vcipher         $out0,$out0,v25
3040         vcipher         $out1,$out1,v25
3041         vcipher         $out2,$out2,v25
3042         vcipher         $out3,$out3,v25
3043         vcipher         $out4,$out4,v25
3044          vxor           $twk0,$twk0,v31
3045
3046         vcipher         $out0,$out0,v26
3047         lvsr            $inpperm,0,$taillen     # $in5 is no more
3048         vcipher         $out1,$out1,v26
3049         vcipher         $out2,$out2,v26
3050         vcipher         $out3,$out3,v26
3051         vcipher         $out4,$out4,v26
3052          vxor           $in1,$twk1,v31
3053
3054         vcipher         $out0,$out0,v27
3055         lvx_u           $in0,0,$inp
3056         vcipher         $out1,$out1,v27
3057         vcipher         $out2,$out2,v27
3058         vcipher         $out3,$out3,v27
3059         vcipher         $out4,$out4,v27
3060          vxor           $in2,$twk2,v31
3061
3062         addi            $key_,$sp,$FRAME+15     # rewind $key_
3063         vcipher         $out0,$out0,v28
3064         vcipher         $out1,$out1,v28
3065         vcipher         $out2,$out2,v28
3066         vcipher         $out3,$out3,v28
3067         vcipher         $out4,$out4,v28
3068         lvx             v24,$x00,$key_          # re-pre-load round[1]
3069          vxor           $in3,$twk3,v31
3070
3071         vcipher         $out0,$out0,v29
3072         le?vperm        $in0,$in0,$in0,$leperm
3073         vcipher         $out1,$out1,v29
3074         vcipher         $out2,$out2,v29
3075         vcipher         $out3,$out3,v29
3076         vcipher         $out4,$out4,v29
3077         lvx             v25,$x10,$key_          # re-pre-load round[2]
3078          vxor           $in4,$twk4,v31
3079
3080         vcipher         $out0,$out0,v30
3081         vperm           $in0,$in0,$in0,$inpperm
3082         vcipher         $out1,$out1,v30
3083         vcipher         $out2,$out2,v30
3084         vcipher         $out3,$out3,v30
3085         vcipher         $out4,$out4,v30
3086
3087         vcipherlast     $out0,$out0,$twk0
3088         vcipherlast     $out1,$out1,$in1
3089         vcipherlast     $out2,$out2,$in2
3090         vcipherlast     $out3,$out3,$in3
3091         vcipherlast     $out4,$out4,$in4
3092         blr
3093         .long           0
3094         .byte           0,12,0x14,0,0,0,0,0
3095
3096 .align  5
3097 _aesp8_xts_decrypt6x:
3098         $STU            $sp,-`($FRAME+21*16+6*$SIZE_T)`($sp)
3099         mflr            r11
3100         li              r7,`$FRAME+8*16+15`
3101         li              r3,`$FRAME+8*16+31`
3102         $PUSH           r11,`$FRAME+21*16+6*$SIZE_T+$LRSAVE`($sp)
3103         stvx            v20,r7,$sp              # ABI says so
3104         addi            r7,r7,32
3105         stvx            v21,r3,$sp
3106         addi            r3,r3,32
3107         stvx            v22,r7,$sp
3108         addi            r7,r7,32
3109         stvx            v23,r3,$sp
3110         addi            r3,r3,32
3111         stvx            v24,r7,$sp
3112         addi            r7,r7,32
3113         stvx            v25,r3,$sp
3114         addi            r3,r3,32
3115         stvx            v26,r7,$sp
3116         addi            r7,r7,32
3117         stvx            v27,r3,$sp
3118         addi            r3,r3,32
3119         stvx            v28,r7,$sp
3120         addi            r7,r7,32
3121         stvx            v29,r3,$sp
3122         addi            r3,r3,32
3123         stvx            v30,r7,$sp
3124         stvx            v31,r3,$sp
3125         li              r0,-1
3126         stw             $vrsave,`$FRAME+21*16-4`($sp)   # save vrsave
3127         li              $x10,0x10
3128         $PUSH           r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3129         li              $x20,0x20
3130         $PUSH           r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3131         li              $x30,0x30
3132         $PUSH           r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3133         li              $x40,0x40
3134         $PUSH           r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3135         li              $x50,0x50
3136         $PUSH           r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3137         li              $x60,0x60
3138         $PUSH           r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3139         li              $x70,0x70
3140         mtspr           256,r0
3141
3142         # Reverse eighty7 to 0x010101..87
3143         xxlor           2, 32+$eighty7, 32+$eighty7
3144         vsldoi          $eighty7,$tmp,$eighty7,1        # 0x010101..87
3145         xxlor           1, 32+$eighty7, 32+$eighty7
3146
3147         # Load XOR contents. 0xf102132435465768798a9bacbdcedfe
3148         mr              $x70, r6
3149         bl              Lconsts
3150         lxvw4x          0, $x40, r6             # load XOR contents
3151         mr              r6, $x70
3152         li              $x70,0x70
3153
3154         subi            $rounds,$rounds,3       # -4 in total
3155
3156         lvx             $rndkey0,$x00,$key1     # load key schedule
3157         lvx             v30,$x10,$key1
3158         addi            $key1,$key1,0x20
3159         lvx             v31,$x00,$key1
3160         ?vperm          $rndkey0,$rndkey0,v30,$keyperm
3161         addi            $key_,$sp,$FRAME+15
3162         mtctr           $rounds
3163
3164 Load_xts_dec_key:
3165         ?vperm          v24,v30,v31,$keyperm
3166         lvx             v30,$x10,$key1
3167         addi            $key1,$key1,0x20
3168         stvx            v24,$x00,$key_          # off-load round[1]
3169         ?vperm          v25,v31,v30,$keyperm
3170         lvx             v31,$x00,$key1
3171         stvx            v25,$x10,$key_          # off-load round[2]
3172         addi            $key_,$key_,0x20
3173         bdnz            Load_xts_dec_key
3174
3175         lvx             v26,$x10,$key1
3176         ?vperm          v24,v30,v31,$keyperm
3177         lvx             v27,$x20,$key1
3178         stvx            v24,$x00,$key_          # off-load round[3]
3179         ?vperm          v25,v31,v26,$keyperm
3180         lvx             v28,$x30,$key1
3181         stvx            v25,$x10,$key_          # off-load round[4]
3182         addi            $key_,$sp,$FRAME+15     # rewind $key_
3183         ?vperm          v26,v26,v27,$keyperm
3184         lvx             v29,$x40,$key1
3185         ?vperm          v27,v27,v28,$keyperm
3186         lvx             v30,$x50,$key1
3187         ?vperm          v28,v28,v29,$keyperm
3188         lvx             v31,$x60,$key1
3189         ?vperm          v29,v29,v30,$keyperm
3190         lvx             $twk5,$x70,$key1        # borrow $twk5
3191         ?vperm          v30,v30,v31,$keyperm
3192         lvx             v24,$x00,$key_          # pre-load round[1]
3193         ?vperm          v31,v31,$twk5,$keyperm
3194         lvx             v25,$x10,$key_          # pre-load round[2]
3195
3196          vperm          $in0,$inout,$inptail,$inpperm
3197          subi           $inp,$inp,31            # undo "caller"
3198         vxor            $twk0,$tweak,$rndkey0
3199         vsrab           $tmp,$tweak,$seven      # next tweak value
3200         vaddubm         $tweak,$tweak,$tweak
3201         vand            $tmp,$tmp,$eighty7
3202          vxor           $out0,$in0,$twk0
3203         xxlor           32+$in1, 0, 0
3204         vpermxor        $tweak, $tweak, $tmp, $in1
3205
3206          lvx_u          $in1,$x10,$inp
3207         vxor            $twk1,$tweak,$rndkey0
3208         vsrab           $tmp,$tweak,$seven      # next tweak value
3209         vaddubm         $tweak,$tweak,$tweak
3210          le?vperm       $in1,$in1,$in1,$leperm
3211         vand            $tmp,$tmp,$eighty7
3212          vxor           $out1,$in1,$twk1
3213         xxlor           32+$in2, 0, 0
3214         vpermxor        $tweak, $tweak, $tmp, $in2
3215
3216          lvx_u          $in2,$x20,$inp
3217          andi.          $taillen,$len,15
3218         vxor            $twk2,$tweak,$rndkey0
3219         vsrab           $tmp,$tweak,$seven      # next tweak value
3220         vaddubm         $tweak,$tweak,$tweak
3221          le?vperm       $in2,$in2,$in2,$leperm
3222         vand            $tmp,$tmp,$eighty7
3223          vxor           $out2,$in2,$twk2
3224         xxlor           32+$in3, 0, 0
3225         vpermxor        $tweak, $tweak, $tmp, $in3
3226
3227          lvx_u          $in3,$x30,$inp
3228          sub            $len,$len,$taillen
3229         vxor            $twk3,$tweak,$rndkey0
3230         vsrab           $tmp,$tweak,$seven      # next tweak value
3231         vaddubm         $tweak,$tweak,$tweak
3232          le?vperm       $in3,$in3,$in3,$leperm
3233         vand            $tmp,$tmp,$eighty7
3234          vxor           $out3,$in3,$twk3
3235         xxlor           32+$in4, 0, 0
3236         vpermxor        $tweak, $tweak, $tmp, $in4
3237
3238          lvx_u          $in4,$x40,$inp
3239          subi           $len,$len,0x60
3240         vxor            $twk4,$tweak,$rndkey0
3241         vsrab           $tmp,$tweak,$seven      # next tweak value
3242         vaddubm         $tweak,$tweak,$tweak
3243          le?vperm       $in4,$in4,$in4,$leperm
3244         vand            $tmp,$tmp,$eighty7
3245          vxor           $out4,$in4,$twk4
3246         xxlor           32+$in5, 0, 0
3247         vpermxor        $tweak, $tweak, $tmp, $in5
3248
3249          lvx_u          $in5,$x50,$inp
3250          addi           $inp,$inp,0x60
3251         vxor            $twk5,$tweak,$rndkey0
3252         vsrab           $tmp,$tweak,$seven      # next tweak value
3253         vaddubm         $tweak,$tweak,$tweak
3254          le?vperm       $in5,$in5,$in5,$leperm
3255         vand            $tmp,$tmp,$eighty7
3256          vxor           $out5,$in5,$twk5
3257         xxlor           32+$in0, 0, 0
3258         vpermxor        $tweak, $tweak, $tmp, $in0
3259
3260         vxor            v31,v31,$rndkey0
3261         mtctr           $rounds
3262         b               Loop_xts_dec6x
3263
3264 .align  5
3265 Loop_xts_dec6x:
3266         vncipher        $out0,$out0,v24
3267         vncipher        $out1,$out1,v24
3268         vncipher        $out2,$out2,v24
3269         vncipher        $out3,$out3,v24
3270         vncipher        $out4,$out4,v24
3271         vncipher        $out5,$out5,v24
3272         lvx             v24,$x20,$key_          # round[3]
3273         addi            $key_,$key_,0x20
3274
3275         vncipher        $out0,$out0,v25
3276         vncipher        $out1,$out1,v25
3277         vncipher        $out2,$out2,v25
3278         vncipher        $out3,$out3,v25
3279         vncipher        $out4,$out4,v25
3280         vncipher        $out5,$out5,v25
3281         lvx             v25,$x10,$key_          # round[4]
3282         bdnz            Loop_xts_dec6x
3283
3284         xxlor           32+$eighty7, 1, 1
3285
3286         subic           $len,$len,96            # $len-=96
3287          vxor           $in0,$twk0,v31          # xor with last round key
3288         vncipher        $out0,$out0,v24
3289         vncipher        $out1,$out1,v24
3290          vsrab          $tmp,$tweak,$seven      # next tweak value
3291          vxor           $twk0,$tweak,$rndkey0
3292          vaddubm        $tweak,$tweak,$tweak
3293         vncipher        $out2,$out2,v24
3294         vncipher        $out3,$out3,v24
3295         vncipher        $out4,$out4,v24
3296         vncipher        $out5,$out5,v24
3297
3298         subfe.          r0,r0,r0                # borrow?-1:0
3299          vand           $tmp,$tmp,$eighty7
3300         vncipher        $out0,$out0,v25
3301         vncipher        $out1,$out1,v25
3302          xxlor          32+$in1, 0, 0
3303          vpermxor       $tweak, $tweak, $tmp, $in1
3304         vncipher        $out2,$out2,v25
3305         vncipher        $out3,$out3,v25
3306          vxor           $in1,$twk1,v31
3307          vsrab          $tmp,$tweak,$seven      # next tweak value
3308          vxor           $twk1,$tweak,$rndkey0
3309         vncipher        $out4,$out4,v25
3310         vncipher        $out5,$out5,v25
3311
3312         and             r0,r0,$len
3313          vaddubm        $tweak,$tweak,$tweak
3314         vncipher        $out0,$out0,v26
3315         vncipher        $out1,$out1,v26
3316          vand           $tmp,$tmp,$eighty7
3317         vncipher        $out2,$out2,v26
3318         vncipher        $out3,$out3,v26
3319          xxlor          32+$in2, 0, 0
3320          vpermxor       $tweak, $tweak, $tmp, $in2
3321         vncipher        $out4,$out4,v26
3322         vncipher        $out5,$out5,v26
3323
3324         add             $inp,$inp,r0            # $inp is adjusted in such
3325                                                 # way that at exit from the
3326                                                 # loop inX-in5 are loaded
3327                                                 # with last "words"
3328          vxor           $in2,$twk2,v31
3329          vsrab          $tmp,$tweak,$seven      # next tweak value
3330          vxor           $twk2,$tweak,$rndkey0
3331          vaddubm        $tweak,$tweak,$tweak
3332         vncipher        $out0,$out0,v27
3333         vncipher        $out1,$out1,v27
3334         vncipher        $out2,$out2,v27
3335         vncipher        $out3,$out3,v27
3336          vand           $tmp,$tmp,$eighty7
3337         vncipher        $out4,$out4,v27
3338         vncipher        $out5,$out5,v27
3339
3340         addi            $key_,$sp,$FRAME+15     # rewind $key_
3341          xxlor          32+$in3, 0, 0
3342          vpermxor       $tweak, $tweak, $tmp, $in3
3343         vncipher        $out0,$out0,v28
3344         vncipher        $out1,$out1,v28
3345          vxor           $in3,$twk3,v31
3346          vsrab          $tmp,$tweak,$seven      # next tweak value
3347          vxor           $twk3,$tweak,$rndkey0
3348         vncipher        $out2,$out2,v28
3349         vncipher        $out3,$out3,v28
3350          vaddubm        $tweak,$tweak,$tweak
3351         vncipher        $out4,$out4,v28
3352         vncipher        $out5,$out5,v28
3353         lvx             v24,$x00,$key_          # re-pre-load round[1]
3354          vand           $tmp,$tmp,$eighty7
3355
3356         vncipher        $out0,$out0,v29
3357         vncipher        $out1,$out1,v29
3358          xxlor          32+$in4, 0, 0
3359          vpermxor       $tweak, $tweak, $tmp, $in4
3360         vncipher        $out2,$out2,v29
3361         vncipher        $out3,$out3,v29
3362          vxor           $in4,$twk4,v31
3363          vsrab          $tmp,$tweak,$seven      # next tweak value
3364          vxor           $twk4,$tweak,$rndkey0
3365         vncipher        $out4,$out4,v29
3366         vncipher        $out5,$out5,v29
3367         lvx             v25,$x10,$key_          # re-pre-load round[2]
3368          vaddubm        $tweak,$tweak,$tweak
3369
3370         vncipher        $out0,$out0,v30
3371         vncipher        $out1,$out1,v30
3372          vand           $tmp,$tmp,$eighty7
3373         vncipher        $out2,$out2,v30
3374         vncipher        $out3,$out3,v30
3375          xxlor          32+$in5, 0, 0
3376          vpermxor       $tweak, $tweak, $tmp, $in5
3377         vncipher        $out4,$out4,v30
3378         vncipher        $out5,$out5,v30
3379          vxor           $in5,$twk5,v31
3380          vsrab          $tmp,$tweak,$seven      # next tweak value
3381          vxor           $twk5,$tweak,$rndkey0
3382
3383         vncipherlast    $out0,$out0,$in0
3384          lvx_u          $in0,$x00,$inp          # load next input block
3385          vaddubm        $tweak,$tweak,$tweak
3386         vncipherlast    $out1,$out1,$in1
3387          lvx_u          $in1,$x10,$inp
3388         vncipherlast    $out2,$out2,$in2
3389          le?vperm       $in0,$in0,$in0,$leperm
3390          lvx_u          $in2,$x20,$inp
3391          vand           $tmp,$tmp,$eighty7
3392         vncipherlast    $out3,$out3,$in3
3393          le?vperm       $in1,$in1,$in1,$leperm
3394          lvx_u          $in3,$x30,$inp
3395         vncipherlast    $out4,$out4,$in4
3396          le?vperm       $in2,$in2,$in2,$leperm
3397          lvx_u          $in4,$x40,$inp
3398          xxlor          10, 32+$in0, 32+$in0
3399          xxlor          32+$in0, 0, 0
3400          vpermxor       $tweak, $tweak, $tmp, $in0
3401          xxlor          32+$in0, 10, 10
3402         vncipherlast    $out5,$out5,$in5
3403          le?vperm       $in3,$in3,$in3,$leperm
3404          lvx_u          $in5,$x50,$inp
3405          addi           $inp,$inp,0x60
3406          le?vperm       $in4,$in4,$in4,$leperm
3407          le?vperm       $in5,$in5,$in5,$leperm
3408
3409         le?vperm        $out0,$out0,$out0,$leperm
3410         le?vperm        $out1,$out1,$out1,$leperm
3411         stvx_u          $out0,$x00,$out         # store output
3412          vxor           $out0,$in0,$twk0
3413         le?vperm        $out2,$out2,$out2,$leperm
3414         stvx_u          $out1,$x10,$out
3415          vxor           $out1,$in1,$twk1
3416         le?vperm        $out3,$out3,$out3,$leperm
3417         stvx_u          $out2,$x20,$out
3418          vxor           $out2,$in2,$twk2
3419         le?vperm        $out4,$out4,$out4,$leperm
3420         stvx_u          $out3,$x30,$out
3421          vxor           $out3,$in3,$twk3
3422         le?vperm        $out5,$out5,$out5,$leperm
3423         stvx_u          $out4,$x40,$out
3424          vxor           $out4,$in4,$twk4
3425         stvx_u          $out5,$x50,$out
3426          vxor           $out5,$in5,$twk5
3427         addi            $out,$out,0x60
3428
3429         mtctr           $rounds
3430         beq             Loop_xts_dec6x          # did $len-=96 borrow?
3431
3432         xxlor           32+$eighty7, 2, 2
3433
3434         addic.          $len,$len,0x60
3435         beq             Lxts_dec6x_zero
3436         cmpwi           $len,0x20
3437         blt             Lxts_dec6x_one
3438         nop
3439         beq             Lxts_dec6x_two
3440         cmpwi           $len,0x40
3441         blt             Lxts_dec6x_three
3442         nop
3443         beq             Lxts_dec6x_four
3444
3445 Lxts_dec6x_five:
3446         vxor            $out0,$in1,$twk0
3447         vxor            $out1,$in2,$twk1
3448         vxor            $out2,$in3,$twk2
3449         vxor            $out3,$in4,$twk3
3450         vxor            $out4,$in5,$twk4
3451
3452         bl              _aesp8_xts_dec5x
3453
3454         le?vperm        $out0,$out0,$out0,$leperm
3455         vmr             $twk0,$twk5             # unused tweak
3456         vxor            $twk1,$tweak,$rndkey0
3457         le?vperm        $out1,$out1,$out1,$leperm
3458         stvx_u          $out0,$x00,$out         # store output
3459         vxor            $out0,$in0,$twk1
3460         le?vperm        $out2,$out2,$out2,$leperm
3461         stvx_u          $out1,$x10,$out
3462         le?vperm        $out3,$out3,$out3,$leperm
3463         stvx_u          $out2,$x20,$out
3464         le?vperm        $out4,$out4,$out4,$leperm
3465         stvx_u          $out3,$x30,$out
3466         stvx_u          $out4,$x40,$out
3467         addi            $out,$out,0x50
3468         bne             Lxts_dec6x_steal
3469         b               Lxts_dec6x_done
3470
3471 .align  4
3472 Lxts_dec6x_four:
3473         vxor            $out0,$in2,$twk0
3474         vxor            $out1,$in3,$twk1
3475         vxor            $out2,$in4,$twk2
3476         vxor            $out3,$in5,$twk3
3477         vxor            $out4,$out4,$out4
3478
3479         bl              _aesp8_xts_dec5x
3480
3481         le?vperm        $out0,$out0,$out0,$leperm
3482         vmr             $twk0,$twk4             # unused tweak
3483         vmr             $twk1,$twk5
3484         le?vperm        $out1,$out1,$out1,$leperm
3485         stvx_u          $out0,$x00,$out         # store output
3486         vxor            $out0,$in0,$twk5
3487         le?vperm        $out2,$out2,$out2,$leperm
3488         stvx_u          $out1,$x10,$out
3489         le?vperm        $out3,$out3,$out3,$leperm
3490         stvx_u          $out2,$x20,$out
3491         stvx_u          $out3,$x30,$out
3492         addi            $out,$out,0x40
3493         bne             Lxts_dec6x_steal
3494         b               Lxts_dec6x_done
3495
3496 .align  4
3497 Lxts_dec6x_three:
3498         vxor            $out0,$in3,$twk0
3499         vxor            $out1,$in4,$twk1
3500         vxor            $out2,$in5,$twk2
3501         vxor            $out3,$out3,$out3
3502         vxor            $out4,$out4,$out4
3503
3504         bl              _aesp8_xts_dec5x
3505
3506         le?vperm        $out0,$out0,$out0,$leperm
3507         vmr             $twk0,$twk3             # unused tweak
3508         vmr             $twk1,$twk4
3509         le?vperm        $out1,$out1,$out1,$leperm
3510         stvx_u          $out0,$x00,$out         # store output
3511         vxor            $out0,$in0,$twk4
3512         le?vperm        $out2,$out2,$out2,$leperm
3513         stvx_u          $out1,$x10,$out
3514         stvx_u          $out2,$x20,$out
3515         addi            $out,$out,0x30
3516         bne             Lxts_dec6x_steal
3517         b               Lxts_dec6x_done
3518
3519 .align  4
3520 Lxts_dec6x_two:
3521         vxor            $out0,$in4,$twk0
3522         vxor            $out1,$in5,$twk1
3523         vxor            $out2,$out2,$out2
3524         vxor            $out3,$out3,$out3
3525         vxor            $out4,$out4,$out4
3526
3527         bl              _aesp8_xts_dec5x
3528
3529         le?vperm        $out0,$out0,$out0,$leperm
3530         vmr             $twk0,$twk2             # unused tweak
3531         vmr             $twk1,$twk3
3532         le?vperm        $out1,$out1,$out1,$leperm
3533         stvx_u          $out0,$x00,$out         # store output
3534         vxor            $out0,$in0,$twk3
3535         stvx_u          $out1,$x10,$out
3536         addi            $out,$out,0x20
3537         bne             Lxts_dec6x_steal
3538         b               Lxts_dec6x_done
3539
3540 .align  4
3541 Lxts_dec6x_one:
3542         vxor            $out0,$in5,$twk0
3543         nop
3544 Loop_xts_dec1x:
3545         vncipher        $out0,$out0,v24
3546         lvx             v24,$x20,$key_          # round[3]
3547         addi            $key_,$key_,0x20
3548
3549         vncipher        $out0,$out0,v25
3550         lvx             v25,$x10,$key_          # round[4]
3551         bdnz            Loop_xts_dec1x
3552
3553         subi            r0,$taillen,1
3554         vncipher        $out0,$out0,v24
3555
3556         andi.           r0,r0,16
3557         cmpwi           $taillen,0
3558         vncipher        $out0,$out0,v25
3559
3560         sub             $inp,$inp,r0
3561         vncipher        $out0,$out0,v26
3562
3563         lvx_u           $in0,0,$inp
3564         vncipher        $out0,$out0,v27
3565
3566         addi            $key_,$sp,$FRAME+15     # rewind $key_
3567         vncipher        $out0,$out0,v28
3568         lvx             v24,$x00,$key_          # re-pre-load round[1]
3569
3570         vncipher        $out0,$out0,v29
3571         lvx             v25,$x10,$key_          # re-pre-load round[2]
3572          vxor           $twk0,$twk0,v31
3573
3574         le?vperm        $in0,$in0,$in0,$leperm
3575         vncipher        $out0,$out0,v30
3576
3577         mtctr           $rounds
3578         vncipherlast    $out0,$out0,$twk0
3579
3580         vmr             $twk0,$twk1             # unused tweak
3581         vmr             $twk1,$twk2
3582         le?vperm        $out0,$out0,$out0,$leperm
3583         stvx_u          $out0,$x00,$out         # store output
3584         addi            $out,$out,0x10
3585         vxor            $out0,$in0,$twk2
3586         bne             Lxts_dec6x_steal
3587         b               Lxts_dec6x_done
3588
3589 .align  4
3590 Lxts_dec6x_zero:
3591         cmpwi           $taillen,0
3592         beq             Lxts_dec6x_done
3593
3594         lvx_u           $in0,0,$inp
3595         le?vperm        $in0,$in0,$in0,$leperm
3596         vxor            $out0,$in0,$twk1
3597 Lxts_dec6x_steal:
3598         vncipher        $out0,$out0,v24
3599         lvx             v24,$x20,$key_          # round[3]
3600         addi            $key_,$key_,0x20
3601
3602         vncipher        $out0,$out0,v25
3603         lvx             v25,$x10,$key_          # round[4]
3604         bdnz            Lxts_dec6x_steal
3605
3606         add             $inp,$inp,$taillen
3607         vncipher        $out0,$out0,v24
3608
3609         cmpwi           $taillen,0
3610         vncipher        $out0,$out0,v25
3611
3612         lvx_u           $in0,0,$inp
3613         vncipher        $out0,$out0,v26
3614
3615         lvsr            $inpperm,0,$taillen     # $in5 is no more
3616         vncipher        $out0,$out0,v27
3617
3618         addi            $key_,$sp,$FRAME+15     # rewind $key_
3619         vncipher        $out0,$out0,v28
3620         lvx             v24,$x00,$key_          # re-pre-load round[1]
3621
3622         vncipher        $out0,$out0,v29
3623         lvx             v25,$x10,$key_          # re-pre-load round[2]
3624          vxor           $twk1,$twk1,v31
3625
3626         le?vperm        $in0,$in0,$in0,$leperm
3627         vncipher        $out0,$out0,v30
3628
3629         vperm           $in0,$in0,$in0,$inpperm
3630         vncipherlast    $tmp,$out0,$twk1
3631
3632         le?vperm        $out0,$tmp,$tmp,$leperm
3633         le?stvx_u       $out0,0,$out
3634         be?stvx_u       $tmp,0,$out
3635
3636         vxor            $out0,$out0,$out0
3637         vspltisb        $out1,-1
3638         vperm           $out0,$out0,$out1,$inpperm
3639         vsel            $out0,$in0,$tmp,$out0
3640         vxor            $out0,$out0,$twk0
3641
3642         subi            r30,$out,1
3643         mtctr           $taillen
3644 Loop_xts_dec6x_steal:
3645         lbzu            r0,1(r30)
3646         stb             r0,16(r30)
3647         bdnz            Loop_xts_dec6x_steal
3648
3649         li              $taillen,0
3650         mtctr           $rounds
3651         b               Loop_xts_dec1x          # one more time...
3652
3653 .align  4
3654 Lxts_dec6x_done:
3655         ${UCMP}i        $ivp,0
3656         beq             Lxts_dec6x_ret
3657
3658         vxor            $tweak,$twk0,$rndkey0
3659         le?vperm        $tweak,$tweak,$tweak,$leperm
3660         stvx_u          $tweak,0,$ivp
3661
3662 Lxts_dec6x_ret:
3663         mtlr            r11
3664         li              r10,`$FRAME+15`
3665         li              r11,`$FRAME+31`
3666         stvx            $seven,r10,$sp          # wipe copies of round keys
3667         addi            r10,r10,32
3668         stvx            $seven,r11,$sp
3669         addi            r11,r11,32
3670         stvx            $seven,r10,$sp
3671         addi            r10,r10,32
3672         stvx            $seven,r11,$sp
3673         addi            r11,r11,32
3674         stvx            $seven,r10,$sp
3675         addi            r10,r10,32
3676         stvx            $seven,r11,$sp
3677         addi            r11,r11,32
3678         stvx            $seven,r10,$sp
3679         addi            r10,r10,32
3680         stvx            $seven,r11,$sp
3681         addi            r11,r11,32
3682
3683         mtspr           256,$vrsave
3684         lvx             v20,r10,$sp             # ABI says so
3685         addi            r10,r10,32
3686         lvx             v21,r11,$sp
3687         addi            r11,r11,32
3688         lvx             v22,r10,$sp
3689         addi            r10,r10,32
3690         lvx             v23,r11,$sp
3691         addi            r11,r11,32
3692         lvx             v24,r10,$sp
3693         addi            r10,r10,32
3694         lvx             v25,r11,$sp
3695         addi            r11,r11,32
3696         lvx             v26,r10,$sp
3697         addi            r10,r10,32
3698         lvx             v27,r11,$sp
3699         addi            r11,r11,32
3700         lvx             v28,r10,$sp
3701         addi            r10,r10,32
3702         lvx             v29,r11,$sp
3703         addi            r11,r11,32
3704         lvx             v30,r10,$sp
3705         lvx             v31,r11,$sp
3706         $POP            r26,`$FRAME+21*16+0*$SIZE_T`($sp)
3707         $POP            r27,`$FRAME+21*16+1*$SIZE_T`($sp)
3708         $POP            r28,`$FRAME+21*16+2*$SIZE_T`($sp)
3709         $POP            r29,`$FRAME+21*16+3*$SIZE_T`($sp)
3710         $POP            r30,`$FRAME+21*16+4*$SIZE_T`($sp)
3711         $POP            r31,`$FRAME+21*16+5*$SIZE_T`($sp)
3712         addi            $sp,$sp,`$FRAME+21*16+6*$SIZE_T`
3713         blr
3714         .long           0
3715         .byte           0,12,0x04,1,0x80,6,6,0
3716         .long           0
3717
3718 .align  5
3719 _aesp8_xts_dec5x:
3720         vncipher        $out0,$out0,v24
3721         vncipher        $out1,$out1,v24
3722         vncipher        $out2,$out2,v24
3723         vncipher        $out3,$out3,v24
3724         vncipher        $out4,$out4,v24
3725         lvx             v24,$x20,$key_          # round[3]
3726         addi            $key_,$key_,0x20
3727
3728         vncipher        $out0,$out0,v25
3729         vncipher        $out1,$out1,v25
3730         vncipher        $out2,$out2,v25
3731         vncipher        $out3,$out3,v25
3732         vncipher        $out4,$out4,v25
3733         lvx             v25,$x10,$key_          # round[4]
3734         bdnz            _aesp8_xts_dec5x
3735
3736         subi            r0,$taillen,1
3737         vncipher        $out0,$out0,v24
3738         vncipher        $out1,$out1,v24
3739         vncipher        $out2,$out2,v24
3740         vncipher        $out3,$out3,v24
3741         vncipher        $out4,$out4,v24
3742
3743         andi.           r0,r0,16
3744         cmpwi           $taillen,0
3745         vncipher        $out0,$out0,v25
3746         vncipher        $out1,$out1,v25
3747         vncipher        $out2,$out2,v25
3748         vncipher        $out3,$out3,v25
3749         vncipher        $out4,$out4,v25
3750          vxor           $twk0,$twk0,v31
3751
3752         sub             $inp,$inp,r0
3753         vncipher        $out0,$out0,v26
3754         vncipher        $out1,$out1,v26
3755         vncipher        $out2,$out2,v26
3756         vncipher        $out3,$out3,v26
3757         vncipher        $out4,$out4,v26
3758          vxor           $in1,$twk1,v31
3759
3760         vncipher        $out0,$out0,v27
3761         lvx_u           $in0,0,$inp
3762         vncipher        $out1,$out1,v27
3763         vncipher        $out2,$out2,v27
3764         vncipher        $out3,$out3,v27
3765         vncipher        $out4,$out4,v27
3766          vxor           $in2,$twk2,v31
3767
3768         addi            $key_,$sp,$FRAME+15     # rewind $key_
3769         vncipher        $out0,$out0,v28
3770         vncipher        $out1,$out1,v28
3771         vncipher        $out2,$out2,v28
3772         vncipher        $out3,$out3,v28
3773         vncipher        $out4,$out4,v28
3774         lvx             v24,$x00,$key_          # re-pre-load round[1]
3775          vxor           $in3,$twk3,v31
3776
3777         vncipher        $out0,$out0,v29
3778         le?vperm        $in0,$in0,$in0,$leperm
3779         vncipher        $out1,$out1,v29
3780         vncipher        $out2,$out2,v29
3781         vncipher        $out3,$out3,v29
3782         vncipher        $out4,$out4,v29
3783         lvx             v25,$x10,$key_          # re-pre-load round[2]
3784          vxor           $in4,$twk4,v31
3785
3786         vncipher        $out0,$out0,v30
3787         vncipher        $out1,$out1,v30
3788         vncipher        $out2,$out2,v30
3789         vncipher        $out3,$out3,v30
3790         vncipher        $out4,$out4,v30
3791
3792         vncipherlast    $out0,$out0,$twk0
3793         vncipherlast    $out1,$out1,$in1
3794         vncipherlast    $out2,$out2,$in2
3795         vncipherlast    $out3,$out3,$in3
3796         vncipherlast    $out4,$out4,$in4
3797         mtctr           $rounds
3798         blr
3799         .long           0
3800         .byte           0,12,0x14,0,0,0,0,0
3801 ___
3802 }}      }}}
3803
3804 my $consts=1;
3805 foreach(split("\n",$code)) {
3806         s/\`([^\`]*)\`/eval($1)/geo;
3807
3808         # constants table endian-specific conversion
3809         if ($consts && m/\.(long|byte)\s+(.+)\s+(\?[a-z]*)$/o) {
3810             my $conv=$3;
3811             my @bytes=();
3812
3813             # convert to endian-agnostic format
3814             if ($1 eq "long") {
3815               foreach (split(/,\s*/,$2)) {
3816                 my $l = /^0/?oct:int;
3817                 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
3818               }
3819             } else {
3820                 @bytes = map(/^0/?oct:int,split(/,\s*/,$2));
3821             }
3822
3823             # little-endian conversion
3824             if ($flavour =~ /le$/o) {
3825                 SWITCH: for($conv)  {
3826                     /\?inv/ && do   { @bytes=map($_^0xf,@bytes); last; };
3827                     /\?rev/ && do   { @bytes=reverse(@bytes);    last; };
3828                 }
3829             }
3830
3831             #emit
3832             print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
3833             next;
3834         }
3835         $consts=0 if (m/Lconsts:/o);    # end of table
3836
3837         # instructions prefixed with '?' are endian-specific and need
3838         # to be adjusted accordingly...
3839         if ($flavour =~ /le$/o) {       # little-endian
3840             s/le\?//o           or
3841             s/be\?/#be#/o       or
3842             s/\?lvsr/lvsl/o     or
3843             s/\?lvsl/lvsr/o     or
3844             s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
3845             s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
3846             s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
3847         } else {                        # big-endian
3848             s/le\?/#le#/o       or
3849             s/be\?//o           or
3850             s/\?([a-z]+)/$1/o;
3851         }
3852
3853         print $_,"\n";
3854 }
3855
3856 close STDOUT or die "error closing STDOUT: $!";