2 # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ######################################################################
11 ## Constant-time SSSE3 AES core implementation.
14 ## By Mike Hamburg (Stanford University), 2009
17 ## For details see http://shiftleft.org/papers/vector_aes/ and
18 ## http://crypto.stanford.edu/vpaes/.
20 # CBC encrypt/decrypt performance in cycles per byte processed with
24 # PPC74x0/G4e 35.5/52.1/(23.8) 11.9(*)/15.4
25 # PPC970/G5 37.9/55.0/(28.5) 22.2/28.5
26 # POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
27 # POWER7 32.3/42.9/(18.4) 18.5/23.3
29 # (*) This is ~10% worse than reported in paper. The reason is
30 # twofold. This module doesn't make any assumption about
31 # key schedule (or data for that matter) alignment and handles
32 # it in-line. Secondly it, being transliterated from
33 # vpaes-x86_64.pl, relies on "nested inversion" better suited
35 # (**) Inadequate POWER6 performance is due to astronomic AltiVec
36 # latency, 9 cycles per simple logical operation.
38 # $output is the last argument if it looks like a file (it has an extension)
39 # $flavour is the first argument if it doesn't look like a file
40 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
41 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
43 if ($flavour =~ /64/) {
50 } elsif ($flavour =~ /32/) {
57 } else { die "nonsense $flavour"; }
60 $FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload
62 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
63 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
64 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
65 die "can't locate ppc-xlate.pl";
67 open STDOUT,"| $^X $xlate $flavour \"$output\""
68 || die "can't call $xlate: $!";
75 .align 7 # totally strategic alignment
77 Lk_mc_forward: # mc_forward
78 .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c ?inv
79 .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300 ?inv
80 .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704 ?inv
81 .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08 ?inv
82 Lk_mc_backward: # mc_backward
83 .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e ?inv
84 .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a ?inv
85 .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506 ?inv
86 .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102 ?inv
88 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f ?inv
89 .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b ?inv
90 .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07 ?inv
91 .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603 ?inv
97 .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704 ?rev
98 .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03 ?rev
99 Lk_ipt: # input transform (lo, hi)
100 .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca ?rev
101 .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd ?rev
103 .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15 ?rev
104 .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e ?rev
106 .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b ?rev
107 .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5 ?rev
109 .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2 ?rev
110 .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e ?rev
115 Lk_dipt: # decryption input transform
116 .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15 ?rev
117 .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712 ?rev
118 Lk_dsbo: # decryption sbox final output
119 .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7 ?rev
120 .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca ?rev
121 Lk_dsb9: # decryption sbox output *9*u, *9*t
122 .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca ?rev
123 .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72 ?rev
124 Lk_dsbd: # decryption sbox output *D*u, *D*t
125 .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5 ?rev
126 .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129 ?rev
127 Lk_dsbb: # decryption sbox output *B*u, *B*t
128 .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660 ?rev
129 .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3 ?rev
130 Lk_dsbe: # decryption sbox output *E*u, *E*t
131 .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222 ?rev
132 .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794 ?rev
135 ## Key schedule constants
137 Lk_dksd: # decryption key schedule: invskew x*D
138 .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007 ?rev
139 .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f ?rev
140 Lk_dksb: # decryption key schedule: invskew x*B
141 .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603 ?rev
142 .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9 ?rev
143 Lk_dkse: # decryption key schedule: invskew x*E + 0x63
144 .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553 ?rev
145 .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd ?rev
146 Lk_dks9: # decryption key schedule: invskew x*9
147 .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a ?rev
148 .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b ?rev
151 .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70 ?asis
153 .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b ?asis
155 Lk_opt: # output transform
156 .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7 ?rev
157 .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1 ?rev
158 Lk_deskew: # deskew tables: inverts the sbox's "skew"
159 .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d ?rev
160 .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128 ?rev
165 mflr r12 #vvvvv "distance between . and _vpaes_consts
170 .byte 0,12,0x14,0,0,0,0,0
171 .asciz "Vector Permutation AES for AltiVec, Mike Hamburg (Stanford University)"
175 my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
177 my ($inp,$out,$key) = map("r$_",(3..5));
179 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
180 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
181 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
187 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
188 ## and %xmm9-%xmm15 as specified below.
191 _vpaes_encrypt_preheat:
195 li r11, 0xc0 # Lk_inv
199 vxor v7, v7, v7 # 0x00..00
200 vspltisb v8,4 # 0x04..04
201 vspltisb v9,0x0f # 0x0f..0f
220 .byte 0,12,0x14,0,0,0,0,0
225 ## AES-encrypt %xmm0.
229 ## %xmm9-%xmm15 as in _vpaes_preheat
230 ## (%rdx) = scheduled keys
233 ## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
238 lwz r8, 240($key) # pull rounds
240 lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
244 ?vperm v5, v5, v6, $keyperm # align round key
246 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
247 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
248 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
249 vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
250 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
256 # middle of middle round
257 vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
258 lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
260 vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
261 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
262 andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
263 vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
264 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
265 vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
266 lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
268 vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
269 vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
270 vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
271 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
272 vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
273 vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
274 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
278 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
279 vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
280 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
281 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
282 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
284 vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
285 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
286 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
288 lvx v6, r9, $key # vmovdqu (%r9), %xmm5
289 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
291 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
292 ?vperm v5, v5, v6, $keyperm # align round key
293 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
296 # middle of last round
298 # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
299 # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
300 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
301 lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
302 vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
303 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
304 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
305 vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
308 .byte 0,12,0x14,0,0,0,0,0
310 .globl .vpaes_encrypt
313 $STU $sp,-$FRAME($sp)
314 li r10,`15+6*$SIZE_T`
315 li r11,`31+6*$SIZE_T`
317 mfspr r7, 256 # save vrsave
340 stw r7,`$FRAME-4`($sp) # save vrsave
342 $PUSH r6,`$FRAME+$LRSAVE`($sp)
343 mtspr 256, r0 # preserve all AltiVec registers
345 bl _vpaes_encrypt_preheat
347 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
349 addi $inp, $inp, 15 # 15 is not a typo
350 ?lvsr $outperm, 0, $out
351 ?lvsl $keyperm, 0, $key # prepare for unaligned access
352 lvx $inptail, 0, $inp # redundant in aligned case
353 ?vperm v0, v0, $inptail, $inpperm
355 bl _vpaes_encrypt_core
361 vperm v0, v0, v0, $outperm # rotate right/left
366 bdnz Lenc_out_unaligned
374 li r10,`15+6*$SIZE_T`
375 li r11,`31+6*$SIZE_T`
377 mtspr 256, r7 # restore vrsave
403 .byte 0,12,0x04,1,0x80,0,3,0
405 .size .vpaes_encrypt,.-.vpaes_encrypt
408 _vpaes_decrypt_preheat:
412 li r11, 0xc0 # Lk_inv
416 vxor v7, v7, v7 # 0x00..00
417 vspltisb v8,4 # 0x04..04
418 vspltisb v9,0x0f # 0x0f..0f
445 .byte 0,12,0x14,0,0,0,0,0
450 ## Same API as encryption core.
454 lwz r8, 240($key) # pull rounds
456 lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
460 ?vperm v5, v5, v6, $keyperm # align round key
461 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
462 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
463 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
464 vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
465 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
472 # Inverse mix columns
474 lvx v0, r12, r11 # v5 and v0 are flipped
475 # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
476 # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
477 vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
479 vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
481 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
482 # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
483 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
484 # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
486 vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
487 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
488 vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
489 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
490 # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
491 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
492 # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
494 vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
495 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
496 vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
497 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
498 # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
499 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
500 # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
502 vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
503 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
504 vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
505 vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
506 vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
510 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
511 vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
512 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
513 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
514 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
516 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
517 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
518 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
520 lvx v6, r9, $key # vmovdqu (%r9), %xmm0
521 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
523 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
524 ?vperm v5, v5, v6, $keyperm # align round key
525 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
528 # middle of last round
530 # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
531 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
532 # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
533 lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
534 vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
535 vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
536 vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
537 vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
540 .byte 0,12,0x14,0,0,0,0,0
542 .globl .vpaes_decrypt
545 $STU $sp,-$FRAME($sp)
546 li r10,`15+6*$SIZE_T`
547 li r11,`31+6*$SIZE_T`
549 mfspr r7, 256 # save vrsave
572 stw r7,`$FRAME-4`($sp) # save vrsave
574 $PUSH r6,`$FRAME+$LRSAVE`($sp)
575 mtspr 256, r0 # preserve all AltiVec registers
577 bl _vpaes_decrypt_preheat
579 ?lvsl $inpperm, 0, $inp # prepare for unaligned access
581 addi $inp, $inp, 15 # 15 is not a typo
582 ?lvsr $outperm, 0, $out
583 ?lvsl $keyperm, 0, $key
584 lvx $inptail, 0, $inp # redundant in aligned case
585 ?vperm v0, v0, $inptail, $inpperm
587 bl _vpaes_decrypt_core
593 vperm v0, v0, v0, $outperm # rotate right/left
598 bdnz Ldec_out_unaligned
606 li r10,`15+6*$SIZE_T`
607 li r11,`31+6*$SIZE_T`
609 mtspr 256, r7 # restore vrsave
635 .byte 0,12,0x04,1,0x80,0,3,0
637 .size .vpaes_decrypt,.-.vpaes_decrypt
639 .globl .vpaes_cbc_encrypt
645 $STU $sp,-`($FRAME+2*$SIZE_T)`($sp)
647 li r10,`15+6*$SIZE_T`
648 li r11,`31+6*$SIZE_T`
672 stw r12,`$FRAME-4`($sp) # save vrsave
673 $PUSH r30,`$FRAME+$SIZE_T*0`($sp)
674 $PUSH r31,`$FRAME+$SIZE_T*1`($sp)
676 $PUSH r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
678 and r30, r5, r9 # copy length&-16
679 andi. r9, $out, 15 # is $out aligned?
680 mr r5, r6 # copy pointer to key
681 mr r31, r7 # copy pointer to iv
683 mcrf cr1, cr0 # put aside $out alignment flag
684 mr r7, r12 # copy vrsave
685 mtspr 256, r6 # preserve all AltiVec registers
687 lvx v24, 0, r31 # load [potentially unaligned] iv
689 ?lvsl $inpperm, 0, r31
691 ?vperm v24, v24, v25, $inpperm
693 cmpwi r8, 0 # test direction
694 neg r8, $inp # prepare for unaligned access
696 ?lvsl $keyperm, 0, $key
697 ?lvsr $outperm, 0, $out
698 ?lvsr $inpperm, 0, r8 # -$inp
699 vnor $outmask, v7, v7 # 0xff..ff
700 lvx $inptail, 0, $inp
701 ?vperm $outmask, v7, $outmask, $outperm
702 addi $inp, $inp, 15 # 15 is not a typo
706 bl _vpaes_encrypt_preheat
709 beq cr1, Lcbc_enc_loop # $out is aligned
712 lvx $inptail, 0, $inp
714 ?vperm v0, v0, $inptail, $inpperm
715 vxor v0, v0, v24 # ^= iv
717 bl _vpaes_encrypt_core
720 vmr v24, v0 # put aside iv
722 vperm $outhead, v0, v0, $outperm # rotate right/left
725 stvebx $outhead, r8, r9
730 sub. r30, r30, r0 # len -= 16
732 beq Lcbc_unaligned_done
736 lvx $inptail, 0, $inp
738 ?vperm v0, v0, $inptail, $inpperm
739 vxor v0, v0, v24 # ^= iv
741 bl _vpaes_encrypt_core
743 vmr v24, v0 # put aside iv
744 sub. r30, r30, r0 # len -= 16
745 vperm v0, v0, v0, $outperm # rotate right/left
746 vsel v1, $outhead, v0, $outmask
756 bl _vpaes_decrypt_preheat
759 beq cr1, Lcbc_dec_loop # $out is aligned
762 lvx $inptail, 0, $inp
764 ?vperm v0, v0, $inptail, $inpperm
765 vmr v25, v0 # put aside input
767 bl _vpaes_decrypt_core
770 vxor v0, v0, v24 # ^= iv
773 vperm $outhead, v0, v0, $outperm # rotate right/left
776 stvebx $outhead, r8, r9
781 sub. r30, r30, r0 # len -= 16
783 beq Lcbc_unaligned_done
787 lvx $inptail, 0, $inp
789 ?vperm v0, v0, $inptail, $inpperm
790 vmr v25, v0 # put aside input
792 bl _vpaes_decrypt_core
794 vxor v0, v0, v24 # ^= iv
796 sub. r30, r30, r0 # len -= 16
797 vperm v0, v0, v0, $outperm # rotate right/left
798 vsel v1, $outhead, v0, $outmask
805 beq cr1, Lcbc_write_iv # $out is aligned
812 stvebx $outhead, r9, $out
818 neg r8, r31 # write [potentially unaligned] iv
820 ?lvsl $outperm, 0, r8
823 vperm v24, v24, v24, $outperm # rotate right/left
824 stvewx v24, 0, r31 # ivp is at least 32-bit aligned
829 mtspr 256, r7 # restore vrsave
830 li r10,`15+6*$SIZE_T`
831 li r11,`31+6*$SIZE_T`
855 $POP r0, `$FRAME+$SIZE_T*2+$LRSAVE`($sp)
856 $POP r30,`$FRAME+$SIZE_T*0`($sp)
857 $POP r31,`$FRAME+$SIZE_T*1`($sp)
859 addi $sp,$sp,`$FRAME+$SIZE_T*2`
862 .byte 0,12,0x04,1,0x80,2,6,0
864 .size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
868 my ($inp,$bits,$out)=map("r$_",(3..5));
870 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
873 ########################################################
875 ## AES key schedule ##
877 ########################################################
883 li r11, 0xc0 # Lk_inv
888 vspltisb v8,4 # 0x04..04
889 vxor v9,v9,v9 # 0x00..00
890 lvx $invlo, r12, r11 # Lk_inv
894 lvx $iptlo, r12, r9 # Lk_ipt
899 lvx v14, r12, r11 # Lk_sb1
904 lvx v16, r12, r9 # Lk_dksd
908 lvx v18, r12, r11 # Lk_dksb
912 lvx v20, r12, r9 # Lk_dkse
916 lvx v22, r12, r11 # Lk_dks9
919 lvx v24, r12, r9 # Lk_rcon
920 lvx v25, 0, r12 # Lk_mc_forward[0]
921 lvx v26, r12, r8 # Lks63
924 .byte 0,12,0x14,0,0,0,0,0
927 _vpaes_schedule_core:
930 bl _vpaes_key_preheat # load the tables
932 #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
933 neg r8, $inp # prepare for unaligned access
935 addi $inp, $inp, 15 # 15 is not typo
936 ?lvsr $inpperm, 0, r8 # -$inp
937 lvx v6, 0, $inp # v6 serves as inptail
939 ?vperm v0, v0, v6, $inpperm
942 vmr v3, v0 # vmovdqa %xmm0, %xmm3
943 bl _vpaes_schedule_transform
944 vmr v7, v0 # vmovdqa %xmm0, %xmm7
946 bne $dir, Lschedule_am_decrypting
948 # encrypting, output zeroth round key after transform
949 li r8, 0x30 # mov \$0x30,%r8d
954 ?lvsr $outperm, 0, $out # prepare for unaligned access
955 vnor $outmask, v9, v9 # 0xff..ff
956 ?vperm $outmask, v9, $outmask, $outperm
958 #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
959 vperm $outhead, v0, v0, $outperm # rotate right/left
960 stvewx $outhead, 0, $out # some are superfluous
961 stvewx $outhead, r9, $out
962 stvewx $outhead, r10, $out
963 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
964 stvewx $outhead, r11, $out
967 Lschedule_am_decrypting:
968 srwi r8, $bits, 1 # shr \$1,%r8d
969 andi. r8, r8, 32 # and \$32,%r8d
970 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
971 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
972 # decrypting, output zeroth round key after shiftrows
973 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
977 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
979 neg r0, $out # prepare for unaligned access
980 ?lvsl $outperm, 0, r0
981 vnor $outmask, v9, v9 # 0xff..ff
982 ?vperm $outmask, $outmask, v9, $outperm
984 #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
985 vperm $outhead, v4, v4, $outperm # rotate right/left
986 stvewx $outhead, 0, $out # some are superfluous
987 stvewx $outhead, r9, $out
988 stvewx $outhead, r10, $out
989 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
990 stvewx $outhead, r11, $out
991 addi $out, $out, 15 # 15 is not typo
992 xori r8, r8, 0x30 # xor \$0x30, %r8
995 cmplwi $bits, 192 # cmp \$192, %esi
1003 ## 128-bit specific part of key schedule.
1005 ## This schedule is really simple, because all its parts
1006 ## are accomplished by the subroutines.
1009 li r0, 10 # mov \$10, %esi
1013 bl _vpaes_schedule_round
1014 bdz Lschedule_mangle_last # dec %esi
1015 bl _vpaes_schedule_mangle # write output
1019 ## .aes_schedule_192
1021 ## 192-bit specific part of key schedule.
1023 ## The main body of this schedule is the same as the 128-bit
1024 ## schedule, but with more smearing. The long, high side is
1025 ## stored in %xmm7 as before, and the short, low side is in
1026 ## the high bits of %xmm6.
1028 ## This schedule is somewhat nastier, however, because each
1029 ## round produces 192 bits of key material, or 1.5 round keys.
1030 ## Therefore, on each cycle we do 2 rounds and produce 3 round
1035 li r0, 4 # mov \$4, %esi
1037 ?vperm v0, v6, v0, $inpperm
1038 ?vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
1039 bl _vpaes_schedule_transform # input transform
1040 ?vsldoi v6, v0, v9, 8
1041 ?vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
1045 bl _vpaes_schedule_round
1046 ?vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
1047 bl _vpaes_schedule_mangle # save key n
1048 bl _vpaes_schedule_192_smear
1049 bl _vpaes_schedule_mangle # save key n+1
1050 bl _vpaes_schedule_round
1051 bdz Lschedule_mangle_last # dec %esi
1052 bl _vpaes_schedule_mangle # save key n+2
1053 bl _vpaes_schedule_192_smear
1057 ## .aes_schedule_256
1059 ## 256-bit specific part of key schedule.
1061 ## The structure here is very similar to the 128-bit
1062 ## schedule, but with an additional "low side" in
1063 ## %xmm6. The low side's rounds are the same as the
1064 ## high side's, except no rcon and no rotation.
1068 li r0, 7 # mov \$7, %esi
1070 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
1071 ?vperm v0, v6, v0, $inpperm
1072 bl _vpaes_schedule_transform # input transform
1076 bl _vpaes_schedule_mangle # output low result
1077 vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
1080 bl _vpaes_schedule_round
1081 bdz Lschedule_mangle_last # dec %esi
1082 bl _vpaes_schedule_mangle
1084 # low round. swap xmm7 and xmm6
1085 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1086 vmr v5, v7 # vmovdqa %xmm7, %xmm5
1087 vmr v7, v6 # vmovdqa %xmm6, %xmm7
1088 bl _vpaes_schedule_low_round
1089 vmr v7, v5 # vmovdqa %xmm5, %xmm7
1093 ## .aes_schedule_mangle_last
1095 ## Mangler for last round of key schedule
1097 ## when encrypting, outputs out(%xmm0) ^ 63
1098 ## when decrypting, outputs unskew(%xmm0)
1100 ## Always called right before return... jumps to cleanup and exits
1103 Lschedule_mangle_last:
1104 # schedule last round key from xmm0
1105 li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
1107 bne $dir, Lschedule_mangle_last_dec
1110 lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
1111 li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
1112 li r9, 0x2d0 # prepare to output transform
1113 vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
1115 lvx $iptlo, r11, r12 # reload $ipt
1117 addi $out, $out, 16 # add \$16, %rdx
1118 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1119 bl _vpaes_schedule_transform # output transform
1121 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1122 vperm v0, v0, v0, $outperm # rotate right/left
1124 vsel v2, $outhead, v0, $outmask
1128 stvewx v0, 0, $out # some (or all) are redundant
1129 stvewx v0, r10, $out
1130 stvewx v0, r11, $out
1131 stvewx v0, r12, $out
1132 b Lschedule_mangle_done
1135 Lschedule_mangle_last_dec:
1136 lvx $iptlo, r11, r12 # reload $ipt
1138 addi $out, $out, -16 # add \$-16, %rdx
1139 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
1140 bl _vpaes_schedule_transform # output transform
1142 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
1143 addi r9, $out, -15 # -15 is not typo
1144 vperm v0, v0, v0, $outperm # rotate right/left
1146 vsel v2, $outhead, v0, $outmask
1150 stvewx v0, 0, r9 # some (or all) are redundant
1156 Lschedule_mangle_done:
1159 vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
1160 vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
1161 vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
1162 vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
1163 vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1164 vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
1165 vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
1166 vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
1170 .byte 0,12,0x14,0,0,0,0,0
1173 ## .aes_schedule_192_smear
1175 ## Smear the short, low side in the 192-bit key schedule.
1178 ## %xmm7: high side, b a x y
1179 ## %xmm6: low side, d c 0 0
1183 ## %xmm6: b+c+d b+c 0 0
1184 ## %xmm0: b+c+d b+c b a
1187 _vpaes_schedule_192_smear:
1189 ?vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
1190 ?vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
1191 vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
1192 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
1194 ?vsldoi v6, v6, v9, 8
1195 ?vsldoi v6, v9, v6, 8 # clobber low side with zeros
1198 .byte 0,12,0x14,0,0,0,0,0
1201 ## .aes_schedule_round
1203 ## Runs one main round of the key schedule on %xmm0, %xmm7
1205 ## Specifically, runs subbytes on the high dword of %xmm0
1206 ## then rotates it by one byte and xors into the low dword of
1209 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
1212 ## Smears the dwords of %xmm7 by xoring the low into the
1213 ## second low, result into third, result into highest.
1215 ## Returns results in %xmm7 = %xmm0.
1216 ## Clobbers %xmm1-%xmm4, %r11.
1219 _vpaes_schedule_round:
1220 # extract rcon from xmm8
1221 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
1222 ?vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
1223 ?vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
1224 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1227 ?vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
1228 ?vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
1232 # low round: same as high round, but no rotation and no rcon.
1233 _vpaes_schedule_low_round:
1235 ?vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
1236 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1237 vspltisb v1, 0x0f # 0x0f..0f
1238 ?vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
1241 vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
1242 vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
1243 vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
1244 vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
1245 vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
1246 vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
1247 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
1248 vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
1249 vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
1250 vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
1251 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
1252 vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
1253 vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
1254 vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
1255 vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
1256 vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
1257 vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
1259 # add in smeared stuff
1260 vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
1261 vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
1264 .byte 0,12,0x14,0,0,0,0,0
1267 ## .aes_schedule_transform
1269 ## Linear-transform %xmm0 according to tables at (%r11)
1271 ## Requires that %xmm9 = 0x0F0F... as in preheat
1276 _vpaes_schedule_transform:
1277 #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
1278 vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
1279 # vmovdqa (%r11), %xmm2 # lo
1280 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
1281 # vmovdqa 16(%r11), %xmm1 # hi
1282 vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
1283 vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
1286 .byte 0,12,0x14,0,0,0,0,0
1289 ## .aes_schedule_mangle
1291 ## Mangle xmm0 from (basis-transformed) standard version
1296 ## multiply by circulant 0,1,1,1
1297 ## apply shiftrows transform
1301 ## multiply by "inverse mixcolumns" circulant E,B,D,9
1303 ## apply shiftrows transform
1306 ## Writes out to (%rdx), and increments or decrements it
1307 ## Keeps track of round number mod 4 in %r8
1309 ## Clobbers xmm1-xmm5
1312 _vpaes_schedule_mangle:
1313 #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
1314 # vmovdqa .Lk_mc_forward(%rip),%xmm5
1315 bne $dir, Lschedule_mangle_dec
1318 vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
1319 addi $out, $out, 16 # add \$16, %rdx
1320 vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
1321 vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
1322 vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
1323 vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
1324 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1325 vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
1327 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1328 addi r8, r8, -16 # add \$-16, %r8
1329 andi. r8, r8, 0x30 # and \$0x30, %r8
1331 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1332 vperm v1, v3, v3, $outperm # rotate right/left
1333 vsel v2, $outhead, v1, $outmask
1339 Lschedule_mangle_dec:
1340 # inverse mix columns
1341 # lea .Lk_dksd(%rip),%r11
1342 vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
1343 #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1345 # vmovdqa 0x00(%r11), %xmm2
1346 vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
1347 # vmovdqa 0x10(%r11), %xmm3
1348 vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
1349 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1350 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1352 # vmovdqa 0x20(%r11), %xmm2
1353 vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
1354 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1355 # vmovdqa 0x30(%r11), %xmm3
1356 vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
1357 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1358 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1360 # vmovdqa 0x40(%r11), %xmm2
1361 vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
1362 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1363 # vmovdqa 0x50(%r11), %xmm3
1364 vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
1365 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1367 # vmovdqa 0x60(%r11), %xmm2
1368 vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
1369 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1370 # vmovdqa 0x70(%r11), %xmm4
1371 vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
1372 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1373 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1374 vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
1376 addi $out, $out, -16 # add \$-16, %rdx
1378 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1379 addi r8, r8, -16 # add \$-16, %r8
1380 andi. r8, r8, 0x30 # and \$0x30, %r8
1382 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1383 vperm v1, v3, v3, $outperm # rotate right/left
1384 vsel v2, $outhead, v1, $outmask
1389 .byte 0,12,0x14,0,0,0,0,0
1391 .globl .vpaes_set_encrypt_key
1393 .vpaes_set_encrypt_key:
1394 $STU $sp,-$FRAME($sp)
1395 li r10,`15+6*$SIZE_T`
1396 li r11,`31+6*$SIZE_T`
1398 mfspr r6, 256 # save vrsave
1421 stw r6,`$FRAME-4`($sp) # save vrsave
1423 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1424 mtspr 256, r7 # preserve all AltiVec registers
1426 srwi r9, $bits, 5 # shr \$5,%eax
1427 addi r9, r9, 6 # add \$5,%eax
1428 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1430 cmplw $dir, $bits, $bits # set encrypt direction
1431 li r8, 0x30 # mov \$0x30,%r8d
1432 bl _vpaes_schedule_core
1434 $POP r0, `$FRAME+$LRSAVE`($sp)
1435 li r10,`15+6*$SIZE_T`
1436 li r11,`31+6*$SIZE_T`
1437 mtspr 256, r6 # restore vrsave
1465 .byte 0,12,0x04,1,0x80,0,3,0
1467 .size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
1469 .globl .vpaes_set_decrypt_key
1471 .vpaes_set_decrypt_key:
1472 $STU $sp,-$FRAME($sp)
1473 li r10,`15+6*$SIZE_T`
1474 li r11,`31+6*$SIZE_T`
1476 mfspr r6, 256 # save vrsave
1499 stw r6,`$FRAME-4`($sp) # save vrsave
1501 $PUSH r0, `$FRAME+$LRSAVE`($sp)
1502 mtspr 256, r7 # preserve all AltiVec registers
1504 srwi r9, $bits, 5 # shr \$5,%eax
1505 addi r9, r9, 6 # add \$5,%eax
1506 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1508 slwi r9, r9, 4 # shl \$4,%eax
1509 add $out, $out, r9 # lea (%rdx,%rax),%rdx
1511 cmplwi $dir, $bits, 0 # set decrypt direction
1512 srwi r8, $bits, 1 # shr \$1,%r8d
1513 andi. r8, r8, 32 # and \$32,%r8d
1514 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
1515 bl _vpaes_schedule_core
1517 $POP r0, `$FRAME+$LRSAVE`($sp)
1518 li r10,`15+6*$SIZE_T`
1519 li r11,`31+6*$SIZE_T`
1520 mtspr 256, r6 # restore vrsave
1548 .byte 0,12,0x04,1,0x80,0,3,0
1550 .size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key
1555 foreach (split("\n",$code)) {
1556 s/\`([^\`]*)\`/eval $1/geo;
1558 # constants table endian-specific conversion
1559 if ($consts && m/\.long\s+(.+)\s+(\?[a-z]*)$/o) {
1563 # convert to endian-agnostic format
1564 foreach (split(/,\s+/,$1)) {
1565 my $l = /^0/?oct:int;
1566 push @bytes,($l>>24)&0xff,($l>>16)&0xff,($l>>8)&0xff,$l&0xff;
1569 # little-endian conversion
1570 if ($flavour =~ /le$/o) {
1571 SWITCH: for($conv) {
1572 /\?inv/ && do { @bytes=map($_^0xf,@bytes); last; };
1573 /\?rev/ && do { @bytes=reverse(@bytes); last; };
1578 print ".byte\t",join(',',map (sprintf("0x%02x",$_),@bytes)),"\n";
1581 $consts=0 if (m/Lconsts:/o); # end of table
1583 # instructions prefixed with '?' are endian-specific and need
1584 # to be adjusted accordingly...
1585 if ($flavour =~ /le$/o) { # little-endian
1588 s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/o or
1589 s/\?(vsldoi\s+v[0-9]+,\s*)(v[0-9]+,)\s*(v[0-9]+,\s*)([0-9]+)/$1$3$2 16-$4/o or
1590 s/\?(vspltw\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9])/$1$2 3-$3/o;
1591 } else { # big-endian
1598 close STDOUT or die "error closing STDOUT: $!";