3 ######################################################################
4 ## Constant-time SSSE3 AES core implementation.
7 ## By Mike Hamburg (Stanford University), 2009
10 ## For details see http://shiftleft.org/papers/vector_aes/ and
11 ## http://crypto.stanford.edu/vpaes/.
13 # CBC encrypt/decrypt performance in cycles per byte processed with
17 # G4e 35.5/52.1/(23.8) 11.9(*)/15.4
18 # POWER6 42.7/54.3/(28.2) 63.0/92.8(**)
19 # POWER7 32.3/42.9/(18.4) 18.5/23.3
21 # (*) This is ~10% worse than reported in paper. The reason is
22 # twofold. This module doesn't make any assumption about
23 # key schedule (or data for that matter) alignment and handles
24 # it in-line. Secondly it, being transliterated from
25 # vpaes-x86_64.pl, relies on "nested inversion" better suited
27 # (**) Inadequate POWER6 performance is due to astronomic AltiVec
28 # latency, 9 cycles per simple logical operation.
32 if ($flavour =~ /64/) {
38 } elsif ($flavour =~ /32/) {
44 } else { die "nonsense $flavour"; }
49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
51 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
52 die "can't locate ppc-xlate.pl";
54 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
61 .align 7 # totally strategic alignment
63 Lk_mc_forward: # mc_forward
64 .long 0x01020300, 0x05060704, 0x090a0b08, 0x0d0e0f0c
65 .long 0x05060704, 0x090a0b08, 0x0d0e0f0c, 0x01020300
66 .long 0x090a0b08, 0x0d0e0f0c, 0x01020300, 0x05060704
67 .long 0x0d0e0f0c, 0x01020300, 0x05060704, 0x090a0b08
68 Lk_mc_backward: # mc_backward
69 .long 0x03000102, 0x07040506, 0x0b08090a, 0x0f0c0d0e
70 .long 0x0f0c0d0e, 0x03000102, 0x07040506, 0x0b08090a
71 .long 0x0b08090a, 0x0f0c0d0e, 0x03000102, 0x07040506
72 .long 0x07040506, 0x0b08090a, 0x0f0c0d0e, 0x03000102
74 .long 0x00010203, 0x04050607, 0x08090a0b, 0x0c0d0e0f
75 .long 0x00050a0f, 0x04090e03, 0x080d0207, 0x0c01060b
76 .long 0x0009020b, 0x040d060f, 0x08010a03, 0x0c050e07
77 .long 0x000d0a07, 0x04010e0b, 0x0805020f, 0x0c090603
83 .long 0xf001080d, 0x0f06050e, 0x020c0b0a, 0x09030704
84 .long 0xf0070b0f, 0x060a0401, 0x09080502, 0x0c0e0d03
85 Lk_ipt: # input transform (lo, hi)
86 .long 0x00702a5a, 0x98e8b2c2, 0x08782252, 0x90e0baca
87 .long 0x004d7c31, 0x7d30014c, 0x81ccfdb0, 0xfcb180cd
89 .long 0x00c7bd6f, 0x176dd2d0, 0x78a802c5, 0x7abfaa15
90 .long 0x006abb5f, 0xa574e4cf, 0xfa352b41, 0xd1901e8e
92 .long 0x0023e2fa, 0x15d41836, 0xefd92e0d, 0xc1ccf73b
93 .long 0x003e50cb, 0x8fe19bb1, 0x44f52a14, 0x6e7adfa5
95 .long 0x0029e10a, 0x4088eb69, 0x4a2382ab, 0xc863a1c2
96 .long 0x0024710b, 0xc6937ae2, 0xcd2f98bc, 0x55e9b75e
101 Lk_dipt: # decryption input transform
102 .long 0x005f540b, 0x045b500f, 0x1a454e11, 0x1e414a15
103 .long 0x00650560, 0xe683e386, 0x94f191f4, 0x72177712
104 Lk_dsbo: # decryption sbox final output
105 .long 0x0040f97e, 0x53ea8713, 0x2d3e94d4, 0xb96daac7
106 .long 0x001d4493, 0x0f56d712, 0x9c8ec5d8, 0x59814bca
107 Lk_dsb9: # decryption sbox output *9*u, *9*t
108 .long 0x00d6869a, 0x53031c85, 0xc94c994f, 0x501fd5ca
109 .long 0x0049d7ec, 0x89173bc0, 0x65a5fbb2, 0x9e2c5e72
110 Lk_dsbd: # decryption sbox output *D*u, *D*t
111 .long 0x00a2b1e6, 0xdfcc577d, 0x39442a88, 0x139b6ef5
112 .long 0x00cbc624, 0xf7fae23c, 0xd3efde15, 0x0d183129
113 Lk_dsbb: # decryption sbox output *B*u, *B*t
114 .long 0x0042b496, 0x926422d0, 0x04d4f2b0, 0xf6462660
115 .long 0x006759cd, 0xa69894c1, 0x6baa5532, 0x3e0cfff3
116 Lk_dsbe: # decryption sbox output *E*u, *E*t
117 .long 0x00d0d426, 0x9692f246, 0xb0f6b464, 0x04604222
118 .long 0x00c1aaff, 0xcda6550c, 0x323e5998, 0x6bf36794
121 ## Key schedule constants
123 Lk_dksd: # decryption key schedule: invskew x*D
124 .long 0x0047e4a3, 0x5d1ab9fe, 0xf9be1d5a, 0xa4e34007
125 .long 0x008336b5, 0xf477c241, 0x1e9d28ab, 0xea69dc5f
126 Lk_dksb: # decryption key schedule: invskew x*B
127 .long 0x00d55085, 0x1fca4f9a, 0x994cc91c, 0x8653d603
128 .long 0x004afcb6, 0xa7ed5b11, 0xc882347e, 0x6f2593d9
129 Lk_dkse: # decryption key schedule: invskew x*E + 0x63
130 .long 0x00d6c91f, 0xca1c03d5, 0x86504f99, 0x4c9a8553
131 .long 0xe87bdc4f, 0x059631a2, 0x8714b320, 0x6af95ecd
132 Lk_dks9: # decryption key schedule: invskew x*9
133 .long 0x00a7d97e, 0xc86f11b6, 0xfc5b2582, 0x3493ed4a
134 .long 0x00331427, 0x62517645, 0xcefddae9, 0xac9fb88b
137 .long 0xb6ee9daf, 0xb991831f, 0x817d7c4d, 0x08982a70
139 .long 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b, 0x5b5b5b5b
141 Lk_opt: # output transform
142 .long 0x0060b6d6, 0x29499fff, 0x0868bede, 0x214197f7
143 .long 0x00ecbc50, 0x51bded01, 0xe00c5cb0, 0xb15d0de1
144 Lk_deskew: # deskew tables: inverts the sbox's "skew"
145 .long 0x00e3a447, 0x40a3e407, 0x1af9be5d, 0x5ab9fe1d
146 .long 0x0069ea83, 0xdcb5365f, 0x771e9df4, 0xabc24128
151 mflr r12 #vvvvv "distance between . and _vpaes_consts
156 .byte 0,12,0x14,0,0,0,0,0
157 .asciz "Vector Permutaion AES for AltiVec, Mike Hamburg (Stanford University)"
161 my ($inptail,$inpperm,$outhead,$outperm,$outmask,$keyperm) = map("v$_",(26..31));
163 my ($inp,$out,$key) = map("r$_",(3..5));
165 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_",(10..15));
166 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_",(16..19));
167 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_",(16..23));
173 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
174 ## and %xmm9-%xmm15 as specified below.
177 _vpaes_encrypt_preheat:
181 li r11, 0xc0 # Lk_inv
185 vxor v7, v7, v7 # 0x00..00
186 vspltisb v8,4 # 0x04..04
187 vspltisb v9,0x0f # 0x0f..0f
206 .byte 0,12,0x14,0,0,0,0,0
211 ## AES-encrypt %xmm0.
215 ## %xmm9-%xmm15 as in _vpaes_preheat
216 ## (%rdx) = scheduled keys
219 ## Clobbers %xmm1-%xmm6, %r9, %r10, %r11, %rax
224 lwz r8, 240($key) # pull rounds
226 lvx v5, 0, $key # vmovdqu (%r9), %xmm5 # round0 key
230 vperm v5, v5, v6, $keyperm # align round key
232 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
233 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm1
234 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm3, %xmm2
235 vxor v0, v0, v5 # vpxor %xmm5, %xmm1, %xmm0
236 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
242 # middle of middle round
243 vperm v4, $sb1t, v7, v2 # vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
244 lvx v1, r12, r11 # vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
246 vperm v0, $sb1u, v7, v3 # vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
247 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
248 andi. r11, r11, 0x30 # and \$0x30, %r11 # ... mod 4
249 vperm v5, $sb2t, v7, v2 # vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
250 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
251 vperm v2, $sb2u, v7, v3 # vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
252 lvx v4, r12, r10 # vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
254 vperm v3, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
255 vxor v2, v2, v5 # vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
256 vperm v0, v0, v7, v4 # vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
257 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
258 vperm v4, v3, v7, v1 # vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
259 vxor v0, v0, v3 # vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
260 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
264 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
265 vperm v5, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
266 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
267 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
268 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
270 vxor v3, v3, v5 # vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
271 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
272 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
274 lvx v6, r9, $key # vmovdqu (%r9), %xmm5
275 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
277 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
278 vperm v5, v5, v6, $keyperm # align round key
279 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
282 # middle of last round
284 # vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
285 # vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
286 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
287 lvx v1, r12, r10 # vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
288 vperm v0, $sbot, v7, v3 # vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
289 vxor v4, v4, v5 # vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
290 vxor v0, v0, v4 # vpxor %xmm4, %xmm0, %xmm0 # 0 = A
291 vperm v0, v0, v7, v1 # vpshufb %xmm1, %xmm0, %xmm0
294 .byte 0,12,0x14,0,0,0,0,0
296 .globl .vpaes_encrypt
300 mfspr r7, 256 # save vrsave
302 $PUSH r6,$LRSAVE($sp)
303 mtspr 256, r0 # preserve all AltiVec registers
305 bl _vpaes_encrypt_preheat
307 neg r8, $inp # prepare for unaligned access
308 lvsl $keyperm, 0, $key
309 lvsr $outperm, 0, $out
310 lvsr $inpperm, 0, r8 # -$inp
311 vnor $outmask, v7, v7 # 0xff..ff
312 lvx $inptail, 0, $inp
313 vperm $outmask, v7, $outmask, $outperm
314 addi $inp, $inp, 15 # 15 is not a typo
315 lvx $outhead, 0, $out
319 lvx $inptail, 0, $inp # redundant in aligned case
321 vperm v0, v0, $inptail, $inpperm
323 bl _vpaes_encrypt_core
325 vperm v0, v0, v0, $outperm # rotate left
326 vsel v1, $outhead, v0, $outmask
329 addi $out, $out, 15 # 15 is not a typo
332 lvx v1, 0, $out # redundant in aligned case
333 vsel v1, $outhead, v1, $outmask
337 mtspr 256, r7 # restore vrsave
340 .byte 0,12,0x14,1,0,0,3,0
342 .size .vpaes_encrypt,.-.vpaes_encrypt
345 _vpaes_decrypt_preheat:
349 li r11, 0xc0 # Lk_inv
353 vxor v7, v7, v7 # 0x00..00
354 vspltisb v8,4 # 0x04..04
355 vspltisb v9,0x0f # 0x0f..0f
382 .byte 0,12,0x14,0,0,0,0,0
387 ## Same API as encryption core.
391 lwz r8, 240($key) # pull rounds
393 lvx v5, 0, $key # vmovdqu (%r9), %xmm4 # round0 key
397 vperm v5, v5, v6, $keyperm # align round key
398 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
399 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
400 vperm v1, $ipthi, $ipthi, v1 # vpshufb %xmm0, %xmm1, %xmm0
401 vxor v0, v0, v5 # vpxor %xmm4, %xmm2, %xmm2
402 vxor v0, v0, v1 # vpxor %xmm2, %xmm0, %xmm0
409 # Inverse mix columns
411 lvx v0, r12, r11 # v5 and v0 are flipped
412 # vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
413 # vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
414 vperm v4, $sb9u, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
416 vperm v1, $sb9t, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
418 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0
419 # vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
420 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
421 # vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
423 vperm v4, $sbdu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
424 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
425 vperm v1, $sbdt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
426 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
427 # vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
428 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
429 # vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
431 vperm v4, $sbbu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
432 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
433 vperm v1, $sbbt, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
434 vxor v5, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
435 # vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
436 vxor v5, v5, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
437 # vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
439 vperm v4, $sbeu, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
440 vperm v5, v5, v7, v0 # vpshufb %xmm5, %xmm0, %xmm0 # MC ch
441 vperm v1, $sbet, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
442 vxor v0, v5, v4 # vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
443 vxor v0, v0, v1 # vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
447 vsrb v1, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
448 vperm v2, $invhi, $invhi, v0 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
449 vxor v0, v0, v1 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
450 vperm v3, $invlo, $invlo, v1 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
451 vperm v4, $invlo, $invlo, v0 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
453 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
454 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
455 vperm v2, $invlo, v7, v3 # vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
457 lvx v6, r9, $key # vmovdqu (%r9), %xmm0
458 vperm v3, $invlo, v7, v4 # vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
460 vxor v2, v2, v0 # vpxor %xmm1, %xmm2, %xmm2 # 2 = io
461 vperm v5, v5, v6, $keyperm # align round key
462 vxor v3, v3, v1 # vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
465 # middle of last round
467 # vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
468 vperm v4, $sbou, v7, v2 # vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
469 # vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
470 lvx v2, r12, r10 # vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
471 vperm v1, $sbot, v7, v3 # vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
472 vxor v4, v4, v5 # vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
473 vxor v0, v1, v4 # vpxor %xmm4, %xmm1, %xmm0 # 0 = A
474 vperm v0, v0, v7, v2 # vpshufb %xmm2, %xmm0, %xmm0
477 .byte 0,12,0x14,0,0,0,0,0
479 .globl .vpaes_decrypt
483 mfspr r7, 256 # save vrsave
485 $PUSH r6,$LRSAVE($sp)
486 mtspr 256, r0 # preserve all AltiVec registers
488 bl _vpaes_decrypt_preheat
490 neg r8, $inp # prepare for unaligned access
491 lvsl $keyperm, 0, $key
492 lvsr $outperm, 0, $out
493 lvsr $inpperm, 0, r8 # -$inp
494 vnor $outmask, v7, v7 # 0xff..ff
495 lvx $inptail, 0, $inp
496 vperm $outmask, v7, $outmask, $outperm
497 addi $inp, $inp, 15 # 15 is not a typo
498 lvx $outhead, 0, $out
502 lvx $inptail, 0, $inp # redundant in aligned case
504 vperm v0, v0, $inptail, $inpperm
506 bl _vpaes_decrypt_core
508 vperm v0, v0, v0, $outperm # rotate left
509 vsel v1, $outhead, v0, $outmask
512 addi $out, $out, 15 # 15 is not a typo
515 lvx v1, 0, $out # redundant in aligned case
516 vsel v1, $outhead, v1, $outmask
520 mtspr 256, r7 # restore vrsave
523 .byte 0,12,0x14,1,0,0,3,0
525 .size .vpaes_decrypt,.-.vpaes_decrypt
527 .globl .vpaes_cbc_encrypt
530 $STU $sp,-$FRAME($sp)
532 $PUSH r30,$FRAME-$SIZE_T*2($sp)
534 $PUSH r31,$FRAME-$SIZE_T*1($sp)
535 $PUSH r0, $FRAME+$LRSAVE($sp)
537 sub. r30, r5, r9 # copy length-16
538 mr r5, r6 # copy pointer to key
539 mr r31, r7 # copy pointer to iv
541 cmpwi r8, 0 # test direction
544 mtspr 256, r6 # preserve all AltiVec registers
546 lvx v24, 0, r31 # load [potentially unaligned] iv
548 lvsl $inpperm, 0, r31
550 vperm v24, v24, v25, $inpperm
552 neg r8, $inp # prepare for unaligned access
554 lvsl $keyperm, 0, $key
555 lvsr $outperm, 0, $out
556 lvsr $inpperm, 0, r8 # -$inp
557 vnor $outmask, v7, v7 # 0xff..ff
558 lvx $inptail, 0, $inp
559 vperm $outmask, v7, $outmask, $outperm
560 addi $inp, $inp, 15 # 15 is not a typo
561 lvx $outhead, 0, $out
565 bl _vpaes_encrypt_preheat
570 lvx $inptail, 0, $inp
572 vperm v0, v0, $inptail, $inpperm
573 vxor v0, v0, v24 # ^= iv
575 bl _vpaes_encrypt_core
577 vmr v24, v0 # put aside iv
578 sub. r30, r30, r0 # len -= 16
579 vperm v0, v0, v0, $outperm # rotate left
580 vsel v1, $outhead, v0, $outmask
590 bl _vpaes_decrypt_preheat
595 lvx $inptail, 0, $inp
597 vperm v0, v0, $inptail, $inpperm
598 vmr v25, v0 # put aside input
600 bl _vpaes_decrypt_core
602 vxor v0, v0, v24 # ^= iv
604 sub. r30, r30, r0 # len -= 16
605 vperm v0, v0, v0, $outperm # rotate left
606 vsel v1, $outhead, v0, $outmask
614 lvx v1, 0, $out # redundant in aligned case
615 vsel v1, $outhead, v1, $outmask
618 neg r8, r31 # write [potentially unaligned] iv
621 vnor $outmask, v7, v7 # 0xff..ff
622 vperm $outmask, v7, $outmask, $outperm
624 vperm v24, v24, v24, $outperm # rotate
625 vsel v0, $outhead, v24, $outmask
628 vsel v1, v24, v1, $outmask
631 mtspr 256, r7 # restore vrsave
633 $POP r0, $FRAME+$LRSAVE($sp)
634 $POP r30,$FRAME-$SIZE_T*2($sp)
635 $POP r31,$FRAME-$SIZE_T*1($sp)
640 .byte 0,12,0x04,1,0x80,2,6,0
642 .size .vpaes_cbc_encrypt,.-.vpaes_cbc_encrypt
646 my ($inp,$bits,$out)=map("r$_",(3..5));
648 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_",(10..13,24));
651 ########################################################
653 ## AES key schedule ##
655 ########################################################
661 li r11, 0xc0 # Lk_inv
666 vspltisb v8,4 # 0x04..04
667 vxor v9,v9,v9 # 0x00..00
668 lvx $invlo, r12, r11 # Lk_inv
672 lvx $iptlo, r12, r9 # Lk_ipt
677 lvx v14, r12, r11 # Lk_sb1
682 lvx v16, r12, r9 # Lk_dksd
686 lvx v18, r12, r11 # Lk_dksb
690 lvx v20, r12, r9 # Lk_dkse
694 lvx v22, r12, r11 # Lk_dks9
697 lvx v24, r12, r9 # Lk_rcon
698 lvx v25, 0, r12 # Lk_mc_forward[0]
699 lvx v26, r12, r8 # Lks63
702 .byte 0,12,0x14,0,0,0,0,0
705 _vpaes_schedule_core:
708 bl _vpaes_key_preheat # load the tables
710 #lvx v0, 0, $inp # vmovdqu (%rdi), %xmm0 # load key (unaligned)
711 neg r8, $inp # prepare for unaligned access
713 addi $inp, $inp, 15 # 15 is not typo
714 lvsr $inpperm, 0, r8 # -$inp
715 lvx v6, 0, $inp # v6 serves as inptail
717 vperm v0, v0, v6, $inpperm
720 vmr v3, v0 # vmovdqa %xmm0, %xmm3
721 bl _vpaes_schedule_transform
722 vmr v7, v0 # vmovdqa %xmm0, %xmm7
724 bne $dir, Lschedule_am_decrypting
726 # encrypting, output zeroth round key after transform
727 li r8, 0x30 # mov \$0x30,%r8d
728 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
730 lvsr $outperm, 0, $out # prepare for unaligned access
731 vspltisb $outmask, -1 # 0xff..ff
732 lvx $outhead, 0, $out
733 vperm $outmask, v9, $outmask, $outperm
735 #stvx v0, 0, $out # vmovdqu %xmm0, (%rdx)
736 vperm v1, v0, v0, $outperm # rotate left
737 vsel v2, $outhead, v1, $outmask
742 Lschedule_am_decrypting:
743 srwi r8, $bits, 1 # shr \$1,%r8d
744 andi. r8, r8, 32 # and \$32,%r8d
745 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
746 addi r10, r12, 0x80 # lea .Lk_sr(%rip),%r10
747 # decrypting, output zeroth round key after shiftrows
748 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
749 vperm v4, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
751 neg r0, $out # prepare for unaligned access
753 addi $out, $out, 15 # 15 is not typo
754 vspltisb $outmask, -1 # 0xff..ff
755 lvx $outhead, 0, $out
756 vperm $outmask, $outmask, v9, $outperm
758 #stvx v4, 0, $out # vmovdqu %xmm3, (%rdx)
759 vperm v4, v4, v4, $outperm # rotate left
760 vsel v2, $outhead, v4, $outmask
763 xori r8, r8, 0x30 # xor \$0x30, %r8
766 cmplwi $bits, 192 # cmp \$192, %esi
774 ## 128-bit specific part of key schedule.
776 ## This schedule is really simple, because all its parts
777 ## are accomplished by the subroutines.
780 li r0, 10 # mov \$10, %esi
784 bl _vpaes_schedule_round
785 bdz Lschedule_mangle_last # dec %esi
786 bl _vpaes_schedule_mangle # write output
792 ## 192-bit specific part of key schedule.
794 ## The main body of this schedule is the same as the 128-bit
795 ## schedule, but with more smearing. The long, high side is
796 ## stored in %xmm7 as before, and the short, low side is in
797 ## the high bits of %xmm6.
799 ## This schedule is somewhat nastier, however, because each
800 ## round produces 192 bits of key material, or 1.5 round keys.
801 ## Therefore, on each cycle we do 2 rounds and produce 3 round
806 li r0, 4 # mov \$4, %esi
808 vperm v0, v6, v0, $inpperm
809 vsldoi v0, v3, v0, 8 # vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
810 bl _vpaes_schedule_transform # input transform
812 vsldoi v6, v9, v6, 8 # clobber "low" side with zeros
816 bl _vpaes_schedule_round
817 vsldoi v0, v6, v0, 8 # vpalignr \$8,%xmm6,%xmm0,%xmm0
818 bl _vpaes_schedule_mangle # save key n
819 bl _vpaes_schedule_192_smear
820 bl _vpaes_schedule_mangle # save key n+1
821 bl _vpaes_schedule_round
822 bdz Lschedule_mangle_last # dec %esi
823 bl _vpaes_schedule_mangle # save key n+2
824 bl _vpaes_schedule_192_smear
830 ## 256-bit specific part of key schedule.
832 ## The structure here is very similar to the 128-bit
833 ## schedule, but with an additional "low side" in
834 ## %xmm6. The low side's rounds are the same as the
835 ## high side's, except no rcon and no rotation.
839 li r0, 7 # mov \$7, %esi
841 lvx v0, 0, $inp # vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
842 vperm v0, v6, v0, $inpperm
843 bl _vpaes_schedule_transform # input transform
847 bl _vpaes_schedule_mangle # output low result
848 vmr v6, v0 # vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
851 bl _vpaes_schedule_round
852 bdz Lschedule_mangle_last # dec %esi
853 bl _vpaes_schedule_mangle
855 # low round. swap xmm7 and xmm6
856 vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
857 vmr v5, v7 # vmovdqa %xmm7, %xmm5
858 vmr v7, v6 # vmovdqa %xmm6, %xmm7
859 bl _vpaes_schedule_low_round
860 vmr v7, v5 # vmovdqa %xmm5, %xmm7
864 ## .aes_schedule_mangle_last
866 ## Mangler for last round of key schedule
868 ## when encrypting, outputs out(%xmm0) ^ 63
869 ## when decrypting, outputs unskew(%xmm0)
871 ## Always called right before return... jumps to cleanup and exits
874 Lschedule_mangle_last:
875 # schedule last round key from xmm0
876 li r11, 0x2e0 # lea .Lk_deskew(%rip),%r11
878 bne $dir, Lschedule_mangle_last_dec
881 lvx v1, r8, r10 # vmovdqa (%r8,%r10),%xmm1
882 li r11, 0x2c0 # lea .Lk_opt(%rip), %r11 # prepare to output transform
883 li r9, 0x2d0 # prepare to output transform
884 vperm v0, v0, v0, v1 # vpshufb %xmm1, %xmm0, %xmm0 # output permute
886 lvx $iptlo, r11, r12 # reload $ipt
888 addi $out, $out, 16 # add \$16, %rdx
889 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
890 bl _vpaes_schedule_transform # output transform
892 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
893 vperm v0, v0, v0, $outperm # rotate left
894 vsel v2, $outhead, v0, $outmask
898 addi $out, $out, 15 # 15 is not typo
899 lvx v1, 0, $out # redundant in aligned case
900 vsel v1, $outhead, v1, $outmask
902 b Lschedule_mangle_done
905 Lschedule_mangle_last_dec:
906 lvx $iptlo, r11, r12 # reload $ipt
908 addi $out, $out, -16 # add \$-16, %rdx
909 vxor v0, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm0
910 bl _vpaes_schedule_transform # output transform
912 #stvx v0, r0, $out # vmovdqu %xmm0, (%rdx) # save last key
913 vperm v0, v0, v0, $outperm # rotate left
914 vsel v2, $outhead, v0, $outmask
918 addi $out, $out, -15 # -15 is not typo
919 lvx v1, 0, $out # redundant in aligned case
920 vsel v1, $outhead, v1, $outmask
923 Lschedule_mangle_done:
926 vxor v0, v0, v0 # vpxor %xmm0, %xmm0, %xmm0
927 vxor v1, v1, v1 # vpxor %xmm1, %xmm1, %xmm1
928 vxor v2, v2, v2 # vpxor %xmm2, %xmm2, %xmm2
929 vxor v3, v3, v3 # vpxor %xmm3, %xmm3, %xmm3
930 vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
931 vxor v5, v5, v5 # vpxor %xmm5, %xmm5, %xmm5
932 vxor v6, v6, v6 # vpxor %xmm6, %xmm6, %xmm6
933 vxor v7, v7, v7 # vpxor %xmm7, %xmm7, %xmm7
937 .byte 0,12,0x14,0,0,0,0,0
940 ## .aes_schedule_192_smear
942 ## Smear the short, low side in the 192-bit key schedule.
945 ## %xmm7: high side, b a x y
946 ## %xmm6: low side, d c 0 0
950 ## %xmm6: b+c+d b+c 0 0
951 ## %xmm0: b+c+d b+c b a
954 _vpaes_schedule_192_smear:
956 vsldoi v1, v9, v6, 12 # vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
957 vsldoi v0, v7, v0, 8 # vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
958 vxor v6, v6, v1 # vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
959 vxor v6, v6, v0 # vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
962 vsldoi v6, v9, v6, 8 # clobber low side with zeros
965 .byte 0,12,0x14,0,0,0,0,0
968 ## .aes_schedule_round
970 ## Runs one main round of the key schedule on %xmm0, %xmm7
972 ## Specifically, runs subbytes on the high dword of %xmm0
973 ## then rotates it by one byte and xors into the low dword of
976 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
979 ## Smears the dwords of %xmm7 by xoring the low into the
980 ## second low, result into third, result into highest.
982 ## Returns results in %xmm7 = %xmm0.
983 ## Clobbers %xmm1-%xmm4, %r11.
986 _vpaes_schedule_round:
987 # extract rcon from xmm8
988 #vxor v4, v4, v4 # vpxor %xmm4, %xmm4, %xmm4
989 vsldoi v1, $rcon, v9, 15 # vpalignr \$15, %xmm8, %xmm4, %xmm1
990 vsldoi $rcon, $rcon, $rcon, 15 # vpalignr \$15, %xmm8, %xmm8, %xmm8
991 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
994 vspltw v0, v0, 3 # vpshufd \$0xFF, %xmm0, %xmm0
995 vsldoi v0, v0, v0, 1 # vpalignr \$1, %xmm0, %xmm0, %xmm0
999 # low round: same as high round, but no rotation and no rcon.
1000 _vpaes_schedule_low_round:
1002 vsldoi v1, v9, v7, 12 # vpslldq \$4, %xmm7, %xmm1
1003 vxor v7, v7, v1 # vpxor %xmm1, %xmm7, %xmm7
1004 vspltisb v1, 0x0f # 0x0f..0f
1005 vsldoi v4, v9, v7, 8 # vpslldq \$8, %xmm7, %xmm4
1008 vand v1, v1, v0 # vpand %xmm9, %xmm0, %xmm1 # 0 = k
1009 vsrb v0, v0, v8 # vpsrlb \$4, %xmm0, %xmm0 # 1 = i
1010 vxor v7, v7, v4 # vpxor %xmm4, %xmm7, %xmm7
1011 vperm v2, $invhi, v9, v1 # vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
1012 vxor v1, v1, v0 # vpxor %xmm0, %xmm1, %xmm1 # 0 = j
1013 vperm v3, $invlo, v9, v0 # vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
1014 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
1015 vperm v4, $invlo, v9, v1 # vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
1016 vxor v7, v7, v26 # vpxor .Lk_s63(%rip), %xmm7, %xmm7
1017 vperm v3, $invlo, v9, v3 # vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
1018 vxor v4, v4, v2 # vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
1019 vperm v2, $invlo, v9, v4 # vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
1020 vxor v3, v3, v1 # vpxor %xmm1, %xmm3, %xmm3 # 2 = io
1021 vxor v2, v2, v0 # vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
1022 vperm v4, v15, v9, v3 # vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
1023 vperm v1, v14, v9, v2 # vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
1024 vxor v1, v1, v4 # vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
1026 # add in smeared stuff
1027 vxor v0, v1, v7 # vpxor %xmm7, %xmm1, %xmm0
1028 vxor v7, v1, v7 # vmovdqa %xmm0, %xmm7
1031 .byte 0,12,0x14,0,0,0,0,0
1034 ## .aes_schedule_transform
1036 ## Linear-transform %xmm0 according to tables at (%r11)
1038 ## Requires that %xmm9 = 0x0F0F... as in preheat
1043 _vpaes_schedule_transform:
1044 #vand v1, v0, v9 # vpand %xmm9, %xmm0, %xmm1
1045 vsrb v2, v0, v8 # vpsrlb \$4, %xmm0, %xmm0
1046 # vmovdqa (%r11), %xmm2 # lo
1047 vperm v0, $iptlo, $iptlo, v0 # vpshufb %xmm1, %xmm2, %xmm2
1048 # vmovdqa 16(%r11), %xmm1 # hi
1049 vperm v2, $ipthi, $ipthi, v2 # vpshufb %xmm0, %xmm1, %xmm0
1050 vxor v0, v0, v2 # vpxor %xmm2, %xmm0, %xmm0
1053 .byte 0,12,0x14,0,0,0,0,0
1056 ## .aes_schedule_mangle
1058 ## Mangle xmm0 from (basis-transformed) standard version
1063 ## multiply by circulant 0,1,1,1
1064 ## apply shiftrows transform
1068 ## multiply by "inverse mixcolumns" circulant E,B,D,9
1070 ## apply shiftrows transform
1073 ## Writes out to (%rdx), and increments or decrements it
1074 ## Keeps track of round number mod 4 in %r8
1076 ## Clobbers xmm1-xmm5
1079 _vpaes_schedule_mangle:
1080 #vmr v4, v0 # vmovdqa %xmm0, %xmm4 # save xmm0 for later
1081 # vmovdqa .Lk_mc_forward(%rip),%xmm5
1082 bne $dir, Lschedule_mangle_dec
1085 vxor v4, v0, v26 # vpxor .Lk_s63(%rip), %xmm0, %xmm4
1086 addi $out, $out, 16 # add \$16, %rdx
1087 vperm v4, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm4
1088 vperm v1, v4, v4, v25 # vpshufb %xmm5, %xmm4, %xmm1
1089 vperm v3, v1, v1, v25 # vpshufb %xmm5, %xmm1, %xmm3
1090 vxor v4, v4, v1 # vpxor %xmm1, %xmm4, %xmm4
1091 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1092 vxor v3, v3, v4 # vpxor %xmm4, %xmm3, %xmm3
1094 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1095 addi r8, r8, -16 # add \$-16, %r8
1096 andi. r8, r8, 0x30 # and \$0x30, %r8
1098 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1099 vperm v1, v3, v3, $outperm # rotate left
1100 vsel v2, $outhead, v1, $outmask
1106 Lschedule_mangle_dec:
1107 # inverse mix columns
1108 # lea .Lk_dksd(%rip),%r11
1109 vsrb v1, v0, v8 # vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
1110 #and v4, v0, v9 # vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1112 # vmovdqa 0x00(%r11), %xmm2
1113 vperm v2, v16, v16, v0 # vpshufb %xmm4, %xmm2, %xmm2
1114 # vmovdqa 0x10(%r11), %xmm3
1115 vperm v3, v17, v17, v1 # vpshufb %xmm1, %xmm3, %xmm3
1116 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1117 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1119 # vmovdqa 0x20(%r11), %xmm2
1120 vperm v2, v18, v18, v0 # vpshufb %xmm4, %xmm2, %xmm2
1121 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1122 # vmovdqa 0x30(%r11), %xmm3
1123 vperm v3, v19, v19, v1 # vpshufb %xmm1, %xmm3, %xmm3
1124 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1125 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1127 # vmovdqa 0x40(%r11), %xmm2
1128 vperm v2, v20, v20, v0 # vpshufb %xmm4, %xmm2, %xmm2
1129 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1130 # vmovdqa 0x50(%r11), %xmm3
1131 vperm v3, v21, v21, v1 # vpshufb %xmm1, %xmm3, %xmm3
1132 vxor v3, v3, v2 # vpxor %xmm2, %xmm3, %xmm3
1134 # vmovdqa 0x60(%r11), %xmm2
1135 vperm v2, v22, v22, v0 # vpshufb %xmm4, %xmm2, %xmm2
1136 vperm v3, v3, v9, v25 # vpshufb %xmm5, %xmm3, %xmm3
1137 # vmovdqa 0x70(%r11), %xmm4
1138 vperm v4, v23, v23, v1 # vpshufb %xmm1, %xmm4, %xmm4
1139 lvx v1, r8, r10 # vmovdqa (%r8,%r10), %xmm1
1140 vxor v2, v2, v3 # vpxor %xmm3, %xmm2, %xmm2
1141 vxor v3, v4, v2 # vpxor %xmm2, %xmm4, %xmm3
1143 addi $out, $out, -16 # add \$-16, %rdx
1145 vperm v3, v3, v3, v1 # vpshufb %xmm1, %xmm3, %xmm3
1146 addi r8, r8, -16 # add \$-16, %r8
1147 andi. r8, r8, 0x30 # and \$0x30, %r8
1149 #stvx v3, 0, $out # vmovdqu %xmm3, (%rdx)
1150 vperm v1, v3, v3, $outperm # rotate left
1151 vsel v2, $outhead, v1, $outmask
1156 .byte 0,12,0x14,0,0,0,0,0
1158 .globl .vpaes_set_encrypt_key
1160 .vpaes_set_encrypt_key:
1162 mfspr r6, 256 # save vrsave
1164 $PUSH r0, $LRSAVE($sp)
1165 mtspr 256, r7 # preserve all AltiVec registers
1167 srwi r9, $bits, 5 # shr \$5,%eax
1168 addi r9, r9, 6 # add \$5,%eax
1169 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1171 cmplw $dir, $bits, $bits
1172 li r8, 0x30 # mov \$0x30,%r8d
1173 bl _vpaes_schedule_core
1175 $POP r0, $LRSAVE($sp)
1176 mtspr 256, r6 # restore vrsave
1181 .byte 0,12,0x14,1,0,3,0
1183 .size .vpaes_set_encrypt_key,.-.vpaes_set_encrypt_key
1185 .globl .vpaes_set_decrypt_key
1187 .vpaes_set_decrypt_key:
1189 mfspr r6, 256 # save vrsave
1191 $PUSH r0, $LRSAVE($sp)
1192 mtspr 256, r7 # preserve all AltiVec registers
1194 srwi r9, $bits, 5 # shr \$5,%eax
1195 addi r9, r9, 6 # add \$5,%eax
1196 stw r9, 240($out) # mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1198 slwi r9, r9, 4 # shl \$4,%eax
1199 add $out, $out, r9 # lea (%rdx,%rax),%rdx
1201 cmplwi $dir, $bits, 0
1202 srwi r8, $bits, 1 # shr \$1,%r8d
1203 andi. r8, r8, 32 # and \$32,%r8d
1204 xori r8, r8, 32 # xor \$32,%r8d # nbits==192?0:32
1205 bl _vpaes_schedule_core
1207 $POP r0, $LRSAVE($sp)
1208 mtspr 256, r6 # restore vrsave
1213 .byte 0,12,0x14,1,0,3,0
1215 .size .vpaes_set_decrypt_key,.-.vpaes_set_decrypt_key