3 ######################################################################
4 ## Constant-time SSSE3 AES core implementation.
7 ## By Mike Hamburg (Stanford University), 2009
10 ## For details see http://shiftleft.org/papers/vector_aes/ and
11 ## http://crypto.stanford.edu/vpaes/.
13 ######################################################################
14 # ARMv8 NEON adaptation by <appro@openssl.org>
16 # Reason for undertaken effort is that there is at least one popular
17 # SoC based on Cortex-A53 that doesn't have crypto extensions.
19 # CBC enc ECB enc/dec(*) [bit-sliced enc/dec]
20 # Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ]
21 # Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ]
22 # X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ]
24 # (*) ECB denotes approximate result for parallelizeable modes
25 # such as CBC decrypt, CTR, etc.;
26 # (**) these results are worse than scalar compiler-generated
27 # code, but it's constant-time and therefore preferred;
30 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
32 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
33 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
34 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
35 die "can't locate arm-xlate.pl";
37 open OUT,"| \"$^X\" $xlate $flavour $output";
43 .type _vpaes_consts,%object
44 .align 7 // totally strategic alignment
46 .Lk_mc_forward: // mc_forward
47 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
48 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
49 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
50 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
51 .Lk_mc_backward:// mc_backward
52 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
53 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
54 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
55 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
57 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
58 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
59 .quad 0x0F060D040B020900, 0x070E050C030A0108
60 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
66 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
67 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
68 .Lk_ipt: // input transform (lo, hi)
69 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
70 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
71 .Lk_sbo: // sbou, sbot
72 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
73 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
74 .Lk_sb1: // sb1u, sb1t
75 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
76 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
77 .Lk_sb2: // sb2u, sb2t
78 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
79 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
84 .Lk_dipt: // decryption input transform
85 .quad 0x0F505B040B545F00, 0x154A411E114E451A
86 .quad 0x86E383E660056500, 0x12771772F491F194
87 .Lk_dsbo: // decryption sbox final output
88 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
89 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
90 .Lk_dsb9: // decryption sbox output *9*u, *9*t
91 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
92 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
93 .Lk_dsbd: // decryption sbox output *D*u, *D*t
94 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
95 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
96 .Lk_dsbb: // decryption sbox output *B*u, *B*t
97 .quad 0xD022649296B44200, 0x602646F6B0F2D404
98 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
99 .Lk_dsbe: // decryption sbox output *E*u, *E*t
100 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
101 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
104 // Key schedule constants
106 .Lk_dksd: // decryption key schedule: invskew x*D
107 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
108 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
109 .Lk_dksb: // decryption key schedule: invskew x*B
110 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
111 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
112 .Lk_dkse: // decryption key schedule: invskew x*E + 0x63
113 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
114 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
115 .Lk_dks9: // decryption key schedule: invskew x*9
116 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
117 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
120 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
122 .Lk_opt: // output transform
123 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
124 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
125 .Lk_deskew: // deskew tables: inverts the sbox's "skew"
126 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
127 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
129 .asciz "Vector Permutaion AES for ARMv8, Mike Hamburg (Stanford University)"
130 .size _vpaes_consts,.-_vpaes_consts
135 my ($inp,$out,$key) = map("x$_",(0..2));
137 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
138 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
139 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
145 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
146 ## and %xmm9-%xmm15 as specified below.
148 .type _vpaes_encrypt_preheat,%function
150 _vpaes_encrypt_preheat:
153 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
154 ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
155 ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2
157 .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
162 ## AES-encrypt %xmm0.
166 ## %xmm9-%xmm15 as in _vpaes_preheat
167 ## (%rdx) = scheduled keys
170 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
171 ## Preserves %xmm6 - %xmm8 so you get some local vectors
174 .type _vpaes_encrypt_core,%function
178 ldr w8, [$key,#240] // pull rounds
179 adr x11, .Lk_mc_forward+16
180 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
181 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
182 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
183 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
184 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
185 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
186 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
187 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
188 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
193 // middle of middle round
195 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
196 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
197 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
198 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
199 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
200 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
201 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
202 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
203 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
204 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
205 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
206 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
207 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
208 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
209 bic x11, x11, #1<<6 // and \$0x30, %r11 # ... mod 4
210 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
211 sub w8, w8, #1 // nr--
215 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
216 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
217 tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
218 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
219 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
220 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
221 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
222 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
223 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
224 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
225 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
226 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
227 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
230 // middle of last round
232 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
233 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
234 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
235 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
236 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
237 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
238 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
239 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
241 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
244 .type vpaes_encrypt,%function
247 stp x29,x30,[sp,#-16]!
251 bl _vpaes_encrypt_preheat
252 bl _vpaes_encrypt_core
257 .size vpaes_encrypt,.-vpaes_encrypt
259 .type _vpaes_encrypt_2x,%function
263 ldr w8, [$key,#240] // pull rounds
264 adr x11, .Lk_mc_forward+16
265 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
266 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
267 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
268 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
269 and v9.16b, v15.16b, v17.16b
270 ushr v8.16b, v15.16b, #4
271 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
272 tbl v9.16b, {$iptlo}, v9.16b
273 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
274 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
275 tbl v10.16b, {$ipthi}, v8.16b
276 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
277 eor v8.16b, v9.16b, v16.16b
278 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
279 eor v8.16b, v8.16b, v10.16b
284 // middle of middle round
286 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
287 tbl v12.16b, {$sb1t}, v10.16b
288 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
289 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
290 tbl v8.16b, {$sb1u}, v11.16b
291 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
292 eor v12.16b, v12.16b, v16.16b
293 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
294 tbl v13.16b, {$sb2t}, v10.16b
295 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
296 eor v8.16b, v8.16b, v12.16b
297 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
298 tbl v10.16b, {$sb2u}, v11.16b
299 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
300 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
301 tbl v11.16b, {v8.16b}, v1.16b
302 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
303 eor v10.16b, v10.16b, v13.16b
304 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
305 tbl v8.16b, {v8.16b}, v4.16b
306 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
307 eor v11.16b, v11.16b, v10.16b
308 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
309 tbl v12.16b, {v11.16b},v1.16b
310 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
311 eor v8.16b, v8.16b, v11.16b
312 bic x11, x11, #1<<6 // and \$0x30, %r11 # ... mod 4
313 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
314 eor v8.16b, v8.16b, v12.16b
315 sub w8, w8, #1 // nr--
319 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
320 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
321 and v9.16b, v8.16b, v17.16b
322 ushr v8.16b, v8.16b, #4
323 tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
324 tbl v13.16b, {$invhi},v9.16b
325 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
326 eor v9.16b, v9.16b, v8.16b
327 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
328 tbl v11.16b, {$invlo},v8.16b
329 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
330 tbl v12.16b, {$invlo},v9.16b
331 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
332 eor v11.16b, v11.16b, v13.16b
333 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
334 eor v12.16b, v12.16b, v13.16b
335 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
336 tbl v10.16b, {$invlo},v11.16b
337 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
338 tbl v11.16b, {$invlo},v12.16b
339 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
340 eor v10.16b, v10.16b, v9.16b
341 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
342 eor v11.16b, v11.16b, v8.16b
343 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
344 cbnz w8, .Lenc_2x_loop
346 // middle of last round
348 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
349 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
350 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
351 tbl v12.16b, {$sbou}, v10.16b
352 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
353 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
354 tbl v8.16b, {$sbot}, v11.16b
355 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
356 eor v12.16b, v12.16b, v16.16b
357 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
358 eor v8.16b, v8.16b, v12.16b
359 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
360 tbl v1.16b, {v8.16b},v1.16b
362 .size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
364 .type _vpaes_decrypt_preheat,%function
366 _vpaes_decrypt_preheat:
370 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
371 ld1 {v20.2d-v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo
372 ld1 {v24.2d-v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd
373 ld1 {v28.2d-v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe
375 .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
380 ## Same API as encryption core.
382 .type _vpaes_decrypt_core,%function
386 ldr w8, [$key,#240] // pull rounds
388 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
389 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
390 eor x11, x11, #0x30 // xor \$0x30, %r11
392 and x11, x11, #0x30 // and \$0x30, %r11
394 adr x10, .Lk_mc_forward+48
396 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
397 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
398 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
399 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
400 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
401 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
402 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
403 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
404 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
410 // Inverse mix columns
412 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
413 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
414 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
415 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
416 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
417 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
418 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
419 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
421 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
422 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
423 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
424 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
425 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
426 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
427 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
429 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
430 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
431 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
432 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
433 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
434 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
435 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
437 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
438 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
439 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
440 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
441 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5
442 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
443 sub w8, w8, #1 // sub \$1,%rax # nr--
447 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
448 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
449 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
450 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
451 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
452 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
453 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
454 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
455 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
456 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
457 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
458 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
459 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
462 // middle of last round
463 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
464 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
465 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
466 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
467 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
468 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
469 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
470 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
472 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
475 .type vpaes_decrypt,%function
478 stp x29,x30,[sp,#-16]!
482 bl _vpaes_decrypt_preheat
483 bl _vpaes_decrypt_core
488 .size vpaes_decrypt,.-vpaes_decrypt
490 // v14-v15 input, v0-v1 output
491 .type _vpaes_decrypt_2x,%function
495 ldr w8, [$key,#240] // pull rounds
497 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
498 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
499 eor x11, x11, #0x30 // xor \$0x30, %r11
501 and x11, x11, #0x30 // and \$0x30, %r11
503 adr x10, .Lk_mc_forward+48
505 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
506 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
507 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
508 and v9.16b, v15.16b, v17.16b
509 ushr v8.16b, v15.16b, #4
510 tbl v2.16b, {$iptlo},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
511 tbl v10.16b, {$iptlo},v9.16b
512 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
513 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
514 tbl v0.16b, {$ipthi},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
515 tbl v8.16b, {$ipthi},v8.16b
516 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
517 eor v10.16b, v10.16b, v16.16b
518 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
519 eor v8.16b, v8.16b, v10.16b
525 // Inverse mix columns
527 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
528 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
529 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
530 tbl v12.16b, {$sb9u}, v10.16b
531 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
532 tbl v9.16b, {$sb9t}, v11.16b
533 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
534 eor v8.16b, v12.16b, v16.16b
535 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
536 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
537 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
538 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
540 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
541 tbl v12.16b, {$sbdu}, v10.16b
542 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
543 tbl v8.16b, {v8.16b},v5.16b
544 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
545 tbl v9.16b, {$sbdt}, v11.16b
546 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
547 eor v8.16b, v8.16b, v12.16b
548 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
549 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
550 eor v8.16b, v8.16b, v9.16b
551 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
553 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
554 tbl v12.16b, {$sbbu}, v10.16b
555 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
556 tbl v8.16b, {v8.16b},v5.16b
557 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
558 tbl v9.16b, {$sbbt}, v11.16b
559 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
560 eor v8.16b, v8.16b, v12.16b
561 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
562 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
563 eor v8.16b, v8.16b, v9.16b
564 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
566 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
567 tbl v12.16b, {$sbeu}, v10.16b
568 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
569 tbl v8.16b, {v8.16b},v5.16b
570 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
571 tbl v9.16b, {$sbet}, v11.16b
572 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
573 eor v8.16b, v8.16b, v12.16b
574 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5
575 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
576 eor v8.16b, v8.16b, v9.16b
577 sub w8, w8, #1 // sub \$1,%rax # nr--
581 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
582 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
583 and v9.16b, v8.16b, v17.16b
584 ushr v8.16b, v8.16b, #4
585 tbl v2.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
586 tbl v10.16b, {$invhi},v9.16b
587 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
588 eor v9.16b, v9.16b, v8.16b
589 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
590 tbl v11.16b, {$invlo},v8.16b
591 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
592 tbl v12.16b, {$invlo},v9.16b
593 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
594 eor v11.16b, v11.16b, v10.16b
595 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
596 eor v12.16b, v12.16b, v10.16b
597 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
598 tbl v10.16b, {$invlo},v11.16b
599 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
600 tbl v11.16b, {$invlo},v12.16b
601 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
602 eor v10.16b, v10.16b, v9.16b
603 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
604 eor v11.16b, v11.16b, v8.16b
605 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
606 cbnz w8, .Ldec_2x_loop
608 // middle of last round
609 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
610 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
611 tbl v12.16b, {$sbou}, v10.16b
612 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
613 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
614 tbl v9.16b, {$sbot}, v11.16b
615 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
616 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
617 eor v12.16b, v12.16b, v16.16b
618 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
619 eor v8.16b, v9.16b, v12.16b
620 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
621 tbl v1.16b, {v8.16b},v2.16b
623 .size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x
627 my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
628 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
631 ########################################################
633 ## AES key schedule ##
635 ########################################################
636 .type _vpaes_key_preheat,%function
640 movi v16.16b, #0x5b // .Lk_s63
642 movi v17.16b, #0x0f // .Lk_s0F
643 ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt
645 ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1
646 adr x11, .Lk_mc_forward
647 ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
648 ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
649 ld1 {v8.2d}, [x10] // .Lk_rcon
650 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0]
652 .size _vpaes_key_preheat,.-_vpaes_key_preheat
654 .type _vpaes_schedule_core,%function
656 _vpaes_schedule_core:
657 stp x29, x30, [sp,#-16]!
660 bl _vpaes_key_preheat // load the tables
662 ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
665 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
666 bl _vpaes_schedule_transform
667 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
669 adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10
671 cbnz $dir, .Lschedule_am_decrypting
673 // encrypting, output zeroth round key after transform
674 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx)
677 .Lschedule_am_decrypting:
678 // decrypting, output zeroth round key after shiftrows
679 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
680 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
681 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
682 eor x8, x8, #0x30 // xor \$0x30, %r8
685 cmp $bits, #192 // cmp \$192, %esi
693 ## 128-bit specific part of key schedule.
695 ## This schedule is really simple, because all its parts
696 ## are accomplished by the subroutines.
699 mov $inp, #10 // mov \$10, %esi
702 sub $inp, $inp, #1 // dec %esi
703 bl _vpaes_schedule_round
704 cbz $inp, .Lschedule_mangle_last
705 bl _vpaes_schedule_mangle // write output
711 ## 192-bit specific part of key schedule.
713 ## The main body of this schedule is the same as the 128-bit
714 ## schedule, but with more smearing. The long, high side is
715 ## stored in %xmm7 as before, and the short, low side is in
716 ## the high bits of %xmm6.
718 ## This schedule is somewhat nastier, however, because each
719 ## round produces 192 bits of key material, or 1.5 round keys.
720 ## Therefore, on each cycle we do 2 rounds and produce 3 round
726 ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
727 bl _vpaes_schedule_transform // input transform
728 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
729 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
730 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
731 mov $inp, #4 // mov \$4, %esi
734 sub $inp, $inp, #1 // dec %esi
735 bl _vpaes_schedule_round
736 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0
737 bl _vpaes_schedule_mangle // save key n
738 bl _vpaes_schedule_192_smear
739 bl _vpaes_schedule_mangle // save key n+1
740 bl _vpaes_schedule_round
741 cbz $inp, .Lschedule_mangle_last
742 bl _vpaes_schedule_mangle // save key n+2
743 bl _vpaes_schedule_192_smear
749 ## 256-bit specific part of key schedule.
751 ## The structure here is very similar to the 128-bit
752 ## schedule, but with an additional "low side" in
753 ## %xmm6. The low side's rounds are the same as the
754 ## high side's, except no rcon and no rotation.
758 ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
759 bl _vpaes_schedule_transform // input transform
760 mov $inp, #7 // mov \$7, %esi
763 sub $inp, $inp, #1 // dec %esi
764 bl _vpaes_schedule_mangle // output low result
765 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
768 bl _vpaes_schedule_round
769 cbz $inp, .Lschedule_mangle_last
770 bl _vpaes_schedule_mangle
772 // low round. swap xmm7 and xmm6
773 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
775 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
776 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
777 bl _vpaes_schedule_low_round
778 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
783 ## .aes_schedule_mangle_last
785 ## Mangler for last round of key schedule
787 ## when encrypting, outputs out(%xmm0) ^ 63
788 ## when decrypting, outputs unskew(%xmm0)
790 ## Always called right before return... jumps to cleanup and exits
793 .Lschedule_mangle_last:
794 // schedule last round key from xmm0
795 adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
796 cbnz $dir, .Lschedule_mangle_last_dec
799 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
800 adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
801 add $out, $out, #32 // add \$32, %rdx
802 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
804 .Lschedule_mangle_last_dec:
805 ld1 {v20.2d-v21.2d}, [x11] // reload constants
806 sub $out, $out, #16 // add \$-16, %rdx
807 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
808 bl _vpaes_schedule_transform // output transform
809 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key
812 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
813 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
814 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
815 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
816 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
817 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
818 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
819 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
820 ldp x29, x30, [sp],#16
822 .size _vpaes_schedule_core,.-_vpaes_schedule_core
825 ## .aes_schedule_192_smear
827 ## Smear the short, low side in the 192-bit key schedule.
830 ## %xmm7: high side, b a x y
831 ## %xmm6: low side, d c 0 0
835 ## %xmm6: b+c+d b+c 0 0
836 ## %xmm0: b+c+d b+c b a
838 .type _vpaes_schedule_192_smear,%function
840 _vpaes_schedule_192_smear:
843 ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
844 ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
845 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
846 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
847 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
848 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
849 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
851 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
854 ## .aes_schedule_round
856 ## Runs one main round of the key schedule on %xmm0, %xmm7
858 ## Specifically, runs subbytes on the high dword of %xmm0
859 ## then rotates it by one byte and xors into the low dword of
862 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
865 ## Smears the dwords of %xmm7 by xoring the low into the
866 ## second low, result into third, result into highest.
868 ## Returns results in %xmm7 = %xmm0.
869 ## Clobbers %xmm1-%xmm4, %r11.
871 .type _vpaes_schedule_round,%function
873 _vpaes_schedule_round:
874 // extract rcon from xmm8
875 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
876 ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1
877 ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8
878 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
881 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
882 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0
886 // low round: same as high round, but no rotation and no rcon.
887 _vpaes_schedule_low_round:
889 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1
890 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
891 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4
894 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
895 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
896 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
897 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
898 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
899 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
900 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
901 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
902 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
903 tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
904 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
905 tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
906 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
907 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
908 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
909 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
910 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
912 // add in smeared stuff
913 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
914 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
916 .size _vpaes_schedule_round,.-_vpaes_schedule_round
919 ## .aes_schedule_transform
921 ## Linear-transform %xmm0 according to tables at (%r11)
923 ## Requires that %xmm9 = 0x0F0F... as in preheat
925 ## Clobbers %xmm1, %xmm2
927 .type _vpaes_schedule_transform,%function
929 _vpaes_schedule_transform:
930 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
931 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
932 // vmovdqa (%r11), %xmm2 # lo
933 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
934 // vmovdqa 16(%r11), %xmm1 # hi
935 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
936 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
938 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
941 ## .aes_schedule_mangle
943 ## Mangle xmm0 from (basis-transformed) standard version
948 ## multiply by circulant 0,1,1,1
949 ## apply shiftrows transform
953 ## multiply by "inverse mixcolumns" circulant E,B,D,9
955 ## apply shiftrows transform
958 ## Writes out to (%rdx), and increments or decrements it
959 ## Keeps track of round number mod 4 in %r8
961 ## Clobbers xmm1-xmm5
963 .type _vpaes_schedule_mangle,%function
965 _vpaes_schedule_mangle:
966 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
967 // vmovdqa .Lk_mc_forward(%rip),%xmm5
968 cbnz $dir, .Lschedule_mangle_dec
971 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
972 add $out, $out, #16 // add \$16, %rdx
973 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
974 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
975 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
976 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
977 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
978 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
980 b .Lschedule_mangle_both
982 .Lschedule_mangle_dec:
983 // inverse mix columns
984 // lea .Lk_dksd(%rip),%r11
985 ushr v1.16b, v4.16b, #4 // vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
986 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
988 // vmovdqa 0x00(%r11), %xmm2
989 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
990 // vmovdqa 0x10(%r11), %xmm3
991 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
992 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
993 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
995 // vmovdqa 0x20(%r11), %xmm2
996 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
997 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
998 // vmovdqa 0x30(%r11), %xmm3
999 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1000 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1001 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1003 // vmovdqa 0x40(%r11), %xmm2
1004 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1005 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1006 // vmovdqa 0x50(%r11), %xmm3
1007 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1008 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1010 // vmovdqa 0x60(%r11), %xmm2
1011 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1012 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1013 // vmovdqa 0x70(%r11), %xmm4
1014 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
1015 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
1016 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1017 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
1019 sub $out, $out, #16 // add \$-16, %rdx
1021 .Lschedule_mangle_both:
1022 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1023 add x8, x8, #64-16 // add \$-16, %r8
1024 bic x8, x8, #1<<6 // and \$0x30, %r8
1025 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
1027 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
1029 .globl vpaes_set_encrypt_key
1030 .type vpaes_set_encrypt_key,%function
1032 vpaes_set_encrypt_key:
1033 stp x29,x30,[sp,#-16]!
1035 stp d8,d9,[sp,#-16]! // ABI spec says so
1037 lsr w9, $bits, #5 // shr \$5,%eax
1038 add w9, w9, #5 // \$5,%eax
1039 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1041 mov $dir, #0 // mov \$0,%ecx
1042 mov x8, #0x30 // mov \$0x30,%r8d
1043 bl _vpaes_schedule_core
1047 ldp x29,x30,[sp],#16
1049 .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1051 .globl vpaes_set_decrypt_key
1052 .type vpaes_set_decrypt_key,%function
1054 vpaes_set_decrypt_key:
1055 stp x29,x30,[sp,#-16]!
1057 stp d8,d9,[sp,#-16]! // ABI spec says so
1059 lsr w9, $bits, #5 // shr \$5,%eax
1060 add w9, w9, #5 // \$5,%eax
1061 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1062 lsl w9, w9, #4 // shl \$4,%eax
1063 add $out, $out, #16 // lea 16(%rdx,%rax),%rdx
1066 mov $dir, #1 // mov \$1,%ecx
1067 lsr w8, $bits, #1 // shr \$1,%r8d
1068 and x8, x8, #32 // and \$32,%r8d
1069 eor x8, x8, #32 // xor \$32,%r8d # nbits==192?0:32
1070 bl _vpaes_schedule_core
1073 ldp x29,x30,[sp],#16
1075 .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1079 my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5));
1082 .globl vpaes_cbc_encrypt
1083 .type vpaes_cbc_encrypt,%function
1086 cbz $len, .Lcbc_abort
1087 cmp w5, #0 // check direction
1088 b.eq vpaes_cbc_decrypt
1090 stp x29,x30,[sp,#-16]!
1093 mov x17, $len // reassign
1094 mov x2, $key // reassign
1096 ld1 {v0.16b}, [$ivec] // load ivec
1097 bl _vpaes_encrypt_preheat
1102 ld1 {v7.16b}, [$inp],#16 // load input
1103 eor v7.16b, v7.16b, v0.16b // xor with ivec
1104 bl _vpaes_encrypt_core
1105 st1 {v0.16b}, [$out],#16 // save output
1109 st1 {v0.16b}, [$ivec] // write ivec
1111 ldp x29,x30,[sp],#16
1114 .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1116 .type vpaes_cbc_decrypt,%function
1119 stp x29,x30,[sp,#-16]!
1121 stp d8,d9,[sp,#-16]! // ABI spec says so
1122 stp d10,d11,[sp,#-16]!
1123 stp d12,d13,[sp,#-16]!
1124 stp d14,d15,[sp,#-16]!
1126 mov x17, $len // reassign
1127 mov x2, $key // reassign
1128 ld1 {v6.16b}, [$ivec] // load ivec
1129 bl _vpaes_decrypt_preheat
1131 b.eq .Lcbc_dec_loop2x
1133 ld1 {v7.16b}, [$inp], #16 // load input
1134 bl _vpaes_decrypt_core
1135 eor v0.16b, v0.16b, v6.16b // xor with ivec
1136 orr v6.16b, v7.16b, v7.16b // next ivec value
1137 st1 {v0.16b}, [$out], #16
1143 ld1 {v14.16b,v15.16b}, [$inp], #32
1144 bl _vpaes_decrypt_2x
1145 eor v0.16b, v0.16b, v6.16b // xor with ivec
1146 eor v1.16b, v1.16b, v14.16b
1147 orr v6.16b, v15.16b, v15.16b
1148 st1 {v0.16b,v1.16b}, [$out], #32
1150 b.hi .Lcbc_dec_loop2x
1153 st1 {v6.16b}, [$ivec]
1155 ldp d14,d15,[sp],#16
1156 ldp d12,d13,[sp],#16
1157 ldp d10,d11,[sp],#16
1159 ldp x29,x30,[sp],#16
1161 .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1165 .globl vpaes_ecb_encrypt
1166 .type vpaes_ecb_encrypt,%function
1169 stp x29,x30,[sp,#-16]!
1171 stp d8,d9,[sp,#-16]! // ABI spec says so
1172 stp d10,d11,[sp,#-16]!
1173 stp d12,d13,[sp,#-16]!
1174 stp d14,d15,[sp,#-16]!
1178 bl _vpaes_encrypt_preheat
1182 ld1 {v7.16b}, [$inp],#16
1183 bl _vpaes_encrypt_core
1184 st1 {v0.16b}, [$out],#16
1190 ld1 {v14.16b,v15.16b}, [$inp], #32
1191 bl _vpaes_encrypt_2x
1192 st1 {v0.16b,v1.16b}, [$out], #32
1197 ldp d14,d15,[sp],#16
1198 ldp d12,d13,[sp],#16
1199 ldp d10,d11,[sp],#16
1201 ldp x29,x30,[sp],#16
1203 .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1205 .globl vpaes_ecb_decrypt
1206 .type vpaes_ecb_decrypt,%function
1209 stp x29,x30,[sp,#-16]!
1211 stp d8,d9,[sp,#-16]! // ABI spec says so
1212 stp d10,d11,[sp,#-16]!
1213 stp d12,d13,[sp,#-16]!
1214 stp d14,d15,[sp,#-16]!
1218 bl _vpaes_decrypt_preheat
1222 ld1 {v7.16b}, [$inp],#16
1223 bl _vpaes_encrypt_core
1224 st1 {v0.16b}, [$out],#16
1230 ld1 {v14.16b,v15.16b}, [$inp], #32
1231 bl _vpaes_decrypt_2x
1232 st1 {v0.16b,v1.16b}, [$out], #32
1237 ldp d14,d15,[sp],#16
1238 ldp d12,d13,[sp],#16
1239 ldp d10,d11,[sp],#16
1241 ldp x29,x30,[sp],#16
1243 .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt