2 # Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ######################################################################
11 ## Constant-time SSSE3 AES core implementation.
14 ## By Mike Hamburg (Stanford University), 2009
17 ## For details see http://shiftleft.org/papers/vector_aes/ and
18 ## http://crypto.stanford.edu/vpaes/.
20 ######################################################################
21 # ARMv8 NEON adaptation by <appro@openssl.org>
23 # Reason for undertaken effort is that there is at least one popular
24 # SoC based on Cortex-A53 that doesn't have crypto extensions.
26 # CBC enc ECB enc/dec(*) [bit-sliced enc/dec]
27 # Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ]
28 # Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ]
29 # X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ]
30 # Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ]
31 # Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ]
32 # Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ]
33 # ThunderX2(***) 39.4(**) 33.8/48.6(**)
35 # (*) ECB denotes approximate result for parallelizable modes
36 # such as CBC decrypt, CTR, etc.;
37 # (**) these results are worse than scalar compiler-generated
38 # code, but it's constant-time and therefore preferred;
39 # (***) presented for reference/comparison purposes;
42 while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
44 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
45 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
46 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
47 die "can't locate arm-xlate.pl";
49 open OUT,"| \"$^X\" $xlate $flavour $output";
55 .type _vpaes_consts,%object
56 .align 7 // totally strategic alignment
58 .Lk_mc_forward: // mc_forward
59 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
60 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
61 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
62 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
63 .Lk_mc_backward:// mc_backward
64 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
65 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
66 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
67 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
69 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
70 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
71 .quad 0x0F060D040B020900, 0x070E050C030A0108
72 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
78 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
79 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
80 .Lk_ipt: // input transform (lo, hi)
81 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
82 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
83 .Lk_sbo: // sbou, sbot
84 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
85 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
86 .Lk_sb1: // sb1u, sb1t
87 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
88 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
89 .Lk_sb2: // sb2u, sb2t
90 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
91 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
96 .Lk_dipt: // decryption input transform
97 .quad 0x0F505B040B545F00, 0x154A411E114E451A
98 .quad 0x86E383E660056500, 0x12771772F491F194
99 .Lk_dsbo: // decryption sbox final output
100 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
101 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
102 .Lk_dsb9: // decryption sbox output *9*u, *9*t
103 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
104 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
105 .Lk_dsbd: // decryption sbox output *D*u, *D*t
106 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
107 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
108 .Lk_dsbb: // decryption sbox output *B*u, *B*t
109 .quad 0xD022649296B44200, 0x602646F6B0F2D404
110 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
111 .Lk_dsbe: // decryption sbox output *E*u, *E*t
112 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
113 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
116 // Key schedule constants
118 .Lk_dksd: // decryption key schedule: invskew x*D
119 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
120 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
121 .Lk_dksb: // decryption key schedule: invskew x*B
122 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
123 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
124 .Lk_dkse: // decryption key schedule: invskew x*E + 0x63
125 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
126 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
127 .Lk_dks9: // decryption key schedule: invskew x*9
128 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
129 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
132 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
134 .Lk_opt: // output transform
135 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
136 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
137 .Lk_deskew: // deskew tables: inverts the sbox's "skew"
138 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
139 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
141 .asciz "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)"
142 .size _vpaes_consts,.-_vpaes_consts
147 my ($inp,$out,$key) = map("x$_",(0..2));
149 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
150 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
151 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
157 // Fills register %r10 -> .aes_consts (so you can -fPIC)
158 // and %xmm9-%xmm15 as specified below.
160 .type _vpaes_encrypt_preheat,%function
162 _vpaes_encrypt_preheat:
165 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
166 ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
167 ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2
169 .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
174 // AES-encrypt %xmm0.
178 // %xmm9-%xmm15 as in _vpaes_preheat
179 // (%rdx) = scheduled keys
182 // Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
183 // Preserves %xmm6 - %xmm8 so you get some local vectors
186 .type _vpaes_encrypt_core,%function
190 ldr w8, [$key,#240] // pull rounds
191 adr x11, .Lk_mc_forward+16
192 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
193 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
194 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
195 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
196 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
197 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
198 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
199 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
200 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
205 // middle of middle round
207 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
208 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
209 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
210 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
211 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
212 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
213 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
214 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
215 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
216 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
217 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
218 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
219 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
220 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
221 and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
222 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
223 sub w8, w8, #1 // nr--
227 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
228 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
229 tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
230 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
231 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
232 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
233 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
234 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
235 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
236 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
237 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
238 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
239 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
242 // middle of last round
244 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
245 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
246 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
247 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
248 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
249 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
250 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
251 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
253 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
256 .type vpaes_encrypt,%function
259 .inst 0xd503233f // paciasp
260 stp x29,x30,[sp,#-16]!
264 bl _vpaes_encrypt_preheat
265 bl _vpaes_encrypt_core
269 .inst 0xd50323bf // autiasp
271 .size vpaes_encrypt,.-vpaes_encrypt
273 .type _vpaes_encrypt_2x,%function
277 ldr w8, [$key,#240] // pull rounds
278 adr x11, .Lk_mc_forward+16
279 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
280 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
281 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
282 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
283 and v9.16b, v15.16b, v17.16b
284 ushr v8.16b, v15.16b, #4
285 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
286 tbl v9.16b, {$iptlo}, v9.16b
287 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
288 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
289 tbl v10.16b, {$ipthi}, v8.16b
290 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
291 eor v8.16b, v9.16b, v16.16b
292 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
293 eor v8.16b, v8.16b, v10.16b
298 // middle of middle round
300 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
301 tbl v12.16b, {$sb1t}, v10.16b
302 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
303 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
304 tbl v8.16b, {$sb1u}, v11.16b
305 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
306 eor v12.16b, v12.16b, v16.16b
307 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
308 tbl v13.16b, {$sb2t}, v10.16b
309 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
310 eor v8.16b, v8.16b, v12.16b
311 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
312 tbl v10.16b, {$sb2u}, v11.16b
313 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
314 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
315 tbl v11.16b, {v8.16b}, v1.16b
316 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
317 eor v10.16b, v10.16b, v13.16b
318 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
319 tbl v8.16b, {v8.16b}, v4.16b
320 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
321 eor v11.16b, v11.16b, v10.16b
322 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
323 tbl v12.16b, {v11.16b},v1.16b
324 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
325 eor v8.16b, v8.16b, v11.16b
326 and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
327 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
328 eor v8.16b, v8.16b, v12.16b
329 sub w8, w8, #1 // nr--
333 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
334 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
335 and v9.16b, v8.16b, v17.16b
336 ushr v8.16b, v8.16b, #4
337 tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
338 tbl v13.16b, {$invhi},v9.16b
339 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
340 eor v9.16b, v9.16b, v8.16b
341 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
342 tbl v11.16b, {$invlo},v8.16b
343 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
344 tbl v12.16b, {$invlo},v9.16b
345 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
346 eor v11.16b, v11.16b, v13.16b
347 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
348 eor v12.16b, v12.16b, v13.16b
349 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
350 tbl v10.16b, {$invlo},v11.16b
351 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
352 tbl v11.16b, {$invlo},v12.16b
353 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
354 eor v10.16b, v10.16b, v9.16b
355 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
356 eor v11.16b, v11.16b, v8.16b
357 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
358 cbnz w8, .Lenc_2x_loop
360 // middle of last round
362 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
363 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
364 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
365 tbl v12.16b, {$sbou}, v10.16b
366 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
367 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
368 tbl v8.16b, {$sbot}, v11.16b
369 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
370 eor v12.16b, v12.16b, v16.16b
371 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
372 eor v8.16b, v8.16b, v12.16b
373 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
374 tbl v1.16b, {v8.16b},v1.16b
376 .size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
378 .type _vpaes_decrypt_preheat,%function
380 _vpaes_decrypt_preheat:
384 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
385 ld1 {v20.2d-v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo
386 ld1 {v24.2d-v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd
387 ld1 {v28.2d-v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe
389 .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
394 // Same API as encryption core.
396 .type _vpaes_decrypt_core,%function
400 ldr w8, [$key,#240] // pull rounds
402 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
403 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
404 eor x11, x11, #0x30 // xor \$0x30, %r11
406 and x11, x11, #0x30 // and \$0x30, %r11
408 adr x10, .Lk_mc_forward+48
410 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
411 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
412 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
413 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
414 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
415 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
416 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
417 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
418 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
424 // Inverse mix columns
426 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
427 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
428 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
429 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
430 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
431 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
432 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
433 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
435 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
436 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
437 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
438 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
439 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
440 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
441 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
443 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
444 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
445 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
446 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
447 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
448 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
449 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
451 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
452 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
453 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
454 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
455 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5
456 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
457 sub w8, w8, #1 // sub \$1,%rax # nr--
461 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
462 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
463 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
464 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
465 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
466 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
467 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
468 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
469 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
470 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
471 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
472 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
473 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
476 // middle of last round
477 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
478 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
479 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
480 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
481 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
482 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
483 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
484 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
486 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
489 .type vpaes_decrypt,%function
492 .inst 0xd503233f // paciasp
493 stp x29,x30,[sp,#-16]!
497 bl _vpaes_decrypt_preheat
498 bl _vpaes_decrypt_core
502 .inst 0xd50323bf // autiasp
504 .size vpaes_decrypt,.-vpaes_decrypt
506 // v14-v15 input, v0-v1 output
507 .type _vpaes_decrypt_2x,%function
511 ldr w8, [$key,#240] // pull rounds
513 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
514 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
515 eor x11, x11, #0x30 // xor \$0x30, %r11
517 and x11, x11, #0x30 // and \$0x30, %r11
519 adr x10, .Lk_mc_forward+48
521 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
522 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
523 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
524 and v9.16b, v15.16b, v17.16b
525 ushr v8.16b, v15.16b, #4
526 tbl v2.16b, {$iptlo},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
527 tbl v10.16b, {$iptlo},v9.16b
528 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
529 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
530 tbl v0.16b, {$ipthi},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
531 tbl v8.16b, {$ipthi},v8.16b
532 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
533 eor v10.16b, v10.16b, v16.16b
534 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
535 eor v8.16b, v8.16b, v10.16b
541 // Inverse mix columns
543 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
544 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
545 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
546 tbl v12.16b, {$sb9u}, v10.16b
547 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
548 tbl v9.16b, {$sb9t}, v11.16b
549 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
550 eor v8.16b, v12.16b, v16.16b
551 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
552 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
553 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
554 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
556 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
557 tbl v12.16b, {$sbdu}, v10.16b
558 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
559 tbl v8.16b, {v8.16b},v5.16b
560 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
561 tbl v9.16b, {$sbdt}, v11.16b
562 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
563 eor v8.16b, v8.16b, v12.16b
564 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
565 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
566 eor v8.16b, v8.16b, v9.16b
567 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
569 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
570 tbl v12.16b, {$sbbu}, v10.16b
571 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
572 tbl v8.16b, {v8.16b},v5.16b
573 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
574 tbl v9.16b, {$sbbt}, v11.16b
575 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
576 eor v8.16b, v8.16b, v12.16b
577 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
578 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
579 eor v8.16b, v8.16b, v9.16b
580 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
582 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
583 tbl v12.16b, {$sbeu}, v10.16b
584 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
585 tbl v8.16b, {v8.16b},v5.16b
586 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
587 tbl v9.16b, {$sbet}, v11.16b
588 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
589 eor v8.16b, v8.16b, v12.16b
590 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5
591 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
592 eor v8.16b, v8.16b, v9.16b
593 sub w8, w8, #1 // sub \$1,%rax # nr--
597 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
598 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
599 and v9.16b, v8.16b, v17.16b
600 ushr v8.16b, v8.16b, #4
601 tbl v2.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
602 tbl v10.16b, {$invhi},v9.16b
603 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
604 eor v9.16b, v9.16b, v8.16b
605 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
606 tbl v11.16b, {$invlo},v8.16b
607 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
608 tbl v12.16b, {$invlo},v9.16b
609 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
610 eor v11.16b, v11.16b, v10.16b
611 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
612 eor v12.16b, v12.16b, v10.16b
613 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
614 tbl v10.16b, {$invlo},v11.16b
615 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
616 tbl v11.16b, {$invlo},v12.16b
617 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
618 eor v10.16b, v10.16b, v9.16b
619 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
620 eor v11.16b, v11.16b, v8.16b
621 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
622 cbnz w8, .Ldec_2x_loop
624 // middle of last round
625 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
626 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
627 tbl v12.16b, {$sbou}, v10.16b
628 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
629 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
630 tbl v9.16b, {$sbot}, v11.16b
631 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
632 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
633 eor v12.16b, v12.16b, v16.16b
634 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
635 eor v8.16b, v9.16b, v12.16b
636 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
637 tbl v1.16b, {v8.16b},v2.16b
639 .size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x
643 my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
644 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
647 ////////////////////////////////////////////////////////
649 // AES key schedule //
651 ////////////////////////////////////////////////////////
652 .type _vpaes_key_preheat,%function
656 movi v16.16b, #0x5b // .Lk_s63
658 movi v17.16b, #0x0f // .Lk_s0F
659 ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt
661 ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1
662 adr x11, .Lk_mc_forward
663 ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
664 ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
665 ld1 {v8.2d}, [x10] // .Lk_rcon
666 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0]
668 .size _vpaes_key_preheat,.-_vpaes_key_preheat
670 .type _vpaes_schedule_core,%function
672 _vpaes_schedule_core:
673 .inst 0xd503233f // paciasp
674 stp x29, x30, [sp,#-16]!
677 bl _vpaes_key_preheat // load the tables
679 ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
682 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
683 bl _vpaes_schedule_transform
684 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
686 adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10
688 cbnz $dir, .Lschedule_am_decrypting
690 // encrypting, output zeroth round key after transform
691 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx)
694 .Lschedule_am_decrypting:
695 // decrypting, output zeroth round key after shiftrows
696 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
697 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
698 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
699 eor x8, x8, #0x30 // xor \$0x30, %r8
702 cmp $bits, #192 // cmp \$192, %esi
710 // 128-bit specific part of key schedule.
712 // This schedule is really simple, because all its parts
713 // are accomplished by the subroutines.
716 mov $inp, #10 // mov \$10, %esi
719 sub $inp, $inp, #1 // dec %esi
720 bl _vpaes_schedule_round
721 cbz $inp, .Lschedule_mangle_last
722 bl _vpaes_schedule_mangle // write output
728 // 192-bit specific part of key schedule.
730 // The main body of this schedule is the same as the 128-bit
731 // schedule, but with more smearing. The long, high side is
732 // stored in %xmm7 as before, and the short, low side is in
733 // the high bits of %xmm6.
735 // This schedule is somewhat nastier, however, because each
736 // round produces 192 bits of key material, or 1.5 round keys.
737 // Therefore, on each cycle we do 2 rounds and produce 3 round
743 ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
744 bl _vpaes_schedule_transform // input transform
745 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
746 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
747 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
748 mov $inp, #4 // mov \$4, %esi
751 sub $inp, $inp, #1 // dec %esi
752 bl _vpaes_schedule_round
753 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0
754 bl _vpaes_schedule_mangle // save key n
755 bl _vpaes_schedule_192_smear
756 bl _vpaes_schedule_mangle // save key n+1
757 bl _vpaes_schedule_round
758 cbz $inp, .Lschedule_mangle_last
759 bl _vpaes_schedule_mangle // save key n+2
760 bl _vpaes_schedule_192_smear
766 // 256-bit specific part of key schedule.
768 // The structure here is very similar to the 128-bit
769 // schedule, but with an additional "low side" in
770 // %xmm6. The low side's rounds are the same as the
771 // high side's, except no rcon and no rotation.
775 ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
776 bl _vpaes_schedule_transform // input transform
777 mov $inp, #7 // mov \$7, %esi
780 sub $inp, $inp, #1 // dec %esi
781 bl _vpaes_schedule_mangle // output low result
782 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
785 bl _vpaes_schedule_round
786 cbz $inp, .Lschedule_mangle_last
787 bl _vpaes_schedule_mangle
789 // low round. swap xmm7 and xmm6
790 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
792 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
793 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
794 bl _vpaes_schedule_low_round
795 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
800 // .aes_schedule_mangle_last
802 // Mangler for last round of key schedule
804 // when encrypting, outputs out(%xmm0) ^ 63
805 // when decrypting, outputs unskew(%xmm0)
807 // Always called right before return... jumps to cleanup and exits
810 .Lschedule_mangle_last:
811 // schedule last round key from xmm0
812 adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
813 cbnz $dir, .Lschedule_mangle_last_dec
816 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
817 adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
818 add $out, $out, #32 // add \$32, %rdx
819 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
821 .Lschedule_mangle_last_dec:
822 ld1 {v20.2d-v21.2d}, [x11] // reload constants
823 sub $out, $out, #16 // add \$-16, %rdx
824 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
825 bl _vpaes_schedule_transform // output transform
826 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key
829 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
830 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
831 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
832 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
833 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
834 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
835 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
836 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
837 ldp x29, x30, [sp],#16
838 .inst 0xd50323bf // autiasp
840 .size _vpaes_schedule_core,.-_vpaes_schedule_core
843 // .aes_schedule_192_smear
845 // Smear the short, low side in the 192-bit key schedule.
848 // %xmm7: high side, b a x y
849 // %xmm6: low side, d c 0 0
853 // %xmm6: b+c+d b+c 0 0
854 // %xmm0: b+c+d b+c b a
856 .type _vpaes_schedule_192_smear,%function
858 _vpaes_schedule_192_smear:
861 ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
862 ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
863 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
864 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
865 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
866 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
867 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
869 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
872 // .aes_schedule_round
874 // Runs one main round of the key schedule on %xmm0, %xmm7
876 // Specifically, runs subbytes on the high dword of %xmm0
877 // then rotates it by one byte and xors into the low dword of
880 // Adds rcon from low byte of %xmm8, then rotates %xmm8 for
883 // Smears the dwords of %xmm7 by xoring the low into the
884 // second low, result into third, result into highest.
886 // Returns results in %xmm7 = %xmm0.
887 // Clobbers %xmm1-%xmm4, %r11.
889 .type _vpaes_schedule_round,%function
891 _vpaes_schedule_round:
892 // extract rcon from xmm8
893 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
894 ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1
895 ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8
896 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
899 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
900 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0
904 // low round: same as high round, but no rotation and no rcon.
905 _vpaes_schedule_low_round:
907 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1
908 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
909 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4
912 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
913 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
914 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
915 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
916 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
917 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
918 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
919 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
920 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
921 tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
922 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
923 tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
924 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
925 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
926 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
927 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
928 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
930 // add in smeared stuff
931 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
932 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
934 .size _vpaes_schedule_round,.-_vpaes_schedule_round
937 // .aes_schedule_transform
939 // Linear-transform %xmm0 according to tables at (%r11)
941 // Requires that %xmm9 = 0x0F0F... as in preheat
943 // Clobbers %xmm1, %xmm2
945 .type _vpaes_schedule_transform,%function
947 _vpaes_schedule_transform:
948 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
949 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
950 // vmovdqa (%r11), %xmm2 # lo
951 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
952 // vmovdqa 16(%r11), %xmm1 # hi
953 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
954 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
956 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
959 // .aes_schedule_mangle
961 // Mangle xmm0 from (basis-transformed) standard version
966 // multiply by circulant 0,1,1,1
967 // apply shiftrows transform
971 // multiply by "inverse mixcolumns" circulant E,B,D,9
973 // apply shiftrows transform
976 // Writes out to (%rdx), and increments or decrements it
977 // Keeps track of round number mod 4 in %r8
979 // Clobbers xmm1-xmm5
981 .type _vpaes_schedule_mangle,%function
983 _vpaes_schedule_mangle:
984 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
985 // vmovdqa .Lk_mc_forward(%rip),%xmm5
986 cbnz $dir, .Lschedule_mangle_dec
989 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
990 add $out, $out, #16 // add \$16, %rdx
991 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
992 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
993 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
994 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
995 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
996 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
998 b .Lschedule_mangle_both
1000 .Lschedule_mangle_dec:
1001 // inverse mix columns
1002 // lea .Lk_dksd(%rip),%r11
1003 ushr v1.16b, v4.16b, #4 // vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
1004 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1006 // vmovdqa 0x00(%r11), %xmm2
1007 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1008 // vmovdqa 0x10(%r11), %xmm3
1009 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1010 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1011 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1013 // vmovdqa 0x20(%r11), %xmm2
1014 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1015 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1016 // vmovdqa 0x30(%r11), %xmm3
1017 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1018 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1019 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1021 // vmovdqa 0x40(%r11), %xmm2
1022 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1023 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1024 // vmovdqa 0x50(%r11), %xmm3
1025 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1026 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1028 // vmovdqa 0x60(%r11), %xmm2
1029 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1030 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1031 // vmovdqa 0x70(%r11), %xmm4
1032 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
1033 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
1034 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1035 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
1037 sub $out, $out, #16 // add \$-16, %rdx
1039 .Lschedule_mangle_both:
1040 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1041 add x8, x8, #64-16 // add \$-16, %r8
1042 and x8, x8, #~(1<<6) // and \$0x30, %r8
1043 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
1045 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
1047 .globl vpaes_set_encrypt_key
1048 .type vpaes_set_encrypt_key,%function
1050 vpaes_set_encrypt_key:
1051 .inst 0xd503233f // paciasp
1052 stp x29,x30,[sp,#-16]!
1054 stp d8,d9,[sp,#-16]! // ABI spec says so
1056 lsr w9, $bits, #5 // shr \$5,%eax
1057 add w9, w9, #5 // \$5,%eax
1058 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1060 mov $dir, #0 // mov \$0,%ecx
1061 mov x8, #0x30 // mov \$0x30,%r8d
1062 bl _vpaes_schedule_core
1066 ldp x29,x30,[sp],#16
1067 .inst 0xd50323bf // autiasp
1069 .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1071 .globl vpaes_set_decrypt_key
1072 .type vpaes_set_decrypt_key,%function
1074 vpaes_set_decrypt_key:
1075 .inst 0xd503233f // paciasp
1076 stp x29,x30,[sp,#-16]!
1078 stp d8,d9,[sp,#-16]! // ABI spec says so
1080 lsr w9, $bits, #5 // shr \$5,%eax
1081 add w9, w9, #5 // \$5,%eax
1082 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1083 lsl w9, w9, #4 // shl \$4,%eax
1084 add $out, $out, #16 // lea 16(%rdx,%rax),%rdx
1087 mov $dir, #1 // mov \$1,%ecx
1088 lsr w8, $bits, #1 // shr \$1,%r8d
1089 and x8, x8, #32 // and \$32,%r8d
1090 eor x8, x8, #32 // xor \$32,%r8d # nbits==192?0:32
1091 bl _vpaes_schedule_core
1094 ldp x29,x30,[sp],#16
1095 .inst 0xd50323bf // autiasp
1097 .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1101 my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5));
1104 .globl vpaes_cbc_encrypt
1105 .type vpaes_cbc_encrypt,%function
1108 cbz $len, .Lcbc_abort
1109 cmp w5, #0 // check direction
1110 b.eq vpaes_cbc_decrypt
1112 .inst 0xd503233f // paciasp
1113 stp x29,x30,[sp,#-16]!
1116 mov x17, $len // reassign
1117 mov x2, $key // reassign
1119 ld1 {v0.16b}, [$ivec] // load ivec
1120 bl _vpaes_encrypt_preheat
1125 ld1 {v7.16b}, [$inp],#16 // load input
1126 eor v7.16b, v7.16b, v0.16b // xor with ivec
1127 bl _vpaes_encrypt_core
1128 st1 {v0.16b}, [$out],#16 // save output
1132 st1 {v0.16b}, [$ivec] // write ivec
1134 ldp x29,x30,[sp],#16
1135 .inst 0xd50323bf // autiasp
1138 .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1140 .type vpaes_cbc_decrypt,%function
1143 .inst 0xd503233f // paciasp
1144 stp x29,x30,[sp,#-16]!
1146 stp d8,d9,[sp,#-16]! // ABI spec says so
1147 stp d10,d11,[sp,#-16]!
1148 stp d12,d13,[sp,#-16]!
1149 stp d14,d15,[sp,#-16]!
1151 mov x17, $len // reassign
1152 mov x2, $key // reassign
1153 ld1 {v6.16b}, [$ivec] // load ivec
1154 bl _vpaes_decrypt_preheat
1156 b.eq .Lcbc_dec_loop2x
1158 ld1 {v7.16b}, [$inp], #16 // load input
1159 bl _vpaes_decrypt_core
1160 eor v0.16b, v0.16b, v6.16b // xor with ivec
1161 orr v6.16b, v7.16b, v7.16b // next ivec value
1162 st1 {v0.16b}, [$out], #16
1168 ld1 {v14.16b,v15.16b}, [$inp], #32
1169 bl _vpaes_decrypt_2x
1170 eor v0.16b, v0.16b, v6.16b // xor with ivec
1171 eor v1.16b, v1.16b, v14.16b
1172 orr v6.16b, v15.16b, v15.16b
1173 st1 {v0.16b,v1.16b}, [$out], #32
1175 b.hi .Lcbc_dec_loop2x
1178 st1 {v6.16b}, [$ivec]
1180 ldp d14,d15,[sp],#16
1181 ldp d12,d13,[sp],#16
1182 ldp d10,d11,[sp],#16
1184 ldp x29,x30,[sp],#16
1185 .inst 0xd50323bf // autiasp
1187 .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1191 .globl vpaes_ecb_encrypt
1192 .type vpaes_ecb_encrypt,%function
1195 .inst 0xd503233f // paciasp
1196 stp x29,x30,[sp,#-16]!
1198 stp d8,d9,[sp,#-16]! // ABI spec says so
1199 stp d10,d11,[sp,#-16]!
1200 stp d12,d13,[sp,#-16]!
1201 stp d14,d15,[sp,#-16]!
1205 bl _vpaes_encrypt_preheat
1209 ld1 {v7.16b}, [$inp],#16
1210 bl _vpaes_encrypt_core
1211 st1 {v0.16b}, [$out],#16
1217 ld1 {v14.16b,v15.16b}, [$inp], #32
1218 bl _vpaes_encrypt_2x
1219 st1 {v0.16b,v1.16b}, [$out], #32
1224 ldp d14,d15,[sp],#16
1225 ldp d12,d13,[sp],#16
1226 ldp d10,d11,[sp],#16
1228 ldp x29,x30,[sp],#16
1229 .inst 0xd50323bf // autiasp
1231 .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1233 .globl vpaes_ecb_decrypt
1234 .type vpaes_ecb_decrypt,%function
1237 .inst 0xd503233f // paciasp
1238 stp x29,x30,[sp,#-16]!
1240 stp d8,d9,[sp,#-16]! // ABI spec says so
1241 stp d10,d11,[sp,#-16]!
1242 stp d12,d13,[sp,#-16]!
1243 stp d14,d15,[sp,#-16]!
1247 bl _vpaes_decrypt_preheat
1251 ld1 {v7.16b}, [$inp],#16
1252 bl _vpaes_encrypt_core
1253 st1 {v0.16b}, [$out],#16
1259 ld1 {v14.16b,v15.16b}, [$inp], #32
1260 bl _vpaes_decrypt_2x
1261 st1 {v0.16b,v1.16b}, [$out], #32
1266 ldp d14,d15,[sp],#16
1267 ldp d12,d13,[sp],#16
1268 ldp d10,d11,[sp],#16
1270 ldp x29,x30,[sp],#16
1271 .inst 0xd50323bf // autiasp
1273 .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt