2 # Copyright 2015-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ######################################################################
11 ## Constant-time SSSE3 AES core implementation.
14 ## By Mike Hamburg (Stanford University), 2009
17 ## For details see http://shiftleft.org/papers/vector_aes/ and
18 ## http://crypto.stanford.edu/vpaes/.
20 ######################################################################
21 # ARMv8 NEON adaptation by <appro@openssl.org>
23 # Reason for undertaken effort is that there is at least one popular
24 # SoC based on Cortex-A53 that doesn't have crypto extensions.
26 # CBC enc ECB enc/dec(*) [bit-sliced enc/dec]
27 # Cortex-A53 21.5 18.1/20.6 [17.5/19.8 ]
28 # Cortex-A57 36.0(**) 20.4/24.9(**) [14.4/16.6 ]
29 # X-Gene 45.9(**) 45.8/57.7(**) [33.1/37.6(**) ]
30 # Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ]
31 # Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ]
32 # Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ]
33 # ThunderX2(***) 39.4(**) 33.8/48.6(**)
35 # (*) ECB denotes approximate result for parallelizable modes
36 # such as CBC decrypt, CTR, etc.;
37 # (**) these results are worse than scalar compiler-generated
38 # code, but it's constant-time and therefore preferred;
39 # (***) presented for reference/comparison purposes;
41 # $output is the last argument if it looks like a file (it has an extension)
42 # $flavour is the first argument if it doesn't look like a file
43 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
44 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
49 die "can't locate arm-xlate.pl";
51 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
52 or die "can't call $xlate: $!";
60 .type _vpaes_consts,%object
61 .align 7 // totally strategic alignment
63 .Lk_mc_forward: // mc_forward
64 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
65 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
66 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
67 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
68 .Lk_mc_backward:// mc_backward
69 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
70 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
71 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
72 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
74 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
75 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
76 .quad 0x0F060D040B020900, 0x070E050C030A0108
77 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
83 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
84 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
85 .Lk_ipt: // input transform (lo, hi)
86 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
87 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
88 .Lk_sbo: // sbou, sbot
89 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
90 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
91 .Lk_sb1: // sb1u, sb1t
92 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
93 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
94 .Lk_sb2: // sb2u, sb2t
95 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
96 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
101 .Lk_dipt: // decryption input transform
102 .quad 0x0F505B040B545F00, 0x154A411E114E451A
103 .quad 0x86E383E660056500, 0x12771772F491F194
104 .Lk_dsbo: // decryption sbox final output
105 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
106 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
107 .Lk_dsb9: // decryption sbox output *9*u, *9*t
108 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
109 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
110 .Lk_dsbd: // decryption sbox output *D*u, *D*t
111 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
112 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
113 .Lk_dsbb: // decryption sbox output *B*u, *B*t
114 .quad 0xD022649296B44200, 0x602646F6B0F2D404
115 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
116 .Lk_dsbe: // decryption sbox output *E*u, *E*t
117 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
118 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
121 // Key schedule constants
123 .Lk_dksd: // decryption key schedule: invskew x*D
124 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
125 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
126 .Lk_dksb: // decryption key schedule: invskew x*B
127 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
128 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
129 .Lk_dkse: // decryption key schedule: invskew x*E + 0x63
130 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
131 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
132 .Lk_dks9: // decryption key schedule: invskew x*9
133 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
134 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
137 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
139 .Lk_opt: // output transform
140 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
141 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
142 .Lk_deskew: // deskew tables: inverts the sbox's "skew"
143 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
144 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
146 .asciz "Vector Permutation AES for ARMv8, Mike Hamburg (Stanford University)"
147 .size _vpaes_consts,.-_vpaes_consts
152 my ($inp,$out,$key) = map("x$_",(0..2));
154 my ($invlo,$invhi,$iptlo,$ipthi,$sbou,$sbot) = map("v$_.16b",(18..23));
155 my ($sb1u,$sb1t,$sb2u,$sb2t) = map("v$_.16b",(24..27));
156 my ($sb9u,$sb9t,$sbdu,$sbdt,$sbbu,$sbbt,$sbeu,$sbet)=map("v$_.16b",(24..31));
162 // Fills register %r10 -> .aes_consts (so you can -fPIC)
163 // and %xmm9-%xmm15 as specified below.
165 .type _vpaes_encrypt_preheat,%function
167 _vpaes_encrypt_preheat:
170 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
171 ld1 {v20.2d-v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo
172 ld1 {v24.2d-v27.2d}, [x10] // .Lk_sb1, .Lk_sb2
174 .size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat
179 // AES-encrypt %xmm0.
183 // %xmm9-%xmm15 as in _vpaes_preheat
184 // (%rdx) = scheduled keys
187 // Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
188 // Preserves %xmm6 - %xmm8 so you get some local vectors
191 .type _vpaes_encrypt_core,%function
195 ldr w8, [$key,#240] // pull rounds
196 adr x11, .Lk_mc_forward+16
197 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
198 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
199 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
200 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
201 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
202 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
203 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
204 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
205 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
210 // middle of middle round
212 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
213 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
214 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
215 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
216 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
217 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
218 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
219 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
220 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
221 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
222 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
223 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
224 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
225 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
226 and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
227 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
228 sub w8, w8, #1 // nr--
232 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
233 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
234 tbl v5.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
235 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
236 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
237 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
238 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
239 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
240 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
241 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
242 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
243 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
244 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
247 // middle of last round
249 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
250 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
251 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
252 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
253 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
254 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
255 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
256 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0
258 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
261 .type vpaes_encrypt,%function
264 AARCH64_SIGN_LINK_REGISTER
265 stp x29,x30,[sp,#-16]!
269 bl _vpaes_encrypt_preheat
270 bl _vpaes_encrypt_core
274 AARCH64_VALIDATE_LINK_REGISTER
276 .size vpaes_encrypt,.-vpaes_encrypt
278 .type _vpaes_encrypt_2x,%function
282 ldr w8, [$key,#240] // pull rounds
283 adr x11, .Lk_mc_forward+16
284 // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo
285 ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key
286 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
287 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
288 and v9.16b, v15.16b, v17.16b
289 ushr v8.16b, v15.16b, #4
290 tbl v1.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1
291 tbl v9.16b, {$iptlo}, v9.16b
292 // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi
293 tbl v2.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2
294 tbl v10.16b, {$ipthi}, v8.16b
295 eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0
296 eor v8.16b, v9.16b, v16.16b
297 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
298 eor v8.16b, v8.16b, v10.16b
303 // middle of middle round
305 tbl v4.16b, {$sb1t}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u
306 tbl v12.16b, {$sb1t}, v10.16b
307 ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
308 tbl v0.16b, {$sb1u}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t
309 tbl v8.16b, {$sb1u}, v11.16b
310 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
311 eor v12.16b, v12.16b, v16.16b
312 tbl v5.16b, {$sb2t}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u
313 tbl v13.16b, {$sb2t}, v10.16b
314 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
315 eor v8.16b, v8.16b, v12.16b
316 tbl v2.16b, {$sb2u}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t
317 tbl v10.16b, {$sb2u}, v11.16b
318 ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
319 tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B
320 tbl v11.16b, {v8.16b}, v1.16b
321 eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A
322 eor v10.16b, v10.16b, v13.16b
323 tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D
324 tbl v8.16b, {v8.16b}, v4.16b
325 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B
326 eor v11.16b, v11.16b, v10.16b
327 tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C
328 tbl v12.16b, {v11.16b},v1.16b
329 eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D
330 eor v8.16b, v8.16b, v11.16b
331 and x11, x11, #~(1<<6) // and \$0x30, %r11 # ... mod 4
332 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D
333 eor v8.16b, v8.16b, v12.16b
334 sub w8, w8, #1 // nr--
338 and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k
339 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
340 and v9.16b, v8.16b, v17.16b
341 ushr v8.16b, v8.16b, #4
342 tbl v5.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k
343 tbl v13.16b, {$invhi},v9.16b
344 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
345 eor v9.16b, v9.16b, v8.16b
346 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
347 tbl v11.16b, {$invlo},v8.16b
348 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
349 tbl v12.16b, {$invlo},v9.16b
350 eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
351 eor v11.16b, v11.16b, v13.16b
352 eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
353 eor v12.16b, v12.16b, v13.16b
354 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
355 tbl v10.16b, {$invlo},v11.16b
356 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
357 tbl v11.16b, {$invlo},v12.16b
358 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
359 eor v10.16b, v10.16b, v9.16b
360 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
361 eor v11.16b, v11.16b, v8.16b
362 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5
363 cbnz w8, .Lenc_2x_loop
365 // middle of last round
367 // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
368 // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
369 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
370 tbl v12.16b, {$sbou}, v10.16b
371 ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
372 tbl v0.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t
373 tbl v8.16b, {$sbot}, v11.16b
374 eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k
375 eor v12.16b, v12.16b, v16.16b
376 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A
377 eor v8.16b, v8.16b, v12.16b
378 tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0
379 tbl v1.16b, {v8.16b},v1.16b
381 .size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x
383 .type _vpaes_decrypt_preheat,%function
385 _vpaes_decrypt_preheat:
389 ld1 {v18.2d-v19.2d}, [x10],#32 // .Lk_inv
390 ld1 {v20.2d-v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo
391 ld1 {v24.2d-v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd
392 ld1 {v28.2d-v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe
394 .size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat
399 // Same API as encryption core.
401 .type _vpaes_decrypt_core,%function
405 ldr w8, [$key,#240] // pull rounds
407 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
408 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
409 eor x11, x11, #0x30 // xor \$0x30, %r11
411 and x11, x11, #0x30 // and \$0x30, %r11
413 adr x10, .Lk_mc_forward+48
415 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
416 and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
417 ushr v0.16b, v7.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
418 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
419 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
420 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
421 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
422 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
423 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
429 // Inverse mix columns
431 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
432 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
433 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
434 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
435 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
436 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
437 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
438 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
440 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
441 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
442 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
443 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
444 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
445 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
446 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
448 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
449 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
450 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
451 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
452 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
453 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
454 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
456 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
457 tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
458 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
459 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
460 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5
461 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
462 sub w8, w8, #1 // sub \$1,%rax # nr--
466 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
467 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
468 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
469 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
470 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
471 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
472 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
473 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
474 tbl v2.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
475 tbl v3.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
476 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
477 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
478 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
481 // middle of last round
482 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
483 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
484 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
485 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
486 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
487 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
488 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
489 tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0
491 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
494 .type vpaes_decrypt,%function
497 AARCH64_SIGN_LINK_REGISTER
498 stp x29,x30,[sp,#-16]!
502 bl _vpaes_decrypt_preheat
503 bl _vpaes_decrypt_core
507 AARCH64_VALIDATE_LINK_REGISTER
509 .size vpaes_decrypt,.-vpaes_decrypt
511 // v14-v15 input, v0-v1 output
512 .type _vpaes_decrypt_2x,%function
516 ldr w8, [$key,#240] // pull rounds
518 // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo
519 lsl x11, x8, #4 // mov %rax, %r11; shl \$4, %r11
520 eor x11, x11, #0x30 // xor \$0x30, %r11
522 and x11, x11, #0x30 // and \$0x30, %r11
524 adr x10, .Lk_mc_forward+48
526 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key
527 and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
528 ushr v0.16b, v14.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
529 and v9.16b, v15.16b, v17.16b
530 ushr v8.16b, v15.16b, #4
531 tbl v2.16b, {$iptlo},v1.16b // vpshufb %xmm1, %xmm2, %xmm2
532 tbl v10.16b, {$iptlo},v9.16b
533 ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5
534 // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi
535 tbl v0.16b, {$ipthi},v0.16b // vpshufb %xmm0, %xmm1, %xmm0
536 tbl v8.16b, {$ipthi},v8.16b
537 eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2
538 eor v10.16b, v10.16b, v16.16b
539 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
540 eor v8.16b, v8.16b, v10.16b
546 // Inverse mix columns
548 // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u
549 // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t
550 tbl v4.16b, {$sb9u}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u
551 tbl v12.16b, {$sb9u}, v10.16b
552 tbl v1.16b, {$sb9t}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t
553 tbl v9.16b, {$sb9t}, v11.16b
554 eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0
555 eor v8.16b, v12.16b, v16.16b
556 // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu
557 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
558 eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
559 // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt
561 tbl v4.16b, {$sbdu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu
562 tbl v12.16b, {$sbdu}, v10.16b
563 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
564 tbl v8.16b, {v8.16b},v5.16b
565 tbl v1.16b, {$sbdt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt
566 tbl v9.16b, {$sbdt}, v11.16b
567 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
568 eor v8.16b, v8.16b, v12.16b
569 // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu
570 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
571 eor v8.16b, v8.16b, v9.16b
572 // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt
574 tbl v4.16b, {$sbbu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu
575 tbl v12.16b, {$sbbu}, v10.16b
576 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
577 tbl v8.16b, {v8.16b},v5.16b
578 tbl v1.16b, {$sbbt}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt
579 tbl v9.16b, {$sbbt}, v11.16b
580 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
581 eor v8.16b, v8.16b, v12.16b
582 // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu
583 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
584 eor v8.16b, v8.16b, v9.16b
585 // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet
587 tbl v4.16b, {$sbeu}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu
588 tbl v12.16b, {$sbeu}, v10.16b
589 tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch
590 tbl v8.16b, {v8.16b},v5.16b
591 tbl v1.16b, {$sbet}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet
592 tbl v9.16b, {$sbet}, v11.16b
593 eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch
594 eor v8.16b, v8.16b, v12.16b
595 ext v5.16b, v5.16b, v5.16b, #12 // vpalignr \$12, %xmm5, %xmm5, %xmm5
596 eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch
597 eor v8.16b, v8.16b, v9.16b
598 sub w8, w8, #1 // sub \$1,%rax # nr--
602 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
603 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
604 and v9.16b, v8.16b, v17.16b
605 ushr v8.16b, v8.16b, #4
606 tbl v2.16b, {$invhi},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
607 tbl v10.16b, {$invhi},v9.16b
608 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
609 eor v9.16b, v9.16b, v8.16b
610 tbl v3.16b, {$invlo},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
611 tbl v11.16b, {$invlo},v8.16b
612 tbl v4.16b, {$invlo},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
613 tbl v12.16b, {$invlo},v9.16b
614 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
615 eor v11.16b, v11.16b, v10.16b
616 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
617 eor v12.16b, v12.16b, v10.16b
618 tbl v2.16b, {$invlo},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak
619 tbl v10.16b, {$invlo},v11.16b
620 tbl v3.16b, {$invlo},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak
621 tbl v11.16b, {$invlo},v12.16b
622 eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io
623 eor v10.16b, v10.16b, v9.16b
624 eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo
625 eor v11.16b, v11.16b, v8.16b
626 ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0
627 cbnz w8, .Ldec_2x_loop
629 // middle of last round
630 // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou
631 tbl v4.16b, {$sbou}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou
632 tbl v12.16b, {$sbou}, v10.16b
633 // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot
634 tbl v1.16b, {$sbot}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t
635 tbl v9.16b, {$sbot}, v11.16b
636 ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
637 eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k
638 eor v12.16b, v12.16b, v16.16b
639 eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A
640 eor v8.16b, v9.16b, v12.16b
641 tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0
642 tbl v1.16b, {v8.16b},v2.16b
644 .size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x
648 my ($inp,$bits,$out,$dir)=("x0","w1","x2","w3");
649 my ($invlo,$invhi,$iptlo,$ipthi,$rcon) = map("v$_.16b",(18..21,8));
652 ////////////////////////////////////////////////////////
654 // AES key schedule //
656 ////////////////////////////////////////////////////////
657 .type _vpaes_key_preheat,%function
661 movi v16.16b, #0x5b // .Lk_s63
663 movi v17.16b, #0x0f // .Lk_s0F
664 ld1 {v18.2d-v21.2d}, [x10] // .Lk_inv, .Lk_ipt
666 ld1 {v22.2d-v23.2d}, [x11] // .Lk_sb1
667 adr x11, .Lk_mc_forward
668 ld1 {v24.2d-v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb
669 ld1 {v28.2d-v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9
670 ld1 {v8.2d}, [x10] // .Lk_rcon
671 ld1 {v9.2d}, [x11] // .Lk_mc_forward[0]
673 .size _vpaes_key_preheat,.-_vpaes_key_preheat
675 .type _vpaes_schedule_core,%function
677 _vpaes_schedule_core:
678 AARCH64_SIGN_LINK_REGISTER
679 stp x29, x30, [sp,#-16]!
682 bl _vpaes_key_preheat // load the tables
684 ld1 {v0.16b}, [$inp],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned)
687 mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3
688 bl _vpaes_schedule_transform
689 mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7
691 adr x10, .Lk_sr // lea .Lk_sr(%rip),%r10
693 cbnz $dir, .Lschedule_am_decrypting
695 // encrypting, output zeroth round key after transform
696 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx)
699 .Lschedule_am_decrypting:
700 // decrypting, output zeroth round key after shiftrows
701 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
702 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
703 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
704 eor x8, x8, #0x30 // xor \$0x30, %r8
707 cmp $bits, #192 // cmp \$192, %esi
715 // 128-bit specific part of key schedule.
717 // This schedule is really simple, because all its parts
718 // are accomplished by the subroutines.
721 mov $inp, #10 // mov \$10, %esi
724 sub $inp, $inp, #1 // dec %esi
725 bl _vpaes_schedule_round
726 cbz $inp, .Lschedule_mangle_last
727 bl _vpaes_schedule_mangle // write output
733 // 192-bit specific part of key schedule.
735 // The main body of this schedule is the same as the 128-bit
736 // schedule, but with more smearing. The long, high side is
737 // stored in %xmm7 as before, and the short, low side is in
738 // the high bits of %xmm6.
740 // This schedule is somewhat nastier, however, because each
741 // round produces 192 bits of key material, or 1.5 round keys.
742 // Therefore, on each cycle we do 2 rounds and produce 3 round
748 ld1 {v0.16b}, [$inp] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
749 bl _vpaes_schedule_transform // input transform
750 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part
751 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4
752 ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros
753 mov $inp, #4 // mov \$4, %esi
756 sub $inp, $inp, #1 // dec %esi
757 bl _vpaes_schedule_round
758 ext v0.16b, v6.16b, v0.16b, #8 // vpalignr \$8,%xmm6,%xmm0,%xmm0
759 bl _vpaes_schedule_mangle // save key n
760 bl _vpaes_schedule_192_smear
761 bl _vpaes_schedule_mangle // save key n+1
762 bl _vpaes_schedule_round
763 cbz $inp, .Lschedule_mangle_last
764 bl _vpaes_schedule_mangle // save key n+2
765 bl _vpaes_schedule_192_smear
771 // 256-bit specific part of key schedule.
773 // The structure here is very similar to the 128-bit
774 // schedule, but with an additional "low side" in
775 // %xmm6. The low side's rounds are the same as the
776 // high side's, except no rcon and no rotation.
780 ld1 {v0.16b}, [$inp] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
781 bl _vpaes_schedule_transform // input transform
782 mov $inp, #7 // mov \$7, %esi
785 sub $inp, $inp, #1 // dec %esi
786 bl _vpaes_schedule_mangle // output low result
787 mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6
790 bl _vpaes_schedule_round
791 cbz $inp, .Lschedule_mangle_last
792 bl _vpaes_schedule_mangle
794 // low round. swap xmm7 and xmm6
795 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
797 mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5
798 mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7
799 bl _vpaes_schedule_low_round
800 mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7
805 // .aes_schedule_mangle_last
807 // Mangler for last round of key schedule
809 // when encrypting, outputs out(%xmm0) ^ 63
810 // when decrypting, outputs unskew(%xmm0)
812 // Always called right before return... jumps to cleanup and exits
815 .Lschedule_mangle_last:
816 // schedule last round key from xmm0
817 adr x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew
818 cbnz $dir, .Lschedule_mangle_last_dec
821 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1
822 adr x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform
823 add $out, $out, #32 // add \$32, %rdx
824 tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute
826 .Lschedule_mangle_last_dec:
827 ld1 {v20.2d-v21.2d}, [x11] // reload constants
828 sub $out, $out, #16 // add \$-16, %rdx
829 eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0
830 bl _vpaes_schedule_transform // output transform
831 st1 {v0.2d}, [$out] // vmovdqu %xmm0, (%rdx) # save last key
834 eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0
835 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
836 eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2
837 eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3
838 eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4
839 eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5
840 eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
841 eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
842 ldp x29, x30, [sp],#16
843 AARCH64_VALIDATE_LINK_REGISTER
845 .size _vpaes_schedule_core,.-_vpaes_schedule_core
848 // .aes_schedule_192_smear
850 // Smear the short, low side in the 192-bit key schedule.
853 // %xmm7: high side, b a x y
854 // %xmm6: low side, d c 0 0
858 // %xmm6: b+c+d b+c 0 0
859 // %xmm0: b+c+d b+c b a
861 .type _vpaes_schedule_192_smear,%function
863 _vpaes_schedule_192_smear:
866 ins v1.s[3], v6.s[2] // vpshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
867 ins v0.s[0], v7.s[2] // vpshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
868 eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0
869 eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1
870 eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a
871 mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0
872 ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros
874 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
877 // .aes_schedule_round
879 // Runs one main round of the key schedule on %xmm0, %xmm7
881 // Specifically, runs subbytes on the high dword of %xmm0
882 // then rotates it by one byte and xors into the low dword of
885 // Adds rcon from low byte of %xmm8, then rotates %xmm8 for
888 // Smears the dwords of %xmm7 by xoring the low into the
889 // second low, result into third, result into highest.
891 // Returns results in %xmm7 = %xmm0.
892 // Clobbers %xmm1-%xmm4, %r11.
894 .type _vpaes_schedule_round,%function
896 _vpaes_schedule_round:
897 // extract rcon from xmm8
898 movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4
899 ext v1.16b, $rcon, v4.16b, #15 // vpalignr \$15, %xmm8, %xmm4, %xmm1
900 ext $rcon, $rcon, $rcon, #15 // vpalignr \$15, %xmm8, %xmm8, %xmm8
901 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
904 dup v0.4s, v0.s[3] // vpshufd \$0xFF, %xmm0, %xmm0
905 ext v0.16b, v0.16b, v0.16b, #1 // vpalignr \$1, %xmm0, %xmm0, %xmm0
909 // low round: same as high round, but no rotation and no rcon.
910 _vpaes_schedule_low_round:
912 ext v1.16b, v4.16b, v7.16b, #12 // vpslldq \$4, %xmm7, %xmm1
913 eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7
914 ext v4.16b, v4.16b, v7.16b, #8 // vpslldq \$8, %xmm7, %xmm4
917 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k
918 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0 # 1 = i
919 eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7
920 tbl v2.16b, {$invhi}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k
921 eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j
922 tbl v3.16b, {$invlo}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i
923 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k
924 tbl v4.16b, {$invlo}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j
925 eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7
926 tbl v3.16b, {$invlo}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak
927 eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k
928 tbl v2.16b, {$invlo}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak
929 eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io
930 eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo
931 tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou
932 tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t
933 eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output
935 // add in smeared stuff
936 eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0
937 eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7
939 .size _vpaes_schedule_round,.-_vpaes_schedule_round
942 // .aes_schedule_transform
944 // Linear-transform %xmm0 according to tables at (%r11)
946 // Requires that %xmm9 = 0x0F0F... as in preheat
948 // Clobbers %xmm1, %xmm2
950 .type _vpaes_schedule_transform,%function
952 _vpaes_schedule_transform:
953 and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1
954 ushr v0.16b, v0.16b, #4 // vpsrlb \$4, %xmm0, %xmm0
955 // vmovdqa (%r11), %xmm2 # lo
956 tbl v2.16b, {$iptlo}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2
957 // vmovdqa 16(%r11), %xmm1 # hi
958 tbl v0.16b, {$ipthi}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0
959 eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0
961 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
964 // .aes_schedule_mangle
966 // Mangle xmm0 from (basis-transformed) standard version
971 // multiply by circulant 0,1,1,1
972 // apply shiftrows transform
976 // multiply by "inverse mixcolumns" circulant E,B,D,9
978 // apply shiftrows transform
981 // Writes out to (%rdx), and increments or decrements it
982 // Keeps track of round number mod 4 in %r8
984 // Clobbers xmm1-xmm5
986 .type _vpaes_schedule_mangle,%function
988 _vpaes_schedule_mangle:
989 mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later
990 // vmovdqa .Lk_mc_forward(%rip),%xmm5
991 cbnz $dir, .Lschedule_mangle_dec
994 eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4
995 add $out, $out, #16 // add \$16, %rdx
996 tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4
997 tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1
998 tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3
999 eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4
1000 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
1001 eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3
1003 b .Lschedule_mangle_both
1005 .Lschedule_mangle_dec:
1006 // inverse mix columns
1007 // lea .Lk_dksd(%rip),%r11
1008 ushr v1.16b, v4.16b, #4 // vpsrlb \$4, %xmm4, %xmm1 # 1 = hi
1009 and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo
1011 // vmovdqa 0x00(%r11), %xmm2
1012 tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1013 // vmovdqa 0x10(%r11), %xmm3
1014 tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1015 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1016 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1018 // vmovdqa 0x20(%r11), %xmm2
1019 tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1020 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1021 // vmovdqa 0x30(%r11), %xmm3
1022 tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1023 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1024 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1026 // vmovdqa 0x40(%r11), %xmm2
1027 tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1028 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1029 // vmovdqa 0x50(%r11), %xmm3
1030 tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1031 eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3
1033 // vmovdqa 0x60(%r11), %xmm2
1034 tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2
1035 tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3
1036 // vmovdqa 0x70(%r11), %xmm4
1037 tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4
1038 ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1
1039 eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2
1040 eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3
1042 sub $out, $out, #16 // add \$-16, %rdx
1044 .Lschedule_mangle_both:
1045 tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3
1046 add x8, x8, #64-16 // add \$-16, %r8
1047 and x8, x8, #~(1<<6) // and \$0x30, %r8
1048 st1 {v3.2d}, [$out] // vmovdqu %xmm3, (%rdx)
1050 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
1052 .globl vpaes_set_encrypt_key
1053 .type vpaes_set_encrypt_key,%function
1055 vpaes_set_encrypt_key:
1056 AARCH64_SIGN_LINK_REGISTER
1057 stp x29,x30,[sp,#-16]!
1059 stp d8,d9,[sp,#-16]! // ABI spec says so
1061 lsr w9, $bits, #5 // shr \$5,%eax
1062 add w9, w9, #5 // \$5,%eax
1063 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1065 mov $dir, #0 // mov \$0,%ecx
1066 mov x8, #0x30 // mov \$0x30,%r8d
1067 bl _vpaes_schedule_core
1071 ldp x29,x30,[sp],#16
1072 AARCH64_VALIDATE_LINK_REGISTER
1074 .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
1076 .globl vpaes_set_decrypt_key
1077 .type vpaes_set_decrypt_key,%function
1079 vpaes_set_decrypt_key:
1080 AARCH64_SIGN_LINK_REGISTER
1081 stp x29,x30,[sp,#-16]!
1083 stp d8,d9,[sp,#-16]! // ABI spec says so
1085 lsr w9, $bits, #5 // shr \$5,%eax
1086 add w9, w9, #5 // \$5,%eax
1087 str w9, [$out,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
1088 lsl w9, w9, #4 // shl \$4,%eax
1089 add $out, $out, #16 // lea 16(%rdx,%rax),%rdx
1092 mov $dir, #1 // mov \$1,%ecx
1093 lsr w8, $bits, #1 // shr \$1,%r8d
1094 and x8, x8, #32 // and \$32,%r8d
1095 eor x8, x8, #32 // xor \$32,%r8d # nbits==192?0:32
1096 bl _vpaes_schedule_core
1099 ldp x29,x30,[sp],#16
1100 AARCH64_VALIDATE_LINK_REGISTER
1102 .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
1106 my ($inp,$out,$len,$key,$ivec,$dir) = map("x$_",(0..5));
1109 .globl vpaes_cbc_encrypt
1110 .type vpaes_cbc_encrypt,%function
1113 AARCH64_SIGN_LINK_REGISTER
1114 cbz $len, .Lcbc_abort
1115 cmp w5, #0 // check direction
1116 b.eq vpaes_cbc_decrypt
1118 stp x29,x30,[sp,#-16]!
1121 mov x17, $len // reassign
1122 mov x2, $key // reassign
1124 ld1 {v0.16b}, [$ivec] // load ivec
1125 bl _vpaes_encrypt_preheat
1130 ld1 {v7.16b}, [$inp],#16 // load input
1131 eor v7.16b, v7.16b, v0.16b // xor with ivec
1132 bl _vpaes_encrypt_core
1133 st1 {v0.16b}, [$out],#16 // save output
1137 st1 {v0.16b}, [$ivec] // write ivec
1139 ldp x29,x30,[sp],#16
1141 AARCH64_VALIDATE_LINK_REGISTER
1143 .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
1145 .type vpaes_cbc_decrypt,%function
1148 // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
1149 // only from vpaes_cbc_encrypt which has already signed the return address.
1150 stp x29,x30,[sp,#-16]!
1152 stp d8,d9,[sp,#-16]! // ABI spec says so
1153 stp d10,d11,[sp,#-16]!
1154 stp d12,d13,[sp,#-16]!
1155 stp d14,d15,[sp,#-16]!
1157 mov x17, $len // reassign
1158 mov x2, $key // reassign
1159 ld1 {v6.16b}, [$ivec] // load ivec
1160 bl _vpaes_decrypt_preheat
1162 b.eq .Lcbc_dec_loop2x
1164 ld1 {v7.16b}, [$inp], #16 // load input
1165 bl _vpaes_decrypt_core
1166 eor v0.16b, v0.16b, v6.16b // xor with ivec
1167 orr v6.16b, v7.16b, v7.16b // next ivec value
1168 st1 {v0.16b}, [$out], #16
1174 ld1 {v14.16b,v15.16b}, [$inp], #32
1175 bl _vpaes_decrypt_2x
1176 eor v0.16b, v0.16b, v6.16b // xor with ivec
1177 eor v1.16b, v1.16b, v14.16b
1178 orr v6.16b, v15.16b, v15.16b
1179 st1 {v0.16b,v1.16b}, [$out], #32
1181 b.hi .Lcbc_dec_loop2x
1184 st1 {v6.16b}, [$ivec]
1186 ldp d14,d15,[sp],#16
1187 ldp d12,d13,[sp],#16
1188 ldp d10,d11,[sp],#16
1190 ldp x29,x30,[sp],#16
1191 AARCH64_VALIDATE_LINK_REGISTER
1193 .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
1197 .globl vpaes_ecb_encrypt
1198 .type vpaes_ecb_encrypt,%function
1201 AARCH64_SIGN_LINK_REGISTER
1202 stp x29,x30,[sp,#-16]!
1204 stp d8,d9,[sp,#-16]! // ABI spec says so
1205 stp d10,d11,[sp,#-16]!
1206 stp d12,d13,[sp,#-16]!
1207 stp d14,d15,[sp,#-16]!
1211 bl _vpaes_encrypt_preheat
1215 ld1 {v7.16b}, [$inp],#16
1216 bl _vpaes_encrypt_core
1217 st1 {v0.16b}, [$out],#16
1223 ld1 {v14.16b,v15.16b}, [$inp], #32
1224 bl _vpaes_encrypt_2x
1225 st1 {v0.16b,v1.16b}, [$out], #32
1230 ldp d14,d15,[sp],#16
1231 ldp d12,d13,[sp],#16
1232 ldp d10,d11,[sp],#16
1234 ldp x29,x30,[sp],#16
1235 AARCH64_VALIDATE_LINK_REGISTER
1237 .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
1239 .globl vpaes_ecb_decrypt
1240 .type vpaes_ecb_decrypt,%function
1243 AARCH64_SIGN_LINK_REGISTER
1244 stp x29,x30,[sp,#-16]!
1246 stp d8,d9,[sp,#-16]! // ABI spec says so
1247 stp d10,d11,[sp,#-16]!
1248 stp d12,d13,[sp,#-16]!
1249 stp d14,d15,[sp,#-16]!
1253 bl _vpaes_decrypt_preheat
1257 ld1 {v7.16b}, [$inp],#16
1258 bl _vpaes_encrypt_core
1259 st1 {v0.16b}, [$out],#16
1265 ld1 {v14.16b,v15.16b}, [$inp], #32
1266 bl _vpaes_decrypt_2x
1267 st1 {v0.16b,v1.16b}, [$out], #32
1272 ldp d14,d15,[sp],#16
1273 ldp d12,d13,[sp],#16
1274 ldp d10,d11,[sp],#16
1276 ldp x29,x30,[sp],#16
1277 AARCH64_VALIDATE_LINK_REGISTER
1279 .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
1284 close STDOUT or die "error closing STDOUT: $!";