2 # Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 ######################################################################
11 ## Constant-time SSSE3 AES core implementation.
14 ## By Mike Hamburg (Stanford University), 2009
17 ## For details see http://shiftleft.org/papers/vector_aes/ and
18 ## http://crypto.stanford.edu/vpaes/.
20 ######################################################################
23 # Interface to OpenSSL as "almost" drop-in replacement for
24 # aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
25 # doesn't handle partial vectors (doesn't have to if called from
26 # EVP only). "Drop-in" implies that this module doesn't share key
27 # schedule structure with the original nor does it make assumption
28 # about its alignment...
30 # Performance summary. aes-x86_64.pl column lists large-block CBC
31 # encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
32 # byte processed with 128-bit key, and vpaes-x86_64.pl column -
33 # [also large-block CBC] encrypt/decrypt.
35 # aes-x86_64.pl vpaes-x86_64.pl
37 # Core 2(**) 29.6/41.1/14.3 21.9/25.2(***)
38 # Nehalem 29.6/40.3/14.6 10.0/11.8
39 # Atom 57.3/74.2/32.1 60.9/77.2(***)
40 # Silvermont 52.7/64.0/19.5 48.8/60.8(***)
41 # Goldmont 38.9/49.0/17.8 10.6/12.6
43 # (*) "Hyper-threading" in the context refers rather to cache shared
44 # among multiple cores, than to specifically Intel HTT. As vast
45 # majority of contemporary cores share cache, slower code path
46 # is common place. In other words "with-hyper-threading-off"
47 # results are presented mostly for reference purposes.
49 # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
51 # (***) Less impressive improvement on Core 2 and Atom is due to slow
52 # pshufb, yet it's respectable +36%/62% improvement on Core 2
53 # (as implied, over "hyper-threading-safe" code path).
57 # $output is the last argument if it looks like a file (it has an extension)
58 # $flavour is the first argument if it doesn't look like a file
59 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
60 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
62 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
66 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
67 die "can't locate x86_64-xlate.pl";
69 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
70 or die "can't call $xlate: $!";
85 ## %xmm9-%xmm15 as in _vpaes_preheat
86 ## (%rdx) = scheduled keys
89 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
90 ## Preserves %xmm6 - %xmm8 so you get some local vectors
93 .type _vpaes_encrypt_core,\@abi-omnipotent
101 movdqa .Lk_ipt(%rip), %xmm2 # iptlo
103 movdqu (%r9), %xmm5 # round0 key
107 movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
112 lea .Lk_mc_backward(%rip),%r10
117 # middle of middle round
118 movdqa %xmm13, %xmm4 # 4 : sb1u
119 movdqa %xmm12, %xmm0 # 0 : sb1t
120 pshufb %xmm2, %xmm4 # 4 = sb1u
121 pshufb %xmm3, %xmm0 # 0 = sb1t
122 pxor %xmm5, %xmm4 # 4 = sb1u + k
123 movdqa %xmm15, %xmm5 # 4 : sb2u
124 pxor %xmm4, %xmm0 # 0 = A
125 movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
126 pshufb %xmm2, %xmm5 # 4 = sb2u
127 movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
128 movdqa %xmm14, %xmm2 # 2 : sb2t
129 pshufb %xmm3, %xmm2 # 2 = sb2t
130 movdqa %xmm0, %xmm3 # 3 = A
131 pxor %xmm5, %xmm2 # 2 = 2A
132 pshufb %xmm1, %xmm0 # 0 = B
133 add \$16, %r9 # next key
134 pxor %xmm2, %xmm0 # 0 = 2A+B
135 pshufb %xmm4, %xmm3 # 3 = D
136 add \$16, %r11 # next mc
137 pxor %xmm0, %xmm3 # 3 = 2A+B+D
138 pshufb %xmm1, %xmm0 # 0 = 2B+C
139 and \$0x30, %r11 # ... mod 4
141 pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
145 movdqa %xmm9, %xmm1 # 1 : i
146 movdqa %xmm11, %xmm5 # 2 : a/k
147 pandn %xmm0, %xmm1 # 1 = i<<4
148 psrld \$4, %xmm1 # 1 = i
149 pand %xmm9, %xmm0 # 0 = k
150 pshufb %xmm0, %xmm5 # 2 = a/k
151 movdqa %xmm10, %xmm3 # 3 : 1/i
152 pxor %xmm1, %xmm0 # 0 = j
153 pshufb %xmm1, %xmm3 # 3 = 1/i
154 movdqa %xmm10, %xmm4 # 4 : 1/j
155 pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
156 pshufb %xmm0, %xmm4 # 4 = 1/j
157 movdqa %xmm10, %xmm2 # 2 : 1/iak
158 pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
159 pshufb %xmm3, %xmm2 # 2 = 1/iak
160 movdqa %xmm10, %xmm3 # 3 : 1/jak
161 pxor %xmm0, %xmm2 # 2 = io
162 pshufb %xmm4, %xmm3 # 3 = 1/jak
164 pxor %xmm1, %xmm3 # 3 = jo
167 # middle of last round
168 movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
169 movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
170 pshufb %xmm2, %xmm4 # 4 = sbou
171 pxor %xmm5, %xmm4 # 4 = sb1u + k
172 pshufb %xmm3, %xmm0 # 0 = sb1t
173 movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
174 pxor %xmm4, %xmm0 # 0 = A
178 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
183 ## Same API as encryption core.
185 .type _vpaes_decrypt_core,\@abi-omnipotent
189 mov %rdx, %r9 # load key
192 movdqa .Lk_dipt(%rip), %xmm2 # iptlo
196 movdqu (%r9), %xmm5 # round0 key
200 movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
202 lea .Lk_dsbd(%rip),%r10
206 movdqa .Lk_mc_forward+48(%rip), %xmm5
215 ## Inverse mix columns
217 movdqa -0x20(%r10),%xmm4 # 4 : sb9u
218 movdqa -0x10(%r10),%xmm1 # 0 : sb9t
219 pshufb %xmm2, %xmm4 # 4 = sb9u
220 pshufb %xmm3, %xmm1 # 0 = sb9t
222 movdqa 0x00(%r10),%xmm4 # 4 : sbdu
223 pxor %xmm1, %xmm0 # 0 = ch
224 movdqa 0x10(%r10),%xmm1 # 0 : sbdt
226 pshufb %xmm2, %xmm4 # 4 = sbdu
227 pshufb %xmm5, %xmm0 # MC ch
228 pshufb %xmm3, %xmm1 # 0 = sbdt
229 pxor %xmm4, %xmm0 # 4 = ch
230 movdqa 0x20(%r10),%xmm4 # 4 : sbbu
231 pxor %xmm1, %xmm0 # 0 = ch
232 movdqa 0x30(%r10),%xmm1 # 0 : sbbt
234 pshufb %xmm2, %xmm4 # 4 = sbbu
235 pshufb %xmm5, %xmm0 # MC ch
236 pshufb %xmm3, %xmm1 # 0 = sbbt
237 pxor %xmm4, %xmm0 # 4 = ch
238 movdqa 0x40(%r10),%xmm4 # 4 : sbeu
239 pxor %xmm1, %xmm0 # 0 = ch
240 movdqa 0x50(%r10),%xmm1 # 0 : sbet
242 pshufb %xmm2, %xmm4 # 4 = sbeu
243 pshufb %xmm5, %xmm0 # MC ch
244 pshufb %xmm3, %xmm1 # 0 = sbet
245 pxor %xmm4, %xmm0 # 4 = ch
246 add \$16, %r9 # next round key
247 palignr \$12, %xmm5, %xmm5
248 pxor %xmm1, %xmm0 # 0 = ch
253 movdqa %xmm9, %xmm1 # 1 : i
254 pandn %xmm0, %xmm1 # 1 = i<<4
255 movdqa %xmm11, %xmm2 # 2 : a/k
256 psrld \$4, %xmm1 # 1 = i
257 pand %xmm9, %xmm0 # 0 = k
258 pshufb %xmm0, %xmm2 # 2 = a/k
259 movdqa %xmm10, %xmm3 # 3 : 1/i
260 pxor %xmm1, %xmm0 # 0 = j
261 pshufb %xmm1, %xmm3 # 3 = 1/i
262 movdqa %xmm10, %xmm4 # 4 : 1/j
263 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
264 pshufb %xmm0, %xmm4 # 4 = 1/j
265 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
266 movdqa %xmm10, %xmm2 # 2 : 1/iak
267 pshufb %xmm3, %xmm2 # 2 = 1/iak
268 movdqa %xmm10, %xmm3 # 3 : 1/jak
269 pxor %xmm0, %xmm2 # 2 = io
270 pshufb %xmm4, %xmm3 # 3 = 1/jak
272 pxor %xmm1, %xmm3 # 3 = jo
275 # middle of last round
276 movdqa 0x60(%r10), %xmm4 # 3 : sbou
277 pshufb %xmm2, %xmm4 # 4 = sbou
278 pxor %xmm0, %xmm4 # 4 = sb1u + k
279 movdqa 0x70(%r10), %xmm0 # 0 : sbot
280 movdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160
281 pshufb %xmm3, %xmm0 # 0 = sb1t
282 pxor %xmm4, %xmm0 # 0 = A
286 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
288 ########################################################
290 ## AES key schedule ##
292 ########################################################
293 .type _vpaes_schedule_core,\@abi-omnipotent
295 _vpaes_schedule_core:
300 # rcx = direction. 0=encrypt, 1=decrypt
302 call _vpaes_preheat # load the tables
303 movdqa .Lk_rcon(%rip), %xmm8 # load rcon
304 movdqu (%rdi), %xmm0 # load key (unaligned)
308 lea .Lk_ipt(%rip), %r11
309 call _vpaes_schedule_transform
312 lea .Lk_sr(%rip),%r10
314 jnz .Lschedule_am_decrypting
316 # encrypting, output zeroth round key after transform
320 .Lschedule_am_decrypting:
321 # decrypting, output zeroth round key after shiftrows
322 movdqa (%r8,%r10),%xmm1
336 ## 128-bit specific part of key schedule.
338 ## This schedule is really simple, because all its parts
339 ## are accomplished by the subroutines.
345 call _vpaes_schedule_round
347 jz .Lschedule_mangle_last
348 call _vpaes_schedule_mangle # write output
349 jmp .Loop_schedule_128
354 ## 192-bit specific part of key schedule.
356 ## The main body of this schedule is the same as the 128-bit
357 ## schedule, but with more smearing. The long, high side is
358 ## stored in %xmm7 as before, and the short, low side is in
359 ## the high bits of %xmm6.
361 ## This schedule is somewhat nastier, however, because each
362 ## round produces 192 bits of key material, or 1.5 round keys.
363 ## Therefore, on each cycle we do 2 rounds and produce 3 round
368 movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
369 call _vpaes_schedule_transform # input transform
370 movdqa %xmm0, %xmm6 # save short part
371 pxor %xmm4, %xmm4 # clear 4
372 movhlps %xmm4, %xmm6 # clobber low side with zeros
376 call _vpaes_schedule_round
377 palignr \$8,%xmm6,%xmm0
378 call _vpaes_schedule_mangle # save key n
379 call _vpaes_schedule_192_smear
380 call _vpaes_schedule_mangle # save key n+1
381 call _vpaes_schedule_round
383 jz .Lschedule_mangle_last
384 call _vpaes_schedule_mangle # save key n+2
385 call _vpaes_schedule_192_smear
386 jmp .Loop_schedule_192
391 ## 256-bit specific part of key schedule.
393 ## The structure here is very similar to the 128-bit
394 ## schedule, but with an additional "low side" in
395 ## %xmm6. The low side's rounds are the same as the
396 ## high side's, except no rcon and no rotation.
400 movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
401 call _vpaes_schedule_transform # input transform
405 call _vpaes_schedule_mangle # output low result
406 movdqa %xmm0, %xmm6 # save cur_lo in xmm6
409 call _vpaes_schedule_round
411 jz .Lschedule_mangle_last
412 call _vpaes_schedule_mangle
414 # low round. swap xmm7 and xmm6
415 pshufd \$0xFF, %xmm0, %xmm0
418 call _vpaes_schedule_low_round
421 jmp .Loop_schedule_256
425 ## .aes_schedule_mangle_last
427 ## Mangler for last round of key schedule
429 ## when encrypting, outputs out(%xmm0) ^ 63
430 ## when decrypting, outputs unskew(%xmm0)
432 ## Always called right before return... jumps to cleanup and exits
435 .Lschedule_mangle_last:
436 # schedule last round key from xmm0
437 lea .Lk_deskew(%rip),%r11 # prepare to deskew
439 jnz .Lschedule_mangle_last_dec
442 movdqa (%r8,%r10),%xmm1
443 pshufb %xmm1, %xmm0 # output permute
444 lea .Lk_opt(%rip), %r11 # prepare to output transform
447 .Lschedule_mangle_last_dec:
449 pxor .Lk_s63(%rip), %xmm0
450 call _vpaes_schedule_transform # output transform
451 movdqu %xmm0, (%rdx) # save last key
464 .size _vpaes_schedule_core,.-_vpaes_schedule_core
467 ## .aes_schedule_192_smear
469 ## Smear the short, low side in the 192-bit key schedule.
472 ## %xmm7: high side, b a x y
473 ## %xmm6: low side, d c 0 0
477 ## %xmm6: b+c+d b+c 0 0
478 ## %xmm0: b+c+d b+c b a
480 .type _vpaes_schedule_192_smear,\@abi-omnipotent
482 _vpaes_schedule_192_smear:
484 pshufd \$0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0
485 pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
486 pxor %xmm1, %xmm6 # -> c+d c 0 0
488 pxor %xmm0, %xmm6 # -> b+c+d b+c b a
490 movhlps %xmm1, %xmm6 # clobber low side with zeros
493 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
496 ## .aes_schedule_round
498 ## Runs one main round of the key schedule on %xmm0, %xmm7
500 ## Specifically, runs subbytes on the high dword of %xmm0
501 ## then rotates it by one byte and xors into the low dword of
504 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
507 ## Smears the dwords of %xmm7 by xoring the low into the
508 ## second low, result into third, result into highest.
510 ## Returns results in %xmm7 = %xmm0.
511 ## Clobbers %xmm1-%xmm4, %r11.
513 .type _vpaes_schedule_round,\@abi-omnipotent
515 _vpaes_schedule_round:
517 # extract rcon from xmm8
519 palignr \$15, %xmm8, %xmm1
520 palignr \$15, %xmm8, %xmm8
524 pshufd \$0xFF, %xmm0, %xmm0
525 palignr \$1, %xmm0, %xmm0
529 # low round: same as high round, but no rotation and no rcon.
530 _vpaes_schedule_low_round:
538 pxor .Lk_s63(%rip), %xmm7
543 psrld \$4, %xmm1 # 1 = i
544 pand %xmm9, %xmm0 # 0 = k
545 movdqa %xmm11, %xmm2 # 2 : a/k
546 pshufb %xmm0, %xmm2 # 2 = a/k
547 pxor %xmm1, %xmm0 # 0 = j
548 movdqa %xmm10, %xmm3 # 3 : 1/i
549 pshufb %xmm1, %xmm3 # 3 = 1/i
550 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
551 movdqa %xmm10, %xmm4 # 4 : 1/j
552 pshufb %xmm0, %xmm4 # 4 = 1/j
553 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
554 movdqa %xmm10, %xmm2 # 2 : 1/iak
555 pshufb %xmm3, %xmm2 # 2 = 1/iak
556 pxor %xmm0, %xmm2 # 2 = io
557 movdqa %xmm10, %xmm3 # 3 : 1/jak
558 pshufb %xmm4, %xmm3 # 3 = 1/jak
559 pxor %xmm1, %xmm3 # 3 = jo
560 movdqa %xmm13, %xmm4 # 4 : sbou
561 pshufb %xmm2, %xmm4 # 4 = sbou
562 movdqa %xmm12, %xmm0 # 0 : sbot
563 pshufb %xmm3, %xmm0 # 0 = sb1t
564 pxor %xmm4, %xmm0 # 0 = sbox output
566 # add in smeared stuff
571 .size _vpaes_schedule_round,.-_vpaes_schedule_round
574 ## .aes_schedule_transform
576 ## Linear-transform %xmm0 according to tables at (%r11)
578 ## Requires that %xmm9 = 0x0F0F... as in preheat
580 ## Clobbers %xmm1, %xmm2
582 .type _vpaes_schedule_transform,\@abi-omnipotent
584 _vpaes_schedule_transform:
590 movdqa (%r11), %xmm2 # lo
592 movdqa 16(%r11), %xmm0 # hi
597 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
600 ## .aes_schedule_mangle
602 ## Mangle xmm0 from (basis-transformed) standard version
607 ## multiply by circulant 0,1,1,1
608 ## apply shiftrows transform
612 ## multiply by "inverse mixcolumns" circulant E,B,D,9
614 ## apply shiftrows transform
617 ## Writes out to (%rdx), and increments or decrements it
618 ## Keeps track of round number mod 4 in %r8
620 ## Clobbers xmm1-xmm5
622 .type _vpaes_schedule_mangle,\@abi-omnipotent
624 _vpaes_schedule_mangle:
626 movdqa %xmm0, %xmm4 # save xmm0 for later
627 movdqa .Lk_mc_forward(%rip),%xmm5
629 jnz .Lschedule_mangle_dec
633 pxor .Lk_s63(%rip),%xmm4
641 jmp .Lschedule_mangle_both
643 .Lschedule_mangle_dec:
644 # inverse mix columns
645 lea .Lk_dksd(%rip),%r11
648 psrld \$4, %xmm1 # 1 = hi
649 pand %xmm9, %xmm4 # 4 = lo
651 movdqa 0x00(%r11), %xmm2
653 movdqa 0x10(%r11), %xmm3
658 movdqa 0x20(%r11), %xmm2
661 movdqa 0x30(%r11), %xmm3
666 movdqa 0x40(%r11), %xmm2
669 movdqa 0x50(%r11), %xmm3
674 movdqa 0x60(%r11), %xmm2
677 movdqa 0x70(%r11), %xmm3
683 .Lschedule_mangle_both:
684 movdqa (%r8,%r10),%xmm1
691 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
694 # Interface to OpenSSL
696 .globl ${PREFIX}_set_encrypt_key
697 .type ${PREFIX}_set_encrypt_key,\@function,3
699 ${PREFIX}_set_encrypt_key:
702 $code.=<<___ if ($win64);
704 movaps %xmm6,0x10(%rsp)
705 movaps %xmm7,0x20(%rsp)
706 movaps %xmm8,0x30(%rsp)
707 movaps %xmm9,0x40(%rsp)
708 movaps %xmm10,0x50(%rsp)
709 movaps %xmm11,0x60(%rsp)
710 movaps %xmm12,0x70(%rsp)
711 movaps %xmm13,0x80(%rsp)
712 movaps %xmm14,0x90(%rsp)
713 movaps %xmm15,0xa0(%rsp)
720 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
724 call _vpaes_schedule_core
726 $code.=<<___ if ($win64);
727 movaps 0x10(%rsp),%xmm6
728 movaps 0x20(%rsp),%xmm7
729 movaps 0x30(%rsp),%xmm8
730 movaps 0x40(%rsp),%xmm9
731 movaps 0x50(%rsp),%xmm10
732 movaps 0x60(%rsp),%xmm11
733 movaps 0x70(%rsp),%xmm12
734 movaps 0x80(%rsp),%xmm13
735 movaps 0x90(%rsp),%xmm14
736 movaps 0xa0(%rsp),%xmm15
744 .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
746 .globl ${PREFIX}_set_decrypt_key
747 .type ${PREFIX}_set_decrypt_key,\@function,3
749 ${PREFIX}_set_decrypt_key:
752 $code.=<<___ if ($win64);
754 movaps %xmm6,0x10(%rsp)
755 movaps %xmm7,0x20(%rsp)
756 movaps %xmm8,0x30(%rsp)
757 movaps %xmm9,0x40(%rsp)
758 movaps %xmm10,0x50(%rsp)
759 movaps %xmm11,0x60(%rsp)
760 movaps %xmm12,0x70(%rsp)
761 movaps %xmm13,0x80(%rsp)
762 movaps %xmm14,0x90(%rsp)
763 movaps %xmm15,0xa0(%rsp)
770 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
772 lea 16(%rdx,%rax),%rdx
778 xor \$32,%r8d # nbits==192?0:32
779 call _vpaes_schedule_core
781 $code.=<<___ if ($win64);
782 movaps 0x10(%rsp),%xmm6
783 movaps 0x20(%rsp),%xmm7
784 movaps 0x30(%rsp),%xmm8
785 movaps 0x40(%rsp),%xmm9
786 movaps 0x50(%rsp),%xmm10
787 movaps 0x60(%rsp),%xmm11
788 movaps 0x70(%rsp),%xmm12
789 movaps 0x80(%rsp),%xmm13
790 movaps 0x90(%rsp),%xmm14
791 movaps 0xa0(%rsp),%xmm15
799 .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
801 .globl ${PREFIX}_encrypt
802 .type ${PREFIX}_encrypt,\@function,3
807 $code.=<<___ if ($win64);
809 movaps %xmm6,0x10(%rsp)
810 movaps %xmm7,0x20(%rsp)
811 movaps %xmm8,0x30(%rsp)
812 movaps %xmm9,0x40(%rsp)
813 movaps %xmm10,0x50(%rsp)
814 movaps %xmm11,0x60(%rsp)
815 movaps %xmm12,0x70(%rsp)
816 movaps %xmm13,0x80(%rsp)
817 movaps %xmm14,0x90(%rsp)
818 movaps %xmm15,0xa0(%rsp)
824 call _vpaes_encrypt_core
827 $code.=<<___ if ($win64);
828 movaps 0x10(%rsp),%xmm6
829 movaps 0x20(%rsp),%xmm7
830 movaps 0x30(%rsp),%xmm8
831 movaps 0x40(%rsp),%xmm9
832 movaps 0x50(%rsp),%xmm10
833 movaps 0x60(%rsp),%xmm11
834 movaps 0x70(%rsp),%xmm12
835 movaps 0x80(%rsp),%xmm13
836 movaps 0x90(%rsp),%xmm14
837 movaps 0xa0(%rsp),%xmm15
844 .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
846 .globl ${PREFIX}_decrypt
847 .type ${PREFIX}_decrypt,\@function,3
852 $code.=<<___ if ($win64);
854 movaps %xmm6,0x10(%rsp)
855 movaps %xmm7,0x20(%rsp)
856 movaps %xmm8,0x30(%rsp)
857 movaps %xmm9,0x40(%rsp)
858 movaps %xmm10,0x50(%rsp)
859 movaps %xmm11,0x60(%rsp)
860 movaps %xmm12,0x70(%rsp)
861 movaps %xmm13,0x80(%rsp)
862 movaps %xmm14,0x90(%rsp)
863 movaps %xmm15,0xa0(%rsp)
869 call _vpaes_decrypt_core
872 $code.=<<___ if ($win64);
873 movaps 0x10(%rsp),%xmm6
874 movaps 0x20(%rsp),%xmm7
875 movaps 0x30(%rsp),%xmm8
876 movaps 0x40(%rsp),%xmm9
877 movaps 0x50(%rsp),%xmm10
878 movaps 0x60(%rsp),%xmm11
879 movaps 0x70(%rsp),%xmm12
880 movaps 0x80(%rsp),%xmm13
881 movaps 0x90(%rsp),%xmm14
882 movaps 0xa0(%rsp),%xmm15
889 .size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
892 my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
893 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
894 # size_t length, const AES_KEY *key,
895 # unsigned char *ivp,const int enc);
897 .globl ${PREFIX}_cbc_encrypt
898 .type ${PREFIX}_cbc_encrypt,\@function,6
900 ${PREFIX}_cbc_encrypt:
904 ($len,$key)=($key,$len);
909 $code.=<<___ if ($win64);
911 movaps %xmm6,0x10(%rsp)
912 movaps %xmm7,0x20(%rsp)
913 movaps %xmm8,0x30(%rsp)
914 movaps %xmm9,0x40(%rsp)
915 movaps %xmm10,0x50(%rsp)
916 movaps %xmm11,0x60(%rsp)
917 movaps %xmm12,0x70(%rsp)
918 movaps %xmm13,0x80(%rsp)
919 movaps %xmm14,0x90(%rsp)
920 movaps %xmm15,0xa0(%rsp)
924 movdqu ($ivp),%xmm6 # load IV
934 call _vpaes_encrypt_core
936 movdqu %xmm0,($out,$inp)
945 call _vpaes_decrypt_core
948 movdqu %xmm0,($out,$inp)
953 movdqu %xmm6,($ivp) # save IV
955 $code.=<<___ if ($win64);
956 movaps 0x10(%rsp),%xmm6
957 movaps 0x20(%rsp),%xmm7
958 movaps 0x30(%rsp),%xmm8
959 movaps 0x40(%rsp),%xmm9
960 movaps 0x50(%rsp),%xmm10
961 movaps 0x60(%rsp),%xmm11
962 movaps 0x70(%rsp),%xmm12
963 movaps 0x80(%rsp),%xmm13
964 movaps 0x90(%rsp),%xmm14
965 movaps 0xa0(%rsp),%xmm15
973 .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
980 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
981 ## and %xmm9-%xmm15 as specified below.
983 .type _vpaes_preheat,\@abi-omnipotent
987 lea .Lk_s0F(%rip), %r10
988 movdqa -0x20(%r10), %xmm10 # .Lk_inv
989 movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
990 movdqa 0x00(%r10), %xmm9 # .Lk_s0F
991 movdqa 0x30(%r10), %xmm13 # .Lk_sb1
992 movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
993 movdqa 0x50(%r10), %xmm15 # .Lk_sb2
994 movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
997 .size _vpaes_preheat,.-_vpaes_preheat
998 ########################################################
1002 ########################################################
1003 .type _vpaes_consts,\@object
1006 .Lk_inv: # inv, inva
1007 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
1008 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
1011 .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
1013 .Lk_ipt: # input transform (lo, hi)
1014 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
1015 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
1017 .Lk_sb1: # sb1u, sb1t
1018 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
1019 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
1020 .Lk_sb2: # sb2u, sb2t
1021 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
1022 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
1023 .Lk_sbo: # sbou, sbot
1024 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
1025 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
1027 .Lk_mc_forward: # mc_forward
1028 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
1029 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
1030 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
1031 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
1033 .Lk_mc_backward:# mc_backward
1034 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
1035 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
1036 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
1037 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
1040 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
1041 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
1042 .quad 0x0F060D040B020900, 0x070E050C030A0108
1043 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
1046 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
1048 .Lk_s63: # s63: all equal to 0x63 transformed
1049 .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
1051 .Lk_opt: # output transform
1052 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
1053 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
1055 .Lk_deskew: # deskew tables: inverts the sbox's "skew"
1056 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
1057 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
1061 ## Key schedule constants
1063 .Lk_dksd: # decryption key schedule: invskew x*D
1064 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
1065 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
1066 .Lk_dksb: # decryption key schedule: invskew x*B
1067 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
1068 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
1069 .Lk_dkse: # decryption key schedule: invskew x*E + 0x63
1070 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
1071 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
1072 .Lk_dks9: # decryption key schedule: invskew x*9
1073 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
1074 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
1078 ## Round function constants
1080 .Lk_dipt: # decryption input transform
1081 .quad 0x0F505B040B545F00, 0x154A411E114E451A
1082 .quad 0x86E383E660056500, 0x12771772F491F194
1084 .Lk_dsb9: # decryption sbox output *9*u, *9*t
1085 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
1086 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
1087 .Lk_dsbd: # decryption sbox output *D*u, *D*t
1088 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
1089 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
1090 .Lk_dsbb: # decryption sbox output *B*u, *B*t
1091 .quad 0xD022649296B44200, 0x602646F6B0F2D404
1092 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
1093 .Lk_dsbe: # decryption sbox output *E*u, *E*t
1094 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
1095 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
1096 .Lk_dsbo: # decryption sbox final output
1097 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
1098 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
1099 .asciz "Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
1101 .size _vpaes_consts,.-_vpaes_consts
1105 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1106 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1113 .extern __imp_RtlVirtualUnwind
1114 .type se_handler,\@abi-omnipotent
1128 mov 120($context),%rax # pull context->Rax
1129 mov 248($context),%rbx # pull context->Rip
1131 mov 8($disp),%rsi # disp->ImageBase
1132 mov 56($disp),%r11 # disp->HandlerData
1134 mov 0(%r11),%r10d # HandlerData[0]
1135 lea (%rsi,%r10),%r10 # prologue label
1136 cmp %r10,%rbx # context->Rip<prologue label
1139 mov 152($context),%rax # pull context->Rsp
1141 mov 4(%r11),%r10d # HandlerData[1]
1142 lea (%rsi,%r10),%r10 # epilogue label
1143 cmp %r10,%rbx # context->Rip>=epilogue label
1146 lea 16(%rax),%rsi # %xmm save area
1147 lea 512($context),%rdi # &context.Xmm6
1148 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1149 .long 0xa548f3fc # cld; rep movsq
1150 lea 0xb8(%rax),%rax # adjust stack pointer
1155 mov %rax,152($context) # restore context->Rsp
1156 mov %rsi,168($context) # restore context->Rsi
1157 mov %rdi,176($context) # restore context->Rdi
1159 mov 40($disp),%rdi # disp->ContextRecord
1160 mov $context,%rsi # context
1161 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1162 .long 0xa548f3fc # cld; rep movsq
1165 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1166 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1167 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1168 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1169 mov 40(%rsi),%r10 # disp->ContextRecord
1170 lea 56(%rsi),%r11 # &disp->HandlerData
1171 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1172 mov %r10,32(%rsp) # arg5
1173 mov %r11,40(%rsp) # arg6
1174 mov %r12,48(%rsp) # arg7
1175 mov %rcx,56(%rsp) # arg8, (NULL)
1176 call *__imp_RtlVirtualUnwind(%rip)
1178 mov \$1,%eax # ExceptionContinueSearch
1190 .size se_handler,.-se_handler
1194 .rva .LSEH_begin_${PREFIX}_set_encrypt_key
1195 .rva .LSEH_end_${PREFIX}_set_encrypt_key
1196 .rva .LSEH_info_${PREFIX}_set_encrypt_key
1198 .rva .LSEH_begin_${PREFIX}_set_decrypt_key
1199 .rva .LSEH_end_${PREFIX}_set_decrypt_key
1200 .rva .LSEH_info_${PREFIX}_set_decrypt_key
1202 .rva .LSEH_begin_${PREFIX}_encrypt
1203 .rva .LSEH_end_${PREFIX}_encrypt
1204 .rva .LSEH_info_${PREFIX}_encrypt
1206 .rva .LSEH_begin_${PREFIX}_decrypt
1207 .rva .LSEH_end_${PREFIX}_decrypt
1208 .rva .LSEH_info_${PREFIX}_decrypt
1210 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
1211 .rva .LSEH_end_${PREFIX}_cbc_encrypt
1212 .rva .LSEH_info_${PREFIX}_cbc_encrypt
1216 .LSEH_info_${PREFIX}_set_encrypt_key:
1219 .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
1220 .LSEH_info_${PREFIX}_set_decrypt_key:
1223 .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
1224 .LSEH_info_${PREFIX}_encrypt:
1227 .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
1228 .LSEH_info_${PREFIX}_decrypt:
1231 .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
1232 .LSEH_info_${PREFIX}_cbc_encrypt:
1235 .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
1239 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1243 close STDOUT or die "error closing STDOUT";