3 ######################################################################
4 ## Constant-time SSSE3 AES core implementation.
7 ## By Mike Hamburg (Stanford University), 2009
10 ## For details see http://shiftleft.org/papers/vector_aes/ and
11 ## http://crypto.stanford.edu/vpaes/.
13 ######################################################################
16 # Interface to OpenSSL as "almost" drop-in replacement for
17 # aes-x86_64.pl. "Almost" refers to the fact that AES_cbc_encrypt
18 # doesn't handle partial vectors (doesn't have to if called from
19 # EVP only). "Drop-in" implies that this module doesn't share key
20 # schedule structure with the original nor does it make assumption
21 # about its alignment...
23 # Performance summary. aes-x86_64.pl column lists large-block CBC
24 # encrypt/decrypt/with-hypert-hreading-off(*) results in cycles per
25 # byte processed with 128-bit key, and vpaes-x86_64.pl column -
28 # aes-x86_64.pl vpaes-x86_64.pl
30 # Core 2(**) 30.5/43.7/14.3 21.8/25.7(***)
31 # Nehalem 30.5/42.2/14.6 9.8/11.8
32 # Atom 63.9/79.0/32.1 64.0/84.8(***)
34 # (*) "Hyper-threading" in the context refers rather to cache shared
35 # among multiple cores, than to specifically Intel HTT. As vast
36 # majority of contemporary cores share cache, slower code path
37 # is common place. In other words "with-hyper-threading-off"
38 # results are presented mostly for reference purposes.
40 # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
42 # (***) Less impressive improvement on Core 2 and Atom is due to slow
43 # pshufb, yet it's respectable +40%/78% improvement on Core 2.
49 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
51 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
53 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
55 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
56 die "can't locate x86_64-xlate.pl";
58 open STDOUT,"| $^X $xlate $flavour $output";
72 ## %xmm9-%xmm15 as in _vpaes_preheat
73 ## (%rdx) = scheduled keys
76 ## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax
77 ## Preserves %xmm6 - %xmm8 so you get some local vectors
80 .type _vpaes_encrypt_core,\@abi-omnipotent
87 movdqa .Lk_ipt(%rip), %xmm2 # iptlo
89 movdqu (%r9), %xmm5 # round0 key
93 movdqa .Lk_ipt+16(%rip), %xmm0 # ipthi
98 lea .Lk_mc_backward(%rip),%r10
103 # middle of middle round
104 movdqa %xmm13, %xmm4 # 4 : sb1u
105 pshufb %xmm2, %xmm4 # 4 = sb1u
106 pxor %xmm5, %xmm4 # 4 = sb1u + k
107 movdqa %xmm12, %xmm0 # 0 : sb1t
108 pshufb %xmm3, %xmm0 # 0 = sb1t
109 pxor %xmm4, %xmm0 # 0 = A
110 movdqa %xmm15, %xmm5 # 4 : sb2u
111 pshufb %xmm2, %xmm5 # 4 = sb2u
112 movdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[]
113 movdqa %xmm14, %xmm2 # 2 : sb2t
114 pshufb %xmm3, %xmm2 # 2 = sb2t
115 pxor %xmm5, %xmm2 # 2 = 2A
116 movdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[]
117 movdqa %xmm0, %xmm3 # 3 = A
118 pshufb %xmm1, %xmm0 # 0 = B
119 add \$16, %r9 # next key
120 pxor %xmm2, %xmm0 # 0 = 2A+B
121 pshufb %xmm4, %xmm3 # 3 = D
122 add \$16, %r11 # next mc
123 pxor %xmm0, %xmm3 # 3 = 2A+B+D
124 pshufb %xmm1, %xmm0 # 0 = 2B+C
125 and \$0x30, %r11 # ... mod 4
126 pxor %xmm3, %xmm0 # 0 = 2A+3B+C+D
131 movdqa %xmm9, %xmm1 # 1 : i
132 pandn %xmm0, %xmm1 # 1 = i<<4
133 psrld \$4, %xmm1 # 1 = i
134 pand %xmm9, %xmm0 # 0 = k
135 movdqa %xmm11, %xmm5 # 2 : a/k
136 pshufb %xmm0, %xmm5 # 2 = a/k
137 pxor %xmm1, %xmm0 # 0 = j
138 movdqa %xmm10, %xmm3 # 3 : 1/i
139 pshufb %xmm1, %xmm3 # 3 = 1/i
140 pxor %xmm5, %xmm3 # 3 = iak = 1/i + a/k
141 movdqa %xmm10, %xmm4 # 4 : 1/j
142 pshufb %xmm0, %xmm4 # 4 = 1/j
143 pxor %xmm5, %xmm4 # 4 = jak = 1/j + a/k
144 movdqa %xmm10, %xmm2 # 2 : 1/iak
145 pshufb %xmm3, %xmm2 # 2 = 1/iak
146 pxor %xmm0, %xmm2 # 2 = io
147 movdqa %xmm10, %xmm3 # 3 : 1/jak
149 pshufb %xmm4, %xmm3 # 3 = 1/jak
150 pxor %xmm1, %xmm3 # 3 = jo
153 # middle of last round
154 movdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo
155 movdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16
156 pshufb %xmm2, %xmm4 # 4 = sbou
157 pxor %xmm5, %xmm4 # 4 = sb1u + k
158 pshufb %xmm3, %xmm0 # 0 = sb1t
159 movdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[]
160 pxor %xmm4, %xmm0 # 0 = A
163 .size _vpaes_encrypt_core,.-_vpaes_encrypt_core
168 ## Same API as encryption core.
170 .type _vpaes_decrypt_core,\@abi-omnipotent
173 mov %rdx, %r9 # load key
176 movdqa .Lk_dipt(%rip), %xmm2 # iptlo
180 movdqu (%r9), %xmm5 # round0 key
184 movdqa .Lk_dipt+16(%rip), %xmm0 # ipthi
186 lea .Lk_dsbd(%rip),%r10
190 movdqa .Lk_mc_forward+48(%rip), %xmm5
199 ## Inverse mix columns
201 movdqa -0x20(%r10),%xmm4 # 4 : sb9u
202 pshufb %xmm2, %xmm4 # 4 = sb9u
204 movdqa -0x10(%r10),%xmm0 # 0 : sb9t
205 pshufb %xmm3, %xmm0 # 0 = sb9t
206 pxor %xmm4, %xmm0 # 0 = ch
207 add \$16, %r9 # next round key
209 pshufb %xmm5, %xmm0 # MC ch
210 movdqa 0x00(%r10),%xmm4 # 4 : sbdu
211 pshufb %xmm2, %xmm4 # 4 = sbdu
212 pxor %xmm0, %xmm4 # 4 = ch
213 movdqa 0x10(%r10),%xmm0 # 0 : sbdt
214 pshufb %xmm3, %xmm0 # 0 = sbdt
215 pxor %xmm4, %xmm0 # 0 = ch
218 pshufb %xmm5, %xmm0 # MC ch
219 movdqa 0x20(%r10),%xmm4 # 4 : sbbu
220 pshufb %xmm2, %xmm4 # 4 = sbbu
221 pxor %xmm0, %xmm4 # 4 = ch
222 movdqa 0x30(%r10),%xmm0 # 0 : sbbt
223 pshufb %xmm3, %xmm0 # 0 = sbbt
224 pxor %xmm4, %xmm0 # 0 = ch
226 pshufb %xmm5, %xmm0 # MC ch
227 movdqa 0x40(%r10),%xmm4 # 4 : sbeu
228 pshufb %xmm2, %xmm4 # 4 = sbeu
229 pxor %xmm0, %xmm4 # 4 = ch
230 movdqa 0x50(%r10),%xmm0 # 0 : sbet
231 pshufb %xmm3, %xmm0 # 0 = sbet
232 pxor %xmm4, %xmm0 # 0 = ch
234 palignr \$12, %xmm5, %xmm5
238 movdqa %xmm9, %xmm1 # 1 : i
239 pandn %xmm0, %xmm1 # 1 = i<<4
240 psrld \$4, %xmm1 # 1 = i
241 pand %xmm9, %xmm0 # 0 = k
242 movdqa %xmm11, %xmm2 # 2 : a/k
243 pshufb %xmm0, %xmm2 # 2 = a/k
244 pxor %xmm1, %xmm0 # 0 = j
245 movdqa %xmm10, %xmm3 # 3 : 1/i
246 pshufb %xmm1, %xmm3 # 3 = 1/i
247 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
248 movdqa %xmm10, %xmm4 # 4 : 1/j
249 pshufb %xmm0, %xmm4 # 4 = 1/j
250 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
251 movdqa %xmm10, %xmm2 # 2 : 1/iak
252 pshufb %xmm3, %xmm2 # 2 = 1/iak
253 pxor %xmm0, %xmm2 # 2 = io
254 movdqa %xmm10, %xmm3 # 3 : 1/jak
255 pshufb %xmm4, %xmm3 # 3 = 1/jak
256 pxor %xmm1, %xmm3 # 3 = jo
260 # middle of last round
261 movdqa 0x60(%r10), %xmm4 # 3 : sbou
262 pshufb %xmm2, %xmm4 # 4 = sbou
263 pxor %xmm0, %xmm4 # 4 = sb1u + k
264 movdqa 0x70(%r10), %xmm0 # 0 : sbot
265 movdqa .Lk_sr-.Lk_dsbd(%r11), %xmm2
266 pshufb %xmm3, %xmm0 # 0 = sb1t
267 pxor %xmm4, %xmm0 # 0 = A
270 .size _vpaes_decrypt_core,.-_vpaes_decrypt_core
272 ########################################################
274 ## AES key schedule ##
276 ########################################################
277 .type _vpaes_schedule_core,\@abi-omnipotent
279 _vpaes_schedule_core:
283 # rcx = direction. 0=encrypt, 1=decrypt
285 call _vpaes_preheat # load the tables
286 movdqa .Lk_rcon(%rip), %xmm8 # load rcon
287 movdqu (%rdi), %xmm0 # load key (unaligned)
291 lea .Lk_ipt(%rip), %r11
292 call _vpaes_schedule_transform
295 lea .Lk_sr(%rip),%r10
297 jnz .Lschedule_am_decrypting
299 # encrypting, output zeroth round key after transform
303 .Lschedule_am_decrypting:
304 # decrypting, output zeroth round key after shiftrows
305 movdqa (%r8,%r10),%xmm1
319 ## 128-bit specific part of key schedule.
321 ## This schedule is really simple, because all its parts
322 ## are accomplished by the subroutines.
328 call _vpaes_schedule_round
330 jz .Lschedule_mangle_last
331 call _vpaes_schedule_mangle # write output
332 jmp .Loop_schedule_128
337 ## 192-bit specific part of key schedule.
339 ## The main body of this schedule is the same as the 128-bit
340 ## schedule, but with more smearing. The long, high side is
341 ## stored in %xmm7 as before, and the short, low side is in
342 ## the high bits of %xmm6.
344 ## This schedule is somewhat nastier, however, because each
345 ## round produces 192 bits of key material, or 1.5 round keys.
346 ## Therefore, on each cycle we do 2 rounds and produce 3 round
351 movdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned)
352 call _vpaes_schedule_transform # input transform
353 movdqa %xmm0, %xmm6 # save short part
354 pxor %xmm4, %xmm4 # clear 4
355 movhlps %xmm4, %xmm6 # clobber low side with zeros
359 call _vpaes_schedule_round
360 palignr \$8,%xmm6,%xmm0
361 call _vpaes_schedule_mangle # save key n
362 call _vpaes_schedule_192_smear
363 call _vpaes_schedule_mangle # save key n+1
364 call _vpaes_schedule_round
366 jz .Lschedule_mangle_last
367 call _vpaes_schedule_mangle # save key n+2
368 call _vpaes_schedule_192_smear
369 jmp .Loop_schedule_192
374 ## 256-bit specific part of key schedule.
376 ## The structure here is very similar to the 128-bit
377 ## schedule, but with an additional "low side" in
378 ## %xmm6. The low side's rounds are the same as the
379 ## high side's, except no rcon and no rotation.
383 movdqu 16(%rdi),%xmm0 # load key part 2 (unaligned)
384 call _vpaes_schedule_transform # input transform
388 call _vpaes_schedule_mangle # output low result
389 movdqa %xmm0, %xmm6 # save cur_lo in xmm6
392 call _vpaes_schedule_round
394 jz .Lschedule_mangle_last
395 call _vpaes_schedule_mangle
397 # low round. swap xmm7 and xmm6
398 pshufd \$0xFF, %xmm0, %xmm0
401 call _vpaes_schedule_low_round
404 jmp .Loop_schedule_256
408 ## .aes_schedule_mangle_last
410 ## Mangler for last round of key schedule
412 ## when encrypting, outputs out(%xmm0) ^ 63
413 ## when decrypting, outputs unskew(%xmm0)
415 ## Always called right before return... jumps to cleanup and exits
418 .Lschedule_mangle_last:
419 # schedule last round key from xmm0
420 lea .Lk_deskew(%rip),%r11 # prepare to deskew
422 jnz .Lschedule_mangle_last_dec
425 movdqa (%r8,%r10),%xmm1
426 pshufb %xmm1, %xmm0 # output permute
427 lea .Lk_opt(%rip), %r11 # prepare to output transform
430 .Lschedule_mangle_last_dec:
432 pxor .Lk_s63(%rip), %xmm0
433 call _vpaes_schedule_transform # output transform
434 movdqu %xmm0, (%rdx) # save last key
446 .size _vpaes_schedule_core,.-_vpaes_schedule_core
449 ## .aes_schedule_192_smear
451 ## Smear the short, low side in the 192-bit key schedule.
454 ## %xmm7: high side, b a x y
455 ## %xmm6: low side, d c 0 0
459 ## %xmm6: b+c+d b+c 0 0
460 ## %xmm0: b+c+d b+c b a
462 .type _vpaes_schedule_192_smear,\@abi-omnipotent
464 _vpaes_schedule_192_smear:
465 pshufd \$0x80, %xmm6, %xmm0 # d c 0 0 -> c 0 0 0
466 pxor %xmm0, %xmm6 # -> c+d c 0 0
467 pshufd \$0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a
468 pxor %xmm0, %xmm6 # -> b+c+d b+c b a
471 movhlps %xmm1, %xmm6 # clobber low side with zeros
473 .size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
476 ## .aes_schedule_round
478 ## Runs one main round of the key schedule on %xmm0, %xmm7
480 ## Specifically, runs subbytes on the high dword of %xmm0
481 ## then rotates it by one byte and xors into the low dword of
484 ## Adds rcon from low byte of %xmm8, then rotates %xmm8 for
487 ## Smears the dwords of %xmm7 by xoring the low into the
488 ## second low, result into third, result into highest.
490 ## Returns results in %xmm7 = %xmm0.
491 ## Clobbers %xmm1-%xmm4, %r11.
493 .type _vpaes_schedule_round,\@abi-omnipotent
495 _vpaes_schedule_round:
496 # extract rcon from xmm8
498 palignr \$15, %xmm8, %xmm1
499 palignr \$15, %xmm8, %xmm8
503 pshufd \$0xFF, %xmm0, %xmm0
504 palignr \$1, %xmm0, %xmm0
508 # low round: same as high round, but no rotation and no rcon.
509 _vpaes_schedule_low_round:
517 pxor .Lk_s63(%rip), %xmm7
522 psrld \$4, %xmm1 # 1 = i
523 pand %xmm9, %xmm0 # 0 = k
524 movdqa %xmm11, %xmm2 # 2 : a/k
525 pshufb %xmm0, %xmm2 # 2 = a/k
526 pxor %xmm1, %xmm0 # 0 = j
527 movdqa %xmm10, %xmm3 # 3 : 1/i
528 pshufb %xmm1, %xmm3 # 3 = 1/i
529 pxor %xmm2, %xmm3 # 3 = iak = 1/i + a/k
530 movdqa %xmm10, %xmm4 # 4 : 1/j
531 pshufb %xmm0, %xmm4 # 4 = 1/j
532 pxor %xmm2, %xmm4 # 4 = jak = 1/j + a/k
533 movdqa %xmm10, %xmm2 # 2 : 1/iak
534 pshufb %xmm3, %xmm2 # 2 = 1/iak
535 pxor %xmm0, %xmm2 # 2 = io
536 movdqa %xmm10, %xmm3 # 3 : 1/jak
537 pshufb %xmm4, %xmm3 # 3 = 1/jak
538 pxor %xmm1, %xmm3 # 3 = jo
539 movdqa %xmm13, %xmm4 # 4 : sbou
540 pshufb %xmm2, %xmm4 # 4 = sbou
541 movdqa %xmm12, %xmm0 # 0 : sbot
542 pshufb %xmm3, %xmm0 # 0 = sb1t
543 pxor %xmm4, %xmm0 # 0 = sbox output
545 # add in smeared stuff
549 .size _vpaes_schedule_round,.-_vpaes_schedule_round
552 ## .aes_schedule_transform
554 ## Linear-transform %xmm0 according to tables at (%r11)
556 ## Requires that %xmm9 = 0x0F0F... as in preheat
558 ## Clobbers %xmm1, %xmm2
560 .type _vpaes_schedule_transform,\@abi-omnipotent
562 _vpaes_schedule_transform:
567 movdqa (%r11), %xmm2 # lo
569 movdqa 16(%r11), %xmm0 # hi
573 .size _vpaes_schedule_transform,.-_vpaes_schedule_transform
576 ## .aes_schedule_mangle
578 ## Mangle xmm0 from (basis-transformed) standard version
583 ## multiply by circulant 0,1,1,1
584 ## apply shiftrows transform
588 ## multiply by "inverse mixcolumns" circulant E,B,D,9
590 ## apply shiftrows transform
593 ## Writes out to (%rdx), and increments or decrements it
594 ## Keeps track of round number mod 4 in %r8
596 ## Clobbers xmm1-xmm5
598 .type _vpaes_schedule_mangle,\@abi-omnipotent
600 _vpaes_schedule_mangle:
601 movdqa %xmm0, %xmm4 # save xmm0 for later
602 movdqa .Lk_mc_forward(%rip),%xmm5
604 jnz .Lschedule_mangle_dec
608 pxor .Lk_s63(%rip),%xmm4
616 jmp .Lschedule_mangle_both
618 .Lschedule_mangle_dec:
619 # inverse mix columns
620 lea .Lk_dksd(%rip),%r11
623 psrld \$4, %xmm1 # 1 = hi
624 pand %xmm9, %xmm4 # 4 = lo
626 movdqa 0x00(%r11), %xmm2
628 movdqa 0x10(%r11), %xmm3
633 movdqa 0x20(%r11), %xmm2
636 movdqa 0x30(%r11), %xmm3
641 movdqa 0x40(%r11), %xmm2
644 movdqa 0x50(%r11), %xmm3
649 movdqa 0x60(%r11), %xmm2
652 movdqa 0x70(%r11), %xmm3
658 .Lschedule_mangle_both:
659 movdqa (%r8,%r10),%xmm1
665 .size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
668 # Interface to OpenSSL
670 .globl ${PREFIX}_set_encrypt_key
671 .type ${PREFIX}_set_encrypt_key,\@function,3
673 ${PREFIX}_set_encrypt_key:
675 $code.=<<___ if ($win64);
677 movaps %xmm6,0x10(%rsp)
678 movaps %xmm7,0x20(%rsp)
679 movaps %xmm8,0x30(%rsp)
680 movaps %xmm9,0x40(%rsp)
681 movaps %xmm10,0x50(%rsp)
682 movaps %xmm11,0x60(%rsp)
683 movaps %xmm12,0x70(%rsp)
684 movaps %xmm13,0x80(%rsp)
685 movaps %xmm14,0x90(%rsp)
686 movaps %xmm15,0xa0(%rsp)
693 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
697 call _vpaes_schedule_core
699 $code.=<<___ if ($win64);
700 movaps 0x10(%rsp),%xmm6
701 movaps 0x20(%rsp),%xmm7
702 movaps 0x30(%rsp),%xmm8
703 movaps 0x40(%rsp),%xmm9
704 movaps 0x50(%rsp),%xmm10
705 movaps 0x60(%rsp),%xmm11
706 movaps 0x70(%rsp),%xmm12
707 movaps 0x80(%rsp),%xmm13
708 movaps 0x90(%rsp),%xmm14
709 movaps 0xa0(%rsp),%xmm15
716 .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
718 .globl ${PREFIX}_set_decrypt_key
719 .type ${PREFIX}_set_decrypt_key,\@function,3
721 ${PREFIX}_set_decrypt_key:
723 $code.=<<___ if ($win64);
725 movaps %xmm6,0x10(%rsp)
726 movaps %xmm7,0x20(%rsp)
727 movaps %xmm8,0x30(%rsp)
728 movaps %xmm9,0x40(%rsp)
729 movaps %xmm10,0x50(%rsp)
730 movaps %xmm11,0x60(%rsp)
731 movaps %xmm12,0x70(%rsp)
732 movaps %xmm13,0x80(%rsp)
733 movaps %xmm14,0x90(%rsp)
734 movaps %xmm15,0xa0(%rsp)
741 mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5;
743 lea 16(%rdx,%rax),%rdx
749 xor \$32,%r8d # nbits==192?0:32
750 call _vpaes_schedule_core
752 $code.=<<___ if ($win64);
753 movaps 0x10(%rsp),%xmm6
754 movaps 0x20(%rsp),%xmm7
755 movaps 0x30(%rsp),%xmm8
756 movaps 0x40(%rsp),%xmm9
757 movaps 0x50(%rsp),%xmm10
758 movaps 0x60(%rsp),%xmm11
759 movaps 0x70(%rsp),%xmm12
760 movaps 0x80(%rsp),%xmm13
761 movaps 0x90(%rsp),%xmm14
762 movaps 0xa0(%rsp),%xmm15
769 .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
771 .globl ${PREFIX}_encrypt
772 .type ${PREFIX}_encrypt,\@function,3
776 $code.=<<___ if ($win64);
778 movaps %xmm6,0x10(%rsp)
779 movaps %xmm7,0x20(%rsp)
780 movaps %xmm8,0x30(%rsp)
781 movaps %xmm9,0x40(%rsp)
782 movaps %xmm10,0x50(%rsp)
783 movaps %xmm11,0x60(%rsp)
784 movaps %xmm12,0x70(%rsp)
785 movaps %xmm13,0x80(%rsp)
786 movaps %xmm14,0x90(%rsp)
787 movaps %xmm15,0xa0(%rsp)
793 call _vpaes_encrypt_core
796 $code.=<<___ if ($win64);
797 movaps 0x10(%rsp),%xmm6
798 movaps 0x20(%rsp),%xmm7
799 movaps 0x30(%rsp),%xmm8
800 movaps 0x40(%rsp),%xmm9
801 movaps 0x50(%rsp),%xmm10
802 movaps 0x60(%rsp),%xmm11
803 movaps 0x70(%rsp),%xmm12
804 movaps 0x80(%rsp),%xmm13
805 movaps 0x90(%rsp),%xmm14
806 movaps 0xa0(%rsp),%xmm15
812 .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
814 .globl ${PREFIX}_decrypt
815 .type ${PREFIX}_decrypt,\@function,3
819 $code.=<<___ if ($win64);
821 movaps %xmm6,0x10(%rsp)
822 movaps %xmm7,0x20(%rsp)
823 movaps %xmm8,0x30(%rsp)
824 movaps %xmm9,0x40(%rsp)
825 movaps %xmm10,0x50(%rsp)
826 movaps %xmm11,0x60(%rsp)
827 movaps %xmm12,0x70(%rsp)
828 movaps %xmm13,0x80(%rsp)
829 movaps %xmm14,0x90(%rsp)
830 movaps %xmm15,0xa0(%rsp)
836 call _vpaes_decrypt_core
839 $code.=<<___ if ($win64);
840 movaps 0x10(%rsp),%xmm6
841 movaps 0x20(%rsp),%xmm7
842 movaps 0x30(%rsp),%xmm8
843 movaps 0x40(%rsp),%xmm9
844 movaps 0x50(%rsp),%xmm10
845 movaps 0x60(%rsp),%xmm11
846 movaps 0x70(%rsp),%xmm12
847 movaps 0x80(%rsp),%xmm13
848 movaps 0x90(%rsp),%xmm14
849 movaps 0xa0(%rsp),%xmm15
855 .size ${PREFIX}_decrypt,.-${PREFIX}_decrypt
858 my ($inp,$out,$len,$key,$ivp,$enc)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
859 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
860 # size_t length, const AES_KEY *key,
861 # unsigned char *ivp,const int enc);
863 .globl ${PREFIX}_cbc_encrypt
864 .type ${PREFIX}_cbc_encrypt,\@function,6
866 ${PREFIX}_cbc_encrypt:
869 ($len,$key)=($key,$len);
872 $code.=<<___ if ($win64);
874 movaps %xmm6,0x10(%rsp)
875 movaps %xmm7,0x20(%rsp)
876 movaps %xmm8,0x30(%rsp)
877 movaps %xmm9,0x40(%rsp)
878 movaps %xmm10,0x50(%rsp)
879 movaps %xmm11,0x60(%rsp)
880 movaps %xmm12,0x70(%rsp)
881 movaps %xmm13,0x80(%rsp)
882 movaps %xmm14,0x90(%rsp)
883 movaps %xmm15,0xa0(%rsp)
887 movdqu ($ivp),%xmm6 # load IV
898 call _vpaes_encrypt_core
900 movdqu %xmm0,($out,$inp)
909 call _vpaes_decrypt_core
912 movdqu %xmm0,($out,$inp)
917 movdqu %xmm6,($ivp) # save IV
919 $code.=<<___ if ($win64);
920 movaps 0x10(%rsp),%xmm6
921 movaps 0x20(%rsp),%xmm7
922 movaps 0x30(%rsp),%xmm8
923 movaps 0x40(%rsp),%xmm9
924 movaps 0x50(%rsp),%xmm10
925 movaps 0x60(%rsp),%xmm11
926 movaps 0x70(%rsp),%xmm12
927 movaps 0x80(%rsp),%xmm13
928 movaps 0x90(%rsp),%xmm14
929 movaps 0xa0(%rsp),%xmm15
935 .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
942 ## Fills register %r10 -> .aes_consts (so you can -fPIC)
943 ## and %xmm9-%xmm15 as specified below.
945 .type _vpaes_preheat,\@abi-omnipotent
948 lea .Lk_s0F(%rip), %r10
949 movdqa -0x20(%r10), %xmm10 # .Lk_inv
950 movdqa -0x10(%r10), %xmm11 # .Lk_inv+16
951 movdqa 0x00(%r10), %xmm9 # .Lk_s0F
952 movdqa 0x30(%r10), %xmm13 # .Lk_sb1
953 movdqa 0x40(%r10), %xmm12 # .Lk_sb1+16
954 movdqa 0x50(%r10), %xmm15 # .Lk_sb2
955 movdqa 0x60(%r10), %xmm14 # .Lk_sb2+16
957 .size _vpaes_preheat,.-_vpaes_preheat
958 ########################################################
962 ########################################################
963 .type _vpaes_consts,\@object
967 .quad 0x0E05060F0D080180, 0x040703090A0B0C02
968 .quad 0x01040A060F0B0780, 0x030D0E0C02050809
971 .quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F
973 .Lk_ipt: # input transform (lo, hi)
974 .quad 0xC2B2E8985A2A7000, 0xCABAE09052227808
975 .quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81
977 .Lk_sb1: # sb1u, sb1t
978 .quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544
979 .quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF
980 .Lk_sb2: # sb2u, sb2t
981 .quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD
982 .quad 0x69EB88400AE12900, 0xC2A163C8AB82234A
983 .Lk_sbo: # sbou, sbot
984 .quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878
985 .quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA
987 .Lk_mc_forward: # mc_forward
988 .quad 0x0407060500030201, 0x0C0F0E0D080B0A09
989 .quad 0x080B0A0904070605, 0x000302010C0F0E0D
990 .quad 0x0C0F0E0D080B0A09, 0x0407060500030201
991 .quad 0x000302010C0F0E0D, 0x080B0A0904070605
993 .Lk_mc_backward:# mc_backward
994 .quad 0x0605040702010003, 0x0E0D0C0F0A09080B
995 .quad 0x020100030E0D0C0F, 0x0A09080B06050407
996 .quad 0x0E0D0C0F0A09080B, 0x0605040702010003
997 .quad 0x0A09080B06050407, 0x020100030E0D0C0F
1000 .quad 0x0706050403020100, 0x0F0E0D0C0B0A0908
1001 .quad 0x030E09040F0A0500, 0x0B06010C07020D08
1002 .quad 0x0F060D040B020900, 0x070E050C030A0108
1003 .quad 0x0B0E0104070A0D00, 0x0306090C0F020508
1006 .quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81
1008 .Lk_s63: # s63: all equal to 0x63 transformed
1009 .quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B
1011 .Lk_opt: # output transform
1012 .quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808
1013 .quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0
1015 .Lk_deskew: # deskew tables: inverts the sbox's "skew"
1016 .quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A
1017 .quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77
1021 ## Key schedule constants
1023 .Lk_dksd: # decryption key schedule: invskew x*D
1024 .quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9
1025 .quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E
1026 .Lk_dksb: # decryption key schedule: invskew x*B
1027 .quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99
1028 .quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8
1029 .Lk_dkse: # decryption key schedule: invskew x*E + 0x63
1030 .quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086
1031 .quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487
1032 .Lk_dks9: # decryption key schedule: invskew x*9
1033 .quad 0xB6116FC87ED9A700, 0x4AED933482255BFC
1034 .quad 0x4576516227143300, 0x8BB89FACE9DAFDCE
1038 ## Round function constants
1040 .Lk_dipt: # decryption input transform
1041 .quad 0x0F505B040B545F00, 0x154A411E114E451A
1042 .quad 0x86E383E660056500, 0x12771772F491F194
1044 .Lk_dsb9: # decryption sbox output *9*u, *9*t
1045 .quad 0x851C03539A86D600, 0xCAD51F504F994CC9
1046 .quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565
1047 .Lk_dsbd: # decryption sbox output *D*u, *D*t
1048 .quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439
1049 .quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3
1050 .Lk_dsbb: # decryption sbox output *B*u, *B*t
1051 .quad 0xD022649296B44200, 0x602646F6B0F2D404
1052 .quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B
1053 .Lk_dsbe: # decryption sbox output *E*u, *E*t
1054 .quad 0x46F2929626D4D000, 0x2242600464B4F6B0
1055 .quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32
1056 .Lk_dsbo: # decryption sbox final output
1057 .quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
1058 .quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C
1059 .asciz "Vector Permutaion AES for x86_64, Mike Hamburg (Stanford University)"
1061 .size _vpaes_consts,.-_vpaes_consts
1065 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1066 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1073 .extern __imp_RtlVirtualUnwind
1074 .type se_handler,\@abi-omnipotent
1088 mov 120($context),%rax # pull context->Rax
1089 mov 248($context),%rbx # pull context->Rip
1091 mov 8($disp),%rsi # disp->ImageBase
1092 mov 56($disp),%r11 # disp->HandlerData
1094 mov 0(%r11),%r10d # HandlerData[0]
1095 lea (%rsi,%r10),%r10 # prologue label
1096 cmp %r10,%rbx # context->Rip<prologue label
1099 mov 152($context),%rax # pull context->Rsp
1101 mov 4(%r11),%r10d # HandlerData[1]
1102 lea (%rsi,%r10),%r10 # epilogue label
1103 cmp %r10,%rbx # context->Rip>=epilogue label
1106 lea 16(%rax),%rsi # %xmm save area
1107 lea 512($context),%rdi # &context.Xmm6
1108 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
1109 .long 0xa548f3fc # cld; rep movsq
1110 lea 0xb8(%rax),%rax # adjust stack pointer
1115 mov %rax,152($context) # restore context->Rsp
1116 mov %rsi,168($context) # restore context->Rsi
1117 mov %rdi,176($context) # restore context->Rdi
1119 mov 40($disp),%rdi # disp->ContextRecord
1120 mov $context,%rsi # context
1121 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1122 .long 0xa548f3fc # cld; rep movsq
1125 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1126 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1127 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1128 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1129 mov 40(%rsi),%r10 # disp->ContextRecord
1130 lea 56(%rsi),%r11 # &disp->HandlerData
1131 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1132 mov %r10,32(%rsp) # arg5
1133 mov %r11,40(%rsp) # arg6
1134 mov %r12,48(%rsp) # arg7
1135 mov %rcx,56(%rsp) # arg8, (NULL)
1136 call *__imp_RtlVirtualUnwind(%rip)
1138 mov \$1,%eax # ExceptionContinueSearch
1150 .size se_handler,.-se_handler
1154 .rva .LSEH_begin_${PREFIX}_set_encrypt_key
1155 .rva .LSEH_end_${PREFIX}_set_encrypt_key
1156 .rva .LSEH_info_${PREFIX}_set_encrypt_key
1158 .rva .LSEH_begin_${PREFIX}_set_decrypt_key
1159 .rva .LSEH_end_${PREFIX}_set_decrypt_key
1160 .rva .LSEH_info_${PREFIX}_set_decrypt_key
1162 .rva .LSEH_begin_${PREFIX}_encrypt
1163 .rva .LSEH_end_${PREFIX}_encrypt
1164 .rva .LSEH_info_${PREFIX}_encrypt
1166 .rva .LSEH_begin_${PREFIX}_decrypt
1167 .rva .LSEH_end_${PREFIX}_decrypt
1168 .rva .LSEH_info_${PREFIX}_decrypt
1170 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
1171 .rva .LSEH_end_${PREFIX}_cbc_encrypt
1172 .rva .LSEH_info_${PREFIX}_cbc_encrypt
1176 .LSEH_info_${PREFIX}_set_encrypt_key:
1179 .rva .Lenc_key_body,.Lenc_key_epilogue # HandlerData[]
1180 .LSEH_info_${PREFIX}_set_decrypt_key:
1183 .rva .Ldec_key_body,.Ldec_key_epilogue # HandlerData[]
1184 .LSEH_info_${PREFIX}_encrypt:
1187 .rva .Lenc_body,.Lenc_epilogue # HandlerData[]
1188 .LSEH_info_${PREFIX}_decrypt:
1191 .rva .Ldec_body,.Ldec_epilogue # HandlerData[]
1192 .LSEH_info_${PREFIX}_cbc_encrypt:
1195 .rva .Lcbc_body,.Lcbc_epilogue # HandlerData[]
1199 $code =~ s/\`([^\`]*)\`/eval($1)/gem;