2 # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements support for Intel AES-NI extension. In
18 # OpenSSL context it's used with Intel engine, but can also be used as
19 # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
24 # Given aes(enc|dec) instructions' latency asymptotic performance for
25 # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26 # processed with 128-bit key. And given their throughput asymptotic
27 # performance for parallelizable modes is 1.25 cycles per byte. Being
28 # asymptotic limit it's not something you commonly achieve in reality,
29 # but how close does one get? Below are results collected for
30 # different modes and block sized. Pairs of numbers are for en-/
33 # 16-byte 64-byte 256-byte 1-KB 8-KB
34 # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
35 # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
36 # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
37 # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
38 # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
39 # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
41 # ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42 # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43 # [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44 # The results were collected with specially crafted speed.c benchmark
45 # in order to compare them with results reported in "Intel Advanced
46 # Encryption Standard (AES) New Instruction Set" White Paper Revision
47 # 3.0 dated May 2010. All above results are consistently better. This
48 # module also provides better performance for block sizes smaller than
49 # 128 bytes in points *not* represented in the above table.
51 # Looking at the results for 8-KB buffer.
53 # CFB and OFB results are far from the limit, because implementation
54 # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55 # single-block aesni_encrypt, which is not the most optimal way to go.
56 # CBC encrypt result is unexpectedly high and there is no documented
57 # explanation for it. Seemingly there is a small penalty for feeding
58 # the result back to AES unit the way it's done in CBC mode. There is
59 # nothing one can do and the result appears optimal. CCM result is
60 # identical to CBC, because CBC-MAC is essentially CBC encrypt without
61 # saving output. CCM CTR "stays invisible," because it's neatly
62 # interleaved wih CBC-MAC. This provides ~30% improvement over
63 # "straghtforward" CCM implementation with CTR and CBC-MAC performed
64 # disjointly. Parallelizable modes practically achieve the theoretical
67 # Looking at how results vary with buffer size.
69 # Curves are practically saturated at 1-KB buffer size. In most cases
70 # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71 # CTR curve doesn't follow this pattern and is "slowest" changing one
72 # with "256-byte" result being 87% of "8-KB." This is because overhead
73 # in CTR mode is most computationally intensive. Small-block CCM
74 # decrypt is slower than encrypt, because first CTR and last CBC-MAC
75 # iterations can't be interleaved.
77 # Results for 192- and 256-bit keys.
79 # EVP-free results were observed to scale perfectly with number of
80 # rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81 # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82 # are a tad smaller, because the above mentioned penalty biases all
83 # results by same constant value. In similar way function call
84 # overhead affects small-block performance, as well as OFB and CFB
85 # results. Differences are not large, most common coefficients are
86 # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87 # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
91 # While Westmere processor features 6 cycles latency for aes[enc|dec]
92 # instructions, which can be scheduled every second cycle, Sandy
93 # Bridge spends 8 cycles per instruction, but it can schedule them
94 # every cycle. This means that code targeting Westmere would perform
95 # suboptimally on Sandy Bridge. Therefore this update.
97 # In addition, non-parallelizable CBC encrypt (as well as CCM) is
98 # optimized. Relative improvement might appear modest, 8% on Westmere,
99 # but in absolute terms it's 3.77 cycles per byte encrypted with
100 # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101 # should be compared to asymptotic limits of 3.75 for Westmere and
102 # 5.00 for Sandy Bridge. Actually, the fact that they get this close
103 # to asymptotic limits is quite amazing. Indeed, the limit is
104 # calculated as latency times number of rounds, 10 for 128-bit key,
105 # and divided by 16, the number of bytes in block, or in other words
106 # it accounts *solely* for aesenc instructions. But there are extra
107 # instructions, and numbers so close to the asymptotic limits mean
108 # that it's as if it takes as little as *one* additional cycle to
109 # execute all of them. How is it possible? It is possible thanks to
110 # out-of-order execution logic, which manages to overlap post-
111 # processing of previous block, things like saving the output, with
112 # actual encryption of current block, as well as pre-processing of
113 # current block, things like fetching input and xor-ing it with
114 # 0-round element of the key schedule, with actual encryption of
115 # previous block. Keep this in mind...
117 # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118 # performance is achieved by interleaving instructions working on
119 # independent blocks. In which case asymptotic limit for such modes
120 # can be obtained by dividing above mentioned numbers by AES
121 # instructions' interleave factor. Westmere can execute at most 3
122 # instructions at a time, meaning that optimal interleave factor is 3,
123 # and that's where the "magic" number of 1.25 come from. "Optimal
124 # interleave factor" means that increase of interleave factor does
125 # not improve performance. The formula has proven to reflect reality
126 # pretty well on Westmere... Sandy Bridge on the other hand can
127 # execute up to 8 AES instructions at a time, so how does varying
128 # interleave factor affect the performance? Here is table for ECB
129 # (numbers are cycles per byte processed with 128-bit key):
131 # instruction interleave factor 3x 6x 8x
132 # theoretical asymptotic limit 1.67 0.83 0.625
133 # measured performance for 8KB block 1.05 0.86 0.84
135 # "as if" interleave factor 4.7x 5.8x 6.0x
137 # Further data for other parallelizable modes:
139 # CBC decrypt 1.16 0.93 0.74
142 # Well, given 3x column it's probably inappropriate to call the limit
143 # asymptotic, if it can be surpassed, isn't it? What happens there?
144 # Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145 # magic is responsible for this. Processor overlaps not only the
146 # additional instructions with AES ones, but even AES instuctions
147 # processing adjacent triplets of independent blocks. In the 6x case
148 # additional instructions still claim disproportionally small amount
149 # of additional cycles, but in 8x case number of instructions must be
150 # a tad too high for out-of-order logic to cope with, and AES unit
151 # remains underutilized... As you can see 8x interleave is hardly
152 # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
153 # utilizies 6x interleave because of limited register bank capacity.
155 # Higher interleave factors do have negative impact on Westmere
156 # performance. While for ECB mode it's negligible ~1.5%, other
157 # parallelizables perform ~5% worse, which is outweighed by ~25%
158 # improvement on Sandy Bridge. To balance regression on Westmere
159 # CTR mode was implemented with 6x aesenc interleave factor.
163 # Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164 # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165 # in CTR mode AES instruction interleave factor was chosen to be 6x.
169 # Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
172 ######################################################################
173 # Current large-block performance in cycles per byte processed with
174 # 128-bit key (less is better).
176 # CBC en-/decrypt CTR XTS ECB OCB
177 # Westmere 3.77/1.25 1.25 1.25 1.26
178 # * Bridge 5.07/0.74 0.75 0.90 0.85 0.98
179 # Haswell 4.44/0.63 0.63 0.73 0.63 0.70
180 # Skylake 2.62/0.63 0.63 0.63 0.63
181 # Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
182 # Goldmont 3.82/1.26 1.26 1.29 1.29 1.50
183 # Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
185 # (*) Atom Silvermont ECB result is suboptimal because of penalties
186 # incurred by operations on %xmm8-15. As ECB is not considered
187 # critical, nothing was done to mitigate the problem.
189 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
190 # generates drop-in replacement for
191 # crypto/aes/asm/aes-x86_64.pl:-)
195 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
197 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
199 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
200 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
201 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
202 die "can't locate x86_64-xlate.pl";
204 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
207 $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
208 @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
209 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
212 $code.=".extern OPENSSL_ia32cap_P\n";
214 $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
215 # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
219 $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
220 $ivp="%r8"; # cbc, ctr, ...
222 $rnds_="%r10d"; # backup copy for $rounds
223 $key_="%r11"; # backup copy for $key
225 # %xmm register layout
226 $rndkey0="%xmm0"; $rndkey1="%xmm1";
227 $inout0="%xmm2"; $inout1="%xmm3";
228 $inout2="%xmm4"; $inout3="%xmm5";
229 $inout4="%xmm6"; $inout5="%xmm7";
230 $inout6="%xmm8"; $inout7="%xmm9";
232 $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
233 $in0="%xmm8"; $iv="%xmm9";
235 # Inline version of internal aesni_[en|de]crypt1.
237 # Why folded loop? Because aes[enc|dec] is slow enough to accommodate
238 # cycles which take care of loop variables...
240 sub aesni_generate1 {
241 my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
244 $movkey ($key),$rndkey0
245 $movkey 16($key),$rndkey1
247 $code.=<<___ if (defined($ivec));
252 $code.=<<___ if (!defined($ivec));
254 xorps $rndkey0,$inout
258 aes${p} $rndkey1,$inout
260 $movkey ($key),$rndkey1
262 jnz .Loop_${p}1_$sn # loop body is 16 bytes
263 aes${p}last $rndkey1,$inout
266 # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
268 { my ($inp,$out,$key) = @_4args;
271 .globl ${PREFIX}_encrypt
272 .type ${PREFIX}_encrypt,\@abi-omnipotent
275 movups ($inp),$inout0 # load input
276 mov 240($key),$rounds # key->rounds
278 &aesni_generate1("enc",$key,$rounds);
280 pxor $rndkey0,$rndkey0 # clear register bank
281 pxor $rndkey1,$rndkey1
282 movups $inout0,($out) # output
285 .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
287 .globl ${PREFIX}_decrypt
288 .type ${PREFIX}_decrypt,\@abi-omnipotent
291 movups ($inp),$inout0 # load input
292 mov 240($key),$rounds # key->rounds
294 &aesni_generate1("dec",$key,$rounds);
296 pxor $rndkey0,$rndkey0 # clear register bank
297 pxor $rndkey1,$rndkey1
298 movups $inout0,($out) # output
301 .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
305 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
306 # factor. Why 3x subroutine were originally used in loops? Even though
307 # aes[enc|dec] latency was originally 6, it could be scheduled only
308 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
309 # utilization, i.e. when subroutine's throughput is virtually same as
310 # of non-interleaved subroutine [for number of input blocks up to 3].
311 # This is why it originally made no sense to implement 2x subroutine.
312 # But times change and it became appropriate to spend extra 192 bytes
313 # on 2x subroutine on Atom Silvermont account. For processors that
314 # can schedule aes[enc|dec] every cycle optimal interleave factor
315 # equals to corresponding instructions latency. 8x is optimal for
316 # * Bridge and "super-optimal" for other Intel CPUs...
318 sub aesni_generate2 {
320 # As already mentioned it takes in $key and $rounds, which are *not*
321 # preserved. $inout[0-1] is cipher/clear text...
323 .type _aesni_${dir}rypt2,\@abi-omnipotent
326 $movkey ($key),$rndkey0
328 $movkey 16($key),$rndkey1
329 xorps $rndkey0,$inout0
330 xorps $rndkey0,$inout1
331 $movkey 32($key),$rndkey0
332 lea 32($key,$rounds),$key
337 aes${dir} $rndkey1,$inout0
338 aes${dir} $rndkey1,$inout1
339 $movkey ($key,%rax),$rndkey1
341 aes${dir} $rndkey0,$inout0
342 aes${dir} $rndkey0,$inout1
343 $movkey -16($key,%rax),$rndkey0
346 aes${dir} $rndkey1,$inout0
347 aes${dir} $rndkey1,$inout1
348 aes${dir}last $rndkey0,$inout0
349 aes${dir}last $rndkey0,$inout1
351 .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
354 sub aesni_generate3 {
356 # As already mentioned it takes in $key and $rounds, which are *not*
357 # preserved. $inout[0-2] is cipher/clear text...
359 .type _aesni_${dir}rypt3,\@abi-omnipotent
362 $movkey ($key),$rndkey0
364 $movkey 16($key),$rndkey1
365 xorps $rndkey0,$inout0
366 xorps $rndkey0,$inout1
367 xorps $rndkey0,$inout2
368 $movkey 32($key),$rndkey0
369 lea 32($key,$rounds),$key
374 aes${dir} $rndkey1,$inout0
375 aes${dir} $rndkey1,$inout1
376 aes${dir} $rndkey1,$inout2
377 $movkey ($key,%rax),$rndkey1
379 aes${dir} $rndkey0,$inout0
380 aes${dir} $rndkey0,$inout1
381 aes${dir} $rndkey0,$inout2
382 $movkey -16($key,%rax),$rndkey0
385 aes${dir} $rndkey1,$inout0
386 aes${dir} $rndkey1,$inout1
387 aes${dir} $rndkey1,$inout2
388 aes${dir}last $rndkey0,$inout0
389 aes${dir}last $rndkey0,$inout1
390 aes${dir}last $rndkey0,$inout2
392 .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
395 # 4x interleave is implemented to improve small block performance,
396 # most notably [and naturally] 4 block by ~30%. One can argue that one
397 # should have implemented 5x as well, but improvement would be <20%,
398 # so it's not worth it...
399 sub aesni_generate4 {
401 # As already mentioned it takes in $key and $rounds, which are *not*
402 # preserved. $inout[0-3] is cipher/clear text...
404 .type _aesni_${dir}rypt4,\@abi-omnipotent
407 $movkey ($key),$rndkey0
409 $movkey 16($key),$rndkey1
410 xorps $rndkey0,$inout0
411 xorps $rndkey0,$inout1
412 xorps $rndkey0,$inout2
413 xorps $rndkey0,$inout3
414 $movkey 32($key),$rndkey0
415 lea 32($key,$rounds),$key
421 aes${dir} $rndkey1,$inout0
422 aes${dir} $rndkey1,$inout1
423 aes${dir} $rndkey1,$inout2
424 aes${dir} $rndkey1,$inout3
425 $movkey ($key,%rax),$rndkey1
427 aes${dir} $rndkey0,$inout0
428 aes${dir} $rndkey0,$inout1
429 aes${dir} $rndkey0,$inout2
430 aes${dir} $rndkey0,$inout3
431 $movkey -16($key,%rax),$rndkey0
434 aes${dir} $rndkey1,$inout0
435 aes${dir} $rndkey1,$inout1
436 aes${dir} $rndkey1,$inout2
437 aes${dir} $rndkey1,$inout3
438 aes${dir}last $rndkey0,$inout0
439 aes${dir}last $rndkey0,$inout1
440 aes${dir}last $rndkey0,$inout2
441 aes${dir}last $rndkey0,$inout3
443 .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
446 sub aesni_generate6 {
448 # As already mentioned it takes in $key and $rounds, which are *not*
449 # preserved. $inout[0-5] is cipher/clear text...
451 .type _aesni_${dir}rypt6,\@abi-omnipotent
454 $movkey ($key),$rndkey0
456 $movkey 16($key),$rndkey1
457 xorps $rndkey0,$inout0
458 pxor $rndkey0,$inout1
459 pxor $rndkey0,$inout2
460 aes${dir} $rndkey1,$inout0
461 lea 32($key,$rounds),$key
463 aes${dir} $rndkey1,$inout1
464 pxor $rndkey0,$inout3
465 pxor $rndkey0,$inout4
466 aes${dir} $rndkey1,$inout2
467 pxor $rndkey0,$inout5
468 $movkey ($key,%rax),$rndkey0
470 jmp .L${dir}_loop6_enter
473 aes${dir} $rndkey1,$inout0
474 aes${dir} $rndkey1,$inout1
475 aes${dir} $rndkey1,$inout2
476 .L${dir}_loop6_enter:
477 aes${dir} $rndkey1,$inout3
478 aes${dir} $rndkey1,$inout4
479 aes${dir} $rndkey1,$inout5
480 $movkey ($key,%rax),$rndkey1
482 aes${dir} $rndkey0,$inout0
483 aes${dir} $rndkey0,$inout1
484 aes${dir} $rndkey0,$inout2
485 aes${dir} $rndkey0,$inout3
486 aes${dir} $rndkey0,$inout4
487 aes${dir} $rndkey0,$inout5
488 $movkey -16($key,%rax),$rndkey0
491 aes${dir} $rndkey1,$inout0
492 aes${dir} $rndkey1,$inout1
493 aes${dir} $rndkey1,$inout2
494 aes${dir} $rndkey1,$inout3
495 aes${dir} $rndkey1,$inout4
496 aes${dir} $rndkey1,$inout5
497 aes${dir}last $rndkey0,$inout0
498 aes${dir}last $rndkey0,$inout1
499 aes${dir}last $rndkey0,$inout2
500 aes${dir}last $rndkey0,$inout3
501 aes${dir}last $rndkey0,$inout4
502 aes${dir}last $rndkey0,$inout5
504 .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
507 sub aesni_generate8 {
509 # As already mentioned it takes in $key and $rounds, which are *not*
510 # preserved. $inout[0-7] is cipher/clear text...
512 .type _aesni_${dir}rypt8,\@abi-omnipotent
515 $movkey ($key),$rndkey0
517 $movkey 16($key),$rndkey1
518 xorps $rndkey0,$inout0
519 xorps $rndkey0,$inout1
520 pxor $rndkey0,$inout2
521 pxor $rndkey0,$inout3
522 pxor $rndkey0,$inout4
523 lea 32($key,$rounds),$key
525 aes${dir} $rndkey1,$inout0
526 pxor $rndkey0,$inout5
527 pxor $rndkey0,$inout6
528 aes${dir} $rndkey1,$inout1
529 pxor $rndkey0,$inout7
530 $movkey ($key,%rax),$rndkey0
532 jmp .L${dir}_loop8_inner
535 aes${dir} $rndkey1,$inout0
536 aes${dir} $rndkey1,$inout1
537 .L${dir}_loop8_inner:
538 aes${dir} $rndkey1,$inout2
539 aes${dir} $rndkey1,$inout3
540 aes${dir} $rndkey1,$inout4
541 aes${dir} $rndkey1,$inout5
542 aes${dir} $rndkey1,$inout6
543 aes${dir} $rndkey1,$inout7
544 .L${dir}_loop8_enter:
545 $movkey ($key,%rax),$rndkey1
547 aes${dir} $rndkey0,$inout0
548 aes${dir} $rndkey0,$inout1
549 aes${dir} $rndkey0,$inout2
550 aes${dir} $rndkey0,$inout3
551 aes${dir} $rndkey0,$inout4
552 aes${dir} $rndkey0,$inout5
553 aes${dir} $rndkey0,$inout6
554 aes${dir} $rndkey0,$inout7
555 $movkey -16($key,%rax),$rndkey0
558 aes${dir} $rndkey1,$inout0
559 aes${dir} $rndkey1,$inout1
560 aes${dir} $rndkey1,$inout2
561 aes${dir} $rndkey1,$inout3
562 aes${dir} $rndkey1,$inout4
563 aes${dir} $rndkey1,$inout5
564 aes${dir} $rndkey1,$inout6
565 aes${dir} $rndkey1,$inout7
566 aes${dir}last $rndkey0,$inout0
567 aes${dir}last $rndkey0,$inout1
568 aes${dir}last $rndkey0,$inout2
569 aes${dir}last $rndkey0,$inout3
570 aes${dir}last $rndkey0,$inout4
571 aes${dir}last $rndkey0,$inout5
572 aes${dir}last $rndkey0,$inout6
573 aes${dir}last $rndkey0,$inout7
575 .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
578 &aesni_generate2("enc") if ($PREFIX eq "aesni");
579 &aesni_generate2("dec");
580 &aesni_generate3("enc") if ($PREFIX eq "aesni");
581 &aesni_generate3("dec");
582 &aesni_generate4("enc") if ($PREFIX eq "aesni");
583 &aesni_generate4("dec");
584 &aesni_generate6("enc") if ($PREFIX eq "aesni");
585 &aesni_generate6("dec");
586 &aesni_generate8("enc") if ($PREFIX eq "aesni");
587 &aesni_generate8("dec");
589 if ($PREFIX eq "aesni") {
590 ########################################################################
591 # void aesni_ecb_encrypt (const void *in, void *out,
592 # size_t length, const AES_KEY *key,
595 .globl aesni_ecb_encrypt
596 .type aesni_ecb_encrypt,\@function,5
600 $code.=<<___ if ($win64);
602 movaps %xmm6,(%rsp) # offload $inout4..7
603 movaps %xmm7,0x10(%rsp)
604 movaps %xmm8,0x20(%rsp)
605 movaps %xmm9,0x30(%rsp)
609 and \$-16,$len # if ($len<16)
610 jz .Lecb_ret # return
612 mov 240($key),$rounds # key->rounds
613 $movkey ($key),$rndkey0
614 mov $key,$key_ # backup $key
615 mov $rounds,$rnds_ # backup $rounds
616 test %r8d,%r8d # 5th argument
618 #--------------------------- ECB ENCRYPT ------------------------------#
619 cmp \$0x80,$len # if ($len<8*16)
620 jb .Lecb_enc_tail # short input
622 movdqu ($inp),$inout0 # load 8 input blocks
623 movdqu 0x10($inp),$inout1
624 movdqu 0x20($inp),$inout2
625 movdqu 0x30($inp),$inout3
626 movdqu 0x40($inp),$inout4
627 movdqu 0x50($inp),$inout5
628 movdqu 0x60($inp),$inout6
629 movdqu 0x70($inp),$inout7
630 lea 0x80($inp),$inp # $inp+=8*16
631 sub \$0x80,$len # $len-=8*16 (can be zero)
632 jmp .Lecb_enc_loop8_enter
635 movups $inout0,($out) # store 8 output blocks
636 mov $key_,$key # restore $key
637 movdqu ($inp),$inout0 # load 8 input blocks
638 mov $rnds_,$rounds # restore $rounds
639 movups $inout1,0x10($out)
640 movdqu 0x10($inp),$inout1
641 movups $inout2,0x20($out)
642 movdqu 0x20($inp),$inout2
643 movups $inout3,0x30($out)
644 movdqu 0x30($inp),$inout3
645 movups $inout4,0x40($out)
646 movdqu 0x40($inp),$inout4
647 movups $inout5,0x50($out)
648 movdqu 0x50($inp),$inout5
649 movups $inout6,0x60($out)
650 movdqu 0x60($inp),$inout6
651 movups $inout7,0x70($out)
652 lea 0x80($out),$out # $out+=8*16
653 movdqu 0x70($inp),$inout7
654 lea 0x80($inp),$inp # $inp+=8*16
655 .Lecb_enc_loop8_enter:
660 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
662 movups $inout0,($out) # store 8 output blocks
663 mov $key_,$key # restore $key
664 movups $inout1,0x10($out)
665 mov $rnds_,$rounds # restore $rounds
666 movups $inout2,0x20($out)
667 movups $inout3,0x30($out)
668 movups $inout4,0x40($out)
669 movups $inout5,0x50($out)
670 movups $inout6,0x60($out)
671 movups $inout7,0x70($out)
672 lea 0x80($out),$out # $out+=8*16
673 add \$0x80,$len # restore real remaining $len
674 jz .Lecb_ret # done if ($len==0)
676 .Lecb_enc_tail: # $len is less than 8*16
677 movups ($inp),$inout0
680 movups 0x10($inp),$inout1
682 movups 0x20($inp),$inout2
685 movups 0x30($inp),$inout3
687 movups 0x40($inp),$inout4
690 movups 0x50($inp),$inout5
692 movdqu 0x60($inp),$inout6
693 xorps $inout7,$inout7
695 movups $inout0,($out) # store 7 output blocks
696 movups $inout1,0x10($out)
697 movups $inout2,0x20($out)
698 movups $inout3,0x30($out)
699 movups $inout4,0x40($out)
700 movups $inout5,0x50($out)
701 movups $inout6,0x60($out)
706 &aesni_generate1("enc",$key,$rounds);
708 movups $inout0,($out) # store one output block
713 movups $inout0,($out) # store 2 output blocks
714 movups $inout1,0x10($out)
719 movups $inout0,($out) # store 3 output blocks
720 movups $inout1,0x10($out)
721 movups $inout2,0x20($out)
726 movups $inout0,($out) # store 4 output blocks
727 movups $inout1,0x10($out)
728 movups $inout2,0x20($out)
729 movups $inout3,0x30($out)
733 xorps $inout5,$inout5
735 movups $inout0,($out) # store 5 output blocks
736 movups $inout1,0x10($out)
737 movups $inout2,0x20($out)
738 movups $inout3,0x30($out)
739 movups $inout4,0x40($out)
744 movups $inout0,($out) # store 6 output blocks
745 movups $inout1,0x10($out)
746 movups $inout2,0x20($out)
747 movups $inout3,0x30($out)
748 movups $inout4,0x40($out)
749 movups $inout5,0x50($out)
751 \f#--------------------------- ECB DECRYPT ------------------------------#
754 cmp \$0x80,$len # if ($len<8*16)
755 jb .Lecb_dec_tail # short input
757 movdqu ($inp),$inout0 # load 8 input blocks
758 movdqu 0x10($inp),$inout1
759 movdqu 0x20($inp),$inout2
760 movdqu 0x30($inp),$inout3
761 movdqu 0x40($inp),$inout4
762 movdqu 0x50($inp),$inout5
763 movdqu 0x60($inp),$inout6
764 movdqu 0x70($inp),$inout7
765 lea 0x80($inp),$inp # $inp+=8*16
766 sub \$0x80,$len # $len-=8*16 (can be zero)
767 jmp .Lecb_dec_loop8_enter
770 movups $inout0,($out) # store 8 output blocks
771 mov $key_,$key # restore $key
772 movdqu ($inp),$inout0 # load 8 input blocks
773 mov $rnds_,$rounds # restore $rounds
774 movups $inout1,0x10($out)
775 movdqu 0x10($inp),$inout1
776 movups $inout2,0x20($out)
777 movdqu 0x20($inp),$inout2
778 movups $inout3,0x30($out)
779 movdqu 0x30($inp),$inout3
780 movups $inout4,0x40($out)
781 movdqu 0x40($inp),$inout4
782 movups $inout5,0x50($out)
783 movdqu 0x50($inp),$inout5
784 movups $inout6,0x60($out)
785 movdqu 0x60($inp),$inout6
786 movups $inout7,0x70($out)
787 lea 0x80($out),$out # $out+=8*16
788 movdqu 0x70($inp),$inout7
789 lea 0x80($inp),$inp # $inp+=8*16
790 .Lecb_dec_loop8_enter:
794 $movkey ($key_),$rndkey0
796 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
798 movups $inout0,($out) # store 8 output blocks
799 pxor $inout0,$inout0 # clear register bank
800 mov $key_,$key # restore $key
801 movups $inout1,0x10($out)
803 mov $rnds_,$rounds # restore $rounds
804 movups $inout2,0x20($out)
806 movups $inout3,0x30($out)
808 movups $inout4,0x40($out)
810 movups $inout5,0x50($out)
812 movups $inout6,0x60($out)
814 movups $inout7,0x70($out)
816 lea 0x80($out),$out # $out+=8*16
817 add \$0x80,$len # restore real remaining $len
818 jz .Lecb_ret # done if ($len==0)
821 movups ($inp),$inout0
824 movups 0x10($inp),$inout1
826 movups 0x20($inp),$inout2
829 movups 0x30($inp),$inout3
831 movups 0x40($inp),$inout4
834 movups 0x50($inp),$inout5
836 movups 0x60($inp),$inout6
837 $movkey ($key),$rndkey0
838 xorps $inout7,$inout7
840 movups $inout0,($out) # store 7 output blocks
841 pxor $inout0,$inout0 # clear register bank
842 movups $inout1,0x10($out)
844 movups $inout2,0x20($out)
846 movups $inout3,0x30($out)
848 movups $inout4,0x40($out)
850 movups $inout5,0x50($out)
852 movups $inout6,0x60($out)
859 &aesni_generate1("dec",$key,$rounds);
861 movups $inout0,($out) # store one output block
862 pxor $inout0,$inout0 # clear register bank
867 movups $inout0,($out) # store 2 output blocks
868 pxor $inout0,$inout0 # clear register bank
869 movups $inout1,0x10($out)
875 movups $inout0,($out) # store 3 output blocks
876 pxor $inout0,$inout0 # clear register bank
877 movups $inout1,0x10($out)
879 movups $inout2,0x20($out)
885 movups $inout0,($out) # store 4 output blocks
886 pxor $inout0,$inout0 # clear register bank
887 movups $inout1,0x10($out)
889 movups $inout2,0x20($out)
891 movups $inout3,0x30($out)
896 xorps $inout5,$inout5
898 movups $inout0,($out) # store 5 output blocks
899 pxor $inout0,$inout0 # clear register bank
900 movups $inout1,0x10($out)
902 movups $inout2,0x20($out)
904 movups $inout3,0x30($out)
906 movups $inout4,0x40($out)
913 movups $inout0,($out) # store 6 output blocks
914 pxor $inout0,$inout0 # clear register bank
915 movups $inout1,0x10($out)
917 movups $inout2,0x20($out)
919 movups $inout3,0x30($out)
921 movups $inout4,0x40($out)
923 movups $inout5,0x50($out)
927 xorps $rndkey0,$rndkey0 # %xmm0
928 pxor $rndkey1,$rndkey1
930 $code.=<<___ if ($win64);
932 movaps %xmm0,(%rsp) # clear stack
933 movaps 0x10(%rsp),%xmm7
934 movaps %xmm0,0x10(%rsp)
935 movaps 0x20(%rsp),%xmm8
936 movaps %xmm0,0x20(%rsp)
937 movaps 0x30(%rsp),%xmm9
938 movaps %xmm0,0x30(%rsp)
944 .size aesni_ecb_encrypt,.-aesni_ecb_encrypt
948 ######################################################################
949 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
950 # size_t blocks, const AES_KEY *key,
951 # const char *ivec,char *cmac);
953 # Handles only complete blocks, operates on 64-bit counter and
954 # does not update *ivec! Nor does it finalize CMAC value
955 # (see engine/eng_aesni.c for details)
958 my $cmac="%r9"; # 6th argument
960 my $increment="%xmm9";
962 my $bswap_mask="%xmm7";
965 .globl aesni_ccm64_encrypt_blocks
966 .type aesni_ccm64_encrypt_blocks,\@function,6
968 aesni_ccm64_encrypt_blocks:
970 $code.=<<___ if ($win64);
972 movaps %xmm6,(%rsp) # $iv
973 movaps %xmm7,0x10(%rsp) # $bswap_mask
974 movaps %xmm8,0x20(%rsp) # $in0
975 movaps %xmm9,0x30(%rsp) # $increment
979 mov 240($key),$rounds # key->rounds
981 movdqa .Lincrement64(%rip),$increment
982 movdqa .Lbswap_mask(%rip),$bswap_mask
987 movdqu ($cmac),$inout1
989 lea 32($key,$rounds),$key # end of key schedule
990 pshufb $bswap_mask,$iv
991 sub %rax,%r10 # twisted $rounds
992 jmp .Lccm64_enc_outer
995 $movkey ($key_),$rndkey0
997 movups ($inp),$in0 # load inp
999 xorps $rndkey0,$inout0 # counter
1000 $movkey 16($key_),$rndkey1
1002 xorps $rndkey0,$inout1 # cmac^=inp
1003 $movkey 32($key_),$rndkey0
1006 aesenc $rndkey1,$inout0
1007 aesenc $rndkey1,$inout1
1008 $movkey ($key,%rax),$rndkey1
1010 aesenc $rndkey0,$inout0
1011 aesenc $rndkey0,$inout1
1012 $movkey -16($key,%rax),$rndkey0
1013 jnz .Lccm64_enc2_loop
1014 aesenc $rndkey1,$inout0
1015 aesenc $rndkey1,$inout1
1016 paddq $increment,$iv
1017 dec $len # $len-- ($len is in blocks)
1018 aesenclast $rndkey0,$inout0
1019 aesenclast $rndkey0,$inout1
1022 xorps $inout0,$in0 # inp ^= E(iv)
1024 movups $in0,($out) # save output
1025 pshufb $bswap_mask,$inout0
1026 lea 16($out),$out # $out+=16
1027 jnz .Lccm64_enc_outer # loop if ($len!=0)
1029 pxor $rndkey0,$rndkey0 # clear register bank
1030 pxor $rndkey1,$rndkey1
1031 pxor $inout0,$inout0
1032 movups $inout1,($cmac) # store resulting mac
1033 pxor $inout1,$inout1
1037 $code.=<<___ if ($win64);
1039 movaps %xmm0,(%rsp) # clear stack
1040 movaps 0x10(%rsp),%xmm7
1041 movaps %xmm0,0x10(%rsp)
1042 movaps 0x20(%rsp),%xmm8
1043 movaps %xmm0,0x20(%rsp)
1044 movaps 0x30(%rsp),%xmm9
1045 movaps %xmm0,0x30(%rsp)
1051 .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
1053 ######################################################################
1055 .globl aesni_ccm64_decrypt_blocks
1056 .type aesni_ccm64_decrypt_blocks,\@function,6
1058 aesni_ccm64_decrypt_blocks:
1060 $code.=<<___ if ($win64);
1061 lea -0x58(%rsp),%rsp
1062 movaps %xmm6,(%rsp) # $iv
1063 movaps %xmm7,0x10(%rsp) # $bswap_mask
1064 movaps %xmm8,0x20(%rsp) # $in8
1065 movaps %xmm9,0x30(%rsp) # $increment
1069 mov 240($key),$rounds # key->rounds
1071 movdqu ($cmac),$inout1
1072 movdqa .Lincrement64(%rip),$increment
1073 movdqa .Lbswap_mask(%rip),$bswap_mask
1078 pshufb $bswap_mask,$iv
1080 &aesni_generate1("enc",$key,$rounds);
1084 movups ($inp),$in0 # load inp
1085 paddq $increment,$iv
1086 lea 16($inp),$inp # $inp+=16
1087 sub %r10,%rax # twisted $rounds
1088 lea 32($key_,$rnds_),$key # end of key schedule
1090 jmp .Lccm64_dec_outer
1093 xorps $inout0,$in0 # inp ^= E(iv)
1095 movups $in0,($out) # save output
1096 lea 16($out),$out # $out+=16
1097 pshufb $bswap_mask,$inout0
1099 sub \$1,$len # $len-- ($len is in blocks)
1100 jz .Lccm64_dec_break # if ($len==0) break
1102 $movkey ($key_),$rndkey0
1104 $movkey 16($key_),$rndkey1
1106 xorps $rndkey0,$inout0
1107 xorps $in0,$inout1 # cmac^=out
1108 $movkey 32($key_),$rndkey0
1109 jmp .Lccm64_dec2_loop
1112 aesenc $rndkey1,$inout0
1113 aesenc $rndkey1,$inout1
1114 $movkey ($key,%rax),$rndkey1
1116 aesenc $rndkey0,$inout0
1117 aesenc $rndkey0,$inout1
1118 $movkey -16($key,%rax),$rndkey0
1119 jnz .Lccm64_dec2_loop
1120 movups ($inp),$in0 # load input
1121 paddq $increment,$iv
1122 aesenc $rndkey1,$inout0
1123 aesenc $rndkey1,$inout1
1124 aesenclast $rndkey0,$inout0
1125 aesenclast $rndkey0,$inout1
1126 lea 16($inp),$inp # $inp+=16
1127 jmp .Lccm64_dec_outer
1131 #xorps $in0,$inout1 # cmac^=out
1132 mov 240($key_),$rounds
1134 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1136 pxor $rndkey0,$rndkey0 # clear register bank
1137 pxor $rndkey1,$rndkey1
1138 pxor $inout0,$inout0
1139 movups $inout1,($cmac) # store resulting mac
1140 pxor $inout1,$inout1
1144 $code.=<<___ if ($win64);
1146 movaps %xmm0,(%rsp) # clear stack
1147 movaps 0x10(%rsp),%xmm7
1148 movaps %xmm0,0x10(%rsp)
1149 movaps 0x20(%rsp),%xmm8
1150 movaps %xmm0,0x20(%rsp)
1151 movaps 0x30(%rsp),%xmm9
1152 movaps %xmm0,0x30(%rsp)
1158 .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1161 ######################################################################
1162 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1163 # size_t blocks, const AES_KEY *key,
1164 # const char *ivec);
1166 # Handles only complete blocks, operates on 32-bit counter and
1167 # does not update *ivec! (see crypto/modes/ctr128.c for details)
1169 # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1170 # http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1171 # Keywords are full unroll and modulo-schedule counter calculations
1172 # with zero-round key xor.
1174 my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
1175 my ($key0,$ctr)=("%ebp","${ivp}d");
1176 my $frame_size = 0x80 + ($win64?160:0);
1179 .globl aesni_ctr32_encrypt_blocks
1180 .type aesni_ctr32_encrypt_blocks,\@function,5
1182 aesni_ctr32_encrypt_blocks:
1187 # handle single block without allocating stack frame,
1188 # useful when handling edges
1189 movups ($ivp),$inout0
1190 movups ($inp),$inout1
1191 mov 240($key),%edx # key->rounds
1193 &aesni_generate1("enc",$key,"%edx");
1195 pxor $rndkey0,$rndkey0 # clear register bank
1196 pxor $rndkey1,$rndkey1
1197 xorps $inout1,$inout0
1198 pxor $inout1,$inout1
1199 movups $inout0,($out)
1200 xorps $inout0,$inout0
1201 jmp .Lctr32_epilogue
1205 lea (%rsp),$key_ # use $key_ as frame pointer
1206 .cfi_def_cfa_register $key_
1209 sub \$$frame_size,%rsp
1210 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1212 $code.=<<___ if ($win64);
1213 movaps %xmm6,-0xa8($key_) # offload everything
1214 movaps %xmm7,-0x98($key_)
1215 movaps %xmm8,-0x88($key_)
1216 movaps %xmm9,-0x78($key_)
1217 movaps %xmm10,-0x68($key_)
1218 movaps %xmm11,-0x58($key_)
1219 movaps %xmm12,-0x48($key_)
1220 movaps %xmm13,-0x38($key_)
1221 movaps %xmm14,-0x28($key_)
1222 movaps %xmm15,-0x18($key_)
1227 # 8 16-byte words on top of stack are counter values
1228 # xor-ed with zero-round key
1230 movdqu ($ivp),$inout0
1231 movdqu ($key),$rndkey0
1232 mov 12($ivp),$ctr # counter LSB
1233 pxor $rndkey0,$inout0
1234 mov 12($key),$key0 # 0-round key LSB
1235 movdqa $inout0,0x00(%rsp) # populate counter block
1237 movdqa $inout0,$inout1
1238 movdqa $inout0,$inout2
1239 movdqa $inout0,$inout3
1240 movdqa $inout0,0x40(%rsp)
1241 movdqa $inout0,0x50(%rsp)
1242 movdqa $inout0,0x60(%rsp)
1243 mov %rdx,%r10 # about to borrow %rdx
1244 movdqa $inout0,0x70(%rsp)
1252 pinsrd \$3,%eax,$inout1
1254 movdqa $inout1,0x10(%rsp)
1255 pinsrd \$3,%edx,$inout2
1257 mov %r10,%rdx # restore %rdx
1259 movdqa $inout2,0x20(%rsp)
1262 pinsrd \$3,%eax,$inout3
1264 movdqa $inout3,0x30(%rsp)
1266 mov %r10d,0x40+12(%rsp)
1269 mov 240($key),$rounds # key->rounds
1272 mov %r9d,0x50+12(%rsp)
1275 mov %r10d,0x60+12(%rsp)
1277 mov OPENSSL_ia32cap_P+4(%rip),%r10d
1279 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
1280 mov %r9d,0x70+12(%rsp)
1282 $movkey 0x10($key),$rndkey1
1284 movdqa 0x40(%rsp),$inout4
1285 movdqa 0x50(%rsp),$inout5
1287 cmp \$8,$len # $len is in blocks
1288 jb .Lctr32_tail # short input if ($len<8)
1290 sub \$6,$len # $len is biased by -6
1291 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
1292 je .Lctr32_6x # [which denotes Atom Silvermont]
1294 lea 0x80($key),$key # size optimization
1295 sub \$2,$len # $len is biased by -8
1303 lea 32($key,$rounds),$key # end of key schedule
1304 sub %rax,%r10 # twisted $rounds
1309 add \$6,$ctr # next counter value
1310 $movkey -48($key,$rnds_),$rndkey0
1311 aesenc $rndkey1,$inout0
1314 aesenc $rndkey1,$inout1
1315 movbe %eax,`0x00+12`(%rsp) # store next counter value
1317 aesenc $rndkey1,$inout2
1319 movbe %eax,`0x10+12`(%rsp)
1320 aesenc $rndkey1,$inout3
1323 aesenc $rndkey1,$inout4
1324 movbe %eax,`0x20+12`(%rsp)
1326 aesenc $rndkey1,$inout5
1327 $movkey -32($key,$rnds_),$rndkey1
1330 aesenc $rndkey0,$inout0
1331 movbe %eax,`0x30+12`(%rsp)
1333 aesenc $rndkey0,$inout1
1335 movbe %eax,`0x40+12`(%rsp)
1336 aesenc $rndkey0,$inout2
1339 aesenc $rndkey0,$inout3
1340 movbe %eax,`0x50+12`(%rsp)
1341 mov %r10,%rax # mov $rnds_,$rounds
1342 aesenc $rndkey0,$inout4
1343 aesenc $rndkey0,$inout5
1344 $movkey -16($key,$rnds_),$rndkey0
1348 movdqu ($inp),$inout6 # load 6 input blocks
1349 movdqu 0x10($inp),$inout7
1350 movdqu 0x20($inp),$in0
1351 movdqu 0x30($inp),$in1
1352 movdqu 0x40($inp),$in2
1353 movdqu 0x50($inp),$in3
1354 lea 0x60($inp),$inp # $inp+=6*16
1355 $movkey -64($key,$rnds_),$rndkey1
1356 pxor $inout0,$inout6 # inp^=E(ctr)
1357 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
1358 pxor $inout1,$inout7
1359 movaps 0x10(%rsp),$inout1
1361 movaps 0x20(%rsp),$inout2
1363 movaps 0x30(%rsp),$inout3
1365 movaps 0x40(%rsp),$inout4
1367 movaps 0x50(%rsp),$inout5
1368 movdqu $inout6,($out) # store 6 output blocks
1369 movdqu $inout7,0x10($out)
1370 movdqu $in0,0x20($out)
1371 movdqu $in1,0x30($out)
1372 movdqu $in2,0x40($out)
1373 movdqu $in3,0x50($out)
1374 lea 0x60($out),$out # $out+=6*16
1377 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
1379 add \$6,$len # restore real remaining $len
1380 jz .Lctr32_done # done if ($len==0)
1382 lea -48($rnds_),$rounds
1383 lea -80($key,$rnds_),$key # restore $key
1385 shr \$4,$rounds # restore $rounds
1390 add \$8,$ctr # next counter value
1391 movdqa 0x60(%rsp),$inout6
1392 aesenc $rndkey1,$inout0
1394 movdqa 0x70(%rsp),$inout7
1395 aesenc $rndkey1,$inout1
1397 $movkey 0x20-0x80($key),$rndkey0
1398 aesenc $rndkey1,$inout2
1401 aesenc $rndkey1,$inout3
1402 mov %r9d,0x00+12(%rsp) # store next counter value
1404 aesenc $rndkey1,$inout4
1405 aesenc $rndkey1,$inout5
1406 aesenc $rndkey1,$inout6
1407 aesenc $rndkey1,$inout7
1408 $movkey 0x30-0x80($key),$rndkey1
1410 for($i=2;$i<8;$i++) {
1411 my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1414 aesenc $rndkeyx,$inout0
1415 aesenc $rndkeyx,$inout1
1418 aesenc $rndkeyx,$inout2
1419 aesenc $rndkeyx,$inout3
1420 mov %r9d,`0x10*($i-1)`+12(%rsp)
1422 aesenc $rndkeyx,$inout4
1423 aesenc $rndkeyx,$inout5
1424 aesenc $rndkeyx,$inout6
1425 aesenc $rndkeyx,$inout7
1426 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
1431 aesenc $rndkey0,$inout0
1432 aesenc $rndkey0,$inout1
1433 aesenc $rndkey0,$inout2
1435 movdqu 0x00($inp),$in0 # start loading input
1436 aesenc $rndkey0,$inout3
1437 mov %r9d,0x70+12(%rsp)
1439 aesenc $rndkey0,$inout4
1440 aesenc $rndkey0,$inout5
1441 aesenc $rndkey0,$inout6
1442 aesenc $rndkey0,$inout7
1443 $movkey 0xa0-0x80($key),$rndkey0
1447 aesenc $rndkey1,$inout0
1448 aesenc $rndkey1,$inout1
1449 aesenc $rndkey1,$inout2
1450 aesenc $rndkey1,$inout3
1451 aesenc $rndkey1,$inout4
1452 aesenc $rndkey1,$inout5
1453 aesenc $rndkey1,$inout6
1454 aesenc $rndkey1,$inout7
1455 $movkey 0xb0-0x80($key),$rndkey1
1457 aesenc $rndkey0,$inout0
1458 aesenc $rndkey0,$inout1
1459 aesenc $rndkey0,$inout2
1460 aesenc $rndkey0,$inout3
1461 aesenc $rndkey0,$inout4
1462 aesenc $rndkey0,$inout5
1463 aesenc $rndkey0,$inout6
1464 aesenc $rndkey0,$inout7
1465 $movkey 0xc0-0x80($key),$rndkey0
1468 aesenc $rndkey1,$inout0
1469 aesenc $rndkey1,$inout1
1470 aesenc $rndkey1,$inout2
1471 aesenc $rndkey1,$inout3
1472 aesenc $rndkey1,$inout4
1473 aesenc $rndkey1,$inout5
1474 aesenc $rndkey1,$inout6
1475 aesenc $rndkey1,$inout7
1476 $movkey 0xd0-0x80($key),$rndkey1
1478 aesenc $rndkey0,$inout0
1479 aesenc $rndkey0,$inout1
1480 aesenc $rndkey0,$inout2
1481 aesenc $rndkey0,$inout3
1482 aesenc $rndkey0,$inout4
1483 aesenc $rndkey0,$inout5
1484 aesenc $rndkey0,$inout6
1485 aesenc $rndkey0,$inout7
1486 $movkey 0xe0-0x80($key),$rndkey0
1487 jmp .Lctr32_enc_done
1491 movdqu 0x10($inp),$in1
1492 pxor $rndkey0,$in0 # input^=round[last]
1493 movdqu 0x20($inp),$in2
1495 movdqu 0x30($inp),$in3
1497 movdqu 0x40($inp),$in4
1499 movdqu 0x50($inp),$in5
1502 aesenc $rndkey1,$inout0
1503 aesenc $rndkey1,$inout1
1504 aesenc $rndkey1,$inout2
1505 aesenc $rndkey1,$inout3
1506 aesenc $rndkey1,$inout4
1507 aesenc $rndkey1,$inout5
1508 aesenc $rndkey1,$inout6
1509 aesenc $rndkey1,$inout7
1510 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
1511 lea 0x80($inp),$inp # $inp+=8*16
1513 aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
1514 pxor $rndkey0,$rndkey1 # borrowed $rndkey
1515 movdqu 0x70-0x80($inp),$in0
1516 aesenclast $in1,$inout1
1518 movdqa 0x00(%rsp),$in1 # load next counter block
1519 aesenclast $in2,$inout2
1520 aesenclast $in3,$inout3
1521 movdqa 0x10(%rsp),$in2
1522 movdqa 0x20(%rsp),$in3
1523 aesenclast $in4,$inout4
1524 aesenclast $in5,$inout5
1525 movdqa 0x30(%rsp),$in4
1526 movdqa 0x40(%rsp),$in5
1527 aesenclast $rndkey1,$inout6
1528 movdqa 0x50(%rsp),$rndkey0
1529 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
1530 aesenclast $in0,$inout7
1532 movups $inout0,($out) # store 8 output blocks
1534 movups $inout1,0x10($out)
1536 movups $inout2,0x20($out)
1538 movups $inout3,0x30($out)
1540 movups $inout4,0x40($out)
1542 movups $inout5,0x50($out)
1543 movdqa $rndkey0,$inout5
1544 movups $inout6,0x60($out)
1545 movups $inout7,0x70($out)
1546 lea 0x80($out),$out # $out+=8*16
1549 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
1551 add \$8,$len # restore real remainig $len
1552 jz .Lctr32_done # done if ($len==0)
1553 lea -0x80($key),$key
1556 # note that at this point $inout0..5 are populated with
1557 # counter values xor-ed with 0-round key
1563 # if ($len>4) compute 7 E(counter)
1565 movdqa 0x60(%rsp),$inout6
1566 pxor $inout7,$inout7
1568 $movkey 16($key),$rndkey0
1569 aesenc $rndkey1,$inout0
1570 aesenc $rndkey1,$inout1
1571 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1573 aesenc $rndkey1,$inout2
1574 add \$16,%rax # prepare for .Lenc_loop8_enter
1576 aesenc $rndkey1,$inout3
1577 aesenc $rndkey1,$inout4
1578 movups 0x10($inp),$in1 # pre-load input
1579 movups 0x20($inp),$in2
1580 aesenc $rndkey1,$inout5
1581 aesenc $rndkey1,$inout6
1583 call .Lenc_loop8_enter
1585 movdqu 0x30($inp),$in3
1587 movdqu 0x40($inp),$in0
1589 movdqu $inout0,($out) # store output
1591 movdqu $inout1,0x10($out)
1593 movdqu $inout2,0x20($out)
1595 movdqu $inout3,0x30($out)
1596 movdqu $inout4,0x40($out)
1598 jb .Lctr32_done # $len was 5, stop store
1600 movups 0x50($inp),$in1
1602 movups $inout5,0x50($out)
1603 je .Lctr32_done # $len was 6, stop store
1605 movups 0x60($inp),$in2
1607 movups $inout6,0x60($out)
1608 jmp .Lctr32_done # $len was 7, stop store
1612 aesenc $rndkey1,$inout0
1615 aesenc $rndkey1,$inout1
1616 aesenc $rndkey1,$inout2
1617 aesenc $rndkey1,$inout3
1618 $movkey ($key),$rndkey1
1620 aesenclast $rndkey1,$inout0
1621 aesenclast $rndkey1,$inout1
1622 movups ($inp),$in0 # load input
1623 movups 0x10($inp),$in1
1624 aesenclast $rndkey1,$inout2
1625 aesenclast $rndkey1,$inout3
1626 movups 0x20($inp),$in2
1627 movups 0x30($inp),$in3
1630 movups $inout0,($out) # store output
1632 movups $inout1,0x10($out)
1634 movdqu $inout2,0x20($out)
1636 movdqu $inout3,0x30($out)
1637 jmp .Lctr32_done # $len was 4, stop store
1641 aesenc $rndkey1,$inout0
1644 aesenc $rndkey1,$inout1
1645 aesenc $rndkey1,$inout2
1646 $movkey ($key),$rndkey1
1648 aesenclast $rndkey1,$inout0
1649 aesenclast $rndkey1,$inout1
1650 aesenclast $rndkey1,$inout2
1652 movups ($inp),$in0 # load input
1654 movups $inout0,($out) # store output
1656 jb .Lctr32_done # $len was 1, stop store
1658 movups 0x10($inp),$in1
1660 movups $inout1,0x10($out)
1661 je .Lctr32_done # $len was 2, stop store
1663 movups 0x20($inp),$in2
1665 movups $inout2,0x20($out) # $len was 3, stop store
1668 xorps %xmm0,%xmm0 # clear regiser bank
1676 $code.=<<___ if (!$win64);
1679 movaps %xmm0,0x00(%rsp) # clear stack
1681 movaps %xmm0,0x10(%rsp)
1683 movaps %xmm0,0x20(%rsp)
1685 movaps %xmm0,0x30(%rsp)
1687 movaps %xmm0,0x40(%rsp)
1689 movaps %xmm0,0x50(%rsp)
1691 movaps %xmm0,0x60(%rsp)
1693 movaps %xmm0,0x70(%rsp)
1696 $code.=<<___ if ($win64);
1697 movaps -0xa8($key_),%xmm6
1698 movaps %xmm0,-0xa8($key_) # clear stack
1699 movaps -0x98($key_),%xmm7
1700 movaps %xmm0,-0x98($key_)
1701 movaps -0x88($key_),%xmm8
1702 movaps %xmm0,-0x88($key_)
1703 movaps -0x78($key_),%xmm9
1704 movaps %xmm0,-0x78($key_)
1705 movaps -0x68($key_),%xmm10
1706 movaps %xmm0,-0x68($key_)
1707 movaps -0x58($key_),%xmm11
1708 movaps %xmm0,-0x58($key_)
1709 movaps -0x48($key_),%xmm12
1710 movaps %xmm0,-0x48($key_)
1711 movaps -0x38($key_),%xmm13
1712 movaps %xmm0,-0x38($key_)
1713 movaps -0x28($key_),%xmm14
1714 movaps %xmm0,-0x28($key_)
1715 movaps -0x18($key_),%xmm15
1716 movaps %xmm0,-0x18($key_)
1717 movaps %xmm0,0x00(%rsp)
1718 movaps %xmm0,0x10(%rsp)
1719 movaps %xmm0,0x20(%rsp)
1720 movaps %xmm0,0x30(%rsp)
1721 movaps %xmm0,0x40(%rsp)
1722 movaps %xmm0,0x50(%rsp)
1723 movaps %xmm0,0x60(%rsp)
1724 movaps %xmm0,0x70(%rsp)
1730 .cfi_def_cfa_register %rsp
1734 .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1738 ######################################################################
1739 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1740 # const AES_KEY *key1, const AES_KEY *key2
1741 # const unsigned char iv[16]);
1744 my @tweak=map("%xmm$_",(10..15));
1745 my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1746 my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1747 my $frame_size = 0x70 + ($win64?160:0);
1748 my $key_ = "%rbp"; # override so that we can use %r11 as FP
1751 .globl aesni_xts_encrypt
1752 .type aesni_xts_encrypt,\@function,6
1756 lea (%rsp),%r11 # frame pointer
1757 .cfi_def_cfa_register %r11
1760 sub \$$frame_size,%rsp
1761 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1763 $code.=<<___ if ($win64);
1764 movaps %xmm6,-0xa8(%r11) # offload everything
1765 movaps %xmm7,-0x98(%r11)
1766 movaps %xmm8,-0x88(%r11)
1767 movaps %xmm9,-0x78(%r11)
1768 movaps %xmm10,-0x68(%r11)
1769 movaps %xmm11,-0x58(%r11)
1770 movaps %xmm12,-0x48(%r11)
1771 movaps %xmm13,-0x38(%r11)
1772 movaps %xmm14,-0x28(%r11)
1773 movaps %xmm15,-0x18(%r11)
1777 movups ($ivp),$inout0 # load clear-text tweak
1778 mov 240(%r8),$rounds # key2->rounds
1779 mov 240($key),$rnds_ # key1->rounds
1781 # generate the tweak
1782 &aesni_generate1("enc",$key2,$rounds,$inout0);
1784 $movkey ($key),$rndkey0 # zero round key
1785 mov $key,$key_ # backup $key
1786 mov $rnds_,$rounds # backup $rounds
1788 mov $len,$len_ # backup $len
1791 $movkey 16($key,$rnds_),$rndkey1 # last round key
1793 movdqa .Lxts_magic(%rip),$twmask
1794 movdqa $inout0,@tweak[5]
1795 pshufd \$0x5f,$inout0,$twres
1796 pxor $rndkey0,$rndkey1
1798 # alternative tweak calculation algorithm is based on suggestions
1799 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1800 # and should help in the future...
1801 for ($i=0;$i<4;$i++) {
1803 movdqa $twres,$twtmp
1805 movdqa @tweak[5],@tweak[$i]
1806 psrad \$31,$twtmp # broadcast upper bits
1807 paddq @tweak[5],@tweak[5]
1809 pxor $rndkey0,@tweak[$i]
1810 pxor $twtmp,@tweak[5]
1814 movdqa @tweak[5],@tweak[4]
1816 paddq @tweak[5],@tweak[5]
1818 pxor $rndkey0,@tweak[4]
1819 pxor $twres,@tweak[5]
1820 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
1823 jc .Lxts_enc_short # if $len-=6*16 borrowed
1826 lea 32($key_,$rnds_),$key # end of key schedule
1827 sub %r10,%rax # twisted $rounds
1828 $movkey 16($key_),$rndkey1
1829 mov %rax,%r10 # backup twisted $rounds
1830 lea .Lxts_magic(%rip),%r8
1831 jmp .Lxts_enc_grandloop
1834 .Lxts_enc_grandloop:
1835 movdqu `16*0`($inp),$inout0 # load input
1836 movdqa $rndkey0,$twmask
1837 movdqu `16*1`($inp),$inout1
1838 pxor @tweak[0],$inout0 # input^=tweak^round[0]
1839 movdqu `16*2`($inp),$inout2
1840 pxor @tweak[1],$inout1
1841 aesenc $rndkey1,$inout0
1842 movdqu `16*3`($inp),$inout3
1843 pxor @tweak[2],$inout2
1844 aesenc $rndkey1,$inout1
1845 movdqu `16*4`($inp),$inout4
1846 pxor @tweak[3],$inout3
1847 aesenc $rndkey1,$inout2
1848 movdqu `16*5`($inp),$inout5
1849 pxor @tweak[5],$twmask # round[0]^=tweak[5]
1850 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
1851 pxor @tweak[4],$inout4
1852 aesenc $rndkey1,$inout3
1853 $movkey 32($key_),$rndkey0
1854 lea `16*6`($inp),$inp
1855 pxor $twmask,$inout5
1857 pxor $twres,@tweak[0] # calclulate tweaks^round[last]
1858 aesenc $rndkey1,$inout4
1859 pxor $twres,@tweak[1]
1860 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
1861 aesenc $rndkey1,$inout5
1862 $movkey 48($key_),$rndkey1
1863 pxor $twres,@tweak[2]
1865 aesenc $rndkey0,$inout0
1866 pxor $twres,@tweak[3]
1867 movdqa @tweak[1],`16*1`(%rsp)
1868 aesenc $rndkey0,$inout1
1869 pxor $twres,@tweak[4]
1870 movdqa @tweak[2],`16*2`(%rsp)
1871 aesenc $rndkey0,$inout2
1872 aesenc $rndkey0,$inout3
1874 movdqa @tweak[4],`16*4`(%rsp)
1875 aesenc $rndkey0,$inout4
1876 aesenc $rndkey0,$inout5
1877 $movkey 64($key_),$rndkey0
1878 movdqa $twmask,`16*5`(%rsp)
1879 pshufd \$0x5f,@tweak[5],$twres
1883 aesenc $rndkey1,$inout0
1884 aesenc $rndkey1,$inout1
1885 aesenc $rndkey1,$inout2
1886 aesenc $rndkey1,$inout3
1887 aesenc $rndkey1,$inout4
1888 aesenc $rndkey1,$inout5
1889 $movkey -64($key,%rax),$rndkey1
1892 aesenc $rndkey0,$inout0
1893 aesenc $rndkey0,$inout1
1894 aesenc $rndkey0,$inout2
1895 aesenc $rndkey0,$inout3
1896 aesenc $rndkey0,$inout4
1897 aesenc $rndkey0,$inout5
1898 $movkey -80($key,%rax),$rndkey0
1901 movdqa (%r8),$twmask # start calculating next tweak
1902 movdqa $twres,$twtmp
1904 aesenc $rndkey1,$inout0
1905 paddq @tweak[5],@tweak[5]
1907 aesenc $rndkey1,$inout1
1909 $movkey ($key_),@tweak[0] # load round[0]
1910 aesenc $rndkey1,$inout2
1911 aesenc $rndkey1,$inout3
1912 aesenc $rndkey1,$inout4
1913 pxor $twtmp,@tweak[5]
1914 movaps @tweak[0],@tweak[1] # copy round[0]
1915 aesenc $rndkey1,$inout5
1916 $movkey -64($key),$rndkey1
1918 movdqa $twres,$twtmp
1919 aesenc $rndkey0,$inout0
1921 pxor @tweak[5],@tweak[0]
1922 aesenc $rndkey0,$inout1
1924 paddq @tweak[5],@tweak[5]
1925 aesenc $rndkey0,$inout2
1926 aesenc $rndkey0,$inout3
1928 movaps @tweak[1],@tweak[2]
1929 aesenc $rndkey0,$inout4
1930 pxor $twtmp,@tweak[5]
1931 movdqa $twres,$twtmp
1932 aesenc $rndkey0,$inout5
1933 $movkey -48($key),$rndkey0
1936 aesenc $rndkey1,$inout0
1937 pxor @tweak[5],@tweak[1]
1939 aesenc $rndkey1,$inout1
1940 paddq @tweak[5],@tweak[5]
1942 aesenc $rndkey1,$inout2
1943 aesenc $rndkey1,$inout3
1944 movdqa @tweak[3],`16*3`(%rsp)
1945 pxor $twtmp,@tweak[5]
1946 aesenc $rndkey1,$inout4
1947 movaps @tweak[2],@tweak[3]
1948 movdqa $twres,$twtmp
1949 aesenc $rndkey1,$inout5
1950 $movkey -32($key),$rndkey1
1953 aesenc $rndkey0,$inout0
1954 pxor @tweak[5],@tweak[2]
1956 aesenc $rndkey0,$inout1
1957 paddq @tweak[5],@tweak[5]
1959 aesenc $rndkey0,$inout2
1960 aesenc $rndkey0,$inout3
1961 aesenc $rndkey0,$inout4
1962 pxor $twtmp,@tweak[5]
1963 movaps @tweak[3],@tweak[4]
1964 aesenc $rndkey0,$inout5
1966 movdqa $twres,$rndkey0
1968 aesenc $rndkey1,$inout0
1969 pxor @tweak[5],@tweak[3]
1971 aesenc $rndkey1,$inout1
1972 paddq @tweak[5],@tweak[5]
1973 pand $twmask,$rndkey0
1974 aesenc $rndkey1,$inout2
1975 aesenc $rndkey1,$inout3
1976 pxor $rndkey0,@tweak[5]
1977 $movkey ($key_),$rndkey0
1978 aesenc $rndkey1,$inout4
1979 aesenc $rndkey1,$inout5
1980 $movkey 16($key_),$rndkey1
1982 pxor @tweak[5],@tweak[4]
1983 aesenclast `16*0`(%rsp),$inout0
1985 paddq @tweak[5],@tweak[5]
1986 aesenclast `16*1`(%rsp),$inout1
1987 aesenclast `16*2`(%rsp),$inout2
1989 mov %r10,%rax # restore $rounds
1990 aesenclast `16*3`(%rsp),$inout3
1991 aesenclast `16*4`(%rsp),$inout4
1992 aesenclast `16*5`(%rsp),$inout5
1993 pxor $twres,@tweak[5]
1995 lea `16*6`($out),$out # $out+=6*16
1996 movups $inout0,`-16*6`($out) # store 6 output blocks
1997 movups $inout1,`-16*5`($out)
1998 movups $inout2,`-16*4`($out)
1999 movups $inout3,`-16*3`($out)
2000 movups $inout4,`-16*2`($out)
2001 movups $inout5,`-16*1`($out)
2003 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
2007 mov $key_,$key # restore $key
2008 shr \$4,$rounds # restore original value
2011 # at the point @tweak[0..5] are populated with tweak values
2012 mov $rounds,$rnds_ # backup $rounds
2013 pxor $rndkey0,@tweak[0]
2014 add \$16*6,$len # restore real remaining $len
2015 jz .Lxts_enc_done # done if ($len==0)
2017 pxor $rndkey0,@tweak[1]
2019 jb .Lxts_enc_one # $len is 1*16
2020 pxor $rndkey0,@tweak[2]
2021 je .Lxts_enc_two # $len is 2*16
2023 pxor $rndkey0,@tweak[3]
2025 jb .Lxts_enc_three # $len is 3*16
2026 pxor $rndkey0,@tweak[4]
2027 je .Lxts_enc_four # $len is 4*16
2029 movdqu ($inp),$inout0 # $len is 5*16
2030 movdqu 16*1($inp),$inout1
2031 movdqu 16*2($inp),$inout2
2032 pxor @tweak[0],$inout0
2033 movdqu 16*3($inp),$inout3
2034 pxor @tweak[1],$inout1
2035 movdqu 16*4($inp),$inout4
2036 lea 16*5($inp),$inp # $inp+=5*16
2037 pxor @tweak[2],$inout2
2038 pxor @tweak[3],$inout3
2039 pxor @tweak[4],$inout4
2040 pxor $inout5,$inout5
2042 call _aesni_encrypt6
2044 xorps @tweak[0],$inout0
2045 movdqa @tweak[5],@tweak[0]
2046 xorps @tweak[1],$inout1
2047 xorps @tweak[2],$inout2
2048 movdqu $inout0,($out) # store 5 output blocks
2049 xorps @tweak[3],$inout3
2050 movdqu $inout1,16*1($out)
2051 xorps @tweak[4],$inout4
2052 movdqu $inout2,16*2($out)
2053 movdqu $inout3,16*3($out)
2054 movdqu $inout4,16*4($out)
2055 lea 16*5($out),$out # $out+=5*16
2060 movups ($inp),$inout0
2061 lea 16*1($inp),$inp # inp+=1*16
2062 xorps @tweak[0],$inout0
2064 &aesni_generate1("enc",$key,$rounds);
2066 xorps @tweak[0],$inout0
2067 movdqa @tweak[1],@tweak[0]
2068 movups $inout0,($out) # store one output block
2069 lea 16*1($out),$out # $out+=1*16
2074 movups ($inp),$inout0
2075 movups 16($inp),$inout1
2076 lea 32($inp),$inp # $inp+=2*16
2077 xorps @tweak[0],$inout0
2078 xorps @tweak[1],$inout1
2080 call _aesni_encrypt2
2082 xorps @tweak[0],$inout0
2083 movdqa @tweak[2],@tweak[0]
2084 xorps @tweak[1],$inout1
2085 movups $inout0,($out) # store 2 output blocks
2086 movups $inout1,16*1($out)
2087 lea 16*2($out),$out # $out+=2*16
2092 movups ($inp),$inout0
2093 movups 16*1($inp),$inout1
2094 movups 16*2($inp),$inout2
2095 lea 16*3($inp),$inp # $inp+=3*16
2096 xorps @tweak[0],$inout0
2097 xorps @tweak[1],$inout1
2098 xorps @tweak[2],$inout2
2100 call _aesni_encrypt3
2102 xorps @tweak[0],$inout0
2103 movdqa @tweak[3],@tweak[0]
2104 xorps @tweak[1],$inout1
2105 xorps @tweak[2],$inout2
2106 movups $inout0,($out) # store 3 output blocks
2107 movups $inout1,16*1($out)
2108 movups $inout2,16*2($out)
2109 lea 16*3($out),$out # $out+=3*16
2114 movups ($inp),$inout0
2115 movups 16*1($inp),$inout1
2116 movups 16*2($inp),$inout2
2117 xorps @tweak[0],$inout0
2118 movups 16*3($inp),$inout3
2119 lea 16*4($inp),$inp # $inp+=4*16
2120 xorps @tweak[1],$inout1
2121 xorps @tweak[2],$inout2
2122 xorps @tweak[3],$inout3
2124 call _aesni_encrypt4
2126 pxor @tweak[0],$inout0
2127 movdqa @tweak[4],@tweak[0]
2128 pxor @tweak[1],$inout1
2129 pxor @tweak[2],$inout2
2130 movdqu $inout0,($out) # store 4 output blocks
2131 pxor @tweak[3],$inout3
2132 movdqu $inout1,16*1($out)
2133 movdqu $inout2,16*2($out)
2134 movdqu $inout3,16*3($out)
2135 lea 16*4($out),$out # $out+=4*16
2140 and \$15,$len_ # see if $len%16 is 0
2145 movzb ($inp),%eax # borrow $rounds ...
2146 movzb -16($out),%ecx # ... and $key
2154 sub $len_,$out # rewind $out
2155 mov $key_,$key # restore $key
2156 mov $rnds_,$rounds # restore $rounds
2158 movups -16($out),$inout0
2159 xorps @tweak[0],$inout0
2161 &aesni_generate1("enc",$key,$rounds);
2163 xorps @tweak[0],$inout0
2164 movups $inout0,-16($out)
2167 xorps %xmm0,%xmm0 # clear register bank
2174 $code.=<<___ if (!$win64);
2177 movaps %xmm0,0x00(%rsp) # clear stack
2179 movaps %xmm0,0x10(%rsp)
2181 movaps %xmm0,0x20(%rsp)
2183 movaps %xmm0,0x30(%rsp)
2185 movaps %xmm0,0x40(%rsp)
2187 movaps %xmm0,0x50(%rsp)
2189 movaps %xmm0,0x60(%rsp)
2193 $code.=<<___ if ($win64);
2194 movaps -0xa8(%r11),%xmm6
2195 movaps %xmm0,-0xa8(%r11) # clear stack
2196 movaps -0x98(%r11),%xmm7
2197 movaps %xmm0,-0x98(%r11)
2198 movaps -0x88(%r11),%xmm8
2199 movaps %xmm0,-0x88(%r11)
2200 movaps -0x78(%r11),%xmm9
2201 movaps %xmm0,-0x78(%r11)
2202 movaps -0x68(%r11),%xmm10
2203 movaps %xmm0,-0x68(%r11)
2204 movaps -0x58(%r11),%xmm11
2205 movaps %xmm0,-0x58(%r11)
2206 movaps -0x48(%r11),%xmm12
2207 movaps %xmm0,-0x48(%r11)
2208 movaps -0x38(%r11),%xmm13
2209 movaps %xmm0,-0x38(%r11)
2210 movaps -0x28(%r11),%xmm14
2211 movaps %xmm0,-0x28(%r11)
2212 movaps -0x18(%r11),%xmm15
2213 movaps %xmm0,-0x18(%r11)
2214 movaps %xmm0,0x00(%rsp)
2215 movaps %xmm0,0x10(%rsp)
2216 movaps %xmm0,0x20(%rsp)
2217 movaps %xmm0,0x30(%rsp)
2218 movaps %xmm0,0x40(%rsp)
2219 movaps %xmm0,0x50(%rsp)
2220 movaps %xmm0,0x60(%rsp)
2226 .cfi_def_cfa_register %rsp
2230 .size aesni_xts_encrypt,.-aesni_xts_encrypt
2234 .globl aesni_xts_decrypt
2235 .type aesni_xts_decrypt,\@function,6
2239 lea (%rsp),%r11 # frame pointer
2240 .cfi_def_cfa_register %r11
2243 sub \$$frame_size,%rsp
2244 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
2246 $code.=<<___ if ($win64);
2247 movaps %xmm6,-0xa8(%r11) # offload everything
2248 movaps %xmm7,-0x98(%r11)
2249 movaps %xmm8,-0x88(%r11)
2250 movaps %xmm9,-0x78(%r11)
2251 movaps %xmm10,-0x68(%r11)
2252 movaps %xmm11,-0x58(%r11)
2253 movaps %xmm12,-0x48(%r11)
2254 movaps %xmm13,-0x38(%r11)
2255 movaps %xmm14,-0x28(%r11)
2256 movaps %xmm15,-0x18(%r11)
2260 movups ($ivp),$inout0 # load clear-text tweak
2261 mov 240($key2),$rounds # key2->rounds
2262 mov 240($key),$rnds_ # key1->rounds
2264 # generate the tweak
2265 &aesni_generate1("enc",$key2,$rounds,$inout0);
2267 xor %eax,%eax # if ($len%16) len-=16;
2273 $movkey ($key),$rndkey0 # zero round key
2274 mov $key,$key_ # backup $key
2275 mov $rnds_,$rounds # backup $rounds
2277 mov $len,$len_ # backup $len
2280 $movkey 16($key,$rnds_),$rndkey1 # last round key
2282 movdqa .Lxts_magic(%rip),$twmask
2283 movdqa $inout0,@tweak[5]
2284 pshufd \$0x5f,$inout0,$twres
2285 pxor $rndkey0,$rndkey1
2287 for ($i=0;$i<4;$i++) {
2289 movdqa $twres,$twtmp
2291 movdqa @tweak[5],@tweak[$i]
2292 psrad \$31,$twtmp # broadcast upper bits
2293 paddq @tweak[5],@tweak[5]
2295 pxor $rndkey0,@tweak[$i]
2296 pxor $twtmp,@tweak[5]
2300 movdqa @tweak[5],@tweak[4]
2302 paddq @tweak[5],@tweak[5]
2304 pxor $rndkey0,@tweak[4]
2305 pxor $twres,@tweak[5]
2306 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
2309 jc .Lxts_dec_short # if $len-=6*16 borrowed
2312 lea 32($key_,$rnds_),$key # end of key schedule
2313 sub %r10,%rax # twisted $rounds
2314 $movkey 16($key_),$rndkey1
2315 mov %rax,%r10 # backup twisted $rounds
2316 lea .Lxts_magic(%rip),%r8
2317 jmp .Lxts_dec_grandloop
2320 .Lxts_dec_grandloop:
2321 movdqu `16*0`($inp),$inout0 # load input
2322 movdqa $rndkey0,$twmask
2323 movdqu `16*1`($inp),$inout1
2324 pxor @tweak[0],$inout0 # intput^=tweak^round[0]
2325 movdqu `16*2`($inp),$inout2
2326 pxor @tweak[1],$inout1
2327 aesdec $rndkey1,$inout0
2328 movdqu `16*3`($inp),$inout3
2329 pxor @tweak[2],$inout2
2330 aesdec $rndkey1,$inout1
2331 movdqu `16*4`($inp),$inout4
2332 pxor @tweak[3],$inout3
2333 aesdec $rndkey1,$inout2
2334 movdqu `16*5`($inp),$inout5
2335 pxor @tweak[5],$twmask # round[0]^=tweak[5]
2336 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
2337 pxor @tweak[4],$inout4
2338 aesdec $rndkey1,$inout3
2339 $movkey 32($key_),$rndkey0
2340 lea `16*6`($inp),$inp
2341 pxor $twmask,$inout5
2343 pxor $twres,@tweak[0] # calclulate tweaks^round[last]
2344 aesdec $rndkey1,$inout4
2345 pxor $twres,@tweak[1]
2346 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
2347 aesdec $rndkey1,$inout5
2348 $movkey 48($key_),$rndkey1
2349 pxor $twres,@tweak[2]
2351 aesdec $rndkey0,$inout0
2352 pxor $twres,@tweak[3]
2353 movdqa @tweak[1],`16*1`(%rsp)
2354 aesdec $rndkey0,$inout1
2355 pxor $twres,@tweak[4]
2356 movdqa @tweak[2],`16*2`(%rsp)
2357 aesdec $rndkey0,$inout2
2358 aesdec $rndkey0,$inout3
2360 movdqa @tweak[4],`16*4`(%rsp)
2361 aesdec $rndkey0,$inout4
2362 aesdec $rndkey0,$inout5
2363 $movkey 64($key_),$rndkey0
2364 movdqa $twmask,`16*5`(%rsp)
2365 pshufd \$0x5f,@tweak[5],$twres
2369 aesdec $rndkey1,$inout0
2370 aesdec $rndkey1,$inout1
2371 aesdec $rndkey1,$inout2
2372 aesdec $rndkey1,$inout3
2373 aesdec $rndkey1,$inout4
2374 aesdec $rndkey1,$inout5
2375 $movkey -64($key,%rax),$rndkey1
2378 aesdec $rndkey0,$inout0
2379 aesdec $rndkey0,$inout1
2380 aesdec $rndkey0,$inout2
2381 aesdec $rndkey0,$inout3
2382 aesdec $rndkey0,$inout4
2383 aesdec $rndkey0,$inout5
2384 $movkey -80($key,%rax),$rndkey0
2387 movdqa (%r8),$twmask # start calculating next tweak
2388 movdqa $twres,$twtmp
2390 aesdec $rndkey1,$inout0
2391 paddq @tweak[5],@tweak[5]
2393 aesdec $rndkey1,$inout1
2395 $movkey ($key_),@tweak[0] # load round[0]
2396 aesdec $rndkey1,$inout2
2397 aesdec $rndkey1,$inout3
2398 aesdec $rndkey1,$inout4
2399 pxor $twtmp,@tweak[5]
2400 movaps @tweak[0],@tweak[1] # copy round[0]
2401 aesdec $rndkey1,$inout5
2402 $movkey -64($key),$rndkey1
2404 movdqa $twres,$twtmp
2405 aesdec $rndkey0,$inout0
2407 pxor @tweak[5],@tweak[0]
2408 aesdec $rndkey0,$inout1
2410 paddq @tweak[5],@tweak[5]
2411 aesdec $rndkey0,$inout2
2412 aesdec $rndkey0,$inout3
2414 movaps @tweak[1],@tweak[2]
2415 aesdec $rndkey0,$inout4
2416 pxor $twtmp,@tweak[5]
2417 movdqa $twres,$twtmp
2418 aesdec $rndkey0,$inout5
2419 $movkey -48($key),$rndkey0
2422 aesdec $rndkey1,$inout0
2423 pxor @tweak[5],@tweak[1]
2425 aesdec $rndkey1,$inout1
2426 paddq @tweak[5],@tweak[5]
2428 aesdec $rndkey1,$inout2
2429 aesdec $rndkey1,$inout3
2430 movdqa @tweak[3],`16*3`(%rsp)
2431 pxor $twtmp,@tweak[5]
2432 aesdec $rndkey1,$inout4
2433 movaps @tweak[2],@tweak[3]
2434 movdqa $twres,$twtmp
2435 aesdec $rndkey1,$inout5
2436 $movkey -32($key),$rndkey1
2439 aesdec $rndkey0,$inout0
2440 pxor @tweak[5],@tweak[2]
2442 aesdec $rndkey0,$inout1
2443 paddq @tweak[5],@tweak[5]
2445 aesdec $rndkey0,$inout2
2446 aesdec $rndkey0,$inout3
2447 aesdec $rndkey0,$inout4
2448 pxor $twtmp,@tweak[5]
2449 movaps @tweak[3],@tweak[4]
2450 aesdec $rndkey0,$inout5
2452 movdqa $twres,$rndkey0
2454 aesdec $rndkey1,$inout0
2455 pxor @tweak[5],@tweak[3]
2457 aesdec $rndkey1,$inout1
2458 paddq @tweak[5],@tweak[5]
2459 pand $twmask,$rndkey0
2460 aesdec $rndkey1,$inout2
2461 aesdec $rndkey1,$inout3
2462 pxor $rndkey0,@tweak[5]
2463 $movkey ($key_),$rndkey0
2464 aesdec $rndkey1,$inout4
2465 aesdec $rndkey1,$inout5
2466 $movkey 16($key_),$rndkey1
2468 pxor @tweak[5],@tweak[4]
2469 aesdeclast `16*0`(%rsp),$inout0
2471 paddq @tweak[5],@tweak[5]
2472 aesdeclast `16*1`(%rsp),$inout1
2473 aesdeclast `16*2`(%rsp),$inout2
2475 mov %r10,%rax # restore $rounds
2476 aesdeclast `16*3`(%rsp),$inout3
2477 aesdeclast `16*4`(%rsp),$inout4
2478 aesdeclast `16*5`(%rsp),$inout5
2479 pxor $twres,@tweak[5]
2481 lea `16*6`($out),$out # $out+=6*16
2482 movups $inout0,`-16*6`($out) # store 6 output blocks
2483 movups $inout1,`-16*5`($out)
2484 movups $inout2,`-16*4`($out)
2485 movups $inout3,`-16*3`($out)
2486 movups $inout4,`-16*2`($out)
2487 movups $inout5,`-16*1`($out)
2489 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
2493 mov $key_,$key # restore $key
2494 shr \$4,$rounds # restore original value
2497 # at the point @tweak[0..5] are populated with tweak values
2498 mov $rounds,$rnds_ # backup $rounds
2499 pxor $rndkey0,@tweak[0]
2500 pxor $rndkey0,@tweak[1]
2501 add \$16*6,$len # restore real remaining $len
2502 jz .Lxts_dec_done # done if ($len==0)
2504 pxor $rndkey0,@tweak[2]
2506 jb .Lxts_dec_one # $len is 1*16
2507 pxor $rndkey0,@tweak[3]
2508 je .Lxts_dec_two # $len is 2*16
2510 pxor $rndkey0,@tweak[4]
2512 jb .Lxts_dec_three # $len is 3*16
2513 je .Lxts_dec_four # $len is 4*16
2515 movdqu ($inp),$inout0 # $len is 5*16
2516 movdqu 16*1($inp),$inout1
2517 movdqu 16*2($inp),$inout2
2518 pxor @tweak[0],$inout0
2519 movdqu 16*3($inp),$inout3
2520 pxor @tweak[1],$inout1
2521 movdqu 16*4($inp),$inout4
2522 lea 16*5($inp),$inp # $inp+=5*16
2523 pxor @tweak[2],$inout2
2524 pxor @tweak[3],$inout3
2525 pxor @tweak[4],$inout4
2527 call _aesni_decrypt6
2529 xorps @tweak[0],$inout0
2530 xorps @tweak[1],$inout1
2531 xorps @tweak[2],$inout2
2532 movdqu $inout0,($out) # store 5 output blocks
2533 xorps @tweak[3],$inout3
2534 movdqu $inout1,16*1($out)
2535 xorps @tweak[4],$inout4
2536 movdqu $inout2,16*2($out)
2538 movdqu $inout3,16*3($out)
2539 pcmpgtd @tweak[5],$twtmp
2540 movdqu $inout4,16*4($out)
2541 lea 16*5($out),$out # $out+=5*16
2542 pshufd \$0x13,$twtmp,@tweak[1] # $twres
2546 movdqa @tweak[5],@tweak[0]
2547 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
2548 pand $twmask,@tweak[1] # isolate carry and residue
2549 pxor @tweak[5],@tweak[1]
2554 movups ($inp),$inout0
2555 lea 16*1($inp),$inp # $inp+=1*16
2556 xorps @tweak[0],$inout0
2558 &aesni_generate1("dec",$key,$rounds);
2560 xorps @tweak[0],$inout0
2561 movdqa @tweak[1],@tweak[0]
2562 movups $inout0,($out) # store one output block
2563 movdqa @tweak[2],@tweak[1]
2564 lea 16*1($out),$out # $out+=1*16
2569 movups ($inp),$inout0
2570 movups 16($inp),$inout1
2571 lea 32($inp),$inp # $inp+=2*16
2572 xorps @tweak[0],$inout0
2573 xorps @tweak[1],$inout1
2575 call _aesni_decrypt2
2577 xorps @tweak[0],$inout0
2578 movdqa @tweak[2],@tweak[0]
2579 xorps @tweak[1],$inout1
2580 movdqa @tweak[3],@tweak[1]
2581 movups $inout0,($out) # store 2 output blocks
2582 movups $inout1,16*1($out)
2583 lea 16*2($out),$out # $out+=2*16
2588 movups ($inp),$inout0
2589 movups 16*1($inp),$inout1
2590 movups 16*2($inp),$inout2
2591 lea 16*3($inp),$inp # $inp+=3*16
2592 xorps @tweak[0],$inout0
2593 xorps @tweak[1],$inout1
2594 xorps @tweak[2],$inout2
2596 call _aesni_decrypt3
2598 xorps @tweak[0],$inout0
2599 movdqa @tweak[3],@tweak[0]
2600 xorps @tweak[1],$inout1
2601 movdqa @tweak[4],@tweak[1]
2602 xorps @tweak[2],$inout2
2603 movups $inout0,($out) # store 3 output blocks
2604 movups $inout1,16*1($out)
2605 movups $inout2,16*2($out)
2606 lea 16*3($out),$out # $out+=3*16
2611 movups ($inp),$inout0
2612 movups 16*1($inp),$inout1
2613 movups 16*2($inp),$inout2
2614 xorps @tweak[0],$inout0
2615 movups 16*3($inp),$inout3
2616 lea 16*4($inp),$inp # $inp+=4*16
2617 xorps @tweak[1],$inout1
2618 xorps @tweak[2],$inout2
2619 xorps @tweak[3],$inout3
2621 call _aesni_decrypt4
2623 pxor @tweak[0],$inout0
2624 movdqa @tweak[4],@tweak[0]
2625 pxor @tweak[1],$inout1
2626 movdqa @tweak[5],@tweak[1]
2627 pxor @tweak[2],$inout2
2628 movdqu $inout0,($out) # store 4 output blocks
2629 pxor @tweak[3],$inout3
2630 movdqu $inout1,16*1($out)
2631 movdqu $inout2,16*2($out)
2632 movdqu $inout3,16*3($out)
2633 lea 16*4($out),$out # $out+=4*16
2638 and \$15,$len_ # see if $len%16 is 0
2642 mov $key_,$key # restore $key
2643 mov $rnds_,$rounds # restore $rounds
2645 movups ($inp),$inout0
2646 xorps @tweak[1],$inout0
2648 &aesni_generate1("dec",$key,$rounds);
2650 xorps @tweak[1],$inout0
2651 movups $inout0,($out)
2654 movzb 16($inp),%eax # borrow $rounds ...
2655 movzb ($out),%ecx # ... and $key
2663 sub $len_,$out # rewind $out
2664 mov $key_,$key # restore $key
2665 mov $rnds_,$rounds # restore $rounds
2667 movups ($out),$inout0
2668 xorps @tweak[0],$inout0
2670 &aesni_generate1("dec",$key,$rounds);
2672 xorps @tweak[0],$inout0
2673 movups $inout0,($out)
2676 xorps %xmm0,%xmm0 # clear register bank
2683 $code.=<<___ if (!$win64);
2686 movaps %xmm0,0x00(%rsp) # clear stack
2688 movaps %xmm0,0x10(%rsp)
2690 movaps %xmm0,0x20(%rsp)
2692 movaps %xmm0,0x30(%rsp)
2694 movaps %xmm0,0x40(%rsp)
2696 movaps %xmm0,0x50(%rsp)
2698 movaps %xmm0,0x60(%rsp)
2702 $code.=<<___ if ($win64);
2703 movaps -0xa8(%r11),%xmm6
2704 movaps %xmm0,-0xa8(%r11) # clear stack
2705 movaps -0x98(%r11),%xmm7
2706 movaps %xmm0,-0x98(%r11)
2707 movaps -0x88(%r11),%xmm8
2708 movaps %xmm0,-0x88(%r11)
2709 movaps -0x78(%r11),%xmm9
2710 movaps %xmm0,-0x78(%r11)
2711 movaps -0x68(%r11),%xmm10
2712 movaps %xmm0,-0x68(%r11)
2713 movaps -0x58(%r11),%xmm11
2714 movaps %xmm0,-0x58(%r11)
2715 movaps -0x48(%r11),%xmm12
2716 movaps %xmm0,-0x48(%r11)
2717 movaps -0x38(%r11),%xmm13
2718 movaps %xmm0,-0x38(%r11)
2719 movaps -0x28(%r11),%xmm14
2720 movaps %xmm0,-0x28(%r11)
2721 movaps -0x18(%r11),%xmm15
2722 movaps %xmm0,-0x18(%r11)
2723 movaps %xmm0,0x00(%rsp)
2724 movaps %xmm0,0x10(%rsp)
2725 movaps %xmm0,0x20(%rsp)
2726 movaps %xmm0,0x30(%rsp)
2727 movaps %xmm0,0x40(%rsp)
2728 movaps %xmm0,0x50(%rsp)
2729 movaps %xmm0,0x60(%rsp)
2735 .cfi_def_cfa_register %rsp
2739 .size aesni_xts_decrypt,.-aesni_xts_decrypt
2743 ######################################################################
2744 # void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2745 # const AES_KEY *key, unsigned int start_block_num,
2746 # unsigned char offset_i[16], const unsigned char L_[][16],
2747 # unsigned char checksum[16]);
2750 my @offset=map("%xmm$_",(10..15));
2751 my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2752 my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
2753 my ($L_p,$checksum_p) = ("%rbx","%rbp");
2754 my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2755 my $seventh_arg = $win64 ? 56 : 8;
2759 .globl aesni_ocb_encrypt
2760 .type aesni_ocb_encrypt,\@function,6
2776 $code.=<<___ if ($win64);
2777 lea -0xa0(%rsp),%rsp
2778 movaps %xmm6,0x00(%rsp) # offload everything
2779 movaps %xmm7,0x10(%rsp)
2780 movaps %xmm8,0x20(%rsp)
2781 movaps %xmm9,0x30(%rsp)
2782 movaps %xmm10,0x40(%rsp)
2783 movaps %xmm11,0x50(%rsp)
2784 movaps %xmm12,0x60(%rsp)
2785 movaps %xmm13,0x70(%rsp)
2786 movaps %xmm14,0x80(%rsp)
2787 movaps %xmm15,0x90(%rsp)
2791 mov $seventh_arg(%rax),$L_p # 7th argument
2792 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
2794 mov 240($key),$rnds_
2797 $movkey ($key),$rndkey0l # round[0]
2798 $movkey 16($key,$rnds_),$rndkey1 # round[last]
2800 movdqu ($offset_p),@offset[5] # load last offset_i
2801 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
2802 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
2805 lea 32($key_,$rnds_),$key
2806 $movkey 16($key_),$rndkey1 # round[1]
2807 sub %r10,%rax # twisted $rounds
2808 mov %rax,%r10 # backup twisted $rounds
2810 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
2811 movdqu ($checksum_p),$checksum # load checksum
2813 test \$1,$block_num # is first block number odd?
2819 movdqu ($L_p,$i1),$inout5 # borrow
2820 movdqu ($inp),$inout0
2825 movdqa $inout5,@offset[5]
2826 movups $inout0,($out)
2832 lea 1($block_num),$i1 # even-numbered blocks
2833 lea 3($block_num),$i3
2834 lea 5($block_num),$i5
2835 lea 6($block_num),$block_num
2836 bsf $i1,$i1 # ntz(block)
2839 shl \$4,$i1 # ntz(block) -> table offset
2845 jmp .Locb_enc_grandloop
2848 .Locb_enc_grandloop:
2849 movdqu `16*0`($inp),$inout0 # load input
2850 movdqu `16*1`($inp),$inout1
2851 movdqu `16*2`($inp),$inout2
2852 movdqu `16*3`($inp),$inout3
2853 movdqu `16*4`($inp),$inout4
2854 movdqu `16*5`($inp),$inout5
2855 lea `16*6`($inp),$inp
2859 movups $inout0,`16*0`($out) # store output
2860 movups $inout1,`16*1`($out)
2861 movups $inout2,`16*2`($out)
2862 movups $inout3,`16*3`($out)
2863 movups $inout4,`16*4`($out)
2864 movups $inout5,`16*5`($out)
2865 lea `16*6`($out),$out
2867 jnc .Locb_enc_grandloop
2873 movdqu `16*0`($inp),$inout0
2876 movdqu `16*1`($inp),$inout1
2879 movdqu `16*2`($inp),$inout2
2882 movdqu `16*3`($inp),$inout3
2885 movdqu `16*4`($inp),$inout4
2886 pxor $inout5,$inout5
2890 movdqa @offset[4],@offset[5]
2891 movups $inout0,`16*0`($out)
2892 movups $inout1,`16*1`($out)
2893 movups $inout2,`16*2`($out)
2894 movups $inout3,`16*3`($out)
2895 movups $inout4,`16*4`($out)
2901 movdqa @offset[0],$inout5 # borrow
2905 movdqa $inout5,@offset[5]
2906 movups $inout0,`16*0`($out)
2911 pxor $inout2,$inout2
2912 pxor $inout3,$inout3
2916 movdqa @offset[1],@offset[5]
2917 movups $inout0,`16*0`($out)
2918 movups $inout1,`16*1`($out)
2924 pxor $inout3,$inout3
2928 movdqa @offset[2],@offset[5]
2929 movups $inout0,`16*0`($out)
2930 movups $inout1,`16*1`($out)
2931 movups $inout2,`16*2`($out)
2939 movdqa @offset[3],@offset[5]
2940 movups $inout0,`16*0`($out)
2941 movups $inout1,`16*1`($out)
2942 movups $inout2,`16*2`($out)
2943 movups $inout3,`16*3`($out)
2946 pxor $rndkey0,@offset[5] # "remove" round[last]
2947 movdqu $checksum,($checksum_p) # store checksum
2948 movdqu @offset[5],($offset_p) # store last offset_i
2950 xorps %xmm0,%xmm0 # clear register bank
2957 $code.=<<___ if (!$win64);
2971 $code.=<<___ if ($win64);
2972 movaps 0x00(%rsp),%xmm6
2973 movaps %xmm0,0x00(%rsp) # clear stack
2974 movaps 0x10(%rsp),%xmm7
2975 movaps %xmm0,0x10(%rsp)
2976 movaps 0x20(%rsp),%xmm8
2977 movaps %xmm0,0x20(%rsp)
2978 movaps 0x30(%rsp),%xmm9
2979 movaps %xmm0,0x30(%rsp)
2980 movaps 0x40(%rsp),%xmm10
2981 movaps %xmm0,0x40(%rsp)
2982 movaps 0x50(%rsp),%xmm11
2983 movaps %xmm0,0x50(%rsp)
2984 movaps 0x60(%rsp),%xmm12
2985 movaps %xmm0,0x60(%rsp)
2986 movaps 0x70(%rsp),%xmm13
2987 movaps %xmm0,0x70(%rsp)
2988 movaps 0x80(%rsp),%xmm14
2989 movaps %xmm0,0x80(%rsp)
2990 movaps 0x90(%rsp),%xmm15
2991 movaps %xmm0,0x90(%rsp)
2992 lea 0xa0+0x28(%rsp),%rax
3007 .cfi_def_cfa_register %rsp
3011 .size aesni_ocb_encrypt,.-aesni_ocb_encrypt
3013 .type __ocb_encrypt6,\@abi-omnipotent
3016 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3017 movdqu ($L_p,$i1),@offset[1]
3018 movdqa @offset[0],@offset[2]
3019 movdqu ($L_p,$i3),@offset[3]
3020 movdqa @offset[0],@offset[4]
3021 pxor @offset[5],@offset[0]
3022 movdqu ($L_p,$i5),@offset[5]
3023 pxor @offset[0],@offset[1]
3024 pxor $inout0,$checksum # accumulate checksum
3025 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3026 pxor @offset[1],@offset[2]
3027 pxor $inout1,$checksum
3028 pxor @offset[1],$inout1
3029 pxor @offset[2],@offset[3]
3030 pxor $inout2,$checksum
3031 pxor @offset[2],$inout2
3032 pxor @offset[3],@offset[4]
3033 pxor $inout3,$checksum
3034 pxor @offset[3],$inout3
3035 pxor @offset[4],@offset[5]
3036 pxor $inout4,$checksum
3037 pxor @offset[4],$inout4
3038 pxor $inout5,$checksum
3039 pxor @offset[5],$inout5
3040 $movkey 32($key_),$rndkey0
3042 lea 1($block_num),$i1 # even-numbered blocks
3043 lea 3($block_num),$i3
3044 lea 5($block_num),$i5
3046 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3047 bsf $i1,$i1 # ntz(block)
3051 aesenc $rndkey1,$inout0
3052 aesenc $rndkey1,$inout1
3053 aesenc $rndkey1,$inout2
3054 aesenc $rndkey1,$inout3
3055 pxor $rndkey0l,@offset[1]
3056 pxor $rndkey0l,@offset[2]
3057 aesenc $rndkey1,$inout4
3058 pxor $rndkey0l,@offset[3]
3059 pxor $rndkey0l,@offset[4]
3060 aesenc $rndkey1,$inout5
3061 $movkey 48($key_),$rndkey1
3062 pxor $rndkey0l,@offset[5]
3064 aesenc $rndkey0,$inout0
3065 aesenc $rndkey0,$inout1
3066 aesenc $rndkey0,$inout2
3067 aesenc $rndkey0,$inout3
3068 aesenc $rndkey0,$inout4
3069 aesenc $rndkey0,$inout5
3070 $movkey 64($key_),$rndkey0
3071 shl \$4,$i1 # ntz(block) -> table offset
3077 aesenc $rndkey1,$inout0
3078 aesenc $rndkey1,$inout1
3079 aesenc $rndkey1,$inout2
3080 aesenc $rndkey1,$inout3
3081 aesenc $rndkey1,$inout4
3082 aesenc $rndkey1,$inout5
3083 $movkey ($key,%rax),$rndkey1
3086 aesenc $rndkey0,$inout0
3087 aesenc $rndkey0,$inout1
3088 aesenc $rndkey0,$inout2
3089 aesenc $rndkey0,$inout3
3090 aesenc $rndkey0,$inout4
3091 aesenc $rndkey0,$inout5
3092 $movkey -16($key,%rax),$rndkey0
3095 aesenc $rndkey1,$inout0
3096 aesenc $rndkey1,$inout1
3097 aesenc $rndkey1,$inout2
3098 aesenc $rndkey1,$inout3
3099 aesenc $rndkey1,$inout4
3100 aesenc $rndkey1,$inout5
3101 $movkey 16($key_),$rndkey1
3104 aesenclast @offset[0],$inout0
3105 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3106 mov %r10,%rax # restore twisted rounds
3107 aesenclast @offset[1],$inout1
3108 aesenclast @offset[2],$inout2
3109 aesenclast @offset[3],$inout3
3110 aesenclast @offset[4],$inout4
3111 aesenclast @offset[5],$inout5
3113 .size __ocb_encrypt6,.-__ocb_encrypt6
3115 .type __ocb_encrypt4,\@abi-omnipotent
3118 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3119 movdqu ($L_p,$i1),@offset[1]
3120 movdqa @offset[0],@offset[2]
3121 movdqu ($L_p,$i3),@offset[3]
3122 pxor @offset[5],@offset[0]
3123 pxor @offset[0],@offset[1]
3124 pxor $inout0,$checksum # accumulate checksum
3125 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3126 pxor @offset[1],@offset[2]
3127 pxor $inout1,$checksum
3128 pxor @offset[1],$inout1
3129 pxor @offset[2],@offset[3]
3130 pxor $inout2,$checksum
3131 pxor @offset[2],$inout2
3132 pxor $inout3,$checksum
3133 pxor @offset[3],$inout3
3134 $movkey 32($key_),$rndkey0
3136 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3137 pxor $rndkey0l,@offset[1]
3138 pxor $rndkey0l,@offset[2]
3139 pxor $rndkey0l,@offset[3]
3141 aesenc $rndkey1,$inout0
3142 aesenc $rndkey1,$inout1
3143 aesenc $rndkey1,$inout2
3144 aesenc $rndkey1,$inout3
3145 $movkey 48($key_),$rndkey1
3147 aesenc $rndkey0,$inout0
3148 aesenc $rndkey0,$inout1
3149 aesenc $rndkey0,$inout2
3150 aesenc $rndkey0,$inout3
3151 $movkey 64($key_),$rndkey0
3156 aesenc $rndkey1,$inout0
3157 aesenc $rndkey1,$inout1
3158 aesenc $rndkey1,$inout2
3159 aesenc $rndkey1,$inout3
3160 $movkey ($key,%rax),$rndkey1
3163 aesenc $rndkey0,$inout0
3164 aesenc $rndkey0,$inout1
3165 aesenc $rndkey0,$inout2
3166 aesenc $rndkey0,$inout3
3167 $movkey -16($key,%rax),$rndkey0
3170 aesenc $rndkey1,$inout0
3171 aesenc $rndkey1,$inout1
3172 aesenc $rndkey1,$inout2
3173 aesenc $rndkey1,$inout3
3174 $movkey 16($key_),$rndkey1
3175 mov %r10,%rax # restore twisted rounds
3177 aesenclast @offset[0],$inout0
3178 aesenclast @offset[1],$inout1
3179 aesenclast @offset[2],$inout2
3180 aesenclast @offset[3],$inout3
3182 .size __ocb_encrypt4,.-__ocb_encrypt4
3184 .type __ocb_encrypt1,\@abi-omnipotent
3187 pxor @offset[5],$inout5 # offset_i
3188 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3189 pxor $inout0,$checksum # accumulate checksum
3190 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3191 $movkey 32($key_),$rndkey0
3193 aesenc $rndkey1,$inout0
3194 $movkey 48($key_),$rndkey1
3195 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3197 aesenc $rndkey0,$inout0
3198 $movkey 64($key_),$rndkey0
3203 aesenc $rndkey1,$inout0
3204 $movkey ($key,%rax),$rndkey1
3207 aesenc $rndkey0,$inout0
3208 $movkey -16($key,%rax),$rndkey0
3211 aesenc $rndkey1,$inout0
3212 $movkey 16($key_),$rndkey1 # redundant in tail
3213 mov %r10,%rax # restore twisted rounds
3215 aesenclast $inout5,$inout0
3217 .size __ocb_encrypt1,.-__ocb_encrypt1
3219 .globl aesni_ocb_decrypt
3220 .type aesni_ocb_decrypt,\@function,6
3236 $code.=<<___ if ($win64);
3237 lea -0xa0(%rsp),%rsp
3238 movaps %xmm6,0x00(%rsp) # offload everything
3239 movaps %xmm7,0x10(%rsp)
3240 movaps %xmm8,0x20(%rsp)
3241 movaps %xmm9,0x30(%rsp)
3242 movaps %xmm10,0x40(%rsp)
3243 movaps %xmm11,0x50(%rsp)
3244 movaps %xmm12,0x60(%rsp)
3245 movaps %xmm13,0x70(%rsp)
3246 movaps %xmm14,0x80(%rsp)
3247 movaps %xmm15,0x90(%rsp)
3251 mov $seventh_arg(%rax),$L_p # 7th argument
3252 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
3254 mov 240($key),$rnds_
3257 $movkey ($key),$rndkey0l # round[0]
3258 $movkey 16($key,$rnds_),$rndkey1 # round[last]
3260 movdqu ($offset_p),@offset[5] # load last offset_i
3261 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
3262 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
3265 lea 32($key_,$rnds_),$key
3266 $movkey 16($key_),$rndkey1 # round[1]
3267 sub %r10,%rax # twisted $rounds
3268 mov %rax,%r10 # backup twisted $rounds
3270 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3271 movdqu ($checksum_p),$checksum # load checksum
3273 test \$1,$block_num # is first block number odd?
3279 movdqu ($L_p,$i1),$inout5 # borrow
3280 movdqu ($inp),$inout0
3285 movdqa $inout5,@offset[5]
3286 movups $inout0,($out)
3287 xorps $inout0,$checksum # accumulate checksum
3293 lea 1($block_num),$i1 # even-numbered blocks
3294 lea 3($block_num),$i3
3295 lea 5($block_num),$i5
3296 lea 6($block_num),$block_num
3297 bsf $i1,$i1 # ntz(block)
3300 shl \$4,$i1 # ntz(block) -> table offset
3306 jmp .Locb_dec_grandloop
3309 .Locb_dec_grandloop:
3310 movdqu `16*0`($inp),$inout0 # load input
3311 movdqu `16*1`($inp),$inout1
3312 movdqu `16*2`($inp),$inout2
3313 movdqu `16*3`($inp),$inout3
3314 movdqu `16*4`($inp),$inout4
3315 movdqu `16*5`($inp),$inout5
3316 lea `16*6`($inp),$inp
3320 movups $inout0,`16*0`($out) # store output
3321 pxor $inout0,$checksum # accumulate checksum
3322 movups $inout1,`16*1`($out)
3323 pxor $inout1,$checksum
3324 movups $inout2,`16*2`($out)
3325 pxor $inout2,$checksum
3326 movups $inout3,`16*3`($out)
3327 pxor $inout3,$checksum
3328 movups $inout4,`16*4`($out)
3329 pxor $inout4,$checksum
3330 movups $inout5,`16*5`($out)
3331 pxor $inout5,$checksum
3332 lea `16*6`($out),$out
3334 jnc .Locb_dec_grandloop
3340 movdqu `16*0`($inp),$inout0
3343 movdqu `16*1`($inp),$inout1
3346 movdqu `16*2`($inp),$inout2
3349 movdqu `16*3`($inp),$inout3
3352 movdqu `16*4`($inp),$inout4
3353 pxor $inout5,$inout5
3357 movdqa @offset[4],@offset[5]
3358 movups $inout0,`16*0`($out) # store output
3359 pxor $inout0,$checksum # accumulate checksum
3360 movups $inout1,`16*1`($out)
3361 pxor $inout1,$checksum
3362 movups $inout2,`16*2`($out)
3363 pxor $inout2,$checksum
3364 movups $inout3,`16*3`($out)
3365 pxor $inout3,$checksum
3366 movups $inout4,`16*4`($out)
3367 pxor $inout4,$checksum
3373 movdqa @offset[0],$inout5 # borrow
3377 movdqa $inout5,@offset[5]
3378 movups $inout0,`16*0`($out) # store output
3379 xorps $inout0,$checksum # accumulate checksum
3384 pxor $inout2,$inout2
3385 pxor $inout3,$inout3
3389 movdqa @offset[1],@offset[5]
3390 movups $inout0,`16*0`($out) # store output
3391 xorps $inout0,$checksum # accumulate checksum
3392 movups $inout1,`16*1`($out)
3393 xorps $inout1,$checksum
3399 pxor $inout3,$inout3
3403 movdqa @offset[2],@offset[5]
3404 movups $inout0,`16*0`($out) # store output
3405 xorps $inout0,$checksum # accumulate checksum
3406 movups $inout1,`16*1`($out)
3407 xorps $inout1,$checksum
3408 movups $inout2,`16*2`($out)
3409 xorps $inout2,$checksum
3417 movdqa @offset[3],@offset[5]
3418 movups $inout0,`16*0`($out) # store output
3419 pxor $inout0,$checksum # accumulate checksum
3420 movups $inout1,`16*1`($out)
3421 pxor $inout1,$checksum
3422 movups $inout2,`16*2`($out)
3423 pxor $inout2,$checksum
3424 movups $inout3,`16*3`($out)
3425 pxor $inout3,$checksum
3428 pxor $rndkey0,@offset[5] # "remove" round[last]
3429 movdqu $checksum,($checksum_p) # store checksum
3430 movdqu @offset[5],($offset_p) # store last offset_i
3432 xorps %xmm0,%xmm0 # clear register bank
3439 $code.=<<___ if (!$win64);
3453 $code.=<<___ if ($win64);
3454 movaps 0x00(%rsp),%xmm6
3455 movaps %xmm0,0x00(%rsp) # clear stack
3456 movaps 0x10(%rsp),%xmm7
3457 movaps %xmm0,0x10(%rsp)
3458 movaps 0x20(%rsp),%xmm8
3459 movaps %xmm0,0x20(%rsp)
3460 movaps 0x30(%rsp),%xmm9
3461 movaps %xmm0,0x30(%rsp)
3462 movaps 0x40(%rsp),%xmm10
3463 movaps %xmm0,0x40(%rsp)
3464 movaps 0x50(%rsp),%xmm11
3465 movaps %xmm0,0x50(%rsp)
3466 movaps 0x60(%rsp),%xmm12
3467 movaps %xmm0,0x60(%rsp)
3468 movaps 0x70(%rsp),%xmm13
3469 movaps %xmm0,0x70(%rsp)
3470 movaps 0x80(%rsp),%xmm14
3471 movaps %xmm0,0x80(%rsp)
3472 movaps 0x90(%rsp),%xmm15
3473 movaps %xmm0,0x90(%rsp)
3474 lea 0xa0+0x28(%rsp),%rax
3489 .cfi_def_cfa_register %rsp
3493 .size aesni_ocb_decrypt,.-aesni_ocb_decrypt
3495 .type __ocb_decrypt6,\@abi-omnipotent
3498 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3499 movdqu ($L_p,$i1),@offset[1]
3500 movdqa @offset[0],@offset[2]
3501 movdqu ($L_p,$i3),@offset[3]
3502 movdqa @offset[0],@offset[4]
3503 pxor @offset[5],@offset[0]
3504 movdqu ($L_p,$i5),@offset[5]
3505 pxor @offset[0],@offset[1]
3506 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3507 pxor @offset[1],@offset[2]
3508 pxor @offset[1],$inout1
3509 pxor @offset[2],@offset[3]
3510 pxor @offset[2],$inout2
3511 pxor @offset[3],@offset[4]
3512 pxor @offset[3],$inout3
3513 pxor @offset[4],@offset[5]
3514 pxor @offset[4],$inout4
3515 pxor @offset[5],$inout5
3516 $movkey 32($key_),$rndkey0
3518 lea 1($block_num),$i1 # even-numbered blocks
3519 lea 3($block_num),$i3
3520 lea 5($block_num),$i5
3522 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3523 bsf $i1,$i1 # ntz(block)
3527 aesdec $rndkey1,$inout0
3528 aesdec $rndkey1,$inout1
3529 aesdec $rndkey1,$inout2
3530 aesdec $rndkey1,$inout3
3531 pxor $rndkey0l,@offset[1]
3532 pxor $rndkey0l,@offset[2]
3533 aesdec $rndkey1,$inout4
3534 pxor $rndkey0l,@offset[3]
3535 pxor $rndkey0l,@offset[4]
3536 aesdec $rndkey1,$inout5
3537 $movkey 48($key_),$rndkey1
3538 pxor $rndkey0l,@offset[5]
3540 aesdec $rndkey0,$inout0
3541 aesdec $rndkey0,$inout1
3542 aesdec $rndkey0,$inout2
3543 aesdec $rndkey0,$inout3
3544 aesdec $rndkey0,$inout4
3545 aesdec $rndkey0,$inout5
3546 $movkey 64($key_),$rndkey0
3547 shl \$4,$i1 # ntz(block) -> table offset
3553 aesdec $rndkey1,$inout0
3554 aesdec $rndkey1,$inout1
3555 aesdec $rndkey1,$inout2
3556 aesdec $rndkey1,$inout3
3557 aesdec $rndkey1,$inout4
3558 aesdec $rndkey1,$inout5
3559 $movkey ($key,%rax),$rndkey1
3562 aesdec $rndkey0,$inout0
3563 aesdec $rndkey0,$inout1
3564 aesdec $rndkey0,$inout2
3565 aesdec $rndkey0,$inout3
3566 aesdec $rndkey0,$inout4
3567 aesdec $rndkey0,$inout5
3568 $movkey -16($key,%rax),$rndkey0
3571 aesdec $rndkey1,$inout0
3572 aesdec $rndkey1,$inout1
3573 aesdec $rndkey1,$inout2
3574 aesdec $rndkey1,$inout3
3575 aesdec $rndkey1,$inout4
3576 aesdec $rndkey1,$inout5
3577 $movkey 16($key_),$rndkey1
3580 aesdeclast @offset[0],$inout0
3581 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3582 mov %r10,%rax # restore twisted rounds
3583 aesdeclast @offset[1],$inout1
3584 aesdeclast @offset[2],$inout2
3585 aesdeclast @offset[3],$inout3
3586 aesdeclast @offset[4],$inout4
3587 aesdeclast @offset[5],$inout5
3589 .size __ocb_decrypt6,.-__ocb_decrypt6
3591 .type __ocb_decrypt4,\@abi-omnipotent
3594 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3595 movdqu ($L_p,$i1),@offset[1]
3596 movdqa @offset[0],@offset[2]
3597 movdqu ($L_p,$i3),@offset[3]
3598 pxor @offset[5],@offset[0]
3599 pxor @offset[0],@offset[1]
3600 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3601 pxor @offset[1],@offset[2]
3602 pxor @offset[1],$inout1
3603 pxor @offset[2],@offset[3]
3604 pxor @offset[2],$inout2
3605 pxor @offset[3],$inout3
3606 $movkey 32($key_),$rndkey0
3608 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3609 pxor $rndkey0l,@offset[1]
3610 pxor $rndkey0l,@offset[2]
3611 pxor $rndkey0l,@offset[3]
3613 aesdec $rndkey1,$inout0
3614 aesdec $rndkey1,$inout1
3615 aesdec $rndkey1,$inout2
3616 aesdec $rndkey1,$inout3
3617 $movkey 48($key_),$rndkey1
3619 aesdec $rndkey0,$inout0
3620 aesdec $rndkey0,$inout1
3621 aesdec $rndkey0,$inout2
3622 aesdec $rndkey0,$inout3
3623 $movkey 64($key_),$rndkey0
3628 aesdec $rndkey1,$inout0
3629 aesdec $rndkey1,$inout1
3630 aesdec $rndkey1,$inout2
3631 aesdec $rndkey1,$inout3
3632 $movkey ($key,%rax),$rndkey1
3635 aesdec $rndkey0,$inout0
3636 aesdec $rndkey0,$inout1
3637 aesdec $rndkey0,$inout2
3638 aesdec $rndkey0,$inout3
3639 $movkey -16($key,%rax),$rndkey0
3642 aesdec $rndkey1,$inout0
3643 aesdec $rndkey1,$inout1
3644 aesdec $rndkey1,$inout2
3645 aesdec $rndkey1,$inout3
3646 $movkey 16($key_),$rndkey1
3647 mov %r10,%rax # restore twisted rounds
3649 aesdeclast @offset[0],$inout0
3650 aesdeclast @offset[1],$inout1
3651 aesdeclast @offset[2],$inout2
3652 aesdeclast @offset[3],$inout3
3654 .size __ocb_decrypt4,.-__ocb_decrypt4
3656 .type __ocb_decrypt1,\@abi-omnipotent
3659 pxor @offset[5],$inout5 # offset_i
3660 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3661 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3662 $movkey 32($key_),$rndkey0
3664 aesdec $rndkey1,$inout0
3665 $movkey 48($key_),$rndkey1
3666 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3668 aesdec $rndkey0,$inout0
3669 $movkey 64($key_),$rndkey0
3674 aesdec $rndkey1,$inout0
3675 $movkey ($key,%rax),$rndkey1
3678 aesdec $rndkey0,$inout0
3679 $movkey -16($key,%rax),$rndkey0
3682 aesdec $rndkey1,$inout0
3683 $movkey 16($key_),$rndkey1 # redundant in tail
3684 mov %r10,%rax # restore twisted rounds
3686 aesdeclast $inout5,$inout0
3688 .size __ocb_decrypt1,.-__ocb_decrypt1
3692 ########################################################################
3693 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
3694 # size_t length, const AES_KEY *key,
3695 # unsigned char *ivp,const int enc);
3697 my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
3698 my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
3701 .globl ${PREFIX}_cbc_encrypt
3702 .type ${PREFIX}_cbc_encrypt,\@function,6
3704 ${PREFIX}_cbc_encrypt:
3706 test $len,$len # check length
3709 mov 240($key),$rnds_ # key->rounds
3710 mov $key,$key_ # backup $key
3711 test %r9d,%r9d # 6th argument
3713 #--------------------------- CBC ENCRYPT ------------------------------#
3714 movups ($ivp),$inout0 # load iv as initial state
3722 movups ($inp),$inout1 # load input
3724 #xorps $inout1,$inout0
3726 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
3728 mov $rnds_,$rounds # restore $rounds
3729 mov $key_,$key # restore $key
3730 movups $inout0,0($out) # store output
3736 pxor $rndkey0,$rndkey0 # clear register bank
3737 pxor $rndkey1,$rndkey1
3738 movups $inout0,($ivp)
3739 pxor $inout0,$inout0
3740 pxor $inout1,$inout1
3744 mov $len,%rcx # zaps $key
3745 xchg $inp,$out # $inp is %rsi and $out is %rdi now
3746 .long 0x9066A4F3 # rep movsb
3747 mov \$16,%ecx # zero tail
3750 .long 0x9066AAF3 # rep stosb
3751 lea -16(%rdi),%rdi # rewind $out by 1 block
3752 mov $rnds_,$rounds # restore $rounds
3753 mov %rdi,%rsi # $inp and $out are the same
3754 mov $key_,$key # restore $key
3755 xor $len,$len # len=16
3756 jmp .Lcbc_enc_loop # one more spin
3757 \f#--------------------------- CBC DECRYPT ------------------------------#
3761 jne .Lcbc_decrypt_bulk
3763 # handle single block without allocating stack frame,
3764 # useful in ciphertext stealing mode
3765 movdqu ($inp),$inout0 # load input
3766 movdqu ($ivp),$inout1 # load iv
3767 movdqa $inout0,$inout2 # future iv
3769 &aesni_generate1("dec",$key,$rnds_);
3771 pxor $rndkey0,$rndkey0 # clear register bank
3772 pxor $rndkey1,$rndkey1
3773 movdqu $inout2,($ivp) # store iv
3774 xorps $inout1,$inout0 # ^=iv
3775 pxor $inout1,$inout1
3776 movups $inout0,($out) # store output
3777 pxor $inout0,$inout0
3781 lea (%rsp),%r11 # frame pointer
3782 .cfi_def_cfa_register %r11
3785 sub \$$frame_size,%rsp
3786 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
3788 $code.=<<___ if ($win64);
3789 movaps %xmm6,0x10(%rsp)
3790 movaps %xmm7,0x20(%rsp)
3791 movaps %xmm8,0x30(%rsp)
3792 movaps %xmm9,0x40(%rsp)
3793 movaps %xmm10,0x50(%rsp)
3794 movaps %xmm11,0x60(%rsp)
3795 movaps %xmm12,0x70(%rsp)
3796 movaps %xmm13,0x80(%rsp)
3797 movaps %xmm14,0x90(%rsp)
3798 movaps %xmm15,0xa0(%rsp)
3802 my $inp_=$key_="%rbp"; # reassign $key_
3805 mov $key,$key_ # [re-]backup $key [after reassignment]
3811 $movkey ($key),$rndkey0
3812 movdqu 0x00($inp),$inout0 # load input
3813 movdqu 0x10($inp),$inout1
3815 movdqu 0x20($inp),$inout2
3817 movdqu 0x30($inp),$inout3
3819 movdqu 0x40($inp),$inout4
3821 movdqu 0x50($inp),$inout5
3823 mov OPENSSL_ia32cap_P+4(%rip),%r9d
3825 jbe .Lcbc_dec_six_or_seven
3827 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
3828 sub \$0x50,$len # $len is biased by -5*16
3829 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
3830 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
3831 sub \$0x20,$len # $len is biased by -7*16
3832 lea 0x70($key),$key # size optimization
3833 jmp .Lcbc_dec_loop8_enter
3836 movups $inout7,($out)
3838 .Lcbc_dec_loop8_enter:
3839 movdqu 0x60($inp),$inout6
3840 pxor $rndkey0,$inout0
3841 movdqu 0x70($inp),$inout7
3842 pxor $rndkey0,$inout1
3843 $movkey 0x10-0x70($key),$rndkey1
3844 pxor $rndkey0,$inout2
3846 cmp \$0x70,$len # is there at least 0x60 bytes ahead?
3847 pxor $rndkey0,$inout3
3848 pxor $rndkey0,$inout4
3849 pxor $rndkey0,$inout5
3850 pxor $rndkey0,$inout6
3852 aesdec $rndkey1,$inout0
3853 pxor $rndkey0,$inout7
3854 $movkey 0x20-0x70($key),$rndkey0
3855 aesdec $rndkey1,$inout1
3856 aesdec $rndkey1,$inout2
3857 aesdec $rndkey1,$inout3
3858 aesdec $rndkey1,$inout4
3859 aesdec $rndkey1,$inout5
3860 aesdec $rndkey1,$inout6
3863 aesdec $rndkey1,$inout7
3865 $movkey 0x30-0x70($key),$rndkey1
3867 for($i=1;$i<12;$i++) {
3868 my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
3869 $code.=<<___ if ($i==7);
3873 aesdec $rndkeyx,$inout0
3874 aesdec $rndkeyx,$inout1
3875 aesdec $rndkeyx,$inout2
3876 aesdec $rndkeyx,$inout3
3877 aesdec $rndkeyx,$inout4
3878 aesdec $rndkeyx,$inout5
3879 aesdec $rndkeyx,$inout6
3880 aesdec $rndkeyx,$inout7
3881 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
3883 $code.=<<___ if ($i<6 || (!($i&1) && $i>7));
3886 $code.=<<___ if ($i==7);
3889 $code.=<<___ if ($i==9);
3892 $code.=<<___ if ($i==11);
3899 aesdec $rndkey1,$inout0
3900 aesdec $rndkey1,$inout1
3903 aesdec $rndkey1,$inout2
3904 aesdec $rndkey1,$inout3
3907 aesdec $rndkey1,$inout4
3908 aesdec $rndkey1,$inout5
3911 aesdec $rndkey1,$inout6
3912 aesdec $rndkey1,$inout7
3913 movdqu 0x50($inp),$rndkey1
3915 aesdeclast $iv,$inout0
3916 movdqu 0x60($inp),$iv # borrow $iv
3917 pxor $rndkey0,$rndkey1
3918 aesdeclast $in0,$inout1
3920 movdqu 0x70($inp),$rndkey0 # next IV
3921 aesdeclast $in1,$inout2
3923 movdqu 0x00($inp_),$in0
3924 aesdeclast $in2,$inout3
3925 aesdeclast $in3,$inout4
3926 movdqu 0x10($inp_),$in1
3927 movdqu 0x20($inp_),$in2
3928 aesdeclast $in4,$inout5
3929 aesdeclast $rndkey1,$inout6
3930 movdqu 0x30($inp_),$in3
3931 movdqu 0x40($inp_),$in4
3932 aesdeclast $iv,$inout7
3933 movdqa $rndkey0,$iv # return $iv
3934 movdqu 0x50($inp_),$rndkey1
3935 $movkey -0x70($key),$rndkey0
3937 movups $inout0,($out) # store output
3939 movups $inout1,0x10($out)
3941 movups $inout2,0x20($out)
3943 movups $inout3,0x30($out)
3945 movups $inout4,0x40($out)
3947 movups $inout5,0x50($out)
3948 movdqa $rndkey1,$inout5
3949 movups $inout6,0x60($out)
3955 movaps $inout7,$inout0
3956 lea -0x70($key),$key
3958 jle .Lcbc_dec_clear_tail_collected
3959 movups $inout7,($out)
3965 .Lcbc_dec_six_or_seven:
3969 movaps $inout5,$inout6
3970 call _aesni_decrypt6
3971 pxor $iv,$inout0 # ^= IV
3974 movdqu $inout0,($out)
3976 movdqu $inout1,0x10($out)
3977 pxor $inout1,$inout1 # clear register bank
3979 movdqu $inout2,0x20($out)
3980 pxor $inout2,$inout2
3982 movdqu $inout3,0x30($out)
3983 pxor $inout3,$inout3
3985 movdqu $inout4,0x40($out)
3986 pxor $inout4,$inout4
3988 movdqa $inout5,$inout0
3989 pxor $inout5,$inout5
3990 jmp .Lcbc_dec_tail_collected
3994 movups 0x60($inp),$inout6
3995 xorps $inout7,$inout7
3996 call _aesni_decrypt8
3997 movups 0x50($inp),$inout7
3998 pxor $iv,$inout0 # ^= IV
3999 movups 0x60($inp),$iv
4001 movdqu $inout0,($out)
4003 movdqu $inout1,0x10($out)
4004 pxor $inout1,$inout1 # clear register bank
4006 movdqu $inout2,0x20($out)
4007 pxor $inout2,$inout2
4009 movdqu $inout3,0x30($out)
4010 pxor $inout3,$inout3
4012 movdqu $inout4,0x40($out)
4013 pxor $inout4,$inout4
4014 pxor $inout7,$inout6
4015 movdqu $inout5,0x50($out)
4016 pxor $inout5,$inout5
4018 movdqa $inout6,$inout0
4019 pxor $inout6,$inout6
4020 pxor $inout7,$inout7
4021 jmp .Lcbc_dec_tail_collected
4025 movups $inout5,($out)
4027 movdqu 0x00($inp),$inout0 # load input
4028 movdqu 0x10($inp),$inout1
4030 movdqu 0x20($inp),$inout2
4032 movdqu 0x30($inp),$inout3
4034 movdqu 0x40($inp),$inout4
4036 movdqu 0x50($inp),$inout5
4038 .Lcbc_dec_loop6_enter:
4040 movdqa $inout5,$inout6
4042 call _aesni_decrypt6
4044 pxor $iv,$inout0 # ^= IV
4047 movdqu $inout0,($out)
4049 movdqu $inout1,0x10($out)
4051 movdqu $inout2,0x20($out)
4054 movdqu $inout3,0x30($out)
4057 movdqu $inout4,0x40($out)
4062 movdqa $inout5,$inout0
4064 jle .Lcbc_dec_clear_tail_collected
4065 movups $inout5,($out)
4069 movups ($inp),$inout0
4071 jbe .Lcbc_dec_one # $len is 1*16 or less
4073 movups 0x10($inp),$inout1
4076 jbe .Lcbc_dec_two # $len is 2*16 or less
4078 movups 0x20($inp),$inout2
4081 jbe .Lcbc_dec_three # $len is 3*16 or less
4083 movups 0x30($inp),$inout3
4086 jbe .Lcbc_dec_four # $len is 4*16 or less
4088 movups 0x40($inp),$inout4 # $len is 5*16 or less
4091 xorps $inout5,$inout5
4092 call _aesni_decrypt6
4096 movdqu $inout0,($out)
4098 movdqu $inout1,0x10($out)
4099 pxor $inout1,$inout1 # clear register bank
4101 movdqu $inout2,0x20($out)
4102 pxor $inout2,$inout2
4104 movdqu $inout3,0x30($out)
4105 pxor $inout3,$inout3
4107 movdqa $inout4,$inout0
4108 pxor $inout4,$inout4
4109 pxor $inout5,$inout5
4111 jmp .Lcbc_dec_tail_collected
4117 &aesni_generate1("dec",$key,$rounds);
4121 jmp .Lcbc_dec_tail_collected
4125 call _aesni_decrypt2
4129 movdqu $inout0,($out)
4130 movdqa $inout1,$inout0
4131 pxor $inout1,$inout1 # clear register bank
4133 jmp .Lcbc_dec_tail_collected
4137 call _aesni_decrypt3
4141 movdqu $inout0,($out)
4143 movdqu $inout1,0x10($out)
4144 pxor $inout1,$inout1 # clear register bank
4145 movdqa $inout2,$inout0
4146 pxor $inout2,$inout2
4148 jmp .Lcbc_dec_tail_collected
4152 call _aesni_decrypt4
4156 movdqu $inout0,($out)
4158 movdqu $inout1,0x10($out)
4159 pxor $inout1,$inout1 # clear register bank
4161 movdqu $inout2,0x20($out)
4162 pxor $inout2,$inout2
4163 movdqa $inout3,$inout0
4164 pxor $inout3,$inout3
4166 jmp .Lcbc_dec_tail_collected
4169 .Lcbc_dec_clear_tail_collected:
4170 pxor $inout1,$inout1 # clear register bank
4171 pxor $inout2,$inout2
4172 pxor $inout3,$inout3
4174 $code.=<<___ if (!$win64);
4175 pxor $inout4,$inout4 # %xmm6..9
4176 pxor $inout5,$inout5
4177 pxor $inout6,$inout6
4178 pxor $inout7,$inout7
4181 .Lcbc_dec_tail_collected:
4184 jnz .Lcbc_dec_tail_partial
4185 movups $inout0,($out)
4186 pxor $inout0,$inout0
4189 .Lcbc_dec_tail_partial:
4190 movaps $inout0,(%rsp)
4191 pxor $inout0,$inout0
4196 .long 0x9066A4F3 # rep movsb
4197 movdqa $inout0,(%rsp)
4200 xorps $rndkey0,$rndkey0 # %xmm0
4201 pxor $rndkey1,$rndkey1
4203 $code.=<<___ if ($win64);
4204 movaps 0x10(%rsp),%xmm6
4205 movaps %xmm0,0x10(%rsp) # clear stack
4206 movaps 0x20(%rsp),%xmm7
4207 movaps %xmm0,0x20(%rsp)
4208 movaps 0x30(%rsp),%xmm8
4209 movaps %xmm0,0x30(%rsp)
4210 movaps 0x40(%rsp),%xmm9
4211 movaps %xmm0,0x40(%rsp)
4212 movaps 0x50(%rsp),%xmm10
4213 movaps %xmm0,0x50(%rsp)
4214 movaps 0x60(%rsp),%xmm11
4215 movaps %xmm0,0x60(%rsp)
4216 movaps 0x70(%rsp),%xmm12
4217 movaps %xmm0,0x70(%rsp)
4218 movaps 0x80(%rsp),%xmm13
4219 movaps %xmm0,0x80(%rsp)
4220 movaps 0x90(%rsp),%xmm14
4221 movaps %xmm0,0x90(%rsp)
4222 movaps 0xa0(%rsp),%xmm15
4223 movaps %xmm0,0xa0(%rsp)
4229 .cfi_def_cfa_register %rsp
4233 .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4236 # int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
4237 # int bits, AES_KEY *key)
4239 # input: $inp user-supplied key
4240 # $bits $inp length in bits
4241 # $key pointer to key schedule
4242 # output: %eax 0 denoting success, -1 or -2 - failure (see C)
4243 # *$key key schedule
4245 { my ($inp,$bits,$key) = @_4args;
4249 .globl ${PREFIX}_set_decrypt_key
4250 .type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
4252 ${PREFIX}_set_decrypt_key:
4254 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
4255 .cfi_adjust_cfa_offset 8
4256 call __aesni_set_encrypt_key
4257 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
4260 lea 16($key,$bits),$inp # points at the end of key schedule
4262 $movkey ($key),%xmm0 # just swap
4263 $movkey ($inp),%xmm1
4264 $movkey %xmm0,($inp)
4265 $movkey %xmm1,($key)
4270 $movkey ($key),%xmm0 # swap and inverse
4271 $movkey ($inp),%xmm1
4276 $movkey %xmm0,16($inp)
4277 $movkey %xmm1,-16($key)
4279 ja .Ldec_key_inverse
4281 $movkey ($key),%xmm0 # inverse middle
4284 $movkey %xmm0,($inp)
4288 .cfi_adjust_cfa_offset -8
4291 .LSEH_end_set_decrypt_key:
4292 .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4295 # This is based on submission by
4297 # Huang Ying <ying.huang@intel.com>
4298 # Vinodh Gopal <vinodh.gopal@intel.com>
4301 # Aggressively optimized in respect to aeskeygenassist's critical path
4302 # and is contained in %xmm0-5 to meet Win64 ABI requirement.
4304 # int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4305 # int bits, AES_KEY * const key);
4307 # input: $inp user-supplied key
4308 # $bits $inp length in bits
4309 # $key pointer to key schedule
4310 # output: %eax 0 denoting success, -1 or -2 - failure (see C)
4311 # $bits rounds-1 (used in aesni_set_decrypt_key)
4312 # *$key key schedule
4313 # $key pointer to key schedule (used in
4314 # aesni_set_decrypt_key)
4316 # Subroutine is frame-less, which means that only volatile registers
4317 # are used. Note that it's declared "abi-omnipotent", which means that
4318 # amount of volatile registers is smaller on Windows.
4321 .globl ${PREFIX}_set_encrypt_key
4322 .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
4324 ${PREFIX}_set_encrypt_key:
4325 __aesni_set_encrypt_key:
4327 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
4328 .cfi_adjust_cfa_offset 8
4335 mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
4336 movups ($inp),%xmm0 # pull first 128 bits of *userKey
4337 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
4338 and OPENSSL_ia32cap_P+4(%rip),%r10d
4339 lea 16($key),%rax # %rax is used as modifiable copy of $key
4348 mov \$9,$bits # 10 rounds for 128-bit key
4349 cmp \$`1<<28`,%r10d # AVX, bit no XOP
4352 $movkey %xmm0,($key) # round 0
4353 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
4354 call .Lkey_expansion_128_cold
4355 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
4356 call .Lkey_expansion_128
4357 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
4358 call .Lkey_expansion_128
4359 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
4360 call .Lkey_expansion_128
4361 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
4362 call .Lkey_expansion_128
4363 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
4364 call .Lkey_expansion_128
4365 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
4366 call .Lkey_expansion_128
4367 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
4368 call .Lkey_expansion_128
4369 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
4370 call .Lkey_expansion_128
4371 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
4372 call .Lkey_expansion_128
4373 $movkey %xmm0,(%rax)
4374 mov $bits,80(%rax) # 240(%rdx)
4380 movdqa .Lkey_rotate(%rip),%xmm5
4382 movdqa .Lkey_rcon1(%rip),%xmm4
4390 aesenclast %xmm4,%xmm0
4403 movdqu %xmm0,-16(%rax)
4409 movdqa .Lkey_rcon1b(%rip),%xmm4
4412 aesenclast %xmm4,%xmm0
4428 aesenclast %xmm4,%xmm0
4439 movdqu %xmm0,16(%rax)
4441 mov $bits,96(%rax) # 240($key)
4447 movq 16($inp),%xmm2 # remaining 1/3 of *userKey
4448 mov \$11,$bits # 12 rounds for 192
4449 cmp \$`1<<28`,%r10d # AVX, but no XOP
4452 $movkey %xmm0,($key) # round 0
4453 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
4454 call .Lkey_expansion_192a_cold
4455 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
4456 call .Lkey_expansion_192b
4457 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
4458 call .Lkey_expansion_192a
4459 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
4460 call .Lkey_expansion_192b
4461 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
4462 call .Lkey_expansion_192a
4463 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
4464 call .Lkey_expansion_192b
4465 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
4466 call .Lkey_expansion_192a
4467 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
4468 call .Lkey_expansion_192b
4469 $movkey %xmm0,(%rax)
4470 mov $bits,48(%rax) # 240(%rdx)
4476 movdqa .Lkey_rotate192(%rip),%xmm5
4477 movdqa .Lkey_rcon1(%rip),%xmm4
4487 aesenclast %xmm4,%xmm2
4499 pshufd \$0xff,%xmm0,%xmm3
4506 movdqu %xmm0,-16(%rax)
4511 mov $bits,32(%rax) # 240($key)
4517 movups 16($inp),%xmm2 # remaning half of *userKey
4518 mov \$13,$bits # 14 rounds for 256
4520 cmp \$`1<<28`,%r10d # AVX, but no XOP
4523 $movkey %xmm0,($key) # round 0
4524 $movkey %xmm2,16($key) # round 1
4525 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
4526 call .Lkey_expansion_256a_cold
4527 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
4528 call .Lkey_expansion_256b
4529 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
4530 call .Lkey_expansion_256a
4531 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
4532 call .Lkey_expansion_256b
4533 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
4534 call .Lkey_expansion_256a
4535 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
4536 call .Lkey_expansion_256b
4537 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
4538 call .Lkey_expansion_256a
4539 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
4540 call .Lkey_expansion_256b
4541 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
4542 call .Lkey_expansion_256a
4543 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
4544 call .Lkey_expansion_256b
4545 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
4546 call .Lkey_expansion_256a
4547 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
4548 call .Lkey_expansion_256b
4549 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
4550 call .Lkey_expansion_256a
4551 $movkey %xmm0,(%rax)
4552 mov $bits,16(%rax) # 240(%rdx)
4558 movdqa .Lkey_rotate(%rip),%xmm5
4559 movdqa .Lkey_rcon1(%rip),%xmm4
4561 movdqu %xmm0,0($key)
4563 movdqu %xmm2,16($key)
4569 aesenclast %xmm4,%xmm2
4586 pshufd \$0xff,%xmm0,%xmm2
4588 aesenclast %xmm3,%xmm2
4599 movdqu %xmm2,16(%rax)
4606 mov $bits,16(%rax) # 240($key)
4621 .cfi_adjust_cfa_offset -8
4624 .LSEH_end_set_encrypt_key:
4627 .Lkey_expansion_128:
4628 $movkey %xmm0,(%rax)
4630 .Lkey_expansion_128_cold:
4631 shufps \$0b00010000,%xmm0,%xmm4
4633 shufps \$0b10001100,%xmm0,%xmm4
4635 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4640 .Lkey_expansion_192a:
4641 $movkey %xmm0,(%rax)
4643 .Lkey_expansion_192a_cold:
4645 .Lkey_expansion_192b_warm:
4646 shufps \$0b00010000,%xmm0,%xmm4
4649 shufps \$0b10001100,%xmm0,%xmm4
4652 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
4655 pshufd \$0b11111111,%xmm0,%xmm3
4660 .Lkey_expansion_192b:
4662 shufps \$0b01000100,%xmm0,%xmm5
4663 $movkey %xmm5,(%rax)
4664 shufps \$0b01001110,%xmm2,%xmm3
4665 $movkey %xmm3,16(%rax)
4667 jmp .Lkey_expansion_192b_warm
4670 .Lkey_expansion_256a:
4671 $movkey %xmm2,(%rax)
4673 .Lkey_expansion_256a_cold:
4674 shufps \$0b00010000,%xmm0,%xmm4
4676 shufps \$0b10001100,%xmm0,%xmm4
4678 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4683 .Lkey_expansion_256b:
4684 $movkey %xmm0,(%rax)
4687 shufps \$0b00010000,%xmm2,%xmm4
4689 shufps \$0b10001100,%xmm2,%xmm4
4691 shufps \$0b10101010,%xmm1,%xmm1 # critical path
4694 .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
4695 .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
4702 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4710 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4712 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4714 .long 0x04070605,0x04070605,0x04070605,0x04070605
4718 .long 0x1b,0x1b,0x1b,0x1b
4720 .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4724 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4725 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
4733 .extern __imp_RtlVirtualUnwind
4735 $code.=<<___ if ($PREFIX eq "aesni");
4736 .type ecb_ccm64_se_handler,\@abi-omnipotent
4738 ecb_ccm64_se_handler:
4750 mov 120($context),%rax # pull context->Rax
4751 mov 248($context),%rbx # pull context->Rip
4753 mov 8($disp),%rsi # disp->ImageBase
4754 mov 56($disp),%r11 # disp->HandlerData
4756 mov 0(%r11),%r10d # HandlerData[0]
4757 lea (%rsi,%r10),%r10 # prologue label
4758 cmp %r10,%rbx # context->Rip<prologue label
4759 jb .Lcommon_seh_tail
4761 mov 152($context),%rax # pull context->Rsp
4763 mov 4(%r11),%r10d # HandlerData[1]
4764 lea (%rsi,%r10),%r10 # epilogue label
4765 cmp %r10,%rbx # context->Rip>=epilogue label
4766 jae .Lcommon_seh_tail
4768 lea 0(%rax),%rsi # %xmm save area
4769 lea 512($context),%rdi # &context.Xmm6
4770 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
4771 .long 0xa548f3fc # cld; rep movsq
4772 lea 0x58(%rax),%rax # adjust stack pointer
4774 jmp .Lcommon_seh_tail
4775 .size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
4777 .type ctr_xts_se_handler,\@abi-omnipotent
4791 mov 120($context),%rax # pull context->Rax
4792 mov 248($context),%rbx # pull context->Rip
4794 mov 8($disp),%rsi # disp->ImageBase
4795 mov 56($disp),%r11 # disp->HandlerData
4797 mov 0(%r11),%r10d # HandlerData[0]
4798 lea (%rsi,%r10),%r10 # prologue lable
4799 cmp %r10,%rbx # context->Rip<prologue label
4800 jb .Lcommon_seh_tail
4802 mov 152($context),%rax # pull context->Rsp
4804 mov 4(%r11),%r10d # HandlerData[1]
4805 lea (%rsi,%r10),%r10 # epilogue label
4806 cmp %r10,%rbx # context->Rip>=epilogue label
4807 jae .Lcommon_seh_tail
4809 mov 208($context),%rax # pull context->R11
4811 lea -0xa8(%rax),%rsi # %xmm save area
4812 lea 512($context),%rdi # & context.Xmm6
4813 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4814 .long 0xa548f3fc # cld; rep movsq
4816 mov -8(%rax),%rbp # restore saved %rbp
4817 mov %rbp,160($context) # restore context->Rbp
4818 jmp .Lcommon_seh_tail
4819 .size ctr_xts_se_handler,.-ctr_xts_se_handler
4821 .type ocb_se_handler,\@abi-omnipotent
4835 mov 120($context),%rax # pull context->Rax
4836 mov 248($context),%rbx # pull context->Rip
4838 mov 8($disp),%rsi # disp->ImageBase
4839 mov 56($disp),%r11 # disp->HandlerData
4841 mov 0(%r11),%r10d # HandlerData[0]
4842 lea (%rsi,%r10),%r10 # prologue lable
4843 cmp %r10,%rbx # context->Rip<prologue label
4844 jb .Lcommon_seh_tail
4846 mov 4(%r11),%r10d # HandlerData[1]
4847 lea (%rsi,%r10),%r10 # epilogue label
4848 cmp %r10,%rbx # context->Rip>=epilogue label
4849 jae .Lcommon_seh_tail
4851 mov 8(%r11),%r10d # HandlerData[2]
4852 lea (%rsi,%r10),%r10
4853 cmp %r10,%rbx # context->Rip>=pop label
4856 mov 152($context),%rax # pull context->Rsp
4858 lea (%rax),%rsi # %xmm save area
4859 lea 512($context),%rdi # & context.Xmm6
4860 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4861 .long 0xa548f3fc # cld; rep movsq
4862 lea 0xa0+0x28(%rax),%rax
4871 mov %rbx,144($context) # restore context->Rbx
4872 mov %rbp,160($context) # restore context->Rbp
4873 mov %r12,216($context) # restore context->R12
4874 mov %r13,224($context) # restore context->R13
4875 mov %r14,232($context) # restore context->R14
4877 jmp .Lcommon_seh_tail
4878 .size ocb_se_handler,.-ocb_se_handler
4881 .type cbc_se_handler,\@abi-omnipotent
4895 mov 152($context),%rax # pull context->Rsp
4896 mov 248($context),%rbx # pull context->Rip
4898 lea .Lcbc_decrypt_bulk(%rip),%r10
4899 cmp %r10,%rbx # context->Rip<"prologue" label
4900 jb .Lcommon_seh_tail
4902 mov 120($context),%rax # pull context->Rax
4904 lea .Lcbc_decrypt_body(%rip),%r10
4905 cmp %r10,%rbx # context->Rip<cbc_decrypt_body
4906 jb .Lcommon_seh_tail
4908 mov 152($context),%rax # pull context->Rsp
4910 lea .Lcbc_ret(%rip),%r10
4911 cmp %r10,%rbx # context->Rip>="epilogue" label
4912 jae .Lcommon_seh_tail
4914 lea 16(%rax),%rsi # %xmm save area
4915 lea 512($context),%rdi # &context.Xmm6
4916 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4917 .long 0xa548f3fc # cld; rep movsq
4919 mov 208($context),%rax # pull context->R11
4921 mov -8(%rax),%rbp # restore saved %rbp
4922 mov %rbp,160($context) # restore context->Rbp
4927 mov %rax,152($context) # restore context->Rsp
4928 mov %rsi,168($context) # restore context->Rsi
4929 mov %rdi,176($context) # restore context->Rdi
4931 mov 40($disp),%rdi # disp->ContextRecord
4932 mov $context,%rsi # context
4933 mov \$154,%ecx # sizeof(CONTEXT)
4934 .long 0xa548f3fc # cld; rep movsq
4937 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4938 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4939 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4940 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4941 mov 40(%rsi),%r10 # disp->ContextRecord
4942 lea 56(%rsi),%r11 # &disp->HandlerData
4943 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4944 mov %r10,32(%rsp) # arg5
4945 mov %r11,40(%rsp) # arg6
4946 mov %r12,48(%rsp) # arg7
4947 mov %rcx,56(%rsp) # arg8, (NULL)
4948 call *__imp_RtlVirtualUnwind(%rip)
4950 mov \$1,%eax # ExceptionContinueSearch
4962 .size cbc_se_handler,.-cbc_se_handler
4967 $code.=<<___ if ($PREFIX eq "aesni");
4968 .rva .LSEH_begin_aesni_ecb_encrypt
4969 .rva .LSEH_end_aesni_ecb_encrypt
4972 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
4973 .rva .LSEH_end_aesni_ccm64_encrypt_blocks
4974 .rva .LSEH_info_ccm64_enc
4976 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
4977 .rva .LSEH_end_aesni_ccm64_decrypt_blocks
4978 .rva .LSEH_info_ccm64_dec
4980 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
4981 .rva .LSEH_end_aesni_ctr32_encrypt_blocks
4982 .rva .LSEH_info_ctr32
4984 .rva .LSEH_begin_aesni_xts_encrypt
4985 .rva .LSEH_end_aesni_xts_encrypt
4986 .rva .LSEH_info_xts_enc
4988 .rva .LSEH_begin_aesni_xts_decrypt
4989 .rva .LSEH_end_aesni_xts_decrypt
4990 .rva .LSEH_info_xts_dec
4992 .rva .LSEH_begin_aesni_ocb_encrypt
4993 .rva .LSEH_end_aesni_ocb_encrypt
4994 .rva .LSEH_info_ocb_enc
4996 .rva .LSEH_begin_aesni_ocb_decrypt
4997 .rva .LSEH_end_aesni_ocb_decrypt
4998 .rva .LSEH_info_ocb_dec
5001 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
5002 .rva .LSEH_end_${PREFIX}_cbc_encrypt
5005 .rva ${PREFIX}_set_decrypt_key
5006 .rva .LSEH_end_set_decrypt_key
5009 .rva ${PREFIX}_set_encrypt_key
5010 .rva .LSEH_end_set_encrypt_key
5015 $code.=<<___ if ($PREFIX eq "aesni");
5018 .rva ecb_ccm64_se_handler
5019 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[]
5020 .LSEH_info_ccm64_enc:
5022 .rva ecb_ccm64_se_handler
5023 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
5024 .LSEH_info_ccm64_dec:
5026 .rva ecb_ccm64_se_handler
5027 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
5030 .rva ctr_xts_se_handler
5031 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
5034 .rva ctr_xts_se_handler
5035 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
5038 .rva ctr_xts_se_handler
5039 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
5043 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[]
5049 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[]
5058 .byte 0x01,0x04,0x01,0x00
5059 .byte 0x04,0x02,0x00,0x00 # sub rsp,8
5064 local *opcode=shift;
5068 $rex|=0x04 if($dst>=8);
5069 $rex|=0x01 if($src>=8);
5070 push @opcode,$rex|0x40 if($rex);
5077 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5078 rex(\@opcode,$4,$3);
5079 push @opcode,0x0f,0x3a,0xdf;
5080 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
5082 push @opcode,$c=~/^0/?oct($c):$c;
5083 return ".byte\t".join(',',@opcode);
5085 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5088 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5089 "aesdec" => 0xde, "aesdeclast" => 0xdf
5091 return undef if (!defined($opcodelet{$1}));
5092 rex(\@opcode,$3,$2);
5093 push @opcode,0x0f,0x38,$opcodelet{$1};
5094 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
5095 return ".byte\t".join(',',@opcode);
5097 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5099 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5100 "aesdec" => 0xde, "aesdeclast" => 0xdf
5102 return undef if (!defined($opcodelet{$1}));
5104 push @opcode,0x44 if ($3>=8);
5105 push @opcode,0x0f,0x38,$opcodelet{$1};
5106 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
5107 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5108 return ".byte\t".join(',',@opcode);
5114 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
5117 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
5118 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5119 #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
5120 $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;