2 # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements support for Intel AES-NI extension. In
18 # OpenSSL context it's used with Intel engine, but can also be used as
19 # drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
24 # Given aes(enc|dec) instructions' latency asymptotic performance for
25 # non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
26 # processed with 128-bit key. And given their throughput asymptotic
27 # performance for parallelizable modes is 1.25 cycles per byte. Being
28 # asymptotic limit it's not something you commonly achieve in reality,
29 # but how close does one get? Below are results collected for
30 # different modes and block sized. Pairs of numbers are for en-/
33 # 16-byte 64-byte 256-byte 1-KB 8-KB
34 # ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
35 # CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
36 # CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
37 # CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
38 # OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
39 # CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
41 # ECB, CTR, CBC and CCM results are free from EVP overhead. This means
42 # that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
43 # [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
44 # The results were collected with specially crafted speed.c benchmark
45 # in order to compare them with results reported in "Intel Advanced
46 # Encryption Standard (AES) New Instruction Set" White Paper Revision
47 # 3.0 dated May 2010. All above results are consistently better. This
48 # module also provides better performance for block sizes smaller than
49 # 128 bytes in points *not* represented in the above table.
51 # Looking at the results for 8-KB buffer.
53 # CFB and OFB results are far from the limit, because implementation
54 # uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
55 # single-block aesni_encrypt, which is not the most optimal way to go.
56 # CBC encrypt result is unexpectedly high and there is no documented
57 # explanation for it. Seemingly there is a small penalty for feeding
58 # the result back to AES unit the way it's done in CBC mode. There is
59 # nothing one can do and the result appears optimal. CCM result is
60 # identical to CBC, because CBC-MAC is essentially CBC encrypt without
61 # saving output. CCM CTR "stays invisible," because it's neatly
62 # interleaved wih CBC-MAC. This provides ~30% improvement over
63 # "straghtforward" CCM implementation with CTR and CBC-MAC performed
64 # disjointly. Parallelizable modes practically achieve the theoretical
67 # Looking at how results vary with buffer size.
69 # Curves are practically saturated at 1-KB buffer size. In most cases
70 # "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
71 # CTR curve doesn't follow this pattern and is "slowest" changing one
72 # with "256-byte" result being 87% of "8-KB." This is because overhead
73 # in CTR mode is most computationally intensive. Small-block CCM
74 # decrypt is slower than encrypt, because first CTR and last CBC-MAC
75 # iterations can't be interleaved.
77 # Results for 192- and 256-bit keys.
79 # EVP-free results were observed to scale perfectly with number of
80 # rounds for larger block sizes, i.e. 192-bit result being 10/12 times
81 # lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
82 # are a tad smaller, because the above mentioned penalty biases all
83 # results by same constant value. In similar way function call
84 # overhead affects small-block performance, as well as OFB and CFB
85 # results. Differences are not large, most common coefficients are
86 # 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
87 # observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
91 # While Westmere processor features 6 cycles latency for aes[enc|dec]
92 # instructions, which can be scheduled every second cycle, Sandy
93 # Bridge spends 8 cycles per instruction, but it can schedule them
94 # every cycle. This means that code targeting Westmere would perform
95 # suboptimally on Sandy Bridge. Therefore this update.
97 # In addition, non-parallelizable CBC encrypt (as well as CCM) is
98 # optimized. Relative improvement might appear modest, 8% on Westmere,
99 # but in absolute terms it's 3.77 cycles per byte encrypted with
100 # 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
101 # should be compared to asymptotic limits of 3.75 for Westmere and
102 # 5.00 for Sandy Bridge. Actually, the fact that they get this close
103 # to asymptotic limits is quite amazing. Indeed, the limit is
104 # calculated as latency times number of rounds, 10 for 128-bit key,
105 # and divided by 16, the number of bytes in block, or in other words
106 # it accounts *solely* for aesenc instructions. But there are extra
107 # instructions, and numbers so close to the asymptotic limits mean
108 # that it's as if it takes as little as *one* additional cycle to
109 # execute all of them. How is it possible? It is possible thanks to
110 # out-of-order execution logic, which manages to overlap post-
111 # processing of previous block, things like saving the output, with
112 # actual encryption of current block, as well as pre-processing of
113 # current block, things like fetching input and xor-ing it with
114 # 0-round element of the key schedule, with actual encryption of
115 # previous block. Keep this in mind...
117 # For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
118 # performance is achieved by interleaving instructions working on
119 # independent blocks. In which case asymptotic limit for such modes
120 # can be obtained by dividing above mentioned numbers by AES
121 # instructions' interleave factor. Westmere can execute at most 3
122 # instructions at a time, meaning that optimal interleave factor is 3,
123 # and that's where the "magic" number of 1.25 come from. "Optimal
124 # interleave factor" means that increase of interleave factor does
125 # not improve performance. The formula has proven to reflect reality
126 # pretty well on Westmere... Sandy Bridge on the other hand can
127 # execute up to 8 AES instructions at a time, so how does varying
128 # interleave factor affect the performance? Here is table for ECB
129 # (numbers are cycles per byte processed with 128-bit key):
131 # instruction interleave factor 3x 6x 8x
132 # theoretical asymptotic limit 1.67 0.83 0.625
133 # measured performance for 8KB block 1.05 0.86 0.84
135 # "as if" interleave factor 4.7x 5.8x 6.0x
137 # Further data for other parallelizable modes:
139 # CBC decrypt 1.16 0.93 0.74
142 # Well, given 3x column it's probably inappropriate to call the limit
143 # asymptotic, if it can be surpassed, isn't it? What happens there?
144 # Rewind to CBC paragraph for the answer. Yes, out-of-order execution
145 # magic is responsible for this. Processor overlaps not only the
146 # additional instructions with AES ones, but even AES instuctions
147 # processing adjacent triplets of independent blocks. In the 6x case
148 # additional instructions still claim disproportionally small amount
149 # of additional cycles, but in 8x case number of instructions must be
150 # a tad too high for out-of-order logic to cope with, and AES unit
151 # remains underutilized... As you can see 8x interleave is hardly
152 # justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
153 # utilizies 6x interleave because of limited register bank capacity.
155 # Higher interleave factors do have negative impact on Westmere
156 # performance. While for ECB mode it's negligible ~1.5%, other
157 # parallelizables perform ~5% worse, which is outweighed by ~25%
158 # improvement on Sandy Bridge. To balance regression on Westmere
159 # CTR mode was implemented with 6x aesenc interleave factor.
163 # Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
164 # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
165 # in CTR mode AES instruction interleave factor was chosen to be 6x.
169 # Add aesni_ocb_[en|de]crypt. AES instruction interleave factor was
172 ######################################################################
173 # Current large-block performance in cycles per byte processed with
174 # 128-bit key (less is better).
176 # CBC en-/decrypt CTR XTS ECB OCB
177 # Westmere 3.77/1.25 1.25 1.25 1.26
178 # * Bridge 5.07/0.74 0.75 0.90 0.85 0.98
179 # Haswell 4.44/0.63 0.63 0.73 0.63 0.70
180 # Skylake 2.62/0.63 0.63 0.63 0.63
181 # Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11
182 # Goldmont 3.82/1.26 1.26 1.29 1.29 1.50
183 # Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95
184 # Ryzen 2.71/0.35 0.35 0.44 ? ?
186 # (*) Atom Silvermont ECB result is suboptimal because of penalties
187 # incurred by operations on %xmm8-15. As ECB is not considered
188 # critical, nothing was done to mitigate the problem.
190 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
191 # generates drop-in replacement for
192 # crypto/aes/asm/aes-x86_64.pl:-)
196 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
198 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
200 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
201 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
202 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
203 die "can't locate x86_64-xlate.pl";
205 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
208 $movkey = $PREFIX eq "aesni" ? "movups" : "movups";
209 @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
210 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
213 $code.=".extern OPENSSL_ia32cap_P\n";
215 $rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
216 # this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
220 $key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
221 $ivp="%r8"; # cbc, ctr, ...
223 $rnds_="%r10d"; # backup copy for $rounds
224 $key_="%r11"; # backup copy for $key
226 # %xmm register layout
227 $rndkey0="%xmm0"; $rndkey1="%xmm1";
228 $inout0="%xmm2"; $inout1="%xmm3";
229 $inout2="%xmm4"; $inout3="%xmm5";
230 $inout4="%xmm6"; $inout5="%xmm7";
231 $inout6="%xmm8"; $inout7="%xmm9";
233 $in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
234 $in0="%xmm8"; $iv="%xmm9";
236 # Inline version of internal aesni_[en|de]crypt1.
238 # Why folded loop? Because aes[enc|dec] is slow enough to accommodate
239 # cycles which take care of loop variables...
241 sub aesni_generate1 {
242 my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
245 $movkey ($key),$rndkey0
246 $movkey 16($key),$rndkey1
248 $code.=<<___ if (defined($ivec));
253 $code.=<<___ if (!defined($ivec));
255 xorps $rndkey0,$inout
259 aes${p} $rndkey1,$inout
261 $movkey ($key),$rndkey1
263 jnz .Loop_${p}1_$sn # loop body is 16 bytes
264 aes${p}last $rndkey1,$inout
267 # void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
269 { my ($inp,$out,$key) = @_4args;
272 .globl ${PREFIX}_encrypt
273 .type ${PREFIX}_encrypt,\@abi-omnipotent
276 movups ($inp),$inout0 # load input
277 mov 240($key),$rounds # key->rounds
279 &aesni_generate1("enc",$key,$rounds);
281 pxor $rndkey0,$rndkey0 # clear register bank
282 pxor $rndkey1,$rndkey1
283 movups $inout0,($out) # output
286 .size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
288 .globl ${PREFIX}_decrypt
289 .type ${PREFIX}_decrypt,\@abi-omnipotent
292 movups ($inp),$inout0 # load input
293 mov 240($key),$rounds # key->rounds
295 &aesni_generate1("dec",$key,$rounds);
297 pxor $rndkey0,$rndkey0 # clear register bank
298 pxor $rndkey1,$rndkey1
299 movups $inout0,($out) # output
302 .size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
306 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
307 # factor. Why 3x subroutine were originally used in loops? Even though
308 # aes[enc|dec] latency was originally 6, it could be scheduled only
309 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
310 # utilization, i.e. when subroutine's throughput is virtually same as
311 # of non-interleaved subroutine [for number of input blocks up to 3].
312 # This is why it originally made no sense to implement 2x subroutine.
313 # But times change and it became appropriate to spend extra 192 bytes
314 # on 2x subroutine on Atom Silvermont account. For processors that
315 # can schedule aes[enc|dec] every cycle optimal interleave factor
316 # equals to corresponding instructions latency. 8x is optimal for
317 # * Bridge and "super-optimal" for other Intel CPUs...
319 sub aesni_generate2 {
321 # As already mentioned it takes in $key and $rounds, which are *not*
322 # preserved. $inout[0-1] is cipher/clear text...
324 .type _aesni_${dir}rypt2,\@abi-omnipotent
327 $movkey ($key),$rndkey0
329 $movkey 16($key),$rndkey1
330 xorps $rndkey0,$inout0
331 xorps $rndkey0,$inout1
332 $movkey 32($key),$rndkey0
333 lea 32($key,$rounds),$key
338 aes${dir} $rndkey1,$inout0
339 aes${dir} $rndkey1,$inout1
340 $movkey ($key,%rax),$rndkey1
342 aes${dir} $rndkey0,$inout0
343 aes${dir} $rndkey0,$inout1
344 $movkey -16($key,%rax),$rndkey0
347 aes${dir} $rndkey1,$inout0
348 aes${dir} $rndkey1,$inout1
349 aes${dir}last $rndkey0,$inout0
350 aes${dir}last $rndkey0,$inout1
352 .size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
355 sub aesni_generate3 {
357 # As already mentioned it takes in $key and $rounds, which are *not*
358 # preserved. $inout[0-2] is cipher/clear text...
360 .type _aesni_${dir}rypt3,\@abi-omnipotent
363 $movkey ($key),$rndkey0
365 $movkey 16($key),$rndkey1
366 xorps $rndkey0,$inout0
367 xorps $rndkey0,$inout1
368 xorps $rndkey0,$inout2
369 $movkey 32($key),$rndkey0
370 lea 32($key,$rounds),$key
375 aes${dir} $rndkey1,$inout0
376 aes${dir} $rndkey1,$inout1
377 aes${dir} $rndkey1,$inout2
378 $movkey ($key,%rax),$rndkey1
380 aes${dir} $rndkey0,$inout0
381 aes${dir} $rndkey0,$inout1
382 aes${dir} $rndkey0,$inout2
383 $movkey -16($key,%rax),$rndkey0
386 aes${dir} $rndkey1,$inout0
387 aes${dir} $rndkey1,$inout1
388 aes${dir} $rndkey1,$inout2
389 aes${dir}last $rndkey0,$inout0
390 aes${dir}last $rndkey0,$inout1
391 aes${dir}last $rndkey0,$inout2
393 .size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
396 # 4x interleave is implemented to improve small block performance,
397 # most notably [and naturally] 4 block by ~30%. One can argue that one
398 # should have implemented 5x as well, but improvement would be <20%,
399 # so it's not worth it...
400 sub aesni_generate4 {
402 # As already mentioned it takes in $key and $rounds, which are *not*
403 # preserved. $inout[0-3] is cipher/clear text...
405 .type _aesni_${dir}rypt4,\@abi-omnipotent
408 $movkey ($key),$rndkey0
410 $movkey 16($key),$rndkey1
411 xorps $rndkey0,$inout0
412 xorps $rndkey0,$inout1
413 xorps $rndkey0,$inout2
414 xorps $rndkey0,$inout3
415 $movkey 32($key),$rndkey0
416 lea 32($key,$rounds),$key
422 aes${dir} $rndkey1,$inout0
423 aes${dir} $rndkey1,$inout1
424 aes${dir} $rndkey1,$inout2
425 aes${dir} $rndkey1,$inout3
426 $movkey ($key,%rax),$rndkey1
428 aes${dir} $rndkey0,$inout0
429 aes${dir} $rndkey0,$inout1
430 aes${dir} $rndkey0,$inout2
431 aes${dir} $rndkey0,$inout3
432 $movkey -16($key,%rax),$rndkey0
435 aes${dir} $rndkey1,$inout0
436 aes${dir} $rndkey1,$inout1
437 aes${dir} $rndkey1,$inout2
438 aes${dir} $rndkey1,$inout3
439 aes${dir}last $rndkey0,$inout0
440 aes${dir}last $rndkey0,$inout1
441 aes${dir}last $rndkey0,$inout2
442 aes${dir}last $rndkey0,$inout3
444 .size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
447 sub aesni_generate6 {
449 # As already mentioned it takes in $key and $rounds, which are *not*
450 # preserved. $inout[0-5] is cipher/clear text...
452 .type _aesni_${dir}rypt6,\@abi-omnipotent
455 $movkey ($key),$rndkey0
457 $movkey 16($key),$rndkey1
458 xorps $rndkey0,$inout0
459 pxor $rndkey0,$inout1
460 pxor $rndkey0,$inout2
461 aes${dir} $rndkey1,$inout0
462 lea 32($key,$rounds),$key
464 aes${dir} $rndkey1,$inout1
465 pxor $rndkey0,$inout3
466 pxor $rndkey0,$inout4
467 aes${dir} $rndkey1,$inout2
468 pxor $rndkey0,$inout5
469 $movkey ($key,%rax),$rndkey0
471 jmp .L${dir}_loop6_enter
474 aes${dir} $rndkey1,$inout0
475 aes${dir} $rndkey1,$inout1
476 aes${dir} $rndkey1,$inout2
477 .L${dir}_loop6_enter:
478 aes${dir} $rndkey1,$inout3
479 aes${dir} $rndkey1,$inout4
480 aes${dir} $rndkey1,$inout5
481 $movkey ($key,%rax),$rndkey1
483 aes${dir} $rndkey0,$inout0
484 aes${dir} $rndkey0,$inout1
485 aes${dir} $rndkey0,$inout2
486 aes${dir} $rndkey0,$inout3
487 aes${dir} $rndkey0,$inout4
488 aes${dir} $rndkey0,$inout5
489 $movkey -16($key,%rax),$rndkey0
492 aes${dir} $rndkey1,$inout0
493 aes${dir} $rndkey1,$inout1
494 aes${dir} $rndkey1,$inout2
495 aes${dir} $rndkey1,$inout3
496 aes${dir} $rndkey1,$inout4
497 aes${dir} $rndkey1,$inout5
498 aes${dir}last $rndkey0,$inout0
499 aes${dir}last $rndkey0,$inout1
500 aes${dir}last $rndkey0,$inout2
501 aes${dir}last $rndkey0,$inout3
502 aes${dir}last $rndkey0,$inout4
503 aes${dir}last $rndkey0,$inout5
505 .size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
508 sub aesni_generate8 {
510 # As already mentioned it takes in $key and $rounds, which are *not*
511 # preserved. $inout[0-7] is cipher/clear text...
513 .type _aesni_${dir}rypt8,\@abi-omnipotent
516 $movkey ($key),$rndkey0
518 $movkey 16($key),$rndkey1
519 xorps $rndkey0,$inout0
520 xorps $rndkey0,$inout1
521 pxor $rndkey0,$inout2
522 pxor $rndkey0,$inout3
523 pxor $rndkey0,$inout4
524 lea 32($key,$rounds),$key
526 aes${dir} $rndkey1,$inout0
527 pxor $rndkey0,$inout5
528 pxor $rndkey0,$inout6
529 aes${dir} $rndkey1,$inout1
530 pxor $rndkey0,$inout7
531 $movkey ($key,%rax),$rndkey0
533 jmp .L${dir}_loop8_inner
536 aes${dir} $rndkey1,$inout0
537 aes${dir} $rndkey1,$inout1
538 .L${dir}_loop8_inner:
539 aes${dir} $rndkey1,$inout2
540 aes${dir} $rndkey1,$inout3
541 aes${dir} $rndkey1,$inout4
542 aes${dir} $rndkey1,$inout5
543 aes${dir} $rndkey1,$inout6
544 aes${dir} $rndkey1,$inout7
545 .L${dir}_loop8_enter:
546 $movkey ($key,%rax),$rndkey1
548 aes${dir} $rndkey0,$inout0
549 aes${dir} $rndkey0,$inout1
550 aes${dir} $rndkey0,$inout2
551 aes${dir} $rndkey0,$inout3
552 aes${dir} $rndkey0,$inout4
553 aes${dir} $rndkey0,$inout5
554 aes${dir} $rndkey0,$inout6
555 aes${dir} $rndkey0,$inout7
556 $movkey -16($key,%rax),$rndkey0
559 aes${dir} $rndkey1,$inout0
560 aes${dir} $rndkey1,$inout1
561 aes${dir} $rndkey1,$inout2
562 aes${dir} $rndkey1,$inout3
563 aes${dir} $rndkey1,$inout4
564 aes${dir} $rndkey1,$inout5
565 aes${dir} $rndkey1,$inout6
566 aes${dir} $rndkey1,$inout7
567 aes${dir}last $rndkey0,$inout0
568 aes${dir}last $rndkey0,$inout1
569 aes${dir}last $rndkey0,$inout2
570 aes${dir}last $rndkey0,$inout3
571 aes${dir}last $rndkey0,$inout4
572 aes${dir}last $rndkey0,$inout5
573 aes${dir}last $rndkey0,$inout6
574 aes${dir}last $rndkey0,$inout7
576 .size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
579 &aesni_generate2("enc") if ($PREFIX eq "aesni");
580 &aesni_generate2("dec");
581 &aesni_generate3("enc") if ($PREFIX eq "aesni");
582 &aesni_generate3("dec");
583 &aesni_generate4("enc") if ($PREFIX eq "aesni");
584 &aesni_generate4("dec");
585 &aesni_generate6("enc") if ($PREFIX eq "aesni");
586 &aesni_generate6("dec");
587 &aesni_generate8("enc") if ($PREFIX eq "aesni");
588 &aesni_generate8("dec");
590 if ($PREFIX eq "aesni") {
591 ########################################################################
592 # void aesni_ecb_encrypt (const void *in, void *out,
593 # size_t length, const AES_KEY *key,
596 .globl aesni_ecb_encrypt
597 .type aesni_ecb_encrypt,\@function,5
601 $code.=<<___ if ($win64);
603 movaps %xmm6,(%rsp) # offload $inout4..7
604 movaps %xmm7,0x10(%rsp)
605 movaps %xmm8,0x20(%rsp)
606 movaps %xmm9,0x30(%rsp)
610 and \$-16,$len # if ($len<16)
611 jz .Lecb_ret # return
613 mov 240($key),$rounds # key->rounds
614 $movkey ($key),$rndkey0
615 mov $key,$key_ # backup $key
616 mov $rounds,$rnds_ # backup $rounds
617 test %r8d,%r8d # 5th argument
619 #--------------------------- ECB ENCRYPT ------------------------------#
620 cmp \$0x80,$len # if ($len<8*16)
621 jb .Lecb_enc_tail # short input
623 movdqu ($inp),$inout0 # load 8 input blocks
624 movdqu 0x10($inp),$inout1
625 movdqu 0x20($inp),$inout2
626 movdqu 0x30($inp),$inout3
627 movdqu 0x40($inp),$inout4
628 movdqu 0x50($inp),$inout5
629 movdqu 0x60($inp),$inout6
630 movdqu 0x70($inp),$inout7
631 lea 0x80($inp),$inp # $inp+=8*16
632 sub \$0x80,$len # $len-=8*16 (can be zero)
633 jmp .Lecb_enc_loop8_enter
636 movups $inout0,($out) # store 8 output blocks
637 mov $key_,$key # restore $key
638 movdqu ($inp),$inout0 # load 8 input blocks
639 mov $rnds_,$rounds # restore $rounds
640 movups $inout1,0x10($out)
641 movdqu 0x10($inp),$inout1
642 movups $inout2,0x20($out)
643 movdqu 0x20($inp),$inout2
644 movups $inout3,0x30($out)
645 movdqu 0x30($inp),$inout3
646 movups $inout4,0x40($out)
647 movdqu 0x40($inp),$inout4
648 movups $inout5,0x50($out)
649 movdqu 0x50($inp),$inout5
650 movups $inout6,0x60($out)
651 movdqu 0x60($inp),$inout6
652 movups $inout7,0x70($out)
653 lea 0x80($out),$out # $out+=8*16
654 movdqu 0x70($inp),$inout7
655 lea 0x80($inp),$inp # $inp+=8*16
656 .Lecb_enc_loop8_enter:
661 jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
663 movups $inout0,($out) # store 8 output blocks
664 mov $key_,$key # restore $key
665 movups $inout1,0x10($out)
666 mov $rnds_,$rounds # restore $rounds
667 movups $inout2,0x20($out)
668 movups $inout3,0x30($out)
669 movups $inout4,0x40($out)
670 movups $inout5,0x50($out)
671 movups $inout6,0x60($out)
672 movups $inout7,0x70($out)
673 lea 0x80($out),$out # $out+=8*16
674 add \$0x80,$len # restore real remaining $len
675 jz .Lecb_ret # done if ($len==0)
677 .Lecb_enc_tail: # $len is less than 8*16
678 movups ($inp),$inout0
681 movups 0x10($inp),$inout1
683 movups 0x20($inp),$inout2
686 movups 0x30($inp),$inout3
688 movups 0x40($inp),$inout4
691 movups 0x50($inp),$inout5
693 movdqu 0x60($inp),$inout6
694 xorps $inout7,$inout7
696 movups $inout0,($out) # store 7 output blocks
697 movups $inout1,0x10($out)
698 movups $inout2,0x20($out)
699 movups $inout3,0x30($out)
700 movups $inout4,0x40($out)
701 movups $inout5,0x50($out)
702 movups $inout6,0x60($out)
707 &aesni_generate1("enc",$key,$rounds);
709 movups $inout0,($out) # store one output block
714 movups $inout0,($out) # store 2 output blocks
715 movups $inout1,0x10($out)
720 movups $inout0,($out) # store 3 output blocks
721 movups $inout1,0x10($out)
722 movups $inout2,0x20($out)
727 movups $inout0,($out) # store 4 output blocks
728 movups $inout1,0x10($out)
729 movups $inout2,0x20($out)
730 movups $inout3,0x30($out)
734 xorps $inout5,$inout5
736 movups $inout0,($out) # store 5 output blocks
737 movups $inout1,0x10($out)
738 movups $inout2,0x20($out)
739 movups $inout3,0x30($out)
740 movups $inout4,0x40($out)
745 movups $inout0,($out) # store 6 output blocks
746 movups $inout1,0x10($out)
747 movups $inout2,0x20($out)
748 movups $inout3,0x30($out)
749 movups $inout4,0x40($out)
750 movups $inout5,0x50($out)
752 \f#--------------------------- ECB DECRYPT ------------------------------#
755 cmp \$0x80,$len # if ($len<8*16)
756 jb .Lecb_dec_tail # short input
758 movdqu ($inp),$inout0 # load 8 input blocks
759 movdqu 0x10($inp),$inout1
760 movdqu 0x20($inp),$inout2
761 movdqu 0x30($inp),$inout3
762 movdqu 0x40($inp),$inout4
763 movdqu 0x50($inp),$inout5
764 movdqu 0x60($inp),$inout6
765 movdqu 0x70($inp),$inout7
766 lea 0x80($inp),$inp # $inp+=8*16
767 sub \$0x80,$len # $len-=8*16 (can be zero)
768 jmp .Lecb_dec_loop8_enter
771 movups $inout0,($out) # store 8 output blocks
772 mov $key_,$key # restore $key
773 movdqu ($inp),$inout0 # load 8 input blocks
774 mov $rnds_,$rounds # restore $rounds
775 movups $inout1,0x10($out)
776 movdqu 0x10($inp),$inout1
777 movups $inout2,0x20($out)
778 movdqu 0x20($inp),$inout2
779 movups $inout3,0x30($out)
780 movdqu 0x30($inp),$inout3
781 movups $inout4,0x40($out)
782 movdqu 0x40($inp),$inout4
783 movups $inout5,0x50($out)
784 movdqu 0x50($inp),$inout5
785 movups $inout6,0x60($out)
786 movdqu 0x60($inp),$inout6
787 movups $inout7,0x70($out)
788 lea 0x80($out),$out # $out+=8*16
789 movdqu 0x70($inp),$inout7
790 lea 0x80($inp),$inp # $inp+=8*16
791 .Lecb_dec_loop8_enter:
795 $movkey ($key_),$rndkey0
797 jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
799 movups $inout0,($out) # store 8 output blocks
800 pxor $inout0,$inout0 # clear register bank
801 mov $key_,$key # restore $key
802 movups $inout1,0x10($out)
804 mov $rnds_,$rounds # restore $rounds
805 movups $inout2,0x20($out)
807 movups $inout3,0x30($out)
809 movups $inout4,0x40($out)
811 movups $inout5,0x50($out)
813 movups $inout6,0x60($out)
815 movups $inout7,0x70($out)
817 lea 0x80($out),$out # $out+=8*16
818 add \$0x80,$len # restore real remaining $len
819 jz .Lecb_ret # done if ($len==0)
822 movups ($inp),$inout0
825 movups 0x10($inp),$inout1
827 movups 0x20($inp),$inout2
830 movups 0x30($inp),$inout3
832 movups 0x40($inp),$inout4
835 movups 0x50($inp),$inout5
837 movups 0x60($inp),$inout6
838 $movkey ($key),$rndkey0
839 xorps $inout7,$inout7
841 movups $inout0,($out) # store 7 output blocks
842 pxor $inout0,$inout0 # clear register bank
843 movups $inout1,0x10($out)
845 movups $inout2,0x20($out)
847 movups $inout3,0x30($out)
849 movups $inout4,0x40($out)
851 movups $inout5,0x50($out)
853 movups $inout6,0x60($out)
860 &aesni_generate1("dec",$key,$rounds);
862 movups $inout0,($out) # store one output block
863 pxor $inout0,$inout0 # clear register bank
868 movups $inout0,($out) # store 2 output blocks
869 pxor $inout0,$inout0 # clear register bank
870 movups $inout1,0x10($out)
876 movups $inout0,($out) # store 3 output blocks
877 pxor $inout0,$inout0 # clear register bank
878 movups $inout1,0x10($out)
880 movups $inout2,0x20($out)
886 movups $inout0,($out) # store 4 output blocks
887 pxor $inout0,$inout0 # clear register bank
888 movups $inout1,0x10($out)
890 movups $inout2,0x20($out)
892 movups $inout3,0x30($out)
897 xorps $inout5,$inout5
899 movups $inout0,($out) # store 5 output blocks
900 pxor $inout0,$inout0 # clear register bank
901 movups $inout1,0x10($out)
903 movups $inout2,0x20($out)
905 movups $inout3,0x30($out)
907 movups $inout4,0x40($out)
914 movups $inout0,($out) # store 6 output blocks
915 pxor $inout0,$inout0 # clear register bank
916 movups $inout1,0x10($out)
918 movups $inout2,0x20($out)
920 movups $inout3,0x30($out)
922 movups $inout4,0x40($out)
924 movups $inout5,0x50($out)
928 xorps $rndkey0,$rndkey0 # %xmm0
929 pxor $rndkey1,$rndkey1
931 $code.=<<___ if ($win64);
933 movaps %xmm0,(%rsp) # clear stack
934 movaps 0x10(%rsp),%xmm7
935 movaps %xmm0,0x10(%rsp)
936 movaps 0x20(%rsp),%xmm8
937 movaps %xmm0,0x20(%rsp)
938 movaps 0x30(%rsp),%xmm9
939 movaps %xmm0,0x30(%rsp)
945 .size aesni_ecb_encrypt,.-aesni_ecb_encrypt
949 ######################################################################
950 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
951 # size_t blocks, const AES_KEY *key,
952 # const char *ivec,char *cmac);
954 # Handles only complete blocks, operates on 64-bit counter and
955 # does not update *ivec! Nor does it finalize CMAC value
956 # (see engine/eng_aesni.c for details)
959 my $cmac="%r9"; # 6th argument
961 my $increment="%xmm9";
963 my $bswap_mask="%xmm7";
966 .globl aesni_ccm64_encrypt_blocks
967 .type aesni_ccm64_encrypt_blocks,\@function,6
969 aesni_ccm64_encrypt_blocks:
971 $code.=<<___ if ($win64);
973 movaps %xmm6,(%rsp) # $iv
974 movaps %xmm7,0x10(%rsp) # $bswap_mask
975 movaps %xmm8,0x20(%rsp) # $in0
976 movaps %xmm9,0x30(%rsp) # $increment
980 mov 240($key),$rounds # key->rounds
982 movdqa .Lincrement64(%rip),$increment
983 movdqa .Lbswap_mask(%rip),$bswap_mask
988 movdqu ($cmac),$inout1
990 lea 32($key,$rounds),$key # end of key schedule
991 pshufb $bswap_mask,$iv
992 sub %rax,%r10 # twisted $rounds
993 jmp .Lccm64_enc_outer
996 $movkey ($key_),$rndkey0
998 movups ($inp),$in0 # load inp
1000 xorps $rndkey0,$inout0 # counter
1001 $movkey 16($key_),$rndkey1
1003 xorps $rndkey0,$inout1 # cmac^=inp
1004 $movkey 32($key_),$rndkey0
1007 aesenc $rndkey1,$inout0
1008 aesenc $rndkey1,$inout1
1009 $movkey ($key,%rax),$rndkey1
1011 aesenc $rndkey0,$inout0
1012 aesenc $rndkey0,$inout1
1013 $movkey -16($key,%rax),$rndkey0
1014 jnz .Lccm64_enc2_loop
1015 aesenc $rndkey1,$inout0
1016 aesenc $rndkey1,$inout1
1017 paddq $increment,$iv
1018 dec $len # $len-- ($len is in blocks)
1019 aesenclast $rndkey0,$inout0
1020 aesenclast $rndkey0,$inout1
1023 xorps $inout0,$in0 # inp ^= E(iv)
1025 movups $in0,($out) # save output
1026 pshufb $bswap_mask,$inout0
1027 lea 16($out),$out # $out+=16
1028 jnz .Lccm64_enc_outer # loop if ($len!=0)
1030 pxor $rndkey0,$rndkey0 # clear register bank
1031 pxor $rndkey1,$rndkey1
1032 pxor $inout0,$inout0
1033 movups $inout1,($cmac) # store resulting mac
1034 pxor $inout1,$inout1
1038 $code.=<<___ if ($win64);
1040 movaps %xmm0,(%rsp) # clear stack
1041 movaps 0x10(%rsp),%xmm7
1042 movaps %xmm0,0x10(%rsp)
1043 movaps 0x20(%rsp),%xmm8
1044 movaps %xmm0,0x20(%rsp)
1045 movaps 0x30(%rsp),%xmm9
1046 movaps %xmm0,0x30(%rsp)
1052 .size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
1054 ######################################################################
1056 .globl aesni_ccm64_decrypt_blocks
1057 .type aesni_ccm64_decrypt_blocks,\@function,6
1059 aesni_ccm64_decrypt_blocks:
1061 $code.=<<___ if ($win64);
1062 lea -0x58(%rsp),%rsp
1063 movaps %xmm6,(%rsp) # $iv
1064 movaps %xmm7,0x10(%rsp) # $bswap_mask
1065 movaps %xmm8,0x20(%rsp) # $in8
1066 movaps %xmm9,0x30(%rsp) # $increment
1070 mov 240($key),$rounds # key->rounds
1072 movdqu ($cmac),$inout1
1073 movdqa .Lincrement64(%rip),$increment
1074 movdqa .Lbswap_mask(%rip),$bswap_mask
1079 pshufb $bswap_mask,$iv
1081 &aesni_generate1("enc",$key,$rounds);
1085 movups ($inp),$in0 # load inp
1086 paddq $increment,$iv
1087 lea 16($inp),$inp # $inp+=16
1088 sub %r10,%rax # twisted $rounds
1089 lea 32($key_,$rnds_),$key # end of key schedule
1091 jmp .Lccm64_dec_outer
1094 xorps $inout0,$in0 # inp ^= E(iv)
1096 movups $in0,($out) # save output
1097 lea 16($out),$out # $out+=16
1098 pshufb $bswap_mask,$inout0
1100 sub \$1,$len # $len-- ($len is in blocks)
1101 jz .Lccm64_dec_break # if ($len==0) break
1103 $movkey ($key_),$rndkey0
1105 $movkey 16($key_),$rndkey1
1107 xorps $rndkey0,$inout0
1108 xorps $in0,$inout1 # cmac^=out
1109 $movkey 32($key_),$rndkey0
1110 jmp .Lccm64_dec2_loop
1113 aesenc $rndkey1,$inout0
1114 aesenc $rndkey1,$inout1
1115 $movkey ($key,%rax),$rndkey1
1117 aesenc $rndkey0,$inout0
1118 aesenc $rndkey0,$inout1
1119 $movkey -16($key,%rax),$rndkey0
1120 jnz .Lccm64_dec2_loop
1121 movups ($inp),$in0 # load input
1122 paddq $increment,$iv
1123 aesenc $rndkey1,$inout0
1124 aesenc $rndkey1,$inout1
1125 aesenclast $rndkey0,$inout0
1126 aesenclast $rndkey0,$inout1
1127 lea 16($inp),$inp # $inp+=16
1128 jmp .Lccm64_dec_outer
1132 #xorps $in0,$inout1 # cmac^=out
1133 mov 240($key_),$rounds
1135 &aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
1137 pxor $rndkey0,$rndkey0 # clear register bank
1138 pxor $rndkey1,$rndkey1
1139 pxor $inout0,$inout0
1140 movups $inout1,($cmac) # store resulting mac
1141 pxor $inout1,$inout1
1145 $code.=<<___ if ($win64);
1147 movaps %xmm0,(%rsp) # clear stack
1148 movaps 0x10(%rsp),%xmm7
1149 movaps %xmm0,0x10(%rsp)
1150 movaps 0x20(%rsp),%xmm8
1151 movaps %xmm0,0x20(%rsp)
1152 movaps 0x30(%rsp),%xmm9
1153 movaps %xmm0,0x30(%rsp)
1159 .size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
1162 ######################################################################
1163 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
1164 # size_t blocks, const AES_KEY *key,
1165 # const char *ivec);
1167 # Handles only complete blocks, operates on 32-bit counter and
1168 # does not update *ivec! (see crypto/modes/ctr128.c for details)
1170 # Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
1171 # http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
1172 # Keywords are full unroll and modulo-schedule counter calculations
1173 # with zero-round key xor.
1175 my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
1176 my ($key0,$ctr)=("%ebp","${ivp}d");
1177 my $frame_size = 0x80 + ($win64?160:0);
1180 .globl aesni_ctr32_encrypt_blocks
1181 .type aesni_ctr32_encrypt_blocks,\@function,5
1183 aesni_ctr32_encrypt_blocks:
1188 # handle single block without allocating stack frame,
1189 # useful when handling edges
1190 movups ($ivp),$inout0
1191 movups ($inp),$inout1
1192 mov 240($key),%edx # key->rounds
1194 &aesni_generate1("enc",$key,"%edx");
1196 pxor $rndkey0,$rndkey0 # clear register bank
1197 pxor $rndkey1,$rndkey1
1198 xorps $inout1,$inout0
1199 pxor $inout1,$inout1
1200 movups $inout0,($out)
1201 xorps $inout0,$inout0
1202 jmp .Lctr32_epilogue
1206 lea (%rsp),$key_ # use $key_ as frame pointer
1207 .cfi_def_cfa_register $key_
1210 sub \$$frame_size,%rsp
1211 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1213 $code.=<<___ if ($win64);
1214 movaps %xmm6,-0xa8($key_) # offload everything
1215 movaps %xmm7,-0x98($key_)
1216 movaps %xmm8,-0x88($key_)
1217 movaps %xmm9,-0x78($key_)
1218 movaps %xmm10,-0x68($key_)
1219 movaps %xmm11,-0x58($key_)
1220 movaps %xmm12,-0x48($key_)
1221 movaps %xmm13,-0x38($key_)
1222 movaps %xmm14,-0x28($key_)
1223 movaps %xmm15,-0x18($key_)
1228 # 8 16-byte words on top of stack are counter values
1229 # xor-ed with zero-round key
1231 movdqu ($ivp),$inout0
1232 movdqu ($key),$rndkey0
1233 mov 12($ivp),$ctr # counter LSB
1234 pxor $rndkey0,$inout0
1235 mov 12($key),$key0 # 0-round key LSB
1236 movdqa $inout0,0x00(%rsp) # populate counter block
1238 movdqa $inout0,$inout1
1239 movdqa $inout0,$inout2
1240 movdqa $inout0,$inout3
1241 movdqa $inout0,0x40(%rsp)
1242 movdqa $inout0,0x50(%rsp)
1243 movdqa $inout0,0x60(%rsp)
1244 mov %rdx,%r10 # about to borrow %rdx
1245 movdqa $inout0,0x70(%rsp)
1253 pinsrd \$3,%eax,$inout1
1255 movdqa $inout1,0x10(%rsp)
1256 pinsrd \$3,%edx,$inout2
1258 mov %r10,%rdx # restore %rdx
1260 movdqa $inout2,0x20(%rsp)
1263 pinsrd \$3,%eax,$inout3
1265 movdqa $inout3,0x30(%rsp)
1267 mov %r10d,0x40+12(%rsp)
1270 mov 240($key),$rounds # key->rounds
1273 mov %r9d,0x50+12(%rsp)
1276 mov %r10d,0x60+12(%rsp)
1278 mov OPENSSL_ia32cap_P+4(%rip),%r10d
1280 and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
1281 mov %r9d,0x70+12(%rsp)
1283 $movkey 0x10($key),$rndkey1
1285 movdqa 0x40(%rsp),$inout4
1286 movdqa 0x50(%rsp),$inout5
1288 cmp \$8,$len # $len is in blocks
1289 jb .Lctr32_tail # short input if ($len<8)
1291 sub \$6,$len # $len is biased by -6
1292 cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
1293 je .Lctr32_6x # [which denotes Atom Silvermont]
1295 lea 0x80($key),$key # size optimization
1296 sub \$2,$len # $len is biased by -8
1304 lea 32($key,$rounds),$key # end of key schedule
1305 sub %rax,%r10 # twisted $rounds
1310 add \$6,$ctr # next counter value
1311 $movkey -48($key,$rnds_),$rndkey0
1312 aesenc $rndkey1,$inout0
1315 aesenc $rndkey1,$inout1
1316 movbe %eax,`0x00+12`(%rsp) # store next counter value
1318 aesenc $rndkey1,$inout2
1320 movbe %eax,`0x10+12`(%rsp)
1321 aesenc $rndkey1,$inout3
1324 aesenc $rndkey1,$inout4
1325 movbe %eax,`0x20+12`(%rsp)
1327 aesenc $rndkey1,$inout5
1328 $movkey -32($key,$rnds_),$rndkey1
1331 aesenc $rndkey0,$inout0
1332 movbe %eax,`0x30+12`(%rsp)
1334 aesenc $rndkey0,$inout1
1336 movbe %eax,`0x40+12`(%rsp)
1337 aesenc $rndkey0,$inout2
1340 aesenc $rndkey0,$inout3
1341 movbe %eax,`0x50+12`(%rsp)
1342 mov %r10,%rax # mov $rnds_,$rounds
1343 aesenc $rndkey0,$inout4
1344 aesenc $rndkey0,$inout5
1345 $movkey -16($key,$rnds_),$rndkey0
1349 movdqu ($inp),$inout6 # load 6 input blocks
1350 movdqu 0x10($inp),$inout7
1351 movdqu 0x20($inp),$in0
1352 movdqu 0x30($inp),$in1
1353 movdqu 0x40($inp),$in2
1354 movdqu 0x50($inp),$in3
1355 lea 0x60($inp),$inp # $inp+=6*16
1356 $movkey -64($key,$rnds_),$rndkey1
1357 pxor $inout0,$inout6 # inp^=E(ctr)
1358 movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
1359 pxor $inout1,$inout7
1360 movaps 0x10(%rsp),$inout1
1362 movaps 0x20(%rsp),$inout2
1364 movaps 0x30(%rsp),$inout3
1366 movaps 0x40(%rsp),$inout4
1368 movaps 0x50(%rsp),$inout5
1369 movdqu $inout6,($out) # store 6 output blocks
1370 movdqu $inout7,0x10($out)
1371 movdqu $in0,0x20($out)
1372 movdqu $in1,0x30($out)
1373 movdqu $in2,0x40($out)
1374 movdqu $in3,0x50($out)
1375 lea 0x60($out),$out # $out+=6*16
1378 jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
1380 add \$6,$len # restore real remaining $len
1381 jz .Lctr32_done # done if ($len==0)
1383 lea -48($rnds_),$rounds
1384 lea -80($key,$rnds_),$key # restore $key
1386 shr \$4,$rounds # restore $rounds
1391 add \$8,$ctr # next counter value
1392 movdqa 0x60(%rsp),$inout6
1393 aesenc $rndkey1,$inout0
1395 movdqa 0x70(%rsp),$inout7
1396 aesenc $rndkey1,$inout1
1398 $movkey 0x20-0x80($key),$rndkey0
1399 aesenc $rndkey1,$inout2
1402 aesenc $rndkey1,$inout3
1403 mov %r9d,0x00+12(%rsp) # store next counter value
1405 aesenc $rndkey1,$inout4
1406 aesenc $rndkey1,$inout5
1407 aesenc $rndkey1,$inout6
1408 aesenc $rndkey1,$inout7
1409 $movkey 0x30-0x80($key),$rndkey1
1411 for($i=2;$i<8;$i++) {
1412 my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
1415 aesenc $rndkeyx,$inout0
1416 aesenc $rndkeyx,$inout1
1419 aesenc $rndkeyx,$inout2
1420 aesenc $rndkeyx,$inout3
1421 mov %r9d,`0x10*($i-1)`+12(%rsp)
1423 aesenc $rndkeyx,$inout4
1424 aesenc $rndkeyx,$inout5
1425 aesenc $rndkeyx,$inout6
1426 aesenc $rndkeyx,$inout7
1427 $movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
1432 aesenc $rndkey0,$inout0
1433 aesenc $rndkey0,$inout1
1434 aesenc $rndkey0,$inout2
1436 movdqu 0x00($inp),$in0 # start loading input
1437 aesenc $rndkey0,$inout3
1438 mov %r9d,0x70+12(%rsp)
1440 aesenc $rndkey0,$inout4
1441 aesenc $rndkey0,$inout5
1442 aesenc $rndkey0,$inout6
1443 aesenc $rndkey0,$inout7
1444 $movkey 0xa0-0x80($key),$rndkey0
1448 aesenc $rndkey1,$inout0
1449 aesenc $rndkey1,$inout1
1450 aesenc $rndkey1,$inout2
1451 aesenc $rndkey1,$inout3
1452 aesenc $rndkey1,$inout4
1453 aesenc $rndkey1,$inout5
1454 aesenc $rndkey1,$inout6
1455 aesenc $rndkey1,$inout7
1456 $movkey 0xb0-0x80($key),$rndkey1
1458 aesenc $rndkey0,$inout0
1459 aesenc $rndkey0,$inout1
1460 aesenc $rndkey0,$inout2
1461 aesenc $rndkey0,$inout3
1462 aesenc $rndkey0,$inout4
1463 aesenc $rndkey0,$inout5
1464 aesenc $rndkey0,$inout6
1465 aesenc $rndkey0,$inout7
1466 $movkey 0xc0-0x80($key),$rndkey0
1469 aesenc $rndkey1,$inout0
1470 aesenc $rndkey1,$inout1
1471 aesenc $rndkey1,$inout2
1472 aesenc $rndkey1,$inout3
1473 aesenc $rndkey1,$inout4
1474 aesenc $rndkey1,$inout5
1475 aesenc $rndkey1,$inout6
1476 aesenc $rndkey1,$inout7
1477 $movkey 0xd0-0x80($key),$rndkey1
1479 aesenc $rndkey0,$inout0
1480 aesenc $rndkey0,$inout1
1481 aesenc $rndkey0,$inout2
1482 aesenc $rndkey0,$inout3
1483 aesenc $rndkey0,$inout4
1484 aesenc $rndkey0,$inout5
1485 aesenc $rndkey0,$inout6
1486 aesenc $rndkey0,$inout7
1487 $movkey 0xe0-0x80($key),$rndkey0
1488 jmp .Lctr32_enc_done
1492 movdqu 0x10($inp),$in1
1493 pxor $rndkey0,$in0 # input^=round[last]
1494 movdqu 0x20($inp),$in2
1496 movdqu 0x30($inp),$in3
1498 movdqu 0x40($inp),$in4
1500 movdqu 0x50($inp),$in5
1503 aesenc $rndkey1,$inout0
1504 aesenc $rndkey1,$inout1
1505 aesenc $rndkey1,$inout2
1506 aesenc $rndkey1,$inout3
1507 aesenc $rndkey1,$inout4
1508 aesenc $rndkey1,$inout5
1509 aesenc $rndkey1,$inout6
1510 aesenc $rndkey1,$inout7
1511 movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
1512 lea 0x80($inp),$inp # $inp+=8*16
1514 aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
1515 pxor $rndkey0,$rndkey1 # borrowed $rndkey
1516 movdqu 0x70-0x80($inp),$in0
1517 aesenclast $in1,$inout1
1519 movdqa 0x00(%rsp),$in1 # load next counter block
1520 aesenclast $in2,$inout2
1521 aesenclast $in3,$inout3
1522 movdqa 0x10(%rsp),$in2
1523 movdqa 0x20(%rsp),$in3
1524 aesenclast $in4,$inout4
1525 aesenclast $in5,$inout5
1526 movdqa 0x30(%rsp),$in4
1527 movdqa 0x40(%rsp),$in5
1528 aesenclast $rndkey1,$inout6
1529 movdqa 0x50(%rsp),$rndkey0
1530 $movkey 0x10-0x80($key),$rndkey1#real 1st-round key
1531 aesenclast $in0,$inout7
1533 movups $inout0,($out) # store 8 output blocks
1535 movups $inout1,0x10($out)
1537 movups $inout2,0x20($out)
1539 movups $inout3,0x30($out)
1541 movups $inout4,0x40($out)
1543 movups $inout5,0x50($out)
1544 movdqa $rndkey0,$inout5
1545 movups $inout6,0x60($out)
1546 movups $inout7,0x70($out)
1547 lea 0x80($out),$out # $out+=8*16
1550 jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
1552 add \$8,$len # restore real remainig $len
1553 jz .Lctr32_done # done if ($len==0)
1554 lea -0x80($key),$key
1557 # note that at this point $inout0..5 are populated with
1558 # counter values xor-ed with 0-round key
1564 # if ($len>4) compute 7 E(counter)
1566 movdqa 0x60(%rsp),$inout6
1567 pxor $inout7,$inout7
1569 $movkey 16($key),$rndkey0
1570 aesenc $rndkey1,$inout0
1571 aesenc $rndkey1,$inout1
1572 lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
1574 aesenc $rndkey1,$inout2
1575 add \$16,%rax # prepare for .Lenc_loop8_enter
1577 aesenc $rndkey1,$inout3
1578 aesenc $rndkey1,$inout4
1579 movups 0x10($inp),$in1 # pre-load input
1580 movups 0x20($inp),$in2
1581 aesenc $rndkey1,$inout5
1582 aesenc $rndkey1,$inout6
1584 call .Lenc_loop8_enter
1586 movdqu 0x30($inp),$in3
1588 movdqu 0x40($inp),$in0
1590 movdqu $inout0,($out) # store output
1592 movdqu $inout1,0x10($out)
1594 movdqu $inout2,0x20($out)
1596 movdqu $inout3,0x30($out)
1597 movdqu $inout4,0x40($out)
1599 jb .Lctr32_done # $len was 5, stop store
1601 movups 0x50($inp),$in1
1603 movups $inout5,0x50($out)
1604 je .Lctr32_done # $len was 6, stop store
1606 movups 0x60($inp),$in2
1608 movups $inout6,0x60($out)
1609 jmp .Lctr32_done # $len was 7, stop store
1613 aesenc $rndkey1,$inout0
1616 aesenc $rndkey1,$inout1
1617 aesenc $rndkey1,$inout2
1618 aesenc $rndkey1,$inout3
1619 $movkey ($key),$rndkey1
1621 aesenclast $rndkey1,$inout0
1622 aesenclast $rndkey1,$inout1
1623 movups ($inp),$in0 # load input
1624 movups 0x10($inp),$in1
1625 aesenclast $rndkey1,$inout2
1626 aesenclast $rndkey1,$inout3
1627 movups 0x20($inp),$in2
1628 movups 0x30($inp),$in3
1631 movups $inout0,($out) # store output
1633 movups $inout1,0x10($out)
1635 movdqu $inout2,0x20($out)
1637 movdqu $inout3,0x30($out)
1638 jmp .Lctr32_done # $len was 4, stop store
1642 aesenc $rndkey1,$inout0
1645 aesenc $rndkey1,$inout1
1646 aesenc $rndkey1,$inout2
1647 $movkey ($key),$rndkey1
1649 aesenclast $rndkey1,$inout0
1650 aesenclast $rndkey1,$inout1
1651 aesenclast $rndkey1,$inout2
1653 movups ($inp),$in0 # load input
1655 movups $inout0,($out) # store output
1657 jb .Lctr32_done # $len was 1, stop store
1659 movups 0x10($inp),$in1
1661 movups $inout1,0x10($out)
1662 je .Lctr32_done # $len was 2, stop store
1664 movups 0x20($inp),$in2
1666 movups $inout2,0x20($out) # $len was 3, stop store
1669 xorps %xmm0,%xmm0 # clear regiser bank
1677 $code.=<<___ if (!$win64);
1680 movaps %xmm0,0x00(%rsp) # clear stack
1682 movaps %xmm0,0x10(%rsp)
1684 movaps %xmm0,0x20(%rsp)
1686 movaps %xmm0,0x30(%rsp)
1688 movaps %xmm0,0x40(%rsp)
1690 movaps %xmm0,0x50(%rsp)
1692 movaps %xmm0,0x60(%rsp)
1694 movaps %xmm0,0x70(%rsp)
1697 $code.=<<___ if ($win64);
1698 movaps -0xa8($key_),%xmm6
1699 movaps %xmm0,-0xa8($key_) # clear stack
1700 movaps -0x98($key_),%xmm7
1701 movaps %xmm0,-0x98($key_)
1702 movaps -0x88($key_),%xmm8
1703 movaps %xmm0,-0x88($key_)
1704 movaps -0x78($key_),%xmm9
1705 movaps %xmm0,-0x78($key_)
1706 movaps -0x68($key_),%xmm10
1707 movaps %xmm0,-0x68($key_)
1708 movaps -0x58($key_),%xmm11
1709 movaps %xmm0,-0x58($key_)
1710 movaps -0x48($key_),%xmm12
1711 movaps %xmm0,-0x48($key_)
1712 movaps -0x38($key_),%xmm13
1713 movaps %xmm0,-0x38($key_)
1714 movaps -0x28($key_),%xmm14
1715 movaps %xmm0,-0x28($key_)
1716 movaps -0x18($key_),%xmm15
1717 movaps %xmm0,-0x18($key_)
1718 movaps %xmm0,0x00(%rsp)
1719 movaps %xmm0,0x10(%rsp)
1720 movaps %xmm0,0x20(%rsp)
1721 movaps %xmm0,0x30(%rsp)
1722 movaps %xmm0,0x40(%rsp)
1723 movaps %xmm0,0x50(%rsp)
1724 movaps %xmm0,0x60(%rsp)
1725 movaps %xmm0,0x70(%rsp)
1731 .cfi_def_cfa_register %rsp
1735 .size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
1739 ######################################################################
1740 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1741 # const AES_KEY *key1, const AES_KEY *key2
1742 # const unsigned char iv[16]);
1745 my @tweak=map("%xmm$_",(10..15));
1746 my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
1747 my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
1748 my $frame_size = 0x70 + ($win64?160:0);
1749 my $key_ = "%rbp"; # override so that we can use %r11 as FP
1752 .globl aesni_xts_encrypt
1753 .type aesni_xts_encrypt,\@function,6
1757 lea (%rsp),%r11 # frame pointer
1758 .cfi_def_cfa_register %r11
1761 sub \$$frame_size,%rsp
1762 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
1764 $code.=<<___ if ($win64);
1765 movaps %xmm6,-0xa8(%r11) # offload everything
1766 movaps %xmm7,-0x98(%r11)
1767 movaps %xmm8,-0x88(%r11)
1768 movaps %xmm9,-0x78(%r11)
1769 movaps %xmm10,-0x68(%r11)
1770 movaps %xmm11,-0x58(%r11)
1771 movaps %xmm12,-0x48(%r11)
1772 movaps %xmm13,-0x38(%r11)
1773 movaps %xmm14,-0x28(%r11)
1774 movaps %xmm15,-0x18(%r11)
1778 movups ($ivp),$inout0 # load clear-text tweak
1779 mov 240(%r8),$rounds # key2->rounds
1780 mov 240($key),$rnds_ # key1->rounds
1782 # generate the tweak
1783 &aesni_generate1("enc",$key2,$rounds,$inout0);
1785 $movkey ($key),$rndkey0 # zero round key
1786 mov $key,$key_ # backup $key
1787 mov $rnds_,$rounds # backup $rounds
1789 mov $len,$len_ # backup $len
1792 $movkey 16($key,$rnds_),$rndkey1 # last round key
1794 movdqa .Lxts_magic(%rip),$twmask
1795 movdqa $inout0,@tweak[5]
1796 pshufd \$0x5f,$inout0,$twres
1797 pxor $rndkey0,$rndkey1
1799 # alternative tweak calculation algorithm is based on suggestions
1800 # by Shay Gueron. psrad doesn't conflict with AES-NI instructions
1801 # and should help in the future...
1802 for ($i=0;$i<4;$i++) {
1804 movdqa $twres,$twtmp
1806 movdqa @tweak[5],@tweak[$i]
1807 psrad \$31,$twtmp # broadcast upper bits
1808 paddq @tweak[5],@tweak[5]
1810 pxor $rndkey0,@tweak[$i]
1811 pxor $twtmp,@tweak[5]
1815 movdqa @tweak[5],@tweak[4]
1817 paddq @tweak[5],@tweak[5]
1819 pxor $rndkey0,@tweak[4]
1820 pxor $twres,@tweak[5]
1821 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
1824 jc .Lxts_enc_short # if $len-=6*16 borrowed
1827 lea 32($key_,$rnds_),$key # end of key schedule
1828 sub %r10,%rax # twisted $rounds
1829 $movkey 16($key_),$rndkey1
1830 mov %rax,%r10 # backup twisted $rounds
1831 lea .Lxts_magic(%rip),%r8
1832 jmp .Lxts_enc_grandloop
1835 .Lxts_enc_grandloop:
1836 movdqu `16*0`($inp),$inout0 # load input
1837 movdqa $rndkey0,$twmask
1838 movdqu `16*1`($inp),$inout1
1839 pxor @tweak[0],$inout0 # input^=tweak^round[0]
1840 movdqu `16*2`($inp),$inout2
1841 pxor @tweak[1],$inout1
1842 aesenc $rndkey1,$inout0
1843 movdqu `16*3`($inp),$inout3
1844 pxor @tweak[2],$inout2
1845 aesenc $rndkey1,$inout1
1846 movdqu `16*4`($inp),$inout4
1847 pxor @tweak[3],$inout3
1848 aesenc $rndkey1,$inout2
1849 movdqu `16*5`($inp),$inout5
1850 pxor @tweak[5],$twmask # round[0]^=tweak[5]
1851 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
1852 pxor @tweak[4],$inout4
1853 aesenc $rndkey1,$inout3
1854 $movkey 32($key_),$rndkey0
1855 lea `16*6`($inp),$inp
1856 pxor $twmask,$inout5
1858 pxor $twres,@tweak[0] # calclulate tweaks^round[last]
1859 aesenc $rndkey1,$inout4
1860 pxor $twres,@tweak[1]
1861 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
1862 aesenc $rndkey1,$inout5
1863 $movkey 48($key_),$rndkey1
1864 pxor $twres,@tweak[2]
1866 aesenc $rndkey0,$inout0
1867 pxor $twres,@tweak[3]
1868 movdqa @tweak[1],`16*1`(%rsp)
1869 aesenc $rndkey0,$inout1
1870 pxor $twres,@tweak[4]
1871 movdqa @tweak[2],`16*2`(%rsp)
1872 aesenc $rndkey0,$inout2
1873 aesenc $rndkey0,$inout3
1875 movdqa @tweak[4],`16*4`(%rsp)
1876 aesenc $rndkey0,$inout4
1877 aesenc $rndkey0,$inout5
1878 $movkey 64($key_),$rndkey0
1879 movdqa $twmask,`16*5`(%rsp)
1880 pshufd \$0x5f,@tweak[5],$twres
1884 aesenc $rndkey1,$inout0
1885 aesenc $rndkey1,$inout1
1886 aesenc $rndkey1,$inout2
1887 aesenc $rndkey1,$inout3
1888 aesenc $rndkey1,$inout4
1889 aesenc $rndkey1,$inout5
1890 $movkey -64($key,%rax),$rndkey1
1893 aesenc $rndkey0,$inout0
1894 aesenc $rndkey0,$inout1
1895 aesenc $rndkey0,$inout2
1896 aesenc $rndkey0,$inout3
1897 aesenc $rndkey0,$inout4
1898 aesenc $rndkey0,$inout5
1899 $movkey -80($key,%rax),$rndkey0
1902 movdqa (%r8),$twmask # start calculating next tweak
1903 movdqa $twres,$twtmp
1905 aesenc $rndkey1,$inout0
1906 paddq @tweak[5],@tweak[5]
1908 aesenc $rndkey1,$inout1
1910 $movkey ($key_),@tweak[0] # load round[0]
1911 aesenc $rndkey1,$inout2
1912 aesenc $rndkey1,$inout3
1913 aesenc $rndkey1,$inout4
1914 pxor $twtmp,@tweak[5]
1915 movaps @tweak[0],@tweak[1] # copy round[0]
1916 aesenc $rndkey1,$inout5
1917 $movkey -64($key),$rndkey1
1919 movdqa $twres,$twtmp
1920 aesenc $rndkey0,$inout0
1922 pxor @tweak[5],@tweak[0]
1923 aesenc $rndkey0,$inout1
1925 paddq @tweak[5],@tweak[5]
1926 aesenc $rndkey0,$inout2
1927 aesenc $rndkey0,$inout3
1929 movaps @tweak[1],@tweak[2]
1930 aesenc $rndkey0,$inout4
1931 pxor $twtmp,@tweak[5]
1932 movdqa $twres,$twtmp
1933 aesenc $rndkey0,$inout5
1934 $movkey -48($key),$rndkey0
1937 aesenc $rndkey1,$inout0
1938 pxor @tweak[5],@tweak[1]
1940 aesenc $rndkey1,$inout1
1941 paddq @tweak[5],@tweak[5]
1943 aesenc $rndkey1,$inout2
1944 aesenc $rndkey1,$inout3
1945 movdqa @tweak[3],`16*3`(%rsp)
1946 pxor $twtmp,@tweak[5]
1947 aesenc $rndkey1,$inout4
1948 movaps @tweak[2],@tweak[3]
1949 movdqa $twres,$twtmp
1950 aesenc $rndkey1,$inout5
1951 $movkey -32($key),$rndkey1
1954 aesenc $rndkey0,$inout0
1955 pxor @tweak[5],@tweak[2]
1957 aesenc $rndkey0,$inout1
1958 paddq @tweak[5],@tweak[5]
1960 aesenc $rndkey0,$inout2
1961 aesenc $rndkey0,$inout3
1962 aesenc $rndkey0,$inout4
1963 pxor $twtmp,@tweak[5]
1964 movaps @tweak[3],@tweak[4]
1965 aesenc $rndkey0,$inout5
1967 movdqa $twres,$rndkey0
1969 aesenc $rndkey1,$inout0
1970 pxor @tweak[5],@tweak[3]
1972 aesenc $rndkey1,$inout1
1973 paddq @tweak[5],@tweak[5]
1974 pand $twmask,$rndkey0
1975 aesenc $rndkey1,$inout2
1976 aesenc $rndkey1,$inout3
1977 pxor $rndkey0,@tweak[5]
1978 $movkey ($key_),$rndkey0
1979 aesenc $rndkey1,$inout4
1980 aesenc $rndkey1,$inout5
1981 $movkey 16($key_),$rndkey1
1983 pxor @tweak[5],@tweak[4]
1984 aesenclast `16*0`(%rsp),$inout0
1986 paddq @tweak[5],@tweak[5]
1987 aesenclast `16*1`(%rsp),$inout1
1988 aesenclast `16*2`(%rsp),$inout2
1990 mov %r10,%rax # restore $rounds
1991 aesenclast `16*3`(%rsp),$inout3
1992 aesenclast `16*4`(%rsp),$inout4
1993 aesenclast `16*5`(%rsp),$inout5
1994 pxor $twres,@tweak[5]
1996 lea `16*6`($out),$out # $out+=6*16
1997 movups $inout0,`-16*6`($out) # store 6 output blocks
1998 movups $inout1,`-16*5`($out)
1999 movups $inout2,`-16*4`($out)
2000 movups $inout3,`-16*3`($out)
2001 movups $inout4,`-16*2`($out)
2002 movups $inout5,`-16*1`($out)
2004 jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
2008 mov $key_,$key # restore $key
2009 shr \$4,$rounds # restore original value
2012 # at the point @tweak[0..5] are populated with tweak values
2013 mov $rounds,$rnds_ # backup $rounds
2014 pxor $rndkey0,@tweak[0]
2015 add \$16*6,$len # restore real remaining $len
2016 jz .Lxts_enc_done # done if ($len==0)
2018 pxor $rndkey0,@tweak[1]
2020 jb .Lxts_enc_one # $len is 1*16
2021 pxor $rndkey0,@tweak[2]
2022 je .Lxts_enc_two # $len is 2*16
2024 pxor $rndkey0,@tweak[3]
2026 jb .Lxts_enc_three # $len is 3*16
2027 pxor $rndkey0,@tweak[4]
2028 je .Lxts_enc_four # $len is 4*16
2030 movdqu ($inp),$inout0 # $len is 5*16
2031 movdqu 16*1($inp),$inout1
2032 movdqu 16*2($inp),$inout2
2033 pxor @tweak[0],$inout0
2034 movdqu 16*3($inp),$inout3
2035 pxor @tweak[1],$inout1
2036 movdqu 16*4($inp),$inout4
2037 lea 16*5($inp),$inp # $inp+=5*16
2038 pxor @tweak[2],$inout2
2039 pxor @tweak[3],$inout3
2040 pxor @tweak[4],$inout4
2041 pxor $inout5,$inout5
2043 call _aesni_encrypt6
2045 xorps @tweak[0],$inout0
2046 movdqa @tweak[5],@tweak[0]
2047 xorps @tweak[1],$inout1
2048 xorps @tweak[2],$inout2
2049 movdqu $inout0,($out) # store 5 output blocks
2050 xorps @tweak[3],$inout3
2051 movdqu $inout1,16*1($out)
2052 xorps @tweak[4],$inout4
2053 movdqu $inout2,16*2($out)
2054 movdqu $inout3,16*3($out)
2055 movdqu $inout4,16*4($out)
2056 lea 16*5($out),$out # $out+=5*16
2061 movups ($inp),$inout0
2062 lea 16*1($inp),$inp # inp+=1*16
2063 xorps @tweak[0],$inout0
2065 &aesni_generate1("enc",$key,$rounds);
2067 xorps @tweak[0],$inout0
2068 movdqa @tweak[1],@tweak[0]
2069 movups $inout0,($out) # store one output block
2070 lea 16*1($out),$out # $out+=1*16
2075 movups ($inp),$inout0
2076 movups 16($inp),$inout1
2077 lea 32($inp),$inp # $inp+=2*16
2078 xorps @tweak[0],$inout0
2079 xorps @tweak[1],$inout1
2081 call _aesni_encrypt2
2083 xorps @tweak[0],$inout0
2084 movdqa @tweak[2],@tweak[0]
2085 xorps @tweak[1],$inout1
2086 movups $inout0,($out) # store 2 output blocks
2087 movups $inout1,16*1($out)
2088 lea 16*2($out),$out # $out+=2*16
2093 movups ($inp),$inout0
2094 movups 16*1($inp),$inout1
2095 movups 16*2($inp),$inout2
2096 lea 16*3($inp),$inp # $inp+=3*16
2097 xorps @tweak[0],$inout0
2098 xorps @tweak[1],$inout1
2099 xorps @tweak[2],$inout2
2101 call _aesni_encrypt3
2103 xorps @tweak[0],$inout0
2104 movdqa @tweak[3],@tweak[0]
2105 xorps @tweak[1],$inout1
2106 xorps @tweak[2],$inout2
2107 movups $inout0,($out) # store 3 output blocks
2108 movups $inout1,16*1($out)
2109 movups $inout2,16*2($out)
2110 lea 16*3($out),$out # $out+=3*16
2115 movups ($inp),$inout0
2116 movups 16*1($inp),$inout1
2117 movups 16*2($inp),$inout2
2118 xorps @tweak[0],$inout0
2119 movups 16*3($inp),$inout3
2120 lea 16*4($inp),$inp # $inp+=4*16
2121 xorps @tweak[1],$inout1
2122 xorps @tweak[2],$inout2
2123 xorps @tweak[3],$inout3
2125 call _aesni_encrypt4
2127 pxor @tweak[0],$inout0
2128 movdqa @tweak[4],@tweak[0]
2129 pxor @tweak[1],$inout1
2130 pxor @tweak[2],$inout2
2131 movdqu $inout0,($out) # store 4 output blocks
2132 pxor @tweak[3],$inout3
2133 movdqu $inout1,16*1($out)
2134 movdqu $inout2,16*2($out)
2135 movdqu $inout3,16*3($out)
2136 lea 16*4($out),$out # $out+=4*16
2141 and \$15,$len_ # see if $len%16 is 0
2146 movzb ($inp),%eax # borrow $rounds ...
2147 movzb -16($out),%ecx # ... and $key
2155 sub $len_,$out # rewind $out
2156 mov $key_,$key # restore $key
2157 mov $rnds_,$rounds # restore $rounds
2159 movups -16($out),$inout0
2160 xorps @tweak[0],$inout0
2162 &aesni_generate1("enc",$key,$rounds);
2164 xorps @tweak[0],$inout0
2165 movups $inout0,-16($out)
2168 xorps %xmm0,%xmm0 # clear register bank
2175 $code.=<<___ if (!$win64);
2178 movaps %xmm0,0x00(%rsp) # clear stack
2180 movaps %xmm0,0x10(%rsp)
2182 movaps %xmm0,0x20(%rsp)
2184 movaps %xmm0,0x30(%rsp)
2186 movaps %xmm0,0x40(%rsp)
2188 movaps %xmm0,0x50(%rsp)
2190 movaps %xmm0,0x60(%rsp)
2194 $code.=<<___ if ($win64);
2195 movaps -0xa8(%r11),%xmm6
2196 movaps %xmm0,-0xa8(%r11) # clear stack
2197 movaps -0x98(%r11),%xmm7
2198 movaps %xmm0,-0x98(%r11)
2199 movaps -0x88(%r11),%xmm8
2200 movaps %xmm0,-0x88(%r11)
2201 movaps -0x78(%r11),%xmm9
2202 movaps %xmm0,-0x78(%r11)
2203 movaps -0x68(%r11),%xmm10
2204 movaps %xmm0,-0x68(%r11)
2205 movaps -0x58(%r11),%xmm11
2206 movaps %xmm0,-0x58(%r11)
2207 movaps -0x48(%r11),%xmm12
2208 movaps %xmm0,-0x48(%r11)
2209 movaps -0x38(%r11),%xmm13
2210 movaps %xmm0,-0x38(%r11)
2211 movaps -0x28(%r11),%xmm14
2212 movaps %xmm0,-0x28(%r11)
2213 movaps -0x18(%r11),%xmm15
2214 movaps %xmm0,-0x18(%r11)
2215 movaps %xmm0,0x00(%rsp)
2216 movaps %xmm0,0x10(%rsp)
2217 movaps %xmm0,0x20(%rsp)
2218 movaps %xmm0,0x30(%rsp)
2219 movaps %xmm0,0x40(%rsp)
2220 movaps %xmm0,0x50(%rsp)
2221 movaps %xmm0,0x60(%rsp)
2227 .cfi_def_cfa_register %rsp
2231 .size aesni_xts_encrypt,.-aesni_xts_encrypt
2235 .globl aesni_xts_decrypt
2236 .type aesni_xts_decrypt,\@function,6
2240 lea (%rsp),%r11 # frame pointer
2241 .cfi_def_cfa_register %r11
2244 sub \$$frame_size,%rsp
2245 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
2247 $code.=<<___ if ($win64);
2248 movaps %xmm6,-0xa8(%r11) # offload everything
2249 movaps %xmm7,-0x98(%r11)
2250 movaps %xmm8,-0x88(%r11)
2251 movaps %xmm9,-0x78(%r11)
2252 movaps %xmm10,-0x68(%r11)
2253 movaps %xmm11,-0x58(%r11)
2254 movaps %xmm12,-0x48(%r11)
2255 movaps %xmm13,-0x38(%r11)
2256 movaps %xmm14,-0x28(%r11)
2257 movaps %xmm15,-0x18(%r11)
2261 movups ($ivp),$inout0 # load clear-text tweak
2262 mov 240($key2),$rounds # key2->rounds
2263 mov 240($key),$rnds_ # key1->rounds
2265 # generate the tweak
2266 &aesni_generate1("enc",$key2,$rounds,$inout0);
2268 xor %eax,%eax # if ($len%16) len-=16;
2274 $movkey ($key),$rndkey0 # zero round key
2275 mov $key,$key_ # backup $key
2276 mov $rnds_,$rounds # backup $rounds
2278 mov $len,$len_ # backup $len
2281 $movkey 16($key,$rnds_),$rndkey1 # last round key
2283 movdqa .Lxts_magic(%rip),$twmask
2284 movdqa $inout0,@tweak[5]
2285 pshufd \$0x5f,$inout0,$twres
2286 pxor $rndkey0,$rndkey1
2288 for ($i=0;$i<4;$i++) {
2290 movdqa $twres,$twtmp
2292 movdqa @tweak[5],@tweak[$i]
2293 psrad \$31,$twtmp # broadcast upper bits
2294 paddq @tweak[5],@tweak[5]
2296 pxor $rndkey0,@tweak[$i]
2297 pxor $twtmp,@tweak[5]
2301 movdqa @tweak[5],@tweak[4]
2303 paddq @tweak[5],@tweak[5]
2305 pxor $rndkey0,@tweak[4]
2306 pxor $twres,@tweak[5]
2307 movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
2310 jc .Lxts_dec_short # if $len-=6*16 borrowed
2313 lea 32($key_,$rnds_),$key # end of key schedule
2314 sub %r10,%rax # twisted $rounds
2315 $movkey 16($key_),$rndkey1
2316 mov %rax,%r10 # backup twisted $rounds
2317 lea .Lxts_magic(%rip),%r8
2318 jmp .Lxts_dec_grandloop
2321 .Lxts_dec_grandloop:
2322 movdqu `16*0`($inp),$inout0 # load input
2323 movdqa $rndkey0,$twmask
2324 movdqu `16*1`($inp),$inout1
2325 pxor @tweak[0],$inout0 # intput^=tweak^round[0]
2326 movdqu `16*2`($inp),$inout2
2327 pxor @tweak[1],$inout1
2328 aesdec $rndkey1,$inout0
2329 movdqu `16*3`($inp),$inout3
2330 pxor @tweak[2],$inout2
2331 aesdec $rndkey1,$inout1
2332 movdqu `16*4`($inp),$inout4
2333 pxor @tweak[3],$inout3
2334 aesdec $rndkey1,$inout2
2335 movdqu `16*5`($inp),$inout5
2336 pxor @tweak[5],$twmask # round[0]^=tweak[5]
2337 movdqa 0x60(%rsp),$twres # load round[0]^round[last]
2338 pxor @tweak[4],$inout4
2339 aesdec $rndkey1,$inout3
2340 $movkey 32($key_),$rndkey0
2341 lea `16*6`($inp),$inp
2342 pxor $twmask,$inout5
2344 pxor $twres,@tweak[0] # calclulate tweaks^round[last]
2345 aesdec $rndkey1,$inout4
2346 pxor $twres,@tweak[1]
2347 movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
2348 aesdec $rndkey1,$inout5
2349 $movkey 48($key_),$rndkey1
2350 pxor $twres,@tweak[2]
2352 aesdec $rndkey0,$inout0
2353 pxor $twres,@tweak[3]
2354 movdqa @tweak[1],`16*1`(%rsp)
2355 aesdec $rndkey0,$inout1
2356 pxor $twres,@tweak[4]
2357 movdqa @tweak[2],`16*2`(%rsp)
2358 aesdec $rndkey0,$inout2
2359 aesdec $rndkey0,$inout3
2361 movdqa @tweak[4],`16*4`(%rsp)
2362 aesdec $rndkey0,$inout4
2363 aesdec $rndkey0,$inout5
2364 $movkey 64($key_),$rndkey0
2365 movdqa $twmask,`16*5`(%rsp)
2366 pshufd \$0x5f,@tweak[5],$twres
2370 aesdec $rndkey1,$inout0
2371 aesdec $rndkey1,$inout1
2372 aesdec $rndkey1,$inout2
2373 aesdec $rndkey1,$inout3
2374 aesdec $rndkey1,$inout4
2375 aesdec $rndkey1,$inout5
2376 $movkey -64($key,%rax),$rndkey1
2379 aesdec $rndkey0,$inout0
2380 aesdec $rndkey0,$inout1
2381 aesdec $rndkey0,$inout2
2382 aesdec $rndkey0,$inout3
2383 aesdec $rndkey0,$inout4
2384 aesdec $rndkey0,$inout5
2385 $movkey -80($key,%rax),$rndkey0
2388 movdqa (%r8),$twmask # start calculating next tweak
2389 movdqa $twres,$twtmp
2391 aesdec $rndkey1,$inout0
2392 paddq @tweak[5],@tweak[5]
2394 aesdec $rndkey1,$inout1
2396 $movkey ($key_),@tweak[0] # load round[0]
2397 aesdec $rndkey1,$inout2
2398 aesdec $rndkey1,$inout3
2399 aesdec $rndkey1,$inout4
2400 pxor $twtmp,@tweak[5]
2401 movaps @tweak[0],@tweak[1] # copy round[0]
2402 aesdec $rndkey1,$inout5
2403 $movkey -64($key),$rndkey1
2405 movdqa $twres,$twtmp
2406 aesdec $rndkey0,$inout0
2408 pxor @tweak[5],@tweak[0]
2409 aesdec $rndkey0,$inout1
2411 paddq @tweak[5],@tweak[5]
2412 aesdec $rndkey0,$inout2
2413 aesdec $rndkey0,$inout3
2415 movaps @tweak[1],@tweak[2]
2416 aesdec $rndkey0,$inout4
2417 pxor $twtmp,@tweak[5]
2418 movdqa $twres,$twtmp
2419 aesdec $rndkey0,$inout5
2420 $movkey -48($key),$rndkey0
2423 aesdec $rndkey1,$inout0
2424 pxor @tweak[5],@tweak[1]
2426 aesdec $rndkey1,$inout1
2427 paddq @tweak[5],@tweak[5]
2429 aesdec $rndkey1,$inout2
2430 aesdec $rndkey1,$inout3
2431 movdqa @tweak[3],`16*3`(%rsp)
2432 pxor $twtmp,@tweak[5]
2433 aesdec $rndkey1,$inout4
2434 movaps @tweak[2],@tweak[3]
2435 movdqa $twres,$twtmp
2436 aesdec $rndkey1,$inout5
2437 $movkey -32($key),$rndkey1
2440 aesdec $rndkey0,$inout0
2441 pxor @tweak[5],@tweak[2]
2443 aesdec $rndkey0,$inout1
2444 paddq @tweak[5],@tweak[5]
2446 aesdec $rndkey0,$inout2
2447 aesdec $rndkey0,$inout3
2448 aesdec $rndkey0,$inout4
2449 pxor $twtmp,@tweak[5]
2450 movaps @tweak[3],@tweak[4]
2451 aesdec $rndkey0,$inout5
2453 movdqa $twres,$rndkey0
2455 aesdec $rndkey1,$inout0
2456 pxor @tweak[5],@tweak[3]
2458 aesdec $rndkey1,$inout1
2459 paddq @tweak[5],@tweak[5]
2460 pand $twmask,$rndkey0
2461 aesdec $rndkey1,$inout2
2462 aesdec $rndkey1,$inout3
2463 pxor $rndkey0,@tweak[5]
2464 $movkey ($key_),$rndkey0
2465 aesdec $rndkey1,$inout4
2466 aesdec $rndkey1,$inout5
2467 $movkey 16($key_),$rndkey1
2469 pxor @tweak[5],@tweak[4]
2470 aesdeclast `16*0`(%rsp),$inout0
2472 paddq @tweak[5],@tweak[5]
2473 aesdeclast `16*1`(%rsp),$inout1
2474 aesdeclast `16*2`(%rsp),$inout2
2476 mov %r10,%rax # restore $rounds
2477 aesdeclast `16*3`(%rsp),$inout3
2478 aesdeclast `16*4`(%rsp),$inout4
2479 aesdeclast `16*5`(%rsp),$inout5
2480 pxor $twres,@tweak[5]
2482 lea `16*6`($out),$out # $out+=6*16
2483 movups $inout0,`-16*6`($out) # store 6 output blocks
2484 movups $inout1,`-16*5`($out)
2485 movups $inout2,`-16*4`($out)
2486 movups $inout3,`-16*3`($out)
2487 movups $inout4,`-16*2`($out)
2488 movups $inout5,`-16*1`($out)
2490 jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
2494 mov $key_,$key # restore $key
2495 shr \$4,$rounds # restore original value
2498 # at the point @tweak[0..5] are populated with tweak values
2499 mov $rounds,$rnds_ # backup $rounds
2500 pxor $rndkey0,@tweak[0]
2501 pxor $rndkey0,@tweak[1]
2502 add \$16*6,$len # restore real remaining $len
2503 jz .Lxts_dec_done # done if ($len==0)
2505 pxor $rndkey0,@tweak[2]
2507 jb .Lxts_dec_one # $len is 1*16
2508 pxor $rndkey0,@tweak[3]
2509 je .Lxts_dec_two # $len is 2*16
2511 pxor $rndkey0,@tweak[4]
2513 jb .Lxts_dec_three # $len is 3*16
2514 je .Lxts_dec_four # $len is 4*16
2516 movdqu ($inp),$inout0 # $len is 5*16
2517 movdqu 16*1($inp),$inout1
2518 movdqu 16*2($inp),$inout2
2519 pxor @tweak[0],$inout0
2520 movdqu 16*3($inp),$inout3
2521 pxor @tweak[1],$inout1
2522 movdqu 16*4($inp),$inout4
2523 lea 16*5($inp),$inp # $inp+=5*16
2524 pxor @tweak[2],$inout2
2525 pxor @tweak[3],$inout3
2526 pxor @tweak[4],$inout4
2528 call _aesni_decrypt6
2530 xorps @tweak[0],$inout0
2531 xorps @tweak[1],$inout1
2532 xorps @tweak[2],$inout2
2533 movdqu $inout0,($out) # store 5 output blocks
2534 xorps @tweak[3],$inout3
2535 movdqu $inout1,16*1($out)
2536 xorps @tweak[4],$inout4
2537 movdqu $inout2,16*2($out)
2539 movdqu $inout3,16*3($out)
2540 pcmpgtd @tweak[5],$twtmp
2541 movdqu $inout4,16*4($out)
2542 lea 16*5($out),$out # $out+=5*16
2543 pshufd \$0x13,$twtmp,@tweak[1] # $twres
2547 movdqa @tweak[5],@tweak[0]
2548 paddq @tweak[5],@tweak[5] # psllq 1,$tweak
2549 pand $twmask,@tweak[1] # isolate carry and residue
2550 pxor @tweak[5],@tweak[1]
2555 movups ($inp),$inout0
2556 lea 16*1($inp),$inp # $inp+=1*16
2557 xorps @tweak[0],$inout0
2559 &aesni_generate1("dec",$key,$rounds);
2561 xorps @tweak[0],$inout0
2562 movdqa @tweak[1],@tweak[0]
2563 movups $inout0,($out) # store one output block
2564 movdqa @tweak[2],@tweak[1]
2565 lea 16*1($out),$out # $out+=1*16
2570 movups ($inp),$inout0
2571 movups 16($inp),$inout1
2572 lea 32($inp),$inp # $inp+=2*16
2573 xorps @tweak[0],$inout0
2574 xorps @tweak[1],$inout1
2576 call _aesni_decrypt2
2578 xorps @tweak[0],$inout0
2579 movdqa @tweak[2],@tweak[0]
2580 xorps @tweak[1],$inout1
2581 movdqa @tweak[3],@tweak[1]
2582 movups $inout0,($out) # store 2 output blocks
2583 movups $inout1,16*1($out)
2584 lea 16*2($out),$out # $out+=2*16
2589 movups ($inp),$inout0
2590 movups 16*1($inp),$inout1
2591 movups 16*2($inp),$inout2
2592 lea 16*3($inp),$inp # $inp+=3*16
2593 xorps @tweak[0],$inout0
2594 xorps @tweak[1],$inout1
2595 xorps @tweak[2],$inout2
2597 call _aesni_decrypt3
2599 xorps @tweak[0],$inout0
2600 movdqa @tweak[3],@tweak[0]
2601 xorps @tweak[1],$inout1
2602 movdqa @tweak[4],@tweak[1]
2603 xorps @tweak[2],$inout2
2604 movups $inout0,($out) # store 3 output blocks
2605 movups $inout1,16*1($out)
2606 movups $inout2,16*2($out)
2607 lea 16*3($out),$out # $out+=3*16
2612 movups ($inp),$inout0
2613 movups 16*1($inp),$inout1
2614 movups 16*2($inp),$inout2
2615 xorps @tweak[0],$inout0
2616 movups 16*3($inp),$inout3
2617 lea 16*4($inp),$inp # $inp+=4*16
2618 xorps @tweak[1],$inout1
2619 xorps @tweak[2],$inout2
2620 xorps @tweak[3],$inout3
2622 call _aesni_decrypt4
2624 pxor @tweak[0],$inout0
2625 movdqa @tweak[4],@tweak[0]
2626 pxor @tweak[1],$inout1
2627 movdqa @tweak[5],@tweak[1]
2628 pxor @tweak[2],$inout2
2629 movdqu $inout0,($out) # store 4 output blocks
2630 pxor @tweak[3],$inout3
2631 movdqu $inout1,16*1($out)
2632 movdqu $inout2,16*2($out)
2633 movdqu $inout3,16*3($out)
2634 lea 16*4($out),$out # $out+=4*16
2639 and \$15,$len_ # see if $len%16 is 0
2643 mov $key_,$key # restore $key
2644 mov $rnds_,$rounds # restore $rounds
2646 movups ($inp),$inout0
2647 xorps @tweak[1],$inout0
2649 &aesni_generate1("dec",$key,$rounds);
2651 xorps @tweak[1],$inout0
2652 movups $inout0,($out)
2655 movzb 16($inp),%eax # borrow $rounds ...
2656 movzb ($out),%ecx # ... and $key
2664 sub $len_,$out # rewind $out
2665 mov $key_,$key # restore $key
2666 mov $rnds_,$rounds # restore $rounds
2668 movups ($out),$inout0
2669 xorps @tweak[0],$inout0
2671 &aesni_generate1("dec",$key,$rounds);
2673 xorps @tweak[0],$inout0
2674 movups $inout0,($out)
2677 xorps %xmm0,%xmm0 # clear register bank
2684 $code.=<<___ if (!$win64);
2687 movaps %xmm0,0x00(%rsp) # clear stack
2689 movaps %xmm0,0x10(%rsp)
2691 movaps %xmm0,0x20(%rsp)
2693 movaps %xmm0,0x30(%rsp)
2695 movaps %xmm0,0x40(%rsp)
2697 movaps %xmm0,0x50(%rsp)
2699 movaps %xmm0,0x60(%rsp)
2703 $code.=<<___ if ($win64);
2704 movaps -0xa8(%r11),%xmm6
2705 movaps %xmm0,-0xa8(%r11) # clear stack
2706 movaps -0x98(%r11),%xmm7
2707 movaps %xmm0,-0x98(%r11)
2708 movaps -0x88(%r11),%xmm8
2709 movaps %xmm0,-0x88(%r11)
2710 movaps -0x78(%r11),%xmm9
2711 movaps %xmm0,-0x78(%r11)
2712 movaps -0x68(%r11),%xmm10
2713 movaps %xmm0,-0x68(%r11)
2714 movaps -0x58(%r11),%xmm11
2715 movaps %xmm0,-0x58(%r11)
2716 movaps -0x48(%r11),%xmm12
2717 movaps %xmm0,-0x48(%r11)
2718 movaps -0x38(%r11),%xmm13
2719 movaps %xmm0,-0x38(%r11)
2720 movaps -0x28(%r11),%xmm14
2721 movaps %xmm0,-0x28(%r11)
2722 movaps -0x18(%r11),%xmm15
2723 movaps %xmm0,-0x18(%r11)
2724 movaps %xmm0,0x00(%rsp)
2725 movaps %xmm0,0x10(%rsp)
2726 movaps %xmm0,0x20(%rsp)
2727 movaps %xmm0,0x30(%rsp)
2728 movaps %xmm0,0x40(%rsp)
2729 movaps %xmm0,0x50(%rsp)
2730 movaps %xmm0,0x60(%rsp)
2736 .cfi_def_cfa_register %rsp
2740 .size aesni_xts_decrypt,.-aesni_xts_decrypt
2744 ######################################################################
2745 # void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
2746 # const AES_KEY *key, unsigned int start_block_num,
2747 # unsigned char offset_i[16], const unsigned char L_[][16],
2748 # unsigned char checksum[16]);
2751 my @offset=map("%xmm$_",(10..15));
2752 my ($checksum,$rndkey0l)=("%xmm8","%xmm9");
2753 my ($block_num,$offset_p)=("%r8","%r9"); # 5th and 6th arguments
2754 my ($L_p,$checksum_p) = ("%rbx","%rbp");
2755 my ($i1,$i3,$i5) = ("%r12","%r13","%r14");
2756 my $seventh_arg = $win64 ? 56 : 8;
2760 .globl aesni_ocb_encrypt
2761 .type aesni_ocb_encrypt,\@function,6
2777 $code.=<<___ if ($win64);
2778 lea -0xa0(%rsp),%rsp
2779 movaps %xmm6,0x00(%rsp) # offload everything
2780 movaps %xmm7,0x10(%rsp)
2781 movaps %xmm8,0x20(%rsp)
2782 movaps %xmm9,0x30(%rsp)
2783 movaps %xmm10,0x40(%rsp)
2784 movaps %xmm11,0x50(%rsp)
2785 movaps %xmm12,0x60(%rsp)
2786 movaps %xmm13,0x70(%rsp)
2787 movaps %xmm14,0x80(%rsp)
2788 movaps %xmm15,0x90(%rsp)
2792 mov $seventh_arg(%rax),$L_p # 7th argument
2793 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
2795 mov 240($key),$rnds_
2798 $movkey ($key),$rndkey0l # round[0]
2799 $movkey 16($key,$rnds_),$rndkey1 # round[last]
2801 movdqu ($offset_p),@offset[5] # load last offset_i
2802 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
2803 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
2806 lea 32($key_,$rnds_),$key
2807 $movkey 16($key_),$rndkey1 # round[1]
2808 sub %r10,%rax # twisted $rounds
2809 mov %rax,%r10 # backup twisted $rounds
2811 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
2812 movdqu ($checksum_p),$checksum # load checksum
2814 test \$1,$block_num # is first block number odd?
2820 movdqu ($L_p,$i1),$inout5 # borrow
2821 movdqu ($inp),$inout0
2826 movdqa $inout5,@offset[5]
2827 movups $inout0,($out)
2833 lea 1($block_num),$i1 # even-numbered blocks
2834 lea 3($block_num),$i3
2835 lea 5($block_num),$i5
2836 lea 6($block_num),$block_num
2837 bsf $i1,$i1 # ntz(block)
2840 shl \$4,$i1 # ntz(block) -> table offset
2846 jmp .Locb_enc_grandloop
2849 .Locb_enc_grandloop:
2850 movdqu `16*0`($inp),$inout0 # load input
2851 movdqu `16*1`($inp),$inout1
2852 movdqu `16*2`($inp),$inout2
2853 movdqu `16*3`($inp),$inout3
2854 movdqu `16*4`($inp),$inout4
2855 movdqu `16*5`($inp),$inout5
2856 lea `16*6`($inp),$inp
2860 movups $inout0,`16*0`($out) # store output
2861 movups $inout1,`16*1`($out)
2862 movups $inout2,`16*2`($out)
2863 movups $inout3,`16*3`($out)
2864 movups $inout4,`16*4`($out)
2865 movups $inout5,`16*5`($out)
2866 lea `16*6`($out),$out
2868 jnc .Locb_enc_grandloop
2874 movdqu `16*0`($inp),$inout0
2877 movdqu `16*1`($inp),$inout1
2880 movdqu `16*2`($inp),$inout2
2883 movdqu `16*3`($inp),$inout3
2886 movdqu `16*4`($inp),$inout4
2887 pxor $inout5,$inout5
2891 movdqa @offset[4],@offset[5]
2892 movups $inout0,`16*0`($out)
2893 movups $inout1,`16*1`($out)
2894 movups $inout2,`16*2`($out)
2895 movups $inout3,`16*3`($out)
2896 movups $inout4,`16*4`($out)
2902 movdqa @offset[0],$inout5 # borrow
2906 movdqa $inout5,@offset[5]
2907 movups $inout0,`16*0`($out)
2912 pxor $inout2,$inout2
2913 pxor $inout3,$inout3
2917 movdqa @offset[1],@offset[5]
2918 movups $inout0,`16*0`($out)
2919 movups $inout1,`16*1`($out)
2925 pxor $inout3,$inout3
2929 movdqa @offset[2],@offset[5]
2930 movups $inout0,`16*0`($out)
2931 movups $inout1,`16*1`($out)
2932 movups $inout2,`16*2`($out)
2940 movdqa @offset[3],@offset[5]
2941 movups $inout0,`16*0`($out)
2942 movups $inout1,`16*1`($out)
2943 movups $inout2,`16*2`($out)
2944 movups $inout3,`16*3`($out)
2947 pxor $rndkey0,@offset[5] # "remove" round[last]
2948 movdqu $checksum,($checksum_p) # store checksum
2949 movdqu @offset[5],($offset_p) # store last offset_i
2951 xorps %xmm0,%xmm0 # clear register bank
2958 $code.=<<___ if (!$win64);
2972 $code.=<<___ if ($win64);
2973 movaps 0x00(%rsp),%xmm6
2974 movaps %xmm0,0x00(%rsp) # clear stack
2975 movaps 0x10(%rsp),%xmm7
2976 movaps %xmm0,0x10(%rsp)
2977 movaps 0x20(%rsp),%xmm8
2978 movaps %xmm0,0x20(%rsp)
2979 movaps 0x30(%rsp),%xmm9
2980 movaps %xmm0,0x30(%rsp)
2981 movaps 0x40(%rsp),%xmm10
2982 movaps %xmm0,0x40(%rsp)
2983 movaps 0x50(%rsp),%xmm11
2984 movaps %xmm0,0x50(%rsp)
2985 movaps 0x60(%rsp),%xmm12
2986 movaps %xmm0,0x60(%rsp)
2987 movaps 0x70(%rsp),%xmm13
2988 movaps %xmm0,0x70(%rsp)
2989 movaps 0x80(%rsp),%xmm14
2990 movaps %xmm0,0x80(%rsp)
2991 movaps 0x90(%rsp),%xmm15
2992 movaps %xmm0,0x90(%rsp)
2993 lea 0xa0+0x28(%rsp),%rax
3008 .cfi_def_cfa_register %rsp
3012 .size aesni_ocb_encrypt,.-aesni_ocb_encrypt
3014 .type __ocb_encrypt6,\@abi-omnipotent
3017 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3018 movdqu ($L_p,$i1),@offset[1]
3019 movdqa @offset[0],@offset[2]
3020 movdqu ($L_p,$i3),@offset[3]
3021 movdqa @offset[0],@offset[4]
3022 pxor @offset[5],@offset[0]
3023 movdqu ($L_p,$i5),@offset[5]
3024 pxor @offset[0],@offset[1]
3025 pxor $inout0,$checksum # accumulate checksum
3026 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3027 pxor @offset[1],@offset[2]
3028 pxor $inout1,$checksum
3029 pxor @offset[1],$inout1
3030 pxor @offset[2],@offset[3]
3031 pxor $inout2,$checksum
3032 pxor @offset[2],$inout2
3033 pxor @offset[3],@offset[4]
3034 pxor $inout3,$checksum
3035 pxor @offset[3],$inout3
3036 pxor @offset[4],@offset[5]
3037 pxor $inout4,$checksum
3038 pxor @offset[4],$inout4
3039 pxor $inout5,$checksum
3040 pxor @offset[5],$inout5
3041 $movkey 32($key_),$rndkey0
3043 lea 1($block_num),$i1 # even-numbered blocks
3044 lea 3($block_num),$i3
3045 lea 5($block_num),$i5
3047 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3048 bsf $i1,$i1 # ntz(block)
3052 aesenc $rndkey1,$inout0
3053 aesenc $rndkey1,$inout1
3054 aesenc $rndkey1,$inout2
3055 aesenc $rndkey1,$inout3
3056 pxor $rndkey0l,@offset[1]
3057 pxor $rndkey0l,@offset[2]
3058 aesenc $rndkey1,$inout4
3059 pxor $rndkey0l,@offset[3]
3060 pxor $rndkey0l,@offset[4]
3061 aesenc $rndkey1,$inout5
3062 $movkey 48($key_),$rndkey1
3063 pxor $rndkey0l,@offset[5]
3065 aesenc $rndkey0,$inout0
3066 aesenc $rndkey0,$inout1
3067 aesenc $rndkey0,$inout2
3068 aesenc $rndkey0,$inout3
3069 aesenc $rndkey0,$inout4
3070 aesenc $rndkey0,$inout5
3071 $movkey 64($key_),$rndkey0
3072 shl \$4,$i1 # ntz(block) -> table offset
3078 aesenc $rndkey1,$inout0
3079 aesenc $rndkey1,$inout1
3080 aesenc $rndkey1,$inout2
3081 aesenc $rndkey1,$inout3
3082 aesenc $rndkey1,$inout4
3083 aesenc $rndkey1,$inout5
3084 $movkey ($key,%rax),$rndkey1
3087 aesenc $rndkey0,$inout0
3088 aesenc $rndkey0,$inout1
3089 aesenc $rndkey0,$inout2
3090 aesenc $rndkey0,$inout3
3091 aesenc $rndkey0,$inout4
3092 aesenc $rndkey0,$inout5
3093 $movkey -16($key,%rax),$rndkey0
3096 aesenc $rndkey1,$inout0
3097 aesenc $rndkey1,$inout1
3098 aesenc $rndkey1,$inout2
3099 aesenc $rndkey1,$inout3
3100 aesenc $rndkey1,$inout4
3101 aesenc $rndkey1,$inout5
3102 $movkey 16($key_),$rndkey1
3105 aesenclast @offset[0],$inout0
3106 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3107 mov %r10,%rax # restore twisted rounds
3108 aesenclast @offset[1],$inout1
3109 aesenclast @offset[2],$inout2
3110 aesenclast @offset[3],$inout3
3111 aesenclast @offset[4],$inout4
3112 aesenclast @offset[5],$inout5
3114 .size __ocb_encrypt6,.-__ocb_encrypt6
3116 .type __ocb_encrypt4,\@abi-omnipotent
3119 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3120 movdqu ($L_p,$i1),@offset[1]
3121 movdqa @offset[0],@offset[2]
3122 movdqu ($L_p,$i3),@offset[3]
3123 pxor @offset[5],@offset[0]
3124 pxor @offset[0],@offset[1]
3125 pxor $inout0,$checksum # accumulate checksum
3126 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3127 pxor @offset[1],@offset[2]
3128 pxor $inout1,$checksum
3129 pxor @offset[1],$inout1
3130 pxor @offset[2],@offset[3]
3131 pxor $inout2,$checksum
3132 pxor @offset[2],$inout2
3133 pxor $inout3,$checksum
3134 pxor @offset[3],$inout3
3135 $movkey 32($key_),$rndkey0
3137 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3138 pxor $rndkey0l,@offset[1]
3139 pxor $rndkey0l,@offset[2]
3140 pxor $rndkey0l,@offset[3]
3142 aesenc $rndkey1,$inout0
3143 aesenc $rndkey1,$inout1
3144 aesenc $rndkey1,$inout2
3145 aesenc $rndkey1,$inout3
3146 $movkey 48($key_),$rndkey1
3148 aesenc $rndkey0,$inout0
3149 aesenc $rndkey0,$inout1
3150 aesenc $rndkey0,$inout2
3151 aesenc $rndkey0,$inout3
3152 $movkey 64($key_),$rndkey0
3157 aesenc $rndkey1,$inout0
3158 aesenc $rndkey1,$inout1
3159 aesenc $rndkey1,$inout2
3160 aesenc $rndkey1,$inout3
3161 $movkey ($key,%rax),$rndkey1
3164 aesenc $rndkey0,$inout0
3165 aesenc $rndkey0,$inout1
3166 aesenc $rndkey0,$inout2
3167 aesenc $rndkey0,$inout3
3168 $movkey -16($key,%rax),$rndkey0
3171 aesenc $rndkey1,$inout0
3172 aesenc $rndkey1,$inout1
3173 aesenc $rndkey1,$inout2
3174 aesenc $rndkey1,$inout3
3175 $movkey 16($key_),$rndkey1
3176 mov %r10,%rax # restore twisted rounds
3178 aesenclast @offset[0],$inout0
3179 aesenclast @offset[1],$inout1
3180 aesenclast @offset[2],$inout2
3181 aesenclast @offset[3],$inout3
3183 .size __ocb_encrypt4,.-__ocb_encrypt4
3185 .type __ocb_encrypt1,\@abi-omnipotent
3188 pxor @offset[5],$inout5 # offset_i
3189 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3190 pxor $inout0,$checksum # accumulate checksum
3191 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3192 $movkey 32($key_),$rndkey0
3194 aesenc $rndkey1,$inout0
3195 $movkey 48($key_),$rndkey1
3196 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3198 aesenc $rndkey0,$inout0
3199 $movkey 64($key_),$rndkey0
3204 aesenc $rndkey1,$inout0
3205 $movkey ($key,%rax),$rndkey1
3208 aesenc $rndkey0,$inout0
3209 $movkey -16($key,%rax),$rndkey0
3212 aesenc $rndkey1,$inout0
3213 $movkey 16($key_),$rndkey1 # redundant in tail
3214 mov %r10,%rax # restore twisted rounds
3216 aesenclast $inout5,$inout0
3218 .size __ocb_encrypt1,.-__ocb_encrypt1
3220 .globl aesni_ocb_decrypt
3221 .type aesni_ocb_decrypt,\@function,6
3237 $code.=<<___ if ($win64);
3238 lea -0xa0(%rsp),%rsp
3239 movaps %xmm6,0x00(%rsp) # offload everything
3240 movaps %xmm7,0x10(%rsp)
3241 movaps %xmm8,0x20(%rsp)
3242 movaps %xmm9,0x30(%rsp)
3243 movaps %xmm10,0x40(%rsp)
3244 movaps %xmm11,0x50(%rsp)
3245 movaps %xmm12,0x60(%rsp)
3246 movaps %xmm13,0x70(%rsp)
3247 movaps %xmm14,0x80(%rsp)
3248 movaps %xmm15,0x90(%rsp)
3252 mov $seventh_arg(%rax),$L_p # 7th argument
3253 mov $seventh_arg+8(%rax),$checksum_p# 8th argument
3255 mov 240($key),$rnds_
3258 $movkey ($key),$rndkey0l # round[0]
3259 $movkey 16($key,$rnds_),$rndkey1 # round[last]
3261 movdqu ($offset_p),@offset[5] # load last offset_i
3262 pxor $rndkey1,$rndkey0l # round[0] ^ round[last]
3263 pxor $rndkey1,@offset[5] # offset_i ^ round[last]
3266 lea 32($key_,$rnds_),$key
3267 $movkey 16($key_),$rndkey1 # round[1]
3268 sub %r10,%rax # twisted $rounds
3269 mov %rax,%r10 # backup twisted $rounds
3271 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3272 movdqu ($checksum_p),$checksum # load checksum
3274 test \$1,$block_num # is first block number odd?
3280 movdqu ($L_p,$i1),$inout5 # borrow
3281 movdqu ($inp),$inout0
3286 movdqa $inout5,@offset[5]
3287 movups $inout0,($out)
3288 xorps $inout0,$checksum # accumulate checksum
3294 lea 1($block_num),$i1 # even-numbered blocks
3295 lea 3($block_num),$i3
3296 lea 5($block_num),$i5
3297 lea 6($block_num),$block_num
3298 bsf $i1,$i1 # ntz(block)
3301 shl \$4,$i1 # ntz(block) -> table offset
3307 jmp .Locb_dec_grandloop
3310 .Locb_dec_grandloop:
3311 movdqu `16*0`($inp),$inout0 # load input
3312 movdqu `16*1`($inp),$inout1
3313 movdqu `16*2`($inp),$inout2
3314 movdqu `16*3`($inp),$inout3
3315 movdqu `16*4`($inp),$inout4
3316 movdqu `16*5`($inp),$inout5
3317 lea `16*6`($inp),$inp
3321 movups $inout0,`16*0`($out) # store output
3322 pxor $inout0,$checksum # accumulate checksum
3323 movups $inout1,`16*1`($out)
3324 pxor $inout1,$checksum
3325 movups $inout2,`16*2`($out)
3326 pxor $inout2,$checksum
3327 movups $inout3,`16*3`($out)
3328 pxor $inout3,$checksum
3329 movups $inout4,`16*4`($out)
3330 pxor $inout4,$checksum
3331 movups $inout5,`16*5`($out)
3332 pxor $inout5,$checksum
3333 lea `16*6`($out),$out
3335 jnc .Locb_dec_grandloop
3341 movdqu `16*0`($inp),$inout0
3344 movdqu `16*1`($inp),$inout1
3347 movdqu `16*2`($inp),$inout2
3350 movdqu `16*3`($inp),$inout3
3353 movdqu `16*4`($inp),$inout4
3354 pxor $inout5,$inout5
3358 movdqa @offset[4],@offset[5]
3359 movups $inout0,`16*0`($out) # store output
3360 pxor $inout0,$checksum # accumulate checksum
3361 movups $inout1,`16*1`($out)
3362 pxor $inout1,$checksum
3363 movups $inout2,`16*2`($out)
3364 pxor $inout2,$checksum
3365 movups $inout3,`16*3`($out)
3366 pxor $inout3,$checksum
3367 movups $inout4,`16*4`($out)
3368 pxor $inout4,$checksum
3374 movdqa @offset[0],$inout5 # borrow
3378 movdqa $inout5,@offset[5]
3379 movups $inout0,`16*0`($out) # store output
3380 xorps $inout0,$checksum # accumulate checksum
3385 pxor $inout2,$inout2
3386 pxor $inout3,$inout3
3390 movdqa @offset[1],@offset[5]
3391 movups $inout0,`16*0`($out) # store output
3392 xorps $inout0,$checksum # accumulate checksum
3393 movups $inout1,`16*1`($out)
3394 xorps $inout1,$checksum
3400 pxor $inout3,$inout3
3404 movdqa @offset[2],@offset[5]
3405 movups $inout0,`16*0`($out) # store output
3406 xorps $inout0,$checksum # accumulate checksum
3407 movups $inout1,`16*1`($out)
3408 xorps $inout1,$checksum
3409 movups $inout2,`16*2`($out)
3410 xorps $inout2,$checksum
3418 movdqa @offset[3],@offset[5]
3419 movups $inout0,`16*0`($out) # store output
3420 pxor $inout0,$checksum # accumulate checksum
3421 movups $inout1,`16*1`($out)
3422 pxor $inout1,$checksum
3423 movups $inout2,`16*2`($out)
3424 pxor $inout2,$checksum
3425 movups $inout3,`16*3`($out)
3426 pxor $inout3,$checksum
3429 pxor $rndkey0,@offset[5] # "remove" round[last]
3430 movdqu $checksum,($checksum_p) # store checksum
3431 movdqu @offset[5],($offset_p) # store last offset_i
3433 xorps %xmm0,%xmm0 # clear register bank
3440 $code.=<<___ if (!$win64);
3454 $code.=<<___ if ($win64);
3455 movaps 0x00(%rsp),%xmm6
3456 movaps %xmm0,0x00(%rsp) # clear stack
3457 movaps 0x10(%rsp),%xmm7
3458 movaps %xmm0,0x10(%rsp)
3459 movaps 0x20(%rsp),%xmm8
3460 movaps %xmm0,0x20(%rsp)
3461 movaps 0x30(%rsp),%xmm9
3462 movaps %xmm0,0x30(%rsp)
3463 movaps 0x40(%rsp),%xmm10
3464 movaps %xmm0,0x40(%rsp)
3465 movaps 0x50(%rsp),%xmm11
3466 movaps %xmm0,0x50(%rsp)
3467 movaps 0x60(%rsp),%xmm12
3468 movaps %xmm0,0x60(%rsp)
3469 movaps 0x70(%rsp),%xmm13
3470 movaps %xmm0,0x70(%rsp)
3471 movaps 0x80(%rsp),%xmm14
3472 movaps %xmm0,0x80(%rsp)
3473 movaps 0x90(%rsp),%xmm15
3474 movaps %xmm0,0x90(%rsp)
3475 lea 0xa0+0x28(%rsp),%rax
3490 .cfi_def_cfa_register %rsp
3494 .size aesni_ocb_decrypt,.-aesni_ocb_decrypt
3496 .type __ocb_decrypt6,\@abi-omnipotent
3499 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3500 movdqu ($L_p,$i1),@offset[1]
3501 movdqa @offset[0],@offset[2]
3502 movdqu ($L_p,$i3),@offset[3]
3503 movdqa @offset[0],@offset[4]
3504 pxor @offset[5],@offset[0]
3505 movdqu ($L_p,$i5),@offset[5]
3506 pxor @offset[0],@offset[1]
3507 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3508 pxor @offset[1],@offset[2]
3509 pxor @offset[1],$inout1
3510 pxor @offset[2],@offset[3]
3511 pxor @offset[2],$inout2
3512 pxor @offset[3],@offset[4]
3513 pxor @offset[3],$inout3
3514 pxor @offset[4],@offset[5]
3515 pxor @offset[4],$inout4
3516 pxor @offset[5],$inout5
3517 $movkey 32($key_),$rndkey0
3519 lea 1($block_num),$i1 # even-numbered blocks
3520 lea 3($block_num),$i3
3521 lea 5($block_num),$i5
3523 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3524 bsf $i1,$i1 # ntz(block)
3528 aesdec $rndkey1,$inout0
3529 aesdec $rndkey1,$inout1
3530 aesdec $rndkey1,$inout2
3531 aesdec $rndkey1,$inout3
3532 pxor $rndkey0l,@offset[1]
3533 pxor $rndkey0l,@offset[2]
3534 aesdec $rndkey1,$inout4
3535 pxor $rndkey0l,@offset[3]
3536 pxor $rndkey0l,@offset[4]
3537 aesdec $rndkey1,$inout5
3538 $movkey 48($key_),$rndkey1
3539 pxor $rndkey0l,@offset[5]
3541 aesdec $rndkey0,$inout0
3542 aesdec $rndkey0,$inout1
3543 aesdec $rndkey0,$inout2
3544 aesdec $rndkey0,$inout3
3545 aesdec $rndkey0,$inout4
3546 aesdec $rndkey0,$inout5
3547 $movkey 64($key_),$rndkey0
3548 shl \$4,$i1 # ntz(block) -> table offset
3554 aesdec $rndkey1,$inout0
3555 aesdec $rndkey1,$inout1
3556 aesdec $rndkey1,$inout2
3557 aesdec $rndkey1,$inout3
3558 aesdec $rndkey1,$inout4
3559 aesdec $rndkey1,$inout5
3560 $movkey ($key,%rax),$rndkey1
3563 aesdec $rndkey0,$inout0
3564 aesdec $rndkey0,$inout1
3565 aesdec $rndkey0,$inout2
3566 aesdec $rndkey0,$inout3
3567 aesdec $rndkey0,$inout4
3568 aesdec $rndkey0,$inout5
3569 $movkey -16($key,%rax),$rndkey0
3572 aesdec $rndkey1,$inout0
3573 aesdec $rndkey1,$inout1
3574 aesdec $rndkey1,$inout2
3575 aesdec $rndkey1,$inout3
3576 aesdec $rndkey1,$inout4
3577 aesdec $rndkey1,$inout5
3578 $movkey 16($key_),$rndkey1
3581 aesdeclast @offset[0],$inout0
3582 movdqu ($L_p),@offset[0] # L_0 for all odd-numbered blocks
3583 mov %r10,%rax # restore twisted rounds
3584 aesdeclast @offset[1],$inout1
3585 aesdeclast @offset[2],$inout2
3586 aesdeclast @offset[3],$inout3
3587 aesdeclast @offset[4],$inout4
3588 aesdeclast @offset[5],$inout5
3590 .size __ocb_decrypt6,.-__ocb_decrypt6
3592 .type __ocb_decrypt4,\@abi-omnipotent
3595 pxor $rndkey0l,@offset[5] # offset_i ^ round[0]
3596 movdqu ($L_p,$i1),@offset[1]
3597 movdqa @offset[0],@offset[2]
3598 movdqu ($L_p,$i3),@offset[3]
3599 pxor @offset[5],@offset[0]
3600 pxor @offset[0],@offset[1]
3601 pxor @offset[0],$inout0 # input ^ round[0] ^ offset_i
3602 pxor @offset[1],@offset[2]
3603 pxor @offset[1],$inout1
3604 pxor @offset[2],@offset[3]
3605 pxor @offset[2],$inout2
3606 pxor @offset[3],$inout3
3607 $movkey 32($key_),$rndkey0
3609 pxor $rndkey0l,@offset[0] # offset_i ^ round[last]
3610 pxor $rndkey0l,@offset[1]
3611 pxor $rndkey0l,@offset[2]
3612 pxor $rndkey0l,@offset[3]
3614 aesdec $rndkey1,$inout0
3615 aesdec $rndkey1,$inout1
3616 aesdec $rndkey1,$inout2
3617 aesdec $rndkey1,$inout3
3618 $movkey 48($key_),$rndkey1
3620 aesdec $rndkey0,$inout0
3621 aesdec $rndkey0,$inout1
3622 aesdec $rndkey0,$inout2
3623 aesdec $rndkey0,$inout3
3624 $movkey 64($key_),$rndkey0
3629 aesdec $rndkey1,$inout0
3630 aesdec $rndkey1,$inout1
3631 aesdec $rndkey1,$inout2
3632 aesdec $rndkey1,$inout3
3633 $movkey ($key,%rax),$rndkey1
3636 aesdec $rndkey0,$inout0
3637 aesdec $rndkey0,$inout1
3638 aesdec $rndkey0,$inout2
3639 aesdec $rndkey0,$inout3
3640 $movkey -16($key,%rax),$rndkey0
3643 aesdec $rndkey1,$inout0
3644 aesdec $rndkey1,$inout1
3645 aesdec $rndkey1,$inout2
3646 aesdec $rndkey1,$inout3
3647 $movkey 16($key_),$rndkey1
3648 mov %r10,%rax # restore twisted rounds
3650 aesdeclast @offset[0],$inout0
3651 aesdeclast @offset[1],$inout1
3652 aesdeclast @offset[2],$inout2
3653 aesdeclast @offset[3],$inout3
3655 .size __ocb_decrypt4,.-__ocb_decrypt4
3657 .type __ocb_decrypt1,\@abi-omnipotent
3660 pxor @offset[5],$inout5 # offset_i
3661 pxor $rndkey0l,$inout5 # offset_i ^ round[0]
3662 pxor $inout5,$inout0 # input ^ round[0] ^ offset_i
3663 $movkey 32($key_),$rndkey0
3665 aesdec $rndkey1,$inout0
3666 $movkey 48($key_),$rndkey1
3667 pxor $rndkey0l,$inout5 # offset_i ^ round[last]
3669 aesdec $rndkey0,$inout0
3670 $movkey 64($key_),$rndkey0
3675 aesdec $rndkey1,$inout0
3676 $movkey ($key,%rax),$rndkey1
3679 aesdec $rndkey0,$inout0
3680 $movkey -16($key,%rax),$rndkey0
3683 aesdec $rndkey1,$inout0
3684 $movkey 16($key_),$rndkey1 # redundant in tail
3685 mov %r10,%rax # restore twisted rounds
3687 aesdeclast $inout5,$inout0
3689 .size __ocb_decrypt1,.-__ocb_decrypt1
3693 ########################################################################
3694 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
3695 # size_t length, const AES_KEY *key,
3696 # unsigned char *ivp,const int enc);
3698 my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
3699 my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
3702 .globl ${PREFIX}_cbc_encrypt
3703 .type ${PREFIX}_cbc_encrypt,\@function,6
3705 ${PREFIX}_cbc_encrypt:
3707 test $len,$len # check length
3710 mov 240($key),$rnds_ # key->rounds
3711 mov $key,$key_ # backup $key
3712 test %r9d,%r9d # 6th argument
3714 #--------------------------- CBC ENCRYPT ------------------------------#
3715 movups ($ivp),$inout0 # load iv as initial state
3723 movups ($inp),$inout1 # load input
3725 #xorps $inout1,$inout0
3727 &aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
3729 mov $rnds_,$rounds # restore $rounds
3730 mov $key_,$key # restore $key
3731 movups $inout0,0($out) # store output
3737 pxor $rndkey0,$rndkey0 # clear register bank
3738 pxor $rndkey1,$rndkey1
3739 movups $inout0,($ivp)
3740 pxor $inout0,$inout0
3741 pxor $inout1,$inout1
3745 mov $len,%rcx # zaps $key
3746 xchg $inp,$out # $inp is %rsi and $out is %rdi now
3747 .long 0x9066A4F3 # rep movsb
3748 mov \$16,%ecx # zero tail
3751 .long 0x9066AAF3 # rep stosb
3752 lea -16(%rdi),%rdi # rewind $out by 1 block
3753 mov $rnds_,$rounds # restore $rounds
3754 mov %rdi,%rsi # $inp and $out are the same
3755 mov $key_,$key # restore $key
3756 xor $len,$len # len=16
3757 jmp .Lcbc_enc_loop # one more spin
3758 \f#--------------------------- CBC DECRYPT ------------------------------#
3762 jne .Lcbc_decrypt_bulk
3764 # handle single block without allocating stack frame,
3765 # useful in ciphertext stealing mode
3766 movdqu ($inp),$inout0 # load input
3767 movdqu ($ivp),$inout1 # load iv
3768 movdqa $inout0,$inout2 # future iv
3770 &aesni_generate1("dec",$key,$rnds_);
3772 pxor $rndkey0,$rndkey0 # clear register bank
3773 pxor $rndkey1,$rndkey1
3774 movdqu $inout2,($ivp) # store iv
3775 xorps $inout1,$inout0 # ^=iv
3776 pxor $inout1,$inout1
3777 movups $inout0,($out) # store output
3778 pxor $inout0,$inout0
3782 lea (%rsp),%r11 # frame pointer
3783 .cfi_def_cfa_register %r11
3786 sub \$$frame_size,%rsp
3787 and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
3789 $code.=<<___ if ($win64);
3790 movaps %xmm6,0x10(%rsp)
3791 movaps %xmm7,0x20(%rsp)
3792 movaps %xmm8,0x30(%rsp)
3793 movaps %xmm9,0x40(%rsp)
3794 movaps %xmm10,0x50(%rsp)
3795 movaps %xmm11,0x60(%rsp)
3796 movaps %xmm12,0x70(%rsp)
3797 movaps %xmm13,0x80(%rsp)
3798 movaps %xmm14,0x90(%rsp)
3799 movaps %xmm15,0xa0(%rsp)
3803 my $inp_=$key_="%rbp"; # reassign $key_
3806 mov $key,$key_ # [re-]backup $key [after reassignment]
3812 $movkey ($key),$rndkey0
3813 movdqu 0x00($inp),$inout0 # load input
3814 movdqu 0x10($inp),$inout1
3816 movdqu 0x20($inp),$inout2
3818 movdqu 0x30($inp),$inout3
3820 movdqu 0x40($inp),$inout4
3822 movdqu 0x50($inp),$inout5
3824 mov OPENSSL_ia32cap_P+4(%rip),%r9d
3826 jbe .Lcbc_dec_six_or_seven
3828 and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
3829 sub \$0x50,$len # $len is biased by -5*16
3830 cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
3831 je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
3832 sub \$0x20,$len # $len is biased by -7*16
3833 lea 0x70($key),$key # size optimization
3834 jmp .Lcbc_dec_loop8_enter
3837 movups $inout7,($out)
3839 .Lcbc_dec_loop8_enter:
3840 movdqu 0x60($inp),$inout6
3841 pxor $rndkey0,$inout0
3842 movdqu 0x70($inp),$inout7
3843 pxor $rndkey0,$inout1
3844 $movkey 0x10-0x70($key),$rndkey1
3845 pxor $rndkey0,$inout2
3847 cmp \$0x70,$len # is there at least 0x60 bytes ahead?
3848 pxor $rndkey0,$inout3
3849 pxor $rndkey0,$inout4
3850 pxor $rndkey0,$inout5
3851 pxor $rndkey0,$inout6
3853 aesdec $rndkey1,$inout0
3854 pxor $rndkey0,$inout7
3855 $movkey 0x20-0x70($key),$rndkey0
3856 aesdec $rndkey1,$inout1
3857 aesdec $rndkey1,$inout2
3858 aesdec $rndkey1,$inout3
3859 aesdec $rndkey1,$inout4
3860 aesdec $rndkey1,$inout5
3861 aesdec $rndkey1,$inout6
3864 aesdec $rndkey1,$inout7
3866 $movkey 0x30-0x70($key),$rndkey1
3868 for($i=1;$i<12;$i++) {
3869 my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
3870 $code.=<<___ if ($i==7);
3874 aesdec $rndkeyx,$inout0
3875 aesdec $rndkeyx,$inout1
3876 aesdec $rndkeyx,$inout2
3877 aesdec $rndkeyx,$inout3
3878 aesdec $rndkeyx,$inout4
3879 aesdec $rndkeyx,$inout5
3880 aesdec $rndkeyx,$inout6
3881 aesdec $rndkeyx,$inout7
3882 $movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
3884 $code.=<<___ if ($i<6 || (!($i&1) && $i>7));
3887 $code.=<<___ if ($i==7);
3890 $code.=<<___ if ($i==9);
3893 $code.=<<___ if ($i==11);
3900 aesdec $rndkey1,$inout0
3901 aesdec $rndkey1,$inout1
3904 aesdec $rndkey1,$inout2
3905 aesdec $rndkey1,$inout3
3908 aesdec $rndkey1,$inout4
3909 aesdec $rndkey1,$inout5
3912 aesdec $rndkey1,$inout6
3913 aesdec $rndkey1,$inout7
3914 movdqu 0x50($inp),$rndkey1
3916 aesdeclast $iv,$inout0
3917 movdqu 0x60($inp),$iv # borrow $iv
3918 pxor $rndkey0,$rndkey1
3919 aesdeclast $in0,$inout1
3921 movdqu 0x70($inp),$rndkey0 # next IV
3922 aesdeclast $in1,$inout2
3924 movdqu 0x00($inp_),$in0
3925 aesdeclast $in2,$inout3
3926 aesdeclast $in3,$inout4
3927 movdqu 0x10($inp_),$in1
3928 movdqu 0x20($inp_),$in2
3929 aesdeclast $in4,$inout5
3930 aesdeclast $rndkey1,$inout6
3931 movdqu 0x30($inp_),$in3
3932 movdqu 0x40($inp_),$in4
3933 aesdeclast $iv,$inout7
3934 movdqa $rndkey0,$iv # return $iv
3935 movdqu 0x50($inp_),$rndkey1
3936 $movkey -0x70($key),$rndkey0
3938 movups $inout0,($out) # store output
3940 movups $inout1,0x10($out)
3942 movups $inout2,0x20($out)
3944 movups $inout3,0x30($out)
3946 movups $inout4,0x40($out)
3948 movups $inout5,0x50($out)
3949 movdqa $rndkey1,$inout5
3950 movups $inout6,0x60($out)
3956 movaps $inout7,$inout0
3957 lea -0x70($key),$key
3959 jle .Lcbc_dec_clear_tail_collected
3960 movups $inout7,($out)
3966 .Lcbc_dec_six_or_seven:
3970 movaps $inout5,$inout6
3971 call _aesni_decrypt6
3972 pxor $iv,$inout0 # ^= IV
3975 movdqu $inout0,($out)
3977 movdqu $inout1,0x10($out)
3978 pxor $inout1,$inout1 # clear register bank
3980 movdqu $inout2,0x20($out)
3981 pxor $inout2,$inout2
3983 movdqu $inout3,0x30($out)
3984 pxor $inout3,$inout3
3986 movdqu $inout4,0x40($out)
3987 pxor $inout4,$inout4
3989 movdqa $inout5,$inout0
3990 pxor $inout5,$inout5
3991 jmp .Lcbc_dec_tail_collected
3995 movups 0x60($inp),$inout6
3996 xorps $inout7,$inout7
3997 call _aesni_decrypt8
3998 movups 0x50($inp),$inout7
3999 pxor $iv,$inout0 # ^= IV
4000 movups 0x60($inp),$iv
4002 movdqu $inout0,($out)
4004 movdqu $inout1,0x10($out)
4005 pxor $inout1,$inout1 # clear register bank
4007 movdqu $inout2,0x20($out)
4008 pxor $inout2,$inout2
4010 movdqu $inout3,0x30($out)
4011 pxor $inout3,$inout3
4013 movdqu $inout4,0x40($out)
4014 pxor $inout4,$inout4
4015 pxor $inout7,$inout6
4016 movdqu $inout5,0x50($out)
4017 pxor $inout5,$inout5
4019 movdqa $inout6,$inout0
4020 pxor $inout6,$inout6
4021 pxor $inout7,$inout7
4022 jmp .Lcbc_dec_tail_collected
4026 movups $inout5,($out)
4028 movdqu 0x00($inp),$inout0 # load input
4029 movdqu 0x10($inp),$inout1
4031 movdqu 0x20($inp),$inout2
4033 movdqu 0x30($inp),$inout3
4035 movdqu 0x40($inp),$inout4
4037 movdqu 0x50($inp),$inout5
4039 .Lcbc_dec_loop6_enter:
4041 movdqa $inout5,$inout6
4043 call _aesni_decrypt6
4045 pxor $iv,$inout0 # ^= IV
4048 movdqu $inout0,($out)
4050 movdqu $inout1,0x10($out)
4052 movdqu $inout2,0x20($out)
4055 movdqu $inout3,0x30($out)
4058 movdqu $inout4,0x40($out)
4063 movdqa $inout5,$inout0
4065 jle .Lcbc_dec_clear_tail_collected
4066 movups $inout5,($out)
4070 movups ($inp),$inout0
4072 jbe .Lcbc_dec_one # $len is 1*16 or less
4074 movups 0x10($inp),$inout1
4077 jbe .Lcbc_dec_two # $len is 2*16 or less
4079 movups 0x20($inp),$inout2
4082 jbe .Lcbc_dec_three # $len is 3*16 or less
4084 movups 0x30($inp),$inout3
4087 jbe .Lcbc_dec_four # $len is 4*16 or less
4089 movups 0x40($inp),$inout4 # $len is 5*16 or less
4092 xorps $inout5,$inout5
4093 call _aesni_decrypt6
4097 movdqu $inout0,($out)
4099 movdqu $inout1,0x10($out)
4100 pxor $inout1,$inout1 # clear register bank
4102 movdqu $inout2,0x20($out)
4103 pxor $inout2,$inout2
4105 movdqu $inout3,0x30($out)
4106 pxor $inout3,$inout3
4108 movdqa $inout4,$inout0
4109 pxor $inout4,$inout4
4110 pxor $inout5,$inout5
4112 jmp .Lcbc_dec_tail_collected
4118 &aesni_generate1("dec",$key,$rounds);
4122 jmp .Lcbc_dec_tail_collected
4126 call _aesni_decrypt2
4130 movdqu $inout0,($out)
4131 movdqa $inout1,$inout0
4132 pxor $inout1,$inout1 # clear register bank
4134 jmp .Lcbc_dec_tail_collected
4138 call _aesni_decrypt3
4142 movdqu $inout0,($out)
4144 movdqu $inout1,0x10($out)
4145 pxor $inout1,$inout1 # clear register bank
4146 movdqa $inout2,$inout0
4147 pxor $inout2,$inout2
4149 jmp .Lcbc_dec_tail_collected
4153 call _aesni_decrypt4
4157 movdqu $inout0,($out)
4159 movdqu $inout1,0x10($out)
4160 pxor $inout1,$inout1 # clear register bank
4162 movdqu $inout2,0x20($out)
4163 pxor $inout2,$inout2
4164 movdqa $inout3,$inout0
4165 pxor $inout3,$inout3
4167 jmp .Lcbc_dec_tail_collected
4170 .Lcbc_dec_clear_tail_collected:
4171 pxor $inout1,$inout1 # clear register bank
4172 pxor $inout2,$inout2
4173 pxor $inout3,$inout3
4175 $code.=<<___ if (!$win64);
4176 pxor $inout4,$inout4 # %xmm6..9
4177 pxor $inout5,$inout5
4178 pxor $inout6,$inout6
4179 pxor $inout7,$inout7
4182 .Lcbc_dec_tail_collected:
4185 jnz .Lcbc_dec_tail_partial
4186 movups $inout0,($out)
4187 pxor $inout0,$inout0
4190 .Lcbc_dec_tail_partial:
4191 movaps $inout0,(%rsp)
4192 pxor $inout0,$inout0
4197 .long 0x9066A4F3 # rep movsb
4198 movdqa $inout0,(%rsp)
4201 xorps $rndkey0,$rndkey0 # %xmm0
4202 pxor $rndkey1,$rndkey1
4204 $code.=<<___ if ($win64);
4205 movaps 0x10(%rsp),%xmm6
4206 movaps %xmm0,0x10(%rsp) # clear stack
4207 movaps 0x20(%rsp),%xmm7
4208 movaps %xmm0,0x20(%rsp)
4209 movaps 0x30(%rsp),%xmm8
4210 movaps %xmm0,0x30(%rsp)
4211 movaps 0x40(%rsp),%xmm9
4212 movaps %xmm0,0x40(%rsp)
4213 movaps 0x50(%rsp),%xmm10
4214 movaps %xmm0,0x50(%rsp)
4215 movaps 0x60(%rsp),%xmm11
4216 movaps %xmm0,0x60(%rsp)
4217 movaps 0x70(%rsp),%xmm12
4218 movaps %xmm0,0x70(%rsp)
4219 movaps 0x80(%rsp),%xmm13
4220 movaps %xmm0,0x80(%rsp)
4221 movaps 0x90(%rsp),%xmm14
4222 movaps %xmm0,0x90(%rsp)
4223 movaps 0xa0(%rsp),%xmm15
4224 movaps %xmm0,0xa0(%rsp)
4230 .cfi_def_cfa_register %rsp
4234 .size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
4237 # int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
4238 # int bits, AES_KEY *key)
4240 # input: $inp user-supplied key
4241 # $bits $inp length in bits
4242 # $key pointer to key schedule
4243 # output: %eax 0 denoting success, -1 or -2 - failure (see C)
4244 # *$key key schedule
4246 { my ($inp,$bits,$key) = @_4args;
4250 .globl ${PREFIX}_set_decrypt_key
4251 .type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
4253 ${PREFIX}_set_decrypt_key:
4255 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
4256 .cfi_adjust_cfa_offset 8
4257 call __aesni_set_encrypt_key
4258 shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
4261 lea 16($key,$bits),$inp # points at the end of key schedule
4263 $movkey ($key),%xmm0 # just swap
4264 $movkey ($inp),%xmm1
4265 $movkey %xmm0,($inp)
4266 $movkey %xmm1,($key)
4271 $movkey ($key),%xmm0 # swap and inverse
4272 $movkey ($inp),%xmm1
4277 $movkey %xmm0,16($inp)
4278 $movkey %xmm1,-16($key)
4280 ja .Ldec_key_inverse
4282 $movkey ($key),%xmm0 # inverse middle
4285 $movkey %xmm0,($inp)
4289 .cfi_adjust_cfa_offset -8
4292 .LSEH_end_set_decrypt_key:
4293 .size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
4296 # This is based on submission by
4298 # Huang Ying <ying.huang@intel.com>
4299 # Vinodh Gopal <vinodh.gopal@intel.com>
4302 # Aggressively optimized in respect to aeskeygenassist's critical path
4303 # and is contained in %xmm0-5 to meet Win64 ABI requirement.
4305 # int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
4306 # int bits, AES_KEY * const key);
4308 # input: $inp user-supplied key
4309 # $bits $inp length in bits
4310 # $key pointer to key schedule
4311 # output: %eax 0 denoting success, -1 or -2 - failure (see C)
4312 # $bits rounds-1 (used in aesni_set_decrypt_key)
4313 # *$key key schedule
4314 # $key pointer to key schedule (used in
4315 # aesni_set_decrypt_key)
4317 # Subroutine is frame-less, which means that only volatile registers
4318 # are used. Note that it's declared "abi-omnipotent", which means that
4319 # amount of volatile registers is smaller on Windows.
4322 .globl ${PREFIX}_set_encrypt_key
4323 .type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
4325 ${PREFIX}_set_encrypt_key:
4326 __aesni_set_encrypt_key:
4328 .byte 0x48,0x83,0xEC,0x08 # sub rsp,8
4329 .cfi_adjust_cfa_offset 8
4336 mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
4337 movups ($inp),%xmm0 # pull first 128 bits of *userKey
4338 xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
4339 and OPENSSL_ia32cap_P+4(%rip),%r10d
4340 lea 16($key),%rax # %rax is used as modifiable copy of $key
4349 mov \$9,$bits # 10 rounds for 128-bit key
4350 cmp \$`1<<28`,%r10d # AVX, bit no XOP
4353 $movkey %xmm0,($key) # round 0
4354 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
4355 call .Lkey_expansion_128_cold
4356 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
4357 call .Lkey_expansion_128
4358 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
4359 call .Lkey_expansion_128
4360 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
4361 call .Lkey_expansion_128
4362 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
4363 call .Lkey_expansion_128
4364 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
4365 call .Lkey_expansion_128
4366 aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
4367 call .Lkey_expansion_128
4368 aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
4369 call .Lkey_expansion_128
4370 aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
4371 call .Lkey_expansion_128
4372 aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
4373 call .Lkey_expansion_128
4374 $movkey %xmm0,(%rax)
4375 mov $bits,80(%rax) # 240(%rdx)
4381 movdqa .Lkey_rotate(%rip),%xmm5
4383 movdqa .Lkey_rcon1(%rip),%xmm4
4391 aesenclast %xmm4,%xmm0
4404 movdqu %xmm0,-16(%rax)
4410 movdqa .Lkey_rcon1b(%rip),%xmm4
4413 aesenclast %xmm4,%xmm0
4429 aesenclast %xmm4,%xmm0
4440 movdqu %xmm0,16(%rax)
4442 mov $bits,96(%rax) # 240($key)
4448 movq 16($inp),%xmm2 # remaining 1/3 of *userKey
4449 mov \$11,$bits # 12 rounds for 192
4450 cmp \$`1<<28`,%r10d # AVX, but no XOP
4453 $movkey %xmm0,($key) # round 0
4454 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
4455 call .Lkey_expansion_192a_cold
4456 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
4457 call .Lkey_expansion_192b
4458 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
4459 call .Lkey_expansion_192a
4460 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
4461 call .Lkey_expansion_192b
4462 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
4463 call .Lkey_expansion_192a
4464 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
4465 call .Lkey_expansion_192b
4466 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
4467 call .Lkey_expansion_192a
4468 aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
4469 call .Lkey_expansion_192b
4470 $movkey %xmm0,(%rax)
4471 mov $bits,48(%rax) # 240(%rdx)
4477 movdqa .Lkey_rotate192(%rip),%xmm5
4478 movdqa .Lkey_rcon1(%rip),%xmm4
4488 aesenclast %xmm4,%xmm2
4500 pshufd \$0xff,%xmm0,%xmm3
4507 movdqu %xmm0,-16(%rax)
4512 mov $bits,32(%rax) # 240($key)
4518 movups 16($inp),%xmm2 # remaning half of *userKey
4519 mov \$13,$bits # 14 rounds for 256
4521 cmp \$`1<<28`,%r10d # AVX, but no XOP
4524 $movkey %xmm0,($key) # round 0
4525 $movkey %xmm2,16($key) # round 1
4526 aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
4527 call .Lkey_expansion_256a_cold
4528 aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
4529 call .Lkey_expansion_256b
4530 aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
4531 call .Lkey_expansion_256a
4532 aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
4533 call .Lkey_expansion_256b
4534 aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
4535 call .Lkey_expansion_256a
4536 aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
4537 call .Lkey_expansion_256b
4538 aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
4539 call .Lkey_expansion_256a
4540 aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
4541 call .Lkey_expansion_256b
4542 aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
4543 call .Lkey_expansion_256a
4544 aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
4545 call .Lkey_expansion_256b
4546 aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
4547 call .Lkey_expansion_256a
4548 aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
4549 call .Lkey_expansion_256b
4550 aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
4551 call .Lkey_expansion_256a
4552 $movkey %xmm0,(%rax)
4553 mov $bits,16(%rax) # 240(%rdx)
4559 movdqa .Lkey_rotate(%rip),%xmm5
4560 movdqa .Lkey_rcon1(%rip),%xmm4
4562 movdqu %xmm0,0($key)
4564 movdqu %xmm2,16($key)
4570 aesenclast %xmm4,%xmm2
4587 pshufd \$0xff,%xmm0,%xmm2
4589 aesenclast %xmm3,%xmm2
4600 movdqu %xmm2,16(%rax)
4607 mov $bits,16(%rax) # 240($key)
4622 .cfi_adjust_cfa_offset -8
4625 .LSEH_end_set_encrypt_key:
4628 .Lkey_expansion_128:
4629 $movkey %xmm0,(%rax)
4631 .Lkey_expansion_128_cold:
4632 shufps \$0b00010000,%xmm0,%xmm4
4634 shufps \$0b10001100,%xmm0,%xmm4
4636 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4641 .Lkey_expansion_192a:
4642 $movkey %xmm0,(%rax)
4644 .Lkey_expansion_192a_cold:
4646 .Lkey_expansion_192b_warm:
4647 shufps \$0b00010000,%xmm0,%xmm4
4650 shufps \$0b10001100,%xmm0,%xmm4
4653 pshufd \$0b01010101,%xmm1,%xmm1 # critical path
4656 pshufd \$0b11111111,%xmm0,%xmm3
4661 .Lkey_expansion_192b:
4663 shufps \$0b01000100,%xmm0,%xmm5
4664 $movkey %xmm5,(%rax)
4665 shufps \$0b01001110,%xmm2,%xmm3
4666 $movkey %xmm3,16(%rax)
4668 jmp .Lkey_expansion_192b_warm
4671 .Lkey_expansion_256a:
4672 $movkey %xmm2,(%rax)
4674 .Lkey_expansion_256a_cold:
4675 shufps \$0b00010000,%xmm0,%xmm4
4677 shufps \$0b10001100,%xmm0,%xmm4
4679 shufps \$0b11111111,%xmm1,%xmm1 # critical path
4684 .Lkey_expansion_256b:
4685 $movkey %xmm0,(%rax)
4688 shufps \$0b00010000,%xmm2,%xmm4
4690 shufps \$0b10001100,%xmm2,%xmm4
4692 shufps \$0b10101010,%xmm1,%xmm1 # critical path
4695 .size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
4696 .size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
4703 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
4711 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4713 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
4715 .long 0x04070605,0x04070605,0x04070605,0x04070605
4719 .long 0x1b,0x1b,0x1b,0x1b
4721 .asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
4725 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
4726 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
4734 .extern __imp_RtlVirtualUnwind
4736 $code.=<<___ if ($PREFIX eq "aesni");
4737 .type ecb_ccm64_se_handler,\@abi-omnipotent
4739 ecb_ccm64_se_handler:
4751 mov 120($context),%rax # pull context->Rax
4752 mov 248($context),%rbx # pull context->Rip
4754 mov 8($disp),%rsi # disp->ImageBase
4755 mov 56($disp),%r11 # disp->HandlerData
4757 mov 0(%r11),%r10d # HandlerData[0]
4758 lea (%rsi,%r10),%r10 # prologue label
4759 cmp %r10,%rbx # context->Rip<prologue label
4760 jb .Lcommon_seh_tail
4762 mov 152($context),%rax # pull context->Rsp
4764 mov 4(%r11),%r10d # HandlerData[1]
4765 lea (%rsi,%r10),%r10 # epilogue label
4766 cmp %r10,%rbx # context->Rip>=epilogue label
4767 jae .Lcommon_seh_tail
4769 lea 0(%rax),%rsi # %xmm save area
4770 lea 512($context),%rdi # &context.Xmm6
4771 mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
4772 .long 0xa548f3fc # cld; rep movsq
4773 lea 0x58(%rax),%rax # adjust stack pointer
4775 jmp .Lcommon_seh_tail
4776 .size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
4778 .type ctr_xts_se_handler,\@abi-omnipotent
4792 mov 120($context),%rax # pull context->Rax
4793 mov 248($context),%rbx # pull context->Rip
4795 mov 8($disp),%rsi # disp->ImageBase
4796 mov 56($disp),%r11 # disp->HandlerData
4798 mov 0(%r11),%r10d # HandlerData[0]
4799 lea (%rsi,%r10),%r10 # prologue lable
4800 cmp %r10,%rbx # context->Rip<prologue label
4801 jb .Lcommon_seh_tail
4803 mov 152($context),%rax # pull context->Rsp
4805 mov 4(%r11),%r10d # HandlerData[1]
4806 lea (%rsi,%r10),%r10 # epilogue label
4807 cmp %r10,%rbx # context->Rip>=epilogue label
4808 jae .Lcommon_seh_tail
4810 mov 208($context),%rax # pull context->R11
4812 lea -0xa8(%rax),%rsi # %xmm save area
4813 lea 512($context),%rdi # & context.Xmm6
4814 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4815 .long 0xa548f3fc # cld; rep movsq
4817 mov -8(%rax),%rbp # restore saved %rbp
4818 mov %rbp,160($context) # restore context->Rbp
4819 jmp .Lcommon_seh_tail
4820 .size ctr_xts_se_handler,.-ctr_xts_se_handler
4822 .type ocb_se_handler,\@abi-omnipotent
4836 mov 120($context),%rax # pull context->Rax
4837 mov 248($context),%rbx # pull context->Rip
4839 mov 8($disp),%rsi # disp->ImageBase
4840 mov 56($disp),%r11 # disp->HandlerData
4842 mov 0(%r11),%r10d # HandlerData[0]
4843 lea (%rsi,%r10),%r10 # prologue lable
4844 cmp %r10,%rbx # context->Rip<prologue label
4845 jb .Lcommon_seh_tail
4847 mov 4(%r11),%r10d # HandlerData[1]
4848 lea (%rsi,%r10),%r10 # epilogue label
4849 cmp %r10,%rbx # context->Rip>=epilogue label
4850 jae .Lcommon_seh_tail
4852 mov 8(%r11),%r10d # HandlerData[2]
4853 lea (%rsi,%r10),%r10
4854 cmp %r10,%rbx # context->Rip>=pop label
4857 mov 152($context),%rax # pull context->Rsp
4859 lea (%rax),%rsi # %xmm save area
4860 lea 512($context),%rdi # & context.Xmm6
4861 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4862 .long 0xa548f3fc # cld; rep movsq
4863 lea 0xa0+0x28(%rax),%rax
4872 mov %rbx,144($context) # restore context->Rbx
4873 mov %rbp,160($context) # restore context->Rbp
4874 mov %r12,216($context) # restore context->R12
4875 mov %r13,224($context) # restore context->R13
4876 mov %r14,232($context) # restore context->R14
4878 jmp .Lcommon_seh_tail
4879 .size ocb_se_handler,.-ocb_se_handler
4882 .type cbc_se_handler,\@abi-omnipotent
4896 mov 152($context),%rax # pull context->Rsp
4897 mov 248($context),%rbx # pull context->Rip
4899 lea .Lcbc_decrypt_bulk(%rip),%r10
4900 cmp %r10,%rbx # context->Rip<"prologue" label
4901 jb .Lcommon_seh_tail
4903 mov 120($context),%rax # pull context->Rax
4905 lea .Lcbc_decrypt_body(%rip),%r10
4906 cmp %r10,%rbx # context->Rip<cbc_decrypt_body
4907 jb .Lcommon_seh_tail
4909 mov 152($context),%rax # pull context->Rsp
4911 lea .Lcbc_ret(%rip),%r10
4912 cmp %r10,%rbx # context->Rip>="epilogue" label
4913 jae .Lcommon_seh_tail
4915 lea 16(%rax),%rsi # %xmm save area
4916 lea 512($context),%rdi # &context.Xmm6
4917 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
4918 .long 0xa548f3fc # cld; rep movsq
4920 mov 208($context),%rax # pull context->R11
4922 mov -8(%rax),%rbp # restore saved %rbp
4923 mov %rbp,160($context) # restore context->Rbp
4928 mov %rax,152($context) # restore context->Rsp
4929 mov %rsi,168($context) # restore context->Rsi
4930 mov %rdi,176($context) # restore context->Rdi
4932 mov 40($disp),%rdi # disp->ContextRecord
4933 mov $context,%rsi # context
4934 mov \$154,%ecx # sizeof(CONTEXT)
4935 .long 0xa548f3fc # cld; rep movsq
4938 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
4939 mov 8(%rsi),%rdx # arg2, disp->ImageBase
4940 mov 0(%rsi),%r8 # arg3, disp->ControlPc
4941 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
4942 mov 40(%rsi),%r10 # disp->ContextRecord
4943 lea 56(%rsi),%r11 # &disp->HandlerData
4944 lea 24(%rsi),%r12 # &disp->EstablisherFrame
4945 mov %r10,32(%rsp) # arg5
4946 mov %r11,40(%rsp) # arg6
4947 mov %r12,48(%rsp) # arg7
4948 mov %rcx,56(%rsp) # arg8, (NULL)
4949 call *__imp_RtlVirtualUnwind(%rip)
4951 mov \$1,%eax # ExceptionContinueSearch
4963 .size cbc_se_handler,.-cbc_se_handler
4968 $code.=<<___ if ($PREFIX eq "aesni");
4969 .rva .LSEH_begin_aesni_ecb_encrypt
4970 .rva .LSEH_end_aesni_ecb_encrypt
4973 .rva .LSEH_begin_aesni_ccm64_encrypt_blocks
4974 .rva .LSEH_end_aesni_ccm64_encrypt_blocks
4975 .rva .LSEH_info_ccm64_enc
4977 .rva .LSEH_begin_aesni_ccm64_decrypt_blocks
4978 .rva .LSEH_end_aesni_ccm64_decrypt_blocks
4979 .rva .LSEH_info_ccm64_dec
4981 .rva .LSEH_begin_aesni_ctr32_encrypt_blocks
4982 .rva .LSEH_end_aesni_ctr32_encrypt_blocks
4983 .rva .LSEH_info_ctr32
4985 .rva .LSEH_begin_aesni_xts_encrypt
4986 .rva .LSEH_end_aesni_xts_encrypt
4987 .rva .LSEH_info_xts_enc
4989 .rva .LSEH_begin_aesni_xts_decrypt
4990 .rva .LSEH_end_aesni_xts_decrypt
4991 .rva .LSEH_info_xts_dec
4993 .rva .LSEH_begin_aesni_ocb_encrypt
4994 .rva .LSEH_end_aesni_ocb_encrypt
4995 .rva .LSEH_info_ocb_enc
4997 .rva .LSEH_begin_aesni_ocb_decrypt
4998 .rva .LSEH_end_aesni_ocb_decrypt
4999 .rva .LSEH_info_ocb_dec
5002 .rva .LSEH_begin_${PREFIX}_cbc_encrypt
5003 .rva .LSEH_end_${PREFIX}_cbc_encrypt
5006 .rva ${PREFIX}_set_decrypt_key
5007 .rva .LSEH_end_set_decrypt_key
5010 .rva ${PREFIX}_set_encrypt_key
5011 .rva .LSEH_end_set_encrypt_key
5016 $code.=<<___ if ($PREFIX eq "aesni");
5019 .rva ecb_ccm64_se_handler
5020 .rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[]
5021 .LSEH_info_ccm64_enc:
5023 .rva ecb_ccm64_se_handler
5024 .rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
5025 .LSEH_info_ccm64_dec:
5027 .rva ecb_ccm64_se_handler
5028 .rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
5031 .rva ctr_xts_se_handler
5032 .rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
5035 .rva ctr_xts_se_handler
5036 .rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
5039 .rva ctr_xts_se_handler
5040 .rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
5044 .rva .Locb_enc_body,.Locb_enc_epilogue # HandlerData[]
5050 .rva .Locb_dec_body,.Locb_dec_epilogue # HandlerData[]
5059 .byte 0x01,0x04,0x01,0x00
5060 .byte 0x04,0x02,0x00,0x00 # sub rsp,8
5065 local *opcode=shift;
5069 $rex|=0x04 if($dst>=8);
5070 $rex|=0x01 if($src>=8);
5071 push @opcode,$rex|0x40 if($rex);
5078 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5079 rex(\@opcode,$4,$3);
5080 push @opcode,0x0f,0x3a,0xdf;
5081 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
5083 push @opcode,$c=~/^0/?oct($c):$c;
5084 return ".byte\t".join(',',@opcode);
5086 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
5089 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5090 "aesdec" => 0xde, "aesdeclast" => 0xdf
5092 return undef if (!defined($opcodelet{$1}));
5093 rex(\@opcode,$3,$2);
5094 push @opcode,0x0f,0x38,$opcodelet{$1};
5095 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
5096 return ".byte\t".join(',',@opcode);
5098 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
5100 "aesenc" => 0xdc, "aesenclast" => 0xdd,
5101 "aesdec" => 0xde, "aesdeclast" => 0xdf
5103 return undef if (!defined($opcodelet{$1}));
5105 push @opcode,0x44 if ($3>=8);
5106 push @opcode,0x0f,0x38,$opcodelet{$1};
5107 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
5108 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
5109 return ".byte\t".join(',',@opcode);
5115 ".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
5118 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
5119 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
5120 #$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
5121 $code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;