Following the license change, modify the boilerplates in crypto/aes/
[openssl.git] / crypto / aes / asm / aesni-x86.pl
1 #! /usr/bin/env perl
2 # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements support for Intel AES-NI extension. In
18 # OpenSSL context it's used with Intel engine, but can also be used as
19 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
20 # details].
21 #
22 # Performance.
23 #
24 # To start with see corresponding paragraph in aesni-x86_64.pl...
25 # Instead of filling table similar to one found there I've chosen to
26 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
27 # The simplified table below represents 32-bit performance relative
28 # to 64-bit one in every given point. Ratios vary for different
29 # encryption modes, therefore interval values.
30 #
31 #       16-byte     64-byte     256-byte    1-KB        8-KB
32 #       53-67%      67-84%      91-94%      95-98%      97-99.5%
33 #
34 # Lower ratios for smaller block sizes are perfectly understandable,
35 # because function call overhead is higher in 32-bit mode. Largest
36 # 8-KB block performance is virtually same: 32-bit code is less than
37 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
38
39 # January 2011
40 #
41 # See aesni-x86_64.pl for details. Unlike x86_64 version this module
42 # interleaves at most 6 aes[enc|dec] instructions, because there are
43 # not enough registers for 8x interleave [which should be optimal for
44 # Sandy Bridge]. Actually, performance results for 6x interleave
45 # factor presented in aesni-x86_64.pl (except for CTR) are for this
46 # module.
47
48 # April 2011
49 #
50 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
51 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
52
53 # November 2015
54 #
55 # Add aesni_ocb_[en|de]crypt.
56
57 ######################################################################
58 # Current large-block performance in cycles per byte processed with
59 # 128-bit key (less is better).
60 #
61 #               CBC en-/decrypt CTR     XTS     ECB     OCB
62 # Westmere      3.77/1.37       1.37    1.52    1.27
63 # * Bridge      5.07/0.98       0.99    1.09    0.91    1.10
64 # Haswell       4.44/0.80       0.97    1.03    0.72    0.76
65 # Skylake       2.68/0.65       0.65    0.66    0.64    0.66
66 # Silvermont    5.77/3.56       3.67    4.03    3.46    4.03
67 # Goldmont      3.84/1.39       1.39    1.63    1.31    1.70
68 # Bulldozer     5.80/0.98       1.05    1.24    0.93    1.23
69
70 $PREFIX="aesni";        # if $PREFIX is set to "AES", the script
71                         # generates drop-in replacement for
72                         # crypto/aes/asm/aes-586.pl:-)
73 $inline=1;              # inline _aesni_[en|de]crypt
74
75 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
76 push(@INC,"${dir}","${dir}../../perlasm");
77 require "x86asm.pl";
78
79 $output = pop;
80 open OUT,">$output";
81 *STDOUT=*OUT;
82
83 &asm_init($ARGV[0]);
84
85 &external_label("OPENSSL_ia32cap_P");
86 &static_label("key_const");
87
88 if ($PREFIX eq "aesni") { $movekey=\&movups; }
89 else                    { $movekey=\&movups; }
90
91 $len="eax";
92 $rounds="ecx";
93 $key="edx";
94 $inp="esi";
95 $out="edi";
96 $rounds_="ebx"; # backup copy for $rounds
97 $key_="ebp";    # backup copy for $key
98
99 $rndkey0="xmm0";
100 $rndkey1="xmm1";
101 $inout0="xmm2";
102 $inout1="xmm3";
103 $inout2="xmm4";
104 $inout3="xmm5"; $in1="xmm5";
105 $inout4="xmm6"; $in0="xmm6";
106 $inout5="xmm7"; $ivec="xmm7";
107
108 # AESNI extension
109 sub aeskeygenassist
110 { my($dst,$src,$imm)=@_;
111     if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
112     {   &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);   }
113 }
114 sub aescommon
115 { my($opcodelet,$dst,$src)=@_;
116     if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
117     {   &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
118 }
119 sub aesimc      { aescommon(0xdb,@_); }
120 sub aesenc      { aescommon(0xdc,@_); }
121 sub aesenclast  { aescommon(0xdd,@_); }
122 sub aesdec      { aescommon(0xde,@_); }
123 sub aesdeclast  { aescommon(0xdf,@_); }
124 \f
125 # Inline version of internal aesni_[en|de]crypt1
126 { my $sn;
127 sub aesni_inline_generate1
128 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
129   $sn++;
130
131     &$movekey           ($rndkey0,&QWP(0,$key));
132     &$movekey           ($rndkey1,&QWP(16,$key));
133     &xorps              ($ivec,$rndkey0)        if (defined($ivec));
134     &lea                ($key,&DWP(32,$key));
135     &xorps              ($inout,$ivec)          if (defined($ivec));
136     &xorps              ($inout,$rndkey0)       if (!defined($ivec));
137     &set_label("${p}1_loop_$sn");
138         eval"&aes${p}   ($inout,$rndkey1)";
139         &dec            ($rounds);
140         &$movekey       ($rndkey1,&QWP(0,$key));
141         &lea            ($key,&DWP(16,$key));
142     &jnz                (&label("${p}1_loop_$sn"));
143     eval"&aes${p}last   ($inout,$rndkey1)";
144 }}
145
146 sub aesni_generate1     # fully unrolled loop
147 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
148
149     &function_begin_B("_aesni_${p}rypt1");
150         &movups         ($rndkey0,&QWP(0,$key));
151         &$movekey       ($rndkey1,&QWP(0x10,$key));
152         &xorps          ($inout,$rndkey0);
153         &$movekey       ($rndkey0,&QWP(0x20,$key));
154         &lea            ($key,&DWP(0x30,$key));
155         &cmp            ($rounds,11);
156         &jb             (&label("${p}128"));
157         &lea            ($key,&DWP(0x20,$key));
158         &je             (&label("${p}192"));
159         &lea            ($key,&DWP(0x20,$key));
160         eval"&aes${p}   ($inout,$rndkey1)";
161         &$movekey       ($rndkey1,&QWP(-0x40,$key));
162         eval"&aes${p}   ($inout,$rndkey0)";
163         &$movekey       ($rndkey0,&QWP(-0x30,$key));
164     &set_label("${p}192");
165         eval"&aes${p}   ($inout,$rndkey1)";
166         &$movekey       ($rndkey1,&QWP(-0x20,$key));
167         eval"&aes${p}   ($inout,$rndkey0)";
168         &$movekey       ($rndkey0,&QWP(-0x10,$key));
169     &set_label("${p}128");
170         eval"&aes${p}   ($inout,$rndkey1)";
171         &$movekey       ($rndkey1,&QWP(0,$key));
172         eval"&aes${p}   ($inout,$rndkey0)";
173         &$movekey       ($rndkey0,&QWP(0x10,$key));
174         eval"&aes${p}   ($inout,$rndkey1)";
175         &$movekey       ($rndkey1,&QWP(0x20,$key));
176         eval"&aes${p}   ($inout,$rndkey0)";
177         &$movekey       ($rndkey0,&QWP(0x30,$key));
178         eval"&aes${p}   ($inout,$rndkey1)";
179         &$movekey       ($rndkey1,&QWP(0x40,$key));
180         eval"&aes${p}   ($inout,$rndkey0)";
181         &$movekey       ($rndkey0,&QWP(0x50,$key));
182         eval"&aes${p}   ($inout,$rndkey1)";
183         &$movekey       ($rndkey1,&QWP(0x60,$key));
184         eval"&aes${p}   ($inout,$rndkey0)";
185         &$movekey       ($rndkey0,&QWP(0x70,$key));
186         eval"&aes${p}   ($inout,$rndkey1)";
187     eval"&aes${p}last   ($inout,$rndkey0)";
188     &ret();
189     &function_end_B("_aesni_${p}rypt1");
190 }
191 \f
192 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
193 &aesni_generate1("enc") if (!$inline);
194 &function_begin_B("${PREFIX}_encrypt");
195         &mov    ("eax",&wparam(0));
196         &mov    ($key,&wparam(2));
197         &movups ($inout0,&QWP(0,"eax"));
198         &mov    ($rounds,&DWP(240,$key));
199         &mov    ("eax",&wparam(1));
200         if ($inline)
201         {   &aesni_inline_generate1("enc");     }
202         else
203         {   &call       ("_aesni_encrypt1");    }
204         &pxor   ($rndkey0,$rndkey0);            # clear register bank
205         &pxor   ($rndkey1,$rndkey1);
206         &movups (&QWP(0,"eax"),$inout0);
207         &pxor   ($inout0,$inout0);
208         &ret    ();
209 &function_end_B("${PREFIX}_encrypt");
210
211 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
212 &aesni_generate1("dec") if(!$inline);
213 &function_begin_B("${PREFIX}_decrypt");
214         &mov    ("eax",&wparam(0));
215         &mov    ($key,&wparam(2));
216         &movups ($inout0,&QWP(0,"eax"));
217         &mov    ($rounds,&DWP(240,$key));
218         &mov    ("eax",&wparam(1));
219         if ($inline)
220         {   &aesni_inline_generate1("dec");     }
221         else
222         {   &call       ("_aesni_decrypt1");    }
223         &pxor   ($rndkey0,$rndkey0);            # clear register bank
224         &pxor   ($rndkey1,$rndkey1);
225         &movups (&QWP(0,"eax"),$inout0);
226         &pxor   ($inout0,$inout0);
227         &ret    ();
228 &function_end_B("${PREFIX}_decrypt");
229
230 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
231 # factor. Why 3x subroutine were originally used in loops? Even though
232 # aes[enc|dec] latency was originally 6, it could be scheduled only
233 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
234 # utilization, i.e. when subroutine's throughput is virtually same as
235 # of non-interleaved subroutine [for number of input blocks up to 3].
236 # This is why it originally made no sense to implement 2x subroutine.
237 # But times change and it became appropriate to spend extra 192 bytes
238 # on 2x subroutine on Atom Silvermont account. For processors that
239 # can schedule aes[enc|dec] every cycle optimal interleave factor
240 # equals to corresponding instructions latency. 8x is optimal for
241 # * Bridge, but it's unfeasible to accommodate such implementation
242 # in XMM registers addressable in 32-bit mode and therefore maximum
243 # of 6x is used instead...
244
245 sub aesni_generate2
246 { my $p=shift;
247
248     &function_begin_B("_aesni_${p}rypt2");
249         &$movekey       ($rndkey0,&QWP(0,$key));
250         &shl            ($rounds,4);
251         &$movekey       ($rndkey1,&QWP(16,$key));
252         &xorps          ($inout0,$rndkey0);
253         &pxor           ($inout1,$rndkey0);
254         &$movekey       ($rndkey0,&QWP(32,$key));
255         &lea            ($key,&DWP(32,$key,$rounds));
256         &neg            ($rounds);
257         &add            ($rounds,16);
258
259     &set_label("${p}2_loop");
260         eval"&aes${p}   ($inout0,$rndkey1)";
261         eval"&aes${p}   ($inout1,$rndkey1)";
262         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
263         &add            ($rounds,32);
264         eval"&aes${p}   ($inout0,$rndkey0)";
265         eval"&aes${p}   ($inout1,$rndkey0)";
266         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
267         &jnz            (&label("${p}2_loop"));
268     eval"&aes${p}       ($inout0,$rndkey1)";
269     eval"&aes${p}       ($inout1,$rndkey1)";
270     eval"&aes${p}last   ($inout0,$rndkey0)";
271     eval"&aes${p}last   ($inout1,$rndkey0)";
272     &ret();
273     &function_end_B("_aesni_${p}rypt2");
274 }
275
276 sub aesni_generate3
277 { my $p=shift;
278
279     &function_begin_B("_aesni_${p}rypt3");
280         &$movekey       ($rndkey0,&QWP(0,$key));
281         &shl            ($rounds,4);
282         &$movekey       ($rndkey1,&QWP(16,$key));
283         &xorps          ($inout0,$rndkey0);
284         &pxor           ($inout1,$rndkey0);
285         &pxor           ($inout2,$rndkey0);
286         &$movekey       ($rndkey0,&QWP(32,$key));
287         &lea            ($key,&DWP(32,$key,$rounds));
288         &neg            ($rounds);
289         &add            ($rounds,16);
290
291     &set_label("${p}3_loop");
292         eval"&aes${p}   ($inout0,$rndkey1)";
293         eval"&aes${p}   ($inout1,$rndkey1)";
294         eval"&aes${p}   ($inout2,$rndkey1)";
295         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
296         &add            ($rounds,32);
297         eval"&aes${p}   ($inout0,$rndkey0)";
298         eval"&aes${p}   ($inout1,$rndkey0)";
299         eval"&aes${p}   ($inout2,$rndkey0)";
300         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
301         &jnz            (&label("${p}3_loop"));
302     eval"&aes${p}       ($inout0,$rndkey1)";
303     eval"&aes${p}       ($inout1,$rndkey1)";
304     eval"&aes${p}       ($inout2,$rndkey1)";
305     eval"&aes${p}last   ($inout0,$rndkey0)";
306     eval"&aes${p}last   ($inout1,$rndkey0)";
307     eval"&aes${p}last   ($inout2,$rndkey0)";
308     &ret();
309     &function_end_B("_aesni_${p}rypt3");
310 }
311
312 # 4x interleave is implemented to improve small block performance,
313 # most notably [and naturally] 4 block by ~30%. One can argue that one
314 # should have implemented 5x as well, but improvement  would be <20%,
315 # so it's not worth it...
316 sub aesni_generate4
317 { my $p=shift;
318
319     &function_begin_B("_aesni_${p}rypt4");
320         &$movekey       ($rndkey0,&QWP(0,$key));
321         &$movekey       ($rndkey1,&QWP(16,$key));
322         &shl            ($rounds,4);
323         &xorps          ($inout0,$rndkey0);
324         &pxor           ($inout1,$rndkey0);
325         &pxor           ($inout2,$rndkey0);
326         &pxor           ($inout3,$rndkey0);
327         &$movekey       ($rndkey0,&QWP(32,$key));
328         &lea            ($key,&DWP(32,$key,$rounds));
329         &neg            ($rounds);
330         &data_byte      (0x0f,0x1f,0x40,0x00);
331         &add            ($rounds,16);
332
333     &set_label("${p}4_loop");
334         eval"&aes${p}   ($inout0,$rndkey1)";
335         eval"&aes${p}   ($inout1,$rndkey1)";
336         eval"&aes${p}   ($inout2,$rndkey1)";
337         eval"&aes${p}   ($inout3,$rndkey1)";
338         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
339         &add            ($rounds,32);
340         eval"&aes${p}   ($inout0,$rndkey0)";
341         eval"&aes${p}   ($inout1,$rndkey0)";
342         eval"&aes${p}   ($inout2,$rndkey0)";
343         eval"&aes${p}   ($inout3,$rndkey0)";
344         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
345     &jnz                (&label("${p}4_loop"));
346
347     eval"&aes${p}       ($inout0,$rndkey1)";
348     eval"&aes${p}       ($inout1,$rndkey1)";
349     eval"&aes${p}       ($inout2,$rndkey1)";
350     eval"&aes${p}       ($inout3,$rndkey1)";
351     eval"&aes${p}last   ($inout0,$rndkey0)";
352     eval"&aes${p}last   ($inout1,$rndkey0)";
353     eval"&aes${p}last   ($inout2,$rndkey0)";
354     eval"&aes${p}last   ($inout3,$rndkey0)";
355     &ret();
356     &function_end_B("_aesni_${p}rypt4");
357 }
358
359 sub aesni_generate6
360 { my $p=shift;
361
362     &function_begin_B("_aesni_${p}rypt6");
363     &static_label("_aesni_${p}rypt6_enter");
364         &$movekey       ($rndkey0,&QWP(0,$key));
365         &shl            ($rounds,4);
366         &$movekey       ($rndkey1,&QWP(16,$key));
367         &xorps          ($inout0,$rndkey0);
368         &pxor           ($inout1,$rndkey0);     # pxor does better here
369         &pxor           ($inout2,$rndkey0);
370         eval"&aes${p}   ($inout0,$rndkey1)";
371         &pxor           ($inout3,$rndkey0);
372         &pxor           ($inout4,$rndkey0);
373         eval"&aes${p}   ($inout1,$rndkey1)";
374         &lea            ($key,&DWP(32,$key,$rounds));
375         &neg            ($rounds);
376         eval"&aes${p}   ($inout2,$rndkey1)";
377         &pxor           ($inout5,$rndkey0);
378         &$movekey       ($rndkey0,&QWP(0,$key,$rounds));
379         &add            ($rounds,16);
380         &jmp            (&label("_aesni_${p}rypt6_inner"));
381
382     &set_label("${p}6_loop",16);
383         eval"&aes${p}   ($inout0,$rndkey1)";
384         eval"&aes${p}   ($inout1,$rndkey1)";
385         eval"&aes${p}   ($inout2,$rndkey1)";
386     &set_label("_aesni_${p}rypt6_inner");
387         eval"&aes${p}   ($inout3,$rndkey1)";
388         eval"&aes${p}   ($inout4,$rndkey1)";
389         eval"&aes${p}   ($inout5,$rndkey1)";
390     &set_label("_aesni_${p}rypt6_enter");
391         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
392         &add            ($rounds,32);
393         eval"&aes${p}   ($inout0,$rndkey0)";
394         eval"&aes${p}   ($inout1,$rndkey0)";
395         eval"&aes${p}   ($inout2,$rndkey0)";
396         eval"&aes${p}   ($inout3,$rndkey0)";
397         eval"&aes${p}   ($inout4,$rndkey0)";
398         eval"&aes${p}   ($inout5,$rndkey0)";
399         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
400     &jnz                (&label("${p}6_loop"));
401
402     eval"&aes${p}       ($inout0,$rndkey1)";
403     eval"&aes${p}       ($inout1,$rndkey1)";
404     eval"&aes${p}       ($inout2,$rndkey1)";
405     eval"&aes${p}       ($inout3,$rndkey1)";
406     eval"&aes${p}       ($inout4,$rndkey1)";
407     eval"&aes${p}       ($inout5,$rndkey1)";
408     eval"&aes${p}last   ($inout0,$rndkey0)";
409     eval"&aes${p}last   ($inout1,$rndkey0)";
410     eval"&aes${p}last   ($inout2,$rndkey0)";
411     eval"&aes${p}last   ($inout3,$rndkey0)";
412     eval"&aes${p}last   ($inout4,$rndkey0)";
413     eval"&aes${p}last   ($inout5,$rndkey0)";
414     &ret();
415     &function_end_B("_aesni_${p}rypt6");
416 }
417 &aesni_generate2("enc") if ($PREFIX eq "aesni");
418 &aesni_generate2("dec");
419 &aesni_generate3("enc") if ($PREFIX eq "aesni");
420 &aesni_generate3("dec");
421 &aesni_generate4("enc") if ($PREFIX eq "aesni");
422 &aesni_generate4("dec");
423 &aesni_generate6("enc") if ($PREFIX eq "aesni");
424 &aesni_generate6("dec");
425 \f
426 if ($PREFIX eq "aesni") {
427 ######################################################################
428 # void aesni_ecb_encrypt (const void *in, void *out,
429 #                         size_t length, const AES_KEY *key,
430 #                         int enc);
431 &function_begin("aesni_ecb_encrypt");
432         &mov    ($inp,&wparam(0));
433         &mov    ($out,&wparam(1));
434         &mov    ($len,&wparam(2));
435         &mov    ($key,&wparam(3));
436         &mov    ($rounds_,&wparam(4));
437         &and    ($len,-16);
438         &jz     (&label("ecb_ret"));
439         &mov    ($rounds,&DWP(240,$key));
440         &test   ($rounds_,$rounds_);
441         &jz     (&label("ecb_decrypt"));
442
443         &mov    ($key_,$key);           # backup $key
444         &mov    ($rounds_,$rounds);     # backup $rounds
445         &cmp    ($len,0x60);
446         &jb     (&label("ecb_enc_tail"));
447
448         &movdqu ($inout0,&QWP(0,$inp));
449         &movdqu ($inout1,&QWP(0x10,$inp));
450         &movdqu ($inout2,&QWP(0x20,$inp));
451         &movdqu ($inout3,&QWP(0x30,$inp));
452         &movdqu ($inout4,&QWP(0x40,$inp));
453         &movdqu ($inout5,&QWP(0x50,$inp));
454         &lea    ($inp,&DWP(0x60,$inp));
455         &sub    ($len,0x60);
456         &jmp    (&label("ecb_enc_loop6_enter"));
457
458 &set_label("ecb_enc_loop6",16);
459         &movups (&QWP(0,$out),$inout0);
460         &movdqu ($inout0,&QWP(0,$inp));
461         &movups (&QWP(0x10,$out),$inout1);
462         &movdqu ($inout1,&QWP(0x10,$inp));
463         &movups (&QWP(0x20,$out),$inout2);
464         &movdqu ($inout2,&QWP(0x20,$inp));
465         &movups (&QWP(0x30,$out),$inout3);
466         &movdqu ($inout3,&QWP(0x30,$inp));
467         &movups (&QWP(0x40,$out),$inout4);
468         &movdqu ($inout4,&QWP(0x40,$inp));
469         &movups (&QWP(0x50,$out),$inout5);
470         &lea    ($out,&DWP(0x60,$out));
471         &movdqu ($inout5,&QWP(0x50,$inp));
472         &lea    ($inp,&DWP(0x60,$inp));
473 &set_label("ecb_enc_loop6_enter");
474
475         &call   ("_aesni_encrypt6");
476
477         &mov    ($key,$key_);           # restore $key
478         &mov    ($rounds,$rounds_);     # restore $rounds
479         &sub    ($len,0x60);
480         &jnc    (&label("ecb_enc_loop6"));
481
482         &movups (&QWP(0,$out),$inout0);
483         &movups (&QWP(0x10,$out),$inout1);
484         &movups (&QWP(0x20,$out),$inout2);
485         &movups (&QWP(0x30,$out),$inout3);
486         &movups (&QWP(0x40,$out),$inout4);
487         &movups (&QWP(0x50,$out),$inout5);
488         &lea    ($out,&DWP(0x60,$out));
489         &add    ($len,0x60);
490         &jz     (&label("ecb_ret"));
491
492 &set_label("ecb_enc_tail");
493         &movups ($inout0,&QWP(0,$inp));
494         &cmp    ($len,0x20);
495         &jb     (&label("ecb_enc_one"));
496         &movups ($inout1,&QWP(0x10,$inp));
497         &je     (&label("ecb_enc_two"));
498         &movups ($inout2,&QWP(0x20,$inp));
499         &cmp    ($len,0x40);
500         &jb     (&label("ecb_enc_three"));
501         &movups ($inout3,&QWP(0x30,$inp));
502         &je     (&label("ecb_enc_four"));
503         &movups ($inout4,&QWP(0x40,$inp));
504         &xorps  ($inout5,$inout5);
505         &call   ("_aesni_encrypt6");
506         &movups (&QWP(0,$out),$inout0);
507         &movups (&QWP(0x10,$out),$inout1);
508         &movups (&QWP(0x20,$out),$inout2);
509         &movups (&QWP(0x30,$out),$inout3);
510         &movups (&QWP(0x40,$out),$inout4);
511         jmp     (&label("ecb_ret"));
512
513 &set_label("ecb_enc_one",16);
514         if ($inline)
515         {   &aesni_inline_generate1("enc");     }
516         else
517         {   &call       ("_aesni_encrypt1");    }
518         &movups (&QWP(0,$out),$inout0);
519         &jmp    (&label("ecb_ret"));
520
521 &set_label("ecb_enc_two",16);
522         &call   ("_aesni_encrypt2");
523         &movups (&QWP(0,$out),$inout0);
524         &movups (&QWP(0x10,$out),$inout1);
525         &jmp    (&label("ecb_ret"));
526
527 &set_label("ecb_enc_three",16);
528         &call   ("_aesni_encrypt3");
529         &movups (&QWP(0,$out),$inout0);
530         &movups (&QWP(0x10,$out),$inout1);
531         &movups (&QWP(0x20,$out),$inout2);
532         &jmp    (&label("ecb_ret"));
533
534 &set_label("ecb_enc_four",16);
535         &call   ("_aesni_encrypt4");
536         &movups (&QWP(0,$out),$inout0);
537         &movups (&QWP(0x10,$out),$inout1);
538         &movups (&QWP(0x20,$out),$inout2);
539         &movups (&QWP(0x30,$out),$inout3);
540         &jmp    (&label("ecb_ret"));
541 ######################################################################
542 &set_label("ecb_decrypt",16);
543         &mov    ($key_,$key);           # backup $key
544         &mov    ($rounds_,$rounds);     # backup $rounds
545         &cmp    ($len,0x60);
546         &jb     (&label("ecb_dec_tail"));
547
548         &movdqu ($inout0,&QWP(0,$inp));
549         &movdqu ($inout1,&QWP(0x10,$inp));
550         &movdqu ($inout2,&QWP(0x20,$inp));
551         &movdqu ($inout3,&QWP(0x30,$inp));
552         &movdqu ($inout4,&QWP(0x40,$inp));
553         &movdqu ($inout5,&QWP(0x50,$inp));
554         &lea    ($inp,&DWP(0x60,$inp));
555         &sub    ($len,0x60);
556         &jmp    (&label("ecb_dec_loop6_enter"));
557
558 &set_label("ecb_dec_loop6",16);
559         &movups (&QWP(0,$out),$inout0);
560         &movdqu ($inout0,&QWP(0,$inp));
561         &movups (&QWP(0x10,$out),$inout1);
562         &movdqu ($inout1,&QWP(0x10,$inp));
563         &movups (&QWP(0x20,$out),$inout2);
564         &movdqu ($inout2,&QWP(0x20,$inp));
565         &movups (&QWP(0x30,$out),$inout3);
566         &movdqu ($inout3,&QWP(0x30,$inp));
567         &movups (&QWP(0x40,$out),$inout4);
568         &movdqu ($inout4,&QWP(0x40,$inp));
569         &movups (&QWP(0x50,$out),$inout5);
570         &lea    ($out,&DWP(0x60,$out));
571         &movdqu ($inout5,&QWP(0x50,$inp));
572         &lea    ($inp,&DWP(0x60,$inp));
573 &set_label("ecb_dec_loop6_enter");
574
575         &call   ("_aesni_decrypt6");
576
577         &mov    ($key,$key_);           # restore $key
578         &mov    ($rounds,$rounds_);     # restore $rounds
579         &sub    ($len,0x60);
580         &jnc    (&label("ecb_dec_loop6"));
581
582         &movups (&QWP(0,$out),$inout0);
583         &movups (&QWP(0x10,$out),$inout1);
584         &movups (&QWP(0x20,$out),$inout2);
585         &movups (&QWP(0x30,$out),$inout3);
586         &movups (&QWP(0x40,$out),$inout4);
587         &movups (&QWP(0x50,$out),$inout5);
588         &lea    ($out,&DWP(0x60,$out));
589         &add    ($len,0x60);
590         &jz     (&label("ecb_ret"));
591
592 &set_label("ecb_dec_tail");
593         &movups ($inout0,&QWP(0,$inp));
594         &cmp    ($len,0x20);
595         &jb     (&label("ecb_dec_one"));
596         &movups ($inout1,&QWP(0x10,$inp));
597         &je     (&label("ecb_dec_two"));
598         &movups ($inout2,&QWP(0x20,$inp));
599         &cmp    ($len,0x40);
600         &jb     (&label("ecb_dec_three"));
601         &movups ($inout3,&QWP(0x30,$inp));
602         &je     (&label("ecb_dec_four"));
603         &movups ($inout4,&QWP(0x40,$inp));
604         &xorps  ($inout5,$inout5);
605         &call   ("_aesni_decrypt6");
606         &movups (&QWP(0,$out),$inout0);
607         &movups (&QWP(0x10,$out),$inout1);
608         &movups (&QWP(0x20,$out),$inout2);
609         &movups (&QWP(0x30,$out),$inout3);
610         &movups (&QWP(0x40,$out),$inout4);
611         &jmp    (&label("ecb_ret"));
612
613 &set_label("ecb_dec_one",16);
614         if ($inline)
615         {   &aesni_inline_generate1("dec");     }
616         else
617         {   &call       ("_aesni_decrypt1");    }
618         &movups (&QWP(0,$out),$inout0);
619         &jmp    (&label("ecb_ret"));
620
621 &set_label("ecb_dec_two",16);
622         &call   ("_aesni_decrypt2");
623         &movups (&QWP(0,$out),$inout0);
624         &movups (&QWP(0x10,$out),$inout1);
625         &jmp    (&label("ecb_ret"));
626
627 &set_label("ecb_dec_three",16);
628         &call   ("_aesni_decrypt3");
629         &movups (&QWP(0,$out),$inout0);
630         &movups (&QWP(0x10,$out),$inout1);
631         &movups (&QWP(0x20,$out),$inout2);
632         &jmp    (&label("ecb_ret"));
633
634 &set_label("ecb_dec_four",16);
635         &call   ("_aesni_decrypt4");
636         &movups (&QWP(0,$out),$inout0);
637         &movups (&QWP(0x10,$out),$inout1);
638         &movups (&QWP(0x20,$out),$inout2);
639         &movups (&QWP(0x30,$out),$inout3);
640
641 &set_label("ecb_ret");
642         &pxor   ("xmm0","xmm0");                # clear register bank
643         &pxor   ("xmm1","xmm1");
644         &pxor   ("xmm2","xmm2");
645         &pxor   ("xmm3","xmm3");
646         &pxor   ("xmm4","xmm4");
647         &pxor   ("xmm5","xmm5");
648         &pxor   ("xmm6","xmm6");
649         &pxor   ("xmm7","xmm7");
650 &function_end("aesni_ecb_encrypt");
651 \f
652 ######################################################################
653 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
654 #                         size_t blocks, const AES_KEY *key,
655 #                         const char *ivec,char *cmac);
656 #
657 # Handles only complete blocks, operates on 64-bit counter and
658 # does not update *ivec! Nor does it finalize CMAC value
659 # (see engine/eng_aesni.c for details)
660 #
661 { my $cmac=$inout1;
662 &function_begin("aesni_ccm64_encrypt_blocks");
663         &mov    ($inp,&wparam(0));
664         &mov    ($out,&wparam(1));
665         &mov    ($len,&wparam(2));
666         &mov    ($key,&wparam(3));
667         &mov    ($rounds_,&wparam(4));
668         &mov    ($rounds,&wparam(5));
669         &mov    ($key_,"esp");
670         &sub    ("esp",60);
671         &and    ("esp",-16);                    # align stack
672         &mov    (&DWP(48,"esp"),$key_);
673
674         &movdqu ($ivec,&QWP(0,$rounds_));       # load ivec
675         &movdqu ($cmac,&QWP(0,$rounds));        # load cmac
676         &mov    ($rounds,&DWP(240,$key));
677
678         # compose byte-swap control mask for pshufb on stack
679         &mov    (&DWP(0,"esp"),0x0c0d0e0f);
680         &mov    (&DWP(4,"esp"),0x08090a0b);
681         &mov    (&DWP(8,"esp"),0x04050607);
682         &mov    (&DWP(12,"esp"),0x00010203);
683
684         # compose counter increment vector on stack
685         &mov    ($rounds_,1);
686         &xor    ($key_,$key_);
687         &mov    (&DWP(16,"esp"),$rounds_);
688         &mov    (&DWP(20,"esp"),$key_);
689         &mov    (&DWP(24,"esp"),$key_);
690         &mov    (&DWP(28,"esp"),$key_);
691
692         &shl    ($rounds,4);
693         &mov    ($rounds_,16);
694         &lea    ($key_,&DWP(0,$key));
695         &movdqa ($inout3,&QWP(0,"esp"));
696         &movdqa ($inout0,$ivec);
697         &lea    ($key,&DWP(32,$key,$rounds));
698         &sub    ($rounds_,$rounds);
699         &pshufb ($ivec,$inout3);
700
701 &set_label("ccm64_enc_outer");
702         &$movekey       ($rndkey0,&QWP(0,$key_));
703         &mov            ($rounds,$rounds_);
704         &movups         ($in0,&QWP(0,$inp));
705
706         &xorps          ($inout0,$rndkey0);
707         &$movekey       ($rndkey1,&QWP(16,$key_));
708         &xorps          ($rndkey0,$in0);
709         &xorps          ($cmac,$rndkey0);               # cmac^=inp
710         &$movekey       ($rndkey0,&QWP(32,$key_));
711
712 &set_label("ccm64_enc2_loop");
713         &aesenc         ($inout0,$rndkey1);
714         &aesenc         ($cmac,$rndkey1);
715         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
716         &add            ($rounds,32);
717         &aesenc         ($inout0,$rndkey0);
718         &aesenc         ($cmac,$rndkey0);
719         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
720         &jnz            (&label("ccm64_enc2_loop"));
721         &aesenc         ($inout0,$rndkey1);
722         &aesenc         ($cmac,$rndkey1);
723         &paddq          ($ivec,&QWP(16,"esp"));
724         &dec            ($len);
725         &aesenclast     ($inout0,$rndkey0);
726         &aesenclast     ($cmac,$rndkey0);
727
728         &lea    ($inp,&DWP(16,$inp));
729         &xorps  ($in0,$inout0);                 # inp^=E(ivec)
730         &movdqa ($inout0,$ivec);
731         &movups (&QWP(0,$out),$in0);            # save output
732         &pshufb ($inout0,$inout3);
733         &lea    ($out,&DWP(16,$out));
734         &jnz    (&label("ccm64_enc_outer"));
735
736         &mov    ("esp",&DWP(48,"esp"));
737         &mov    ($out,&wparam(5));
738         &movups (&QWP(0,$out),$cmac);
739
740         &pxor   ("xmm0","xmm0");                # clear register bank
741         &pxor   ("xmm1","xmm1");
742         &pxor   ("xmm2","xmm2");
743         &pxor   ("xmm3","xmm3");
744         &pxor   ("xmm4","xmm4");
745         &pxor   ("xmm5","xmm5");
746         &pxor   ("xmm6","xmm6");
747         &pxor   ("xmm7","xmm7");
748 &function_end("aesni_ccm64_encrypt_blocks");
749
750 &function_begin("aesni_ccm64_decrypt_blocks");
751         &mov    ($inp,&wparam(0));
752         &mov    ($out,&wparam(1));
753         &mov    ($len,&wparam(2));
754         &mov    ($key,&wparam(3));
755         &mov    ($rounds_,&wparam(4));
756         &mov    ($rounds,&wparam(5));
757         &mov    ($key_,"esp");
758         &sub    ("esp",60);
759         &and    ("esp",-16);                    # align stack
760         &mov    (&DWP(48,"esp"),$key_);
761
762         &movdqu ($ivec,&QWP(0,$rounds_));       # load ivec
763         &movdqu ($cmac,&QWP(0,$rounds));        # load cmac
764         &mov    ($rounds,&DWP(240,$key));
765
766         # compose byte-swap control mask for pshufb on stack
767         &mov    (&DWP(0,"esp"),0x0c0d0e0f);
768         &mov    (&DWP(4,"esp"),0x08090a0b);
769         &mov    (&DWP(8,"esp"),0x04050607);
770         &mov    (&DWP(12,"esp"),0x00010203);
771
772         # compose counter increment vector on stack
773         &mov    ($rounds_,1);
774         &xor    ($key_,$key_);
775         &mov    (&DWP(16,"esp"),$rounds_);
776         &mov    (&DWP(20,"esp"),$key_);
777         &mov    (&DWP(24,"esp"),$key_);
778         &mov    (&DWP(28,"esp"),$key_);
779
780         &movdqa ($inout3,&QWP(0,"esp"));        # bswap mask
781         &movdqa ($inout0,$ivec);
782
783         &mov    ($key_,$key);
784         &mov    ($rounds_,$rounds);
785
786         &pshufb ($ivec,$inout3);
787         if ($inline)
788         {   &aesni_inline_generate1("enc");     }
789         else
790         {   &call       ("_aesni_encrypt1");    }
791         &shl    ($rounds_,4);
792         &mov    ($rounds,16);
793         &movups ($in0,&QWP(0,$inp));            # load inp
794         &paddq  ($ivec,&QWP(16,"esp"));
795         &lea    ($inp,&QWP(16,$inp));
796         &sub    ($rounds,$rounds_);
797         &lea    ($key,&DWP(32,$key_,$rounds_));
798         &mov    ($rounds_,$rounds);
799         &jmp    (&label("ccm64_dec_outer"));
800
801 &set_label("ccm64_dec_outer",16);
802         &xorps  ($in0,$inout0);                 # inp ^= E(ivec)
803         &movdqa ($inout0,$ivec);
804         &movups (&QWP(0,$out),$in0);            # save output
805         &lea    ($out,&DWP(16,$out));
806         &pshufb ($inout0,$inout3);
807
808         &sub    ($len,1);
809         &jz     (&label("ccm64_dec_break"));
810
811         &$movekey       ($rndkey0,&QWP(0,$key_));
812         &mov            ($rounds,$rounds_);
813         &$movekey       ($rndkey1,&QWP(16,$key_));
814         &xorps          ($in0,$rndkey0);
815         &xorps          ($inout0,$rndkey0);
816         &xorps          ($cmac,$in0);           # cmac^=out
817         &$movekey       ($rndkey0,&QWP(32,$key_));
818
819 &set_label("ccm64_dec2_loop");
820         &aesenc         ($inout0,$rndkey1);
821         &aesenc         ($cmac,$rndkey1);
822         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
823         &add            ($rounds,32);
824         &aesenc         ($inout0,$rndkey0);
825         &aesenc         ($cmac,$rndkey0);
826         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
827         &jnz            (&label("ccm64_dec2_loop"));
828         &movups         ($in0,&QWP(0,$inp));    # load inp
829         &paddq          ($ivec,&QWP(16,"esp"));
830         &aesenc         ($inout0,$rndkey1);
831         &aesenc         ($cmac,$rndkey1);
832         &aesenclast     ($inout0,$rndkey0);
833         &aesenclast     ($cmac,$rndkey0);
834         &lea            ($inp,&QWP(16,$inp));
835         &jmp    (&label("ccm64_dec_outer"));
836
837 &set_label("ccm64_dec_break",16);
838         &mov    ($rounds,&DWP(240,$key_));
839         &mov    ($key,$key_);
840         if ($inline)
841         {   &aesni_inline_generate1("enc",$cmac,$in0);  }
842         else
843         {   &call       ("_aesni_encrypt1",$cmac);      }
844
845         &mov    ("esp",&DWP(48,"esp"));
846         &mov    ($out,&wparam(5));
847         &movups (&QWP(0,$out),$cmac);
848
849         &pxor   ("xmm0","xmm0");                # clear register bank
850         &pxor   ("xmm1","xmm1");
851         &pxor   ("xmm2","xmm2");
852         &pxor   ("xmm3","xmm3");
853         &pxor   ("xmm4","xmm4");
854         &pxor   ("xmm5","xmm5");
855         &pxor   ("xmm6","xmm6");
856         &pxor   ("xmm7","xmm7");
857 &function_end("aesni_ccm64_decrypt_blocks");
858 }
859 \f
860 ######################################################################
861 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
862 #                         size_t blocks, const AES_KEY *key,
863 #                         const char *ivec);
864 #
865 # Handles only complete blocks, operates on 32-bit counter and
866 # does not update *ivec! (see crypto/modes/ctr128.c for details)
867 #
868 # stack layout:
869 #       0       pshufb mask
870 #       16      vector addend: 0,6,6,6
871 #       32      counter-less ivec
872 #       48      1st triplet of counter vector
873 #       64      2nd triplet of counter vector
874 #       80      saved %esp
875
876 &function_begin("aesni_ctr32_encrypt_blocks");
877         &mov    ($inp,&wparam(0));
878         &mov    ($out,&wparam(1));
879         &mov    ($len,&wparam(2));
880         &mov    ($key,&wparam(3));
881         &mov    ($rounds_,&wparam(4));
882         &mov    ($key_,"esp");
883         &sub    ("esp",88);
884         &and    ("esp",-16);                    # align stack
885         &mov    (&DWP(80,"esp"),$key_);
886
887         &cmp    ($len,1);
888         &je     (&label("ctr32_one_shortcut"));
889
890         &movdqu ($inout5,&QWP(0,$rounds_));     # load ivec
891
892         # compose byte-swap control mask for pshufb on stack
893         &mov    (&DWP(0,"esp"),0x0c0d0e0f);
894         &mov    (&DWP(4,"esp"),0x08090a0b);
895         &mov    (&DWP(8,"esp"),0x04050607);
896         &mov    (&DWP(12,"esp"),0x00010203);
897
898         # compose counter increment vector on stack
899         &mov    ($rounds,6);
900         &xor    ($key_,$key_);
901         &mov    (&DWP(16,"esp"),$rounds);
902         &mov    (&DWP(20,"esp"),$rounds);
903         &mov    (&DWP(24,"esp"),$rounds);
904         &mov    (&DWP(28,"esp"),$key_);
905
906         &pextrd ($rounds_,$inout5,3);           # pull 32-bit counter
907         &pinsrd ($inout5,$key_,3);              # wipe 32-bit counter
908
909         &mov    ($rounds,&DWP(240,$key));       # key->rounds
910
911         # compose 2 vectors of 3x32-bit counters
912         &bswap  ($rounds_);
913         &pxor   ($rndkey0,$rndkey0);
914         &pxor   ($rndkey1,$rndkey1);
915         &movdqa ($inout0,&QWP(0,"esp"));        # load byte-swap mask
916         &pinsrd ($rndkey0,$rounds_,0);
917         &lea    ($key_,&DWP(3,$rounds_));
918         &pinsrd ($rndkey1,$key_,0);
919         &inc    ($rounds_);
920         &pinsrd ($rndkey0,$rounds_,1);
921         &inc    ($key_);
922         &pinsrd ($rndkey1,$key_,1);
923         &inc    ($rounds_);
924         &pinsrd ($rndkey0,$rounds_,2);
925         &inc    ($key_);
926         &pinsrd ($rndkey1,$key_,2);
927         &movdqa (&QWP(48,"esp"),$rndkey0);      # save 1st triplet
928         &pshufb ($rndkey0,$inout0);             # byte swap
929         &movdqu ($inout4,&QWP(0,$key));         # key[0]
930         &movdqa (&QWP(64,"esp"),$rndkey1);      # save 2nd triplet
931         &pshufb ($rndkey1,$inout0);             # byte swap
932
933         &pshufd ($inout0,$rndkey0,3<<6);        # place counter to upper dword
934         &pshufd ($inout1,$rndkey0,2<<6);
935         &cmp    ($len,6);
936         &jb     (&label("ctr32_tail"));
937         &pxor   ($inout5,$inout4);              # counter-less ivec^key[0]
938         &shl    ($rounds,4);
939         &mov    ($rounds_,16);
940         &movdqa (&QWP(32,"esp"),$inout5);       # save counter-less ivec^key[0]
941         &mov    ($key_,$key);                   # backup $key
942         &sub    ($rounds_,$rounds);             # backup twisted $rounds
943         &lea    ($key,&DWP(32,$key,$rounds));
944         &sub    ($len,6);
945         &jmp    (&label("ctr32_loop6"));
946
947 &set_label("ctr32_loop6",16);
948         # inlining _aesni_encrypt6's prologue gives ~6% improvement...
949         &pshufd ($inout2,$rndkey0,1<<6);
950         &movdqa ($rndkey0,&QWP(32,"esp"));      # pull counter-less ivec
951         &pshufd ($inout3,$rndkey1,3<<6);
952         &pxor           ($inout0,$rndkey0);     # merge counter-less ivec
953         &pshufd ($inout4,$rndkey1,2<<6);
954         &pxor           ($inout1,$rndkey0);
955         &pshufd ($inout5,$rndkey1,1<<6);
956         &$movekey       ($rndkey1,&QWP(16,$key_));
957         &pxor           ($inout2,$rndkey0);
958         &pxor           ($inout3,$rndkey0);
959         &aesenc         ($inout0,$rndkey1);
960         &pxor           ($inout4,$rndkey0);
961         &pxor           ($inout5,$rndkey0);
962         &aesenc         ($inout1,$rndkey1);
963         &$movekey       ($rndkey0,&QWP(32,$key_));
964         &mov            ($rounds,$rounds_);
965         &aesenc         ($inout2,$rndkey1);
966         &aesenc         ($inout3,$rndkey1);
967         &aesenc         ($inout4,$rndkey1);
968         &aesenc         ($inout5,$rndkey1);
969
970         &call           (&label("_aesni_encrypt6_enter"));
971
972         &movups ($rndkey1,&QWP(0,$inp));
973         &movups ($rndkey0,&QWP(0x10,$inp));
974         &xorps  ($inout0,$rndkey1);
975         &movups ($rndkey1,&QWP(0x20,$inp));
976         &xorps  ($inout1,$rndkey0);
977         &movups (&QWP(0,$out),$inout0);
978         &movdqa ($rndkey0,&QWP(16,"esp"));      # load increment
979         &xorps  ($inout2,$rndkey1);
980         &movdqa ($rndkey1,&QWP(64,"esp"));      # load 2nd triplet
981         &movups (&QWP(0x10,$out),$inout1);
982         &movups (&QWP(0x20,$out),$inout2);
983
984         &paddd  ($rndkey1,$rndkey0);            # 2nd triplet increment
985         &paddd  ($rndkey0,&QWP(48,"esp"));      # 1st triplet increment
986         &movdqa ($inout0,&QWP(0,"esp"));        # load byte swap mask
987
988         &movups ($inout1,&QWP(0x30,$inp));
989         &movups ($inout2,&QWP(0x40,$inp));
990         &xorps  ($inout3,$inout1);
991         &movups ($inout1,&QWP(0x50,$inp));
992         &lea    ($inp,&DWP(0x60,$inp));
993         &movdqa (&QWP(48,"esp"),$rndkey0);      # save 1st triplet
994         &pshufb ($rndkey0,$inout0);             # byte swap
995         &xorps  ($inout4,$inout2);
996         &movups (&QWP(0x30,$out),$inout3);
997         &xorps  ($inout5,$inout1);
998         &movdqa (&QWP(64,"esp"),$rndkey1);      # save 2nd triplet
999         &pshufb ($rndkey1,$inout0);             # byte swap
1000         &movups (&QWP(0x40,$out),$inout4);
1001         &pshufd ($inout0,$rndkey0,3<<6);
1002         &movups (&QWP(0x50,$out),$inout5);
1003         &lea    ($out,&DWP(0x60,$out));
1004
1005         &pshufd ($inout1,$rndkey0,2<<6);
1006         &sub    ($len,6);
1007         &jnc    (&label("ctr32_loop6"));
1008
1009         &add    ($len,6);
1010         &jz     (&label("ctr32_ret"));
1011         &movdqu ($inout5,&QWP(0,$key_));
1012         &mov    ($key,$key_);
1013         &pxor   ($inout5,&QWP(32,"esp"));       # restore count-less ivec
1014         &mov    ($rounds,&DWP(240,$key_));      # restore $rounds
1015
1016 &set_label("ctr32_tail");
1017         &por    ($inout0,$inout5);
1018         &cmp    ($len,2);
1019         &jb     (&label("ctr32_one"));
1020
1021         &pshufd ($inout2,$rndkey0,1<<6);
1022         &por    ($inout1,$inout5);
1023         &je     (&label("ctr32_two"));
1024
1025         &pshufd ($inout3,$rndkey1,3<<6);
1026         &por    ($inout2,$inout5);
1027         &cmp    ($len,4);
1028         &jb     (&label("ctr32_three"));
1029
1030         &pshufd ($inout4,$rndkey1,2<<6);
1031         &por    ($inout3,$inout5);
1032         &je     (&label("ctr32_four"));
1033
1034         &por    ($inout4,$inout5);
1035         &call   ("_aesni_encrypt6");
1036         &movups ($rndkey1,&QWP(0,$inp));
1037         &movups ($rndkey0,&QWP(0x10,$inp));
1038         &xorps  ($inout0,$rndkey1);
1039         &movups ($rndkey1,&QWP(0x20,$inp));
1040         &xorps  ($inout1,$rndkey0);
1041         &movups ($rndkey0,&QWP(0x30,$inp));
1042         &xorps  ($inout2,$rndkey1);
1043         &movups ($rndkey1,&QWP(0x40,$inp));
1044         &xorps  ($inout3,$rndkey0);
1045         &movups (&QWP(0,$out),$inout0);
1046         &xorps  ($inout4,$rndkey1);
1047         &movups (&QWP(0x10,$out),$inout1);
1048         &movups (&QWP(0x20,$out),$inout2);
1049         &movups (&QWP(0x30,$out),$inout3);
1050         &movups (&QWP(0x40,$out),$inout4);
1051         &jmp    (&label("ctr32_ret"));
1052
1053 &set_label("ctr32_one_shortcut",16);
1054         &movups ($inout0,&QWP(0,$rounds_));     # load ivec
1055         &mov    ($rounds,&DWP(240,$key));
1056
1057 &set_label("ctr32_one");
1058         if ($inline)
1059         {   &aesni_inline_generate1("enc");     }
1060         else
1061         {   &call       ("_aesni_encrypt1");    }
1062         &movups ($in0,&QWP(0,$inp));
1063         &xorps  ($in0,$inout0);
1064         &movups (&QWP(0,$out),$in0);
1065         &jmp    (&label("ctr32_ret"));
1066
1067 &set_label("ctr32_two",16);
1068         &call   ("_aesni_encrypt2");
1069         &movups ($inout3,&QWP(0,$inp));
1070         &movups ($inout4,&QWP(0x10,$inp));
1071         &xorps  ($inout0,$inout3);
1072         &xorps  ($inout1,$inout4);
1073         &movups (&QWP(0,$out),$inout0);
1074         &movups (&QWP(0x10,$out),$inout1);
1075         &jmp    (&label("ctr32_ret"));
1076
1077 &set_label("ctr32_three",16);
1078         &call   ("_aesni_encrypt3");
1079         &movups ($inout3,&QWP(0,$inp));
1080         &movups ($inout4,&QWP(0x10,$inp));
1081         &xorps  ($inout0,$inout3);
1082         &movups ($inout5,&QWP(0x20,$inp));
1083         &xorps  ($inout1,$inout4);
1084         &movups (&QWP(0,$out),$inout0);
1085         &xorps  ($inout2,$inout5);
1086         &movups (&QWP(0x10,$out),$inout1);
1087         &movups (&QWP(0x20,$out),$inout2);
1088         &jmp    (&label("ctr32_ret"));
1089
1090 &set_label("ctr32_four",16);
1091         &call   ("_aesni_encrypt4");
1092         &movups ($inout4,&QWP(0,$inp));
1093         &movups ($inout5,&QWP(0x10,$inp));
1094         &movups ($rndkey1,&QWP(0x20,$inp));
1095         &xorps  ($inout0,$inout4);
1096         &movups ($rndkey0,&QWP(0x30,$inp));
1097         &xorps  ($inout1,$inout5);
1098         &movups (&QWP(0,$out),$inout0);
1099         &xorps  ($inout2,$rndkey1);
1100         &movups (&QWP(0x10,$out),$inout1);
1101         &xorps  ($inout3,$rndkey0);
1102         &movups (&QWP(0x20,$out),$inout2);
1103         &movups (&QWP(0x30,$out),$inout3);
1104
1105 &set_label("ctr32_ret");
1106         &pxor   ("xmm0","xmm0");                # clear register bank
1107         &pxor   ("xmm1","xmm1");
1108         &pxor   ("xmm2","xmm2");
1109         &pxor   ("xmm3","xmm3");
1110         &pxor   ("xmm4","xmm4");
1111         &movdqa (&QWP(32,"esp"),"xmm0");        # clear stack
1112         &pxor   ("xmm5","xmm5");
1113         &movdqa (&QWP(48,"esp"),"xmm0");
1114         &pxor   ("xmm6","xmm6");
1115         &movdqa (&QWP(64,"esp"),"xmm0");
1116         &pxor   ("xmm7","xmm7");
1117         &mov    ("esp",&DWP(80,"esp"));
1118 &function_end("aesni_ctr32_encrypt_blocks");
1119 \f
1120 ######################################################################
1121 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1122 #       const AES_KEY *key1, const AES_KEY *key2
1123 #       const unsigned char iv[16]);
1124 #
1125 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1126
1127 &function_begin("aesni_xts_encrypt");
1128         &mov    ($key,&wparam(4));              # key2
1129         &mov    ($inp,&wparam(5));              # clear-text tweak
1130
1131         &mov    ($rounds,&DWP(240,$key));       # key2->rounds
1132         &movups ($inout0,&QWP(0,$inp));
1133         if ($inline)
1134         {   &aesni_inline_generate1("enc");     }
1135         else
1136         {   &call       ("_aesni_encrypt1");    }
1137
1138         &mov    ($inp,&wparam(0));
1139         &mov    ($out,&wparam(1));
1140         &mov    ($len,&wparam(2));
1141         &mov    ($key,&wparam(3));              # key1
1142
1143         &mov    ($key_,"esp");
1144         &sub    ("esp",16*7+8);
1145         &mov    ($rounds,&DWP(240,$key));       # key1->rounds
1146         &and    ("esp",-16);                    # align stack
1147
1148         &mov    (&DWP(16*6+0,"esp"),0x87);      # compose the magic constant
1149         &mov    (&DWP(16*6+4,"esp"),0);
1150         &mov    (&DWP(16*6+8,"esp"),1);
1151         &mov    (&DWP(16*6+12,"esp"),0);
1152         &mov    (&DWP(16*7+0,"esp"),$len);      # save original $len
1153         &mov    (&DWP(16*7+4,"esp"),$key_);     # save original %esp
1154
1155         &movdqa ($tweak,$inout0);
1156         &pxor   ($twtmp,$twtmp);
1157         &movdqa ($twmask,&QWP(6*16,"esp"));     # 0x0...010...87
1158         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1159
1160         &and    ($len,-16);
1161         &mov    ($key_,$key);                   # backup $key
1162         &mov    ($rounds_,$rounds);             # backup $rounds
1163         &sub    ($len,16*6);
1164         &jc     (&label("xts_enc_short"));
1165
1166         &shl    ($rounds,4);
1167         &mov    ($rounds_,16);
1168         &sub    ($rounds_,$rounds);
1169         &lea    ($key,&DWP(32,$key,$rounds));
1170         &jmp    (&label("xts_enc_loop6"));
1171
1172 &set_label("xts_enc_loop6",16);
1173         for ($i=0;$i<4;$i++) {
1174             &pshufd     ($twres,$twtmp,0x13);
1175             &pxor       ($twtmp,$twtmp);
1176             &movdqa     (&QWP(16*$i,"esp"),$tweak);
1177             &paddq      ($tweak,$tweak);        # &psllq($tweak,1);
1178             &pand       ($twres,$twmask);       # isolate carry and residue
1179             &pcmpgtd    ($twtmp,$tweak);        # broadcast upper bits
1180             &pxor       ($tweak,$twres);
1181         }
1182         &pshufd ($inout5,$twtmp,0x13);
1183         &movdqa (&QWP(16*$i++,"esp"),$tweak);
1184         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1185          &$movekey      ($rndkey0,&QWP(0,$key_));
1186         &pand   ($inout5,$twmask);              # isolate carry and residue
1187          &movups        ($inout0,&QWP(0,$inp)); # load input
1188         &pxor   ($inout5,$tweak);
1189
1190         # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1191         &mov    ($rounds,$rounds_);             # restore $rounds
1192         &movdqu ($inout1,&QWP(16*1,$inp));
1193          &xorps         ($inout0,$rndkey0);     # input^=rndkey[0]
1194         &movdqu ($inout2,&QWP(16*2,$inp));
1195          &pxor          ($inout1,$rndkey0);
1196         &movdqu ($inout3,&QWP(16*3,$inp));
1197          &pxor          ($inout2,$rndkey0);
1198         &movdqu ($inout4,&QWP(16*4,$inp));
1199          &pxor          ($inout3,$rndkey0);
1200         &movdqu ($rndkey1,&QWP(16*5,$inp));
1201          &pxor          ($inout4,$rndkey0);
1202         &lea    ($inp,&DWP(16*6,$inp));
1203         &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1204         &movdqa (&QWP(16*$i,"esp"),$inout5);    # save last tweak
1205         &pxor   ($inout5,$rndkey1);
1206
1207          &$movekey      ($rndkey1,&QWP(16,$key_));
1208         &pxor   ($inout1,&QWP(16*1,"esp"));
1209         &pxor   ($inout2,&QWP(16*2,"esp"));
1210          &aesenc        ($inout0,$rndkey1);
1211         &pxor   ($inout3,&QWP(16*3,"esp"));
1212         &pxor   ($inout4,&QWP(16*4,"esp"));
1213          &aesenc        ($inout1,$rndkey1);
1214         &pxor           ($inout5,$rndkey0);
1215          &$movekey      ($rndkey0,&QWP(32,$key_));
1216          &aesenc        ($inout2,$rndkey1);
1217          &aesenc        ($inout3,$rndkey1);
1218          &aesenc        ($inout4,$rndkey1);
1219          &aesenc        ($inout5,$rndkey1);
1220         &call           (&label("_aesni_encrypt6_enter"));
1221
1222         &movdqa ($tweak,&QWP(16*5,"esp"));      # last tweak
1223        &pxor    ($twtmp,$twtmp);
1224         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1225        &pcmpgtd ($twtmp,$tweak);                # broadcast upper bits
1226         &xorps  ($inout1,&QWP(16*1,"esp"));
1227         &movups (&QWP(16*0,$out),$inout0);      # write output
1228         &xorps  ($inout2,&QWP(16*2,"esp"));
1229         &movups (&QWP(16*1,$out),$inout1);
1230         &xorps  ($inout3,&QWP(16*3,"esp"));
1231         &movups (&QWP(16*2,$out),$inout2);
1232         &xorps  ($inout4,&QWP(16*4,"esp"));
1233         &movups (&QWP(16*3,$out),$inout3);
1234         &xorps  ($inout5,$tweak);
1235         &movups (&QWP(16*4,$out),$inout4);
1236        &pshufd  ($twres,$twtmp,0x13);
1237         &movups (&QWP(16*5,$out),$inout5);
1238         &lea    ($out,&DWP(16*6,$out));
1239        &movdqa  ($twmask,&QWP(16*6,"esp"));     # 0x0...010...87
1240
1241         &pxor   ($twtmp,$twtmp);
1242         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1243         &pand   ($twres,$twmask);               # isolate carry and residue
1244         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1245         &pxor   ($tweak,$twres);
1246
1247         &sub    ($len,16*6);
1248         &jnc    (&label("xts_enc_loop6"));
1249
1250         &mov    ($rounds,&DWP(240,$key_));      # restore $rounds
1251         &mov    ($key,$key_);                   # restore $key
1252         &mov    ($rounds_,$rounds);
1253
1254 &set_label("xts_enc_short");
1255         &add    ($len,16*6);
1256         &jz     (&label("xts_enc_done6x"));
1257
1258         &movdqa ($inout3,$tweak);               # put aside previous tweak
1259         &cmp    ($len,0x20);
1260         &jb     (&label("xts_enc_one"));
1261
1262         &pshufd ($twres,$twtmp,0x13);
1263         &pxor   ($twtmp,$twtmp);
1264         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1265         &pand   ($twres,$twmask);               # isolate carry and residue
1266         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1267         &pxor   ($tweak,$twres);
1268         &je     (&label("xts_enc_two"));
1269
1270         &pshufd ($twres,$twtmp,0x13);
1271         &pxor   ($twtmp,$twtmp);
1272         &movdqa ($inout4,$tweak);               # put aside previous tweak
1273         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1274         &pand   ($twres,$twmask);               # isolate carry and residue
1275         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1276         &pxor   ($tweak,$twres);
1277         &cmp    ($len,0x40);
1278         &jb     (&label("xts_enc_three"));
1279
1280         &pshufd ($twres,$twtmp,0x13);
1281         &pxor   ($twtmp,$twtmp);
1282         &movdqa ($inout5,$tweak);               # put aside previous tweak
1283         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1284         &pand   ($twres,$twmask);               # isolate carry and residue
1285         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1286         &pxor   ($tweak,$twres);
1287         &movdqa (&QWP(16*0,"esp"),$inout3);
1288         &movdqa (&QWP(16*1,"esp"),$inout4);
1289         &je     (&label("xts_enc_four"));
1290
1291         &movdqa (&QWP(16*2,"esp"),$inout5);
1292         &pshufd ($inout5,$twtmp,0x13);
1293         &movdqa (&QWP(16*3,"esp"),$tweak);
1294         &paddq  ($tweak,$tweak);                # &psllq($inout0,1);
1295         &pand   ($inout5,$twmask);              # isolate carry and residue
1296         &pxor   ($inout5,$tweak);
1297
1298         &movdqu ($inout0,&QWP(16*0,$inp));      # load input
1299         &movdqu ($inout1,&QWP(16*1,$inp));
1300         &movdqu ($inout2,&QWP(16*2,$inp));
1301         &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1302         &movdqu ($inout3,&QWP(16*3,$inp));
1303         &pxor   ($inout1,&QWP(16*1,"esp"));
1304         &movdqu ($inout4,&QWP(16*4,$inp));
1305         &pxor   ($inout2,&QWP(16*2,"esp"));
1306         &lea    ($inp,&DWP(16*5,$inp));
1307         &pxor   ($inout3,&QWP(16*3,"esp"));
1308         &movdqa (&QWP(16*4,"esp"),$inout5);     # save last tweak
1309         &pxor   ($inout4,$inout5);
1310
1311         &call   ("_aesni_encrypt6");
1312
1313         &movaps ($tweak,&QWP(16*4,"esp"));      # last tweak
1314         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1315         &xorps  ($inout1,&QWP(16*1,"esp"));
1316         &xorps  ($inout2,&QWP(16*2,"esp"));
1317         &movups (&QWP(16*0,$out),$inout0);      # write output
1318         &xorps  ($inout3,&QWP(16*3,"esp"));
1319         &movups (&QWP(16*1,$out),$inout1);
1320         &xorps  ($inout4,$tweak);
1321         &movups (&QWP(16*2,$out),$inout2);
1322         &movups (&QWP(16*3,$out),$inout3);
1323         &movups (&QWP(16*4,$out),$inout4);
1324         &lea    ($out,&DWP(16*5,$out));
1325         &jmp    (&label("xts_enc_done"));
1326
1327 &set_label("xts_enc_one",16);
1328         &movups ($inout0,&QWP(16*0,$inp));      # load input
1329         &lea    ($inp,&DWP(16*1,$inp));
1330         &xorps  ($inout0,$inout3);              # input^=tweak
1331         if ($inline)
1332         {   &aesni_inline_generate1("enc");     }
1333         else
1334         {   &call       ("_aesni_encrypt1");    }
1335         &xorps  ($inout0,$inout3);              # output^=tweak
1336         &movups (&QWP(16*0,$out),$inout0);      # write output
1337         &lea    ($out,&DWP(16*1,$out));
1338
1339         &movdqa ($tweak,$inout3);               # last tweak
1340         &jmp    (&label("xts_enc_done"));
1341
1342 &set_label("xts_enc_two",16);
1343         &movaps ($inout4,$tweak);               # put aside last tweak
1344
1345         &movups ($inout0,&QWP(16*0,$inp));      # load input
1346         &movups ($inout1,&QWP(16*1,$inp));
1347         &lea    ($inp,&DWP(16*2,$inp));
1348         &xorps  ($inout0,$inout3);              # input^=tweak
1349         &xorps  ($inout1,$inout4);
1350
1351         &call   ("_aesni_encrypt2");
1352
1353         &xorps  ($inout0,$inout3);              # output^=tweak
1354         &xorps  ($inout1,$inout4);
1355         &movups (&QWP(16*0,$out),$inout0);      # write output
1356         &movups (&QWP(16*1,$out),$inout1);
1357         &lea    ($out,&DWP(16*2,$out));
1358
1359         &movdqa ($tweak,$inout4);               # last tweak
1360         &jmp    (&label("xts_enc_done"));
1361
1362 &set_label("xts_enc_three",16);
1363         &movaps ($inout5,$tweak);               # put aside last tweak
1364         &movups ($inout0,&QWP(16*0,$inp));      # load input
1365         &movups ($inout1,&QWP(16*1,$inp));
1366         &movups ($inout2,&QWP(16*2,$inp));
1367         &lea    ($inp,&DWP(16*3,$inp));
1368         &xorps  ($inout0,$inout3);              # input^=tweak
1369         &xorps  ($inout1,$inout4);
1370         &xorps  ($inout2,$inout5);
1371
1372         &call   ("_aesni_encrypt3");
1373
1374         &xorps  ($inout0,$inout3);              # output^=tweak
1375         &xorps  ($inout1,$inout4);
1376         &xorps  ($inout2,$inout5);
1377         &movups (&QWP(16*0,$out),$inout0);      # write output
1378         &movups (&QWP(16*1,$out),$inout1);
1379         &movups (&QWP(16*2,$out),$inout2);
1380         &lea    ($out,&DWP(16*3,$out));
1381
1382         &movdqa ($tweak,$inout5);               # last tweak
1383         &jmp    (&label("xts_enc_done"));
1384
1385 &set_label("xts_enc_four",16);
1386         &movaps ($inout4,$tweak);               # put aside last tweak
1387
1388         &movups ($inout0,&QWP(16*0,$inp));      # load input
1389         &movups ($inout1,&QWP(16*1,$inp));
1390         &movups ($inout2,&QWP(16*2,$inp));
1391         &xorps  ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1392         &movups ($inout3,&QWP(16*3,$inp));
1393         &lea    ($inp,&DWP(16*4,$inp));
1394         &xorps  ($inout1,&QWP(16*1,"esp"));
1395         &xorps  ($inout2,$inout5);
1396         &xorps  ($inout3,$inout4);
1397
1398         &call   ("_aesni_encrypt4");
1399
1400         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1401         &xorps  ($inout1,&QWP(16*1,"esp"));
1402         &xorps  ($inout2,$inout5);
1403         &movups (&QWP(16*0,$out),$inout0);      # write output
1404         &xorps  ($inout3,$inout4);
1405         &movups (&QWP(16*1,$out),$inout1);
1406         &movups (&QWP(16*2,$out),$inout2);
1407         &movups (&QWP(16*3,$out),$inout3);
1408         &lea    ($out,&DWP(16*4,$out));
1409
1410         &movdqa ($tweak,$inout4);               # last tweak
1411         &jmp    (&label("xts_enc_done"));
1412
1413 &set_label("xts_enc_done6x",16);                # $tweak is pre-calculated
1414         &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
1415         &and    ($len,15);
1416         &jz     (&label("xts_enc_ret"));
1417         &movdqa ($inout3,$tweak);
1418         &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
1419         &jmp    (&label("xts_enc_steal"));
1420
1421 &set_label("xts_enc_done",16);
1422         &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
1423         &pxor   ($twtmp,$twtmp);
1424         &and    ($len,15);
1425         &jz     (&label("xts_enc_ret"));
1426
1427         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1428         &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
1429         &pshufd ($inout3,$twtmp,0x13);
1430         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1431         &pand   ($inout3,&QWP(16*6,"esp"));     # isolate carry and residue
1432         &pxor   ($inout3,$tweak);
1433
1434 &set_label("xts_enc_steal");
1435         &movz   ($rounds,&BP(0,$inp));
1436         &movz   ($key,&BP(-16,$out));
1437         &lea    ($inp,&DWP(1,$inp));
1438         &mov    (&BP(-16,$out),&LB($rounds));
1439         &mov    (&BP(0,$out),&LB($key));
1440         &lea    ($out,&DWP(1,$out));
1441         &sub    ($len,1);
1442         &jnz    (&label("xts_enc_steal"));
1443
1444         &sub    ($out,&DWP(16*7+0,"esp"));      # rewind $out
1445         &mov    ($key,$key_);                   # restore $key
1446         &mov    ($rounds,$rounds_);             # restore $rounds
1447
1448         &movups ($inout0,&QWP(-16,$out));       # load input
1449         &xorps  ($inout0,$inout3);              # input^=tweak
1450         if ($inline)
1451         {   &aesni_inline_generate1("enc");     }
1452         else
1453         {   &call       ("_aesni_encrypt1");    }
1454         &xorps  ($inout0,$inout3);              # output^=tweak
1455         &movups (&QWP(-16,$out),$inout0);       # write output
1456
1457 &set_label("xts_enc_ret");
1458         &pxor   ("xmm0","xmm0");                # clear register bank
1459         &pxor   ("xmm1","xmm1");
1460         &pxor   ("xmm2","xmm2");
1461         &movdqa (&QWP(16*0,"esp"),"xmm0");      # clear stack
1462         &pxor   ("xmm3","xmm3");
1463         &movdqa (&QWP(16*1,"esp"),"xmm0");
1464         &pxor   ("xmm4","xmm4");
1465         &movdqa (&QWP(16*2,"esp"),"xmm0");
1466         &pxor   ("xmm5","xmm5");
1467         &movdqa (&QWP(16*3,"esp"),"xmm0");
1468         &pxor   ("xmm6","xmm6");
1469         &movdqa (&QWP(16*4,"esp"),"xmm0");
1470         &pxor   ("xmm7","xmm7");
1471         &movdqa (&QWP(16*5,"esp"),"xmm0");
1472         &mov    ("esp",&DWP(16*7+4,"esp"));     # restore %esp
1473 &function_end("aesni_xts_encrypt");
1474
1475 &function_begin("aesni_xts_decrypt");
1476         &mov    ($key,&wparam(4));              # key2
1477         &mov    ($inp,&wparam(5));              # clear-text tweak
1478
1479         &mov    ($rounds,&DWP(240,$key));       # key2->rounds
1480         &movups ($inout0,&QWP(0,$inp));
1481         if ($inline)
1482         {   &aesni_inline_generate1("enc");     }
1483         else
1484         {   &call       ("_aesni_encrypt1");    }
1485
1486         &mov    ($inp,&wparam(0));
1487         &mov    ($out,&wparam(1));
1488         &mov    ($len,&wparam(2));
1489         &mov    ($key,&wparam(3));              # key1
1490
1491         &mov    ($key_,"esp");
1492         &sub    ("esp",16*7+8);
1493         &and    ("esp",-16);                    # align stack
1494
1495         &xor    ($rounds_,$rounds_);            # if(len%16) len-=16;
1496         &test   ($len,15);
1497         &setnz  (&LB($rounds_));
1498         &shl    ($rounds_,4);
1499         &sub    ($len,$rounds_);
1500
1501         &mov    (&DWP(16*6+0,"esp"),0x87);      # compose the magic constant
1502         &mov    (&DWP(16*6+4,"esp"),0);
1503         &mov    (&DWP(16*6+8,"esp"),1);
1504         &mov    (&DWP(16*6+12,"esp"),0);
1505         &mov    (&DWP(16*7+0,"esp"),$len);      # save original $len
1506         &mov    (&DWP(16*7+4,"esp"),$key_);     # save original %esp
1507
1508         &mov    ($rounds,&DWP(240,$key));       # key1->rounds
1509         &mov    ($key_,$key);                   # backup $key
1510         &mov    ($rounds_,$rounds);             # backup $rounds
1511
1512         &movdqa ($tweak,$inout0);
1513         &pxor   ($twtmp,$twtmp);
1514         &movdqa ($twmask,&QWP(6*16,"esp"));     # 0x0...010...87
1515         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1516
1517         &and    ($len,-16);
1518         &sub    ($len,16*6);
1519         &jc     (&label("xts_dec_short"));
1520
1521         &shl    ($rounds,4);
1522         &mov    ($rounds_,16);
1523         &sub    ($rounds_,$rounds);
1524         &lea    ($key,&DWP(32,$key,$rounds));
1525         &jmp    (&label("xts_dec_loop6"));
1526
1527 &set_label("xts_dec_loop6",16);
1528         for ($i=0;$i<4;$i++) {
1529             &pshufd     ($twres,$twtmp,0x13);
1530             &pxor       ($twtmp,$twtmp);
1531             &movdqa     (&QWP(16*$i,"esp"),$tweak);
1532             &paddq      ($tweak,$tweak);        # &psllq($tweak,1);
1533             &pand       ($twres,$twmask);       # isolate carry and residue
1534             &pcmpgtd    ($twtmp,$tweak);        # broadcast upper bits
1535             &pxor       ($tweak,$twres);
1536         }
1537         &pshufd ($inout5,$twtmp,0x13);
1538         &movdqa (&QWP(16*$i++,"esp"),$tweak);
1539         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1540          &$movekey      ($rndkey0,&QWP(0,$key_));
1541         &pand   ($inout5,$twmask);              # isolate carry and residue
1542          &movups        ($inout0,&QWP(0,$inp)); # load input
1543         &pxor   ($inout5,$tweak);
1544
1545         # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1546         &mov    ($rounds,$rounds_);
1547         &movdqu ($inout1,&QWP(16*1,$inp));
1548          &xorps         ($inout0,$rndkey0);     # input^=rndkey[0]
1549         &movdqu ($inout2,&QWP(16*2,$inp));
1550          &pxor          ($inout1,$rndkey0);
1551         &movdqu ($inout3,&QWP(16*3,$inp));
1552          &pxor          ($inout2,$rndkey0);
1553         &movdqu ($inout4,&QWP(16*4,$inp));
1554          &pxor          ($inout3,$rndkey0);
1555         &movdqu ($rndkey1,&QWP(16*5,$inp));
1556          &pxor          ($inout4,$rndkey0);
1557         &lea    ($inp,&DWP(16*6,$inp));
1558         &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1559         &movdqa (&QWP(16*$i,"esp"),$inout5);    # save last tweak
1560         &pxor   ($inout5,$rndkey1);
1561
1562          &$movekey      ($rndkey1,&QWP(16,$key_));
1563         &pxor   ($inout1,&QWP(16*1,"esp"));
1564         &pxor   ($inout2,&QWP(16*2,"esp"));
1565          &aesdec        ($inout0,$rndkey1);
1566         &pxor   ($inout3,&QWP(16*3,"esp"));
1567         &pxor   ($inout4,&QWP(16*4,"esp"));
1568          &aesdec        ($inout1,$rndkey1);
1569         &pxor           ($inout5,$rndkey0);
1570          &$movekey      ($rndkey0,&QWP(32,$key_));
1571          &aesdec        ($inout2,$rndkey1);
1572          &aesdec        ($inout3,$rndkey1);
1573          &aesdec        ($inout4,$rndkey1);
1574          &aesdec        ($inout5,$rndkey1);
1575         &call           (&label("_aesni_decrypt6_enter"));
1576
1577         &movdqa ($tweak,&QWP(16*5,"esp"));      # last tweak
1578        &pxor    ($twtmp,$twtmp);
1579         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1580        &pcmpgtd ($twtmp,$tweak);                # broadcast upper bits
1581         &xorps  ($inout1,&QWP(16*1,"esp"));
1582         &movups (&QWP(16*0,$out),$inout0);      # write output
1583         &xorps  ($inout2,&QWP(16*2,"esp"));
1584         &movups (&QWP(16*1,$out),$inout1);
1585         &xorps  ($inout3,&QWP(16*3,"esp"));
1586         &movups (&QWP(16*2,$out),$inout2);
1587         &xorps  ($inout4,&QWP(16*4,"esp"));
1588         &movups (&QWP(16*3,$out),$inout3);
1589         &xorps  ($inout5,$tweak);
1590         &movups (&QWP(16*4,$out),$inout4);
1591        &pshufd  ($twres,$twtmp,0x13);
1592         &movups (&QWP(16*5,$out),$inout5);
1593         &lea    ($out,&DWP(16*6,$out));
1594        &movdqa  ($twmask,&QWP(16*6,"esp"));     # 0x0...010...87
1595
1596         &pxor   ($twtmp,$twtmp);
1597         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1598         &pand   ($twres,$twmask);               # isolate carry and residue
1599         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1600         &pxor   ($tweak,$twres);
1601
1602         &sub    ($len,16*6);
1603         &jnc    (&label("xts_dec_loop6"));
1604
1605         &mov    ($rounds,&DWP(240,$key_));      # restore $rounds
1606         &mov    ($key,$key_);                   # restore $key
1607         &mov    ($rounds_,$rounds);
1608
1609 &set_label("xts_dec_short");
1610         &add    ($len,16*6);
1611         &jz     (&label("xts_dec_done6x"));
1612
1613         &movdqa ($inout3,$tweak);               # put aside previous tweak
1614         &cmp    ($len,0x20);
1615         &jb     (&label("xts_dec_one"));
1616
1617         &pshufd ($twres,$twtmp,0x13);
1618         &pxor   ($twtmp,$twtmp);
1619         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1620         &pand   ($twres,$twmask);               # isolate carry and residue
1621         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1622         &pxor   ($tweak,$twres);
1623         &je     (&label("xts_dec_two"));
1624
1625         &pshufd ($twres,$twtmp,0x13);
1626         &pxor   ($twtmp,$twtmp);
1627         &movdqa ($inout4,$tweak);               # put aside previous tweak
1628         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1629         &pand   ($twres,$twmask);               # isolate carry and residue
1630         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1631         &pxor   ($tweak,$twres);
1632         &cmp    ($len,0x40);
1633         &jb     (&label("xts_dec_three"));
1634
1635         &pshufd ($twres,$twtmp,0x13);
1636         &pxor   ($twtmp,$twtmp);
1637         &movdqa ($inout5,$tweak);               # put aside previous tweak
1638         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1639         &pand   ($twres,$twmask);               # isolate carry and residue
1640         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1641         &pxor   ($tweak,$twres);
1642         &movdqa (&QWP(16*0,"esp"),$inout3);
1643         &movdqa (&QWP(16*1,"esp"),$inout4);
1644         &je     (&label("xts_dec_four"));
1645
1646         &movdqa (&QWP(16*2,"esp"),$inout5);
1647         &pshufd ($inout5,$twtmp,0x13);
1648         &movdqa (&QWP(16*3,"esp"),$tweak);
1649         &paddq  ($tweak,$tweak);                # &psllq($inout0,1);
1650         &pand   ($inout5,$twmask);              # isolate carry and residue
1651         &pxor   ($inout5,$tweak);
1652
1653         &movdqu ($inout0,&QWP(16*0,$inp));      # load input
1654         &movdqu ($inout1,&QWP(16*1,$inp));
1655         &movdqu ($inout2,&QWP(16*2,$inp));
1656         &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1657         &movdqu ($inout3,&QWP(16*3,$inp));
1658         &pxor   ($inout1,&QWP(16*1,"esp"));
1659         &movdqu ($inout4,&QWP(16*4,$inp));
1660         &pxor   ($inout2,&QWP(16*2,"esp"));
1661         &lea    ($inp,&DWP(16*5,$inp));
1662         &pxor   ($inout3,&QWP(16*3,"esp"));
1663         &movdqa (&QWP(16*4,"esp"),$inout5);     # save last tweak
1664         &pxor   ($inout4,$inout5);
1665
1666         &call   ("_aesni_decrypt6");
1667
1668         &movaps ($tweak,&QWP(16*4,"esp"));      # last tweak
1669         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1670         &xorps  ($inout1,&QWP(16*1,"esp"));
1671         &xorps  ($inout2,&QWP(16*2,"esp"));
1672         &movups (&QWP(16*0,$out),$inout0);      # write output
1673         &xorps  ($inout3,&QWP(16*3,"esp"));
1674         &movups (&QWP(16*1,$out),$inout1);
1675         &xorps  ($inout4,$tweak);
1676         &movups (&QWP(16*2,$out),$inout2);
1677         &movups (&QWP(16*3,$out),$inout3);
1678         &movups (&QWP(16*4,$out),$inout4);
1679         &lea    ($out,&DWP(16*5,$out));
1680         &jmp    (&label("xts_dec_done"));
1681
1682 &set_label("xts_dec_one",16);
1683         &movups ($inout0,&QWP(16*0,$inp));      # load input
1684         &lea    ($inp,&DWP(16*1,$inp));
1685         &xorps  ($inout0,$inout3);              # input^=tweak
1686         if ($inline)
1687         {   &aesni_inline_generate1("dec");     }
1688         else
1689         {   &call       ("_aesni_decrypt1");    }
1690         &xorps  ($inout0,$inout3);              # output^=tweak
1691         &movups (&QWP(16*0,$out),$inout0);      # write output
1692         &lea    ($out,&DWP(16*1,$out));
1693
1694         &movdqa ($tweak,$inout3);               # last tweak
1695         &jmp    (&label("xts_dec_done"));
1696
1697 &set_label("xts_dec_two",16);
1698         &movaps ($inout4,$tweak);               # put aside last tweak
1699
1700         &movups ($inout0,&QWP(16*0,$inp));      # load input
1701         &movups ($inout1,&QWP(16*1,$inp));
1702         &lea    ($inp,&DWP(16*2,$inp));
1703         &xorps  ($inout0,$inout3);              # input^=tweak
1704         &xorps  ($inout1,$inout4);
1705
1706         &call   ("_aesni_decrypt2");
1707
1708         &xorps  ($inout0,$inout3);              # output^=tweak
1709         &xorps  ($inout1,$inout4);
1710         &movups (&QWP(16*0,$out),$inout0);      # write output
1711         &movups (&QWP(16*1,$out),$inout1);
1712         &lea    ($out,&DWP(16*2,$out));
1713
1714         &movdqa ($tweak,$inout4);               # last tweak
1715         &jmp    (&label("xts_dec_done"));
1716
1717 &set_label("xts_dec_three",16);
1718         &movaps ($inout5,$tweak);               # put aside last tweak
1719         &movups ($inout0,&QWP(16*0,$inp));      # load input
1720         &movups ($inout1,&QWP(16*1,$inp));
1721         &movups ($inout2,&QWP(16*2,$inp));
1722         &lea    ($inp,&DWP(16*3,$inp));
1723         &xorps  ($inout0,$inout3);              # input^=tweak
1724         &xorps  ($inout1,$inout4);
1725         &xorps  ($inout2,$inout5);
1726
1727         &call   ("_aesni_decrypt3");
1728
1729         &xorps  ($inout0,$inout3);              # output^=tweak
1730         &xorps  ($inout1,$inout4);
1731         &xorps  ($inout2,$inout5);
1732         &movups (&QWP(16*0,$out),$inout0);      # write output
1733         &movups (&QWP(16*1,$out),$inout1);
1734         &movups (&QWP(16*2,$out),$inout2);
1735         &lea    ($out,&DWP(16*3,$out));
1736
1737         &movdqa ($tweak,$inout5);               # last tweak
1738         &jmp    (&label("xts_dec_done"));
1739
1740 &set_label("xts_dec_four",16);
1741         &movaps ($inout4,$tweak);               # put aside last tweak
1742
1743         &movups ($inout0,&QWP(16*0,$inp));      # load input
1744         &movups ($inout1,&QWP(16*1,$inp));
1745         &movups ($inout2,&QWP(16*2,$inp));
1746         &xorps  ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1747         &movups ($inout3,&QWP(16*3,$inp));
1748         &lea    ($inp,&DWP(16*4,$inp));
1749         &xorps  ($inout1,&QWP(16*1,"esp"));
1750         &xorps  ($inout2,$inout5);
1751         &xorps  ($inout3,$inout4);
1752
1753         &call   ("_aesni_decrypt4");
1754
1755         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1756         &xorps  ($inout1,&QWP(16*1,"esp"));
1757         &xorps  ($inout2,$inout5);
1758         &movups (&QWP(16*0,$out),$inout0);      # write output
1759         &xorps  ($inout3,$inout4);
1760         &movups (&QWP(16*1,$out),$inout1);
1761         &movups (&QWP(16*2,$out),$inout2);
1762         &movups (&QWP(16*3,$out),$inout3);
1763         &lea    ($out,&DWP(16*4,$out));
1764
1765         &movdqa ($tweak,$inout4);               # last tweak
1766         &jmp    (&label("xts_dec_done"));
1767
1768 &set_label("xts_dec_done6x",16);                # $tweak is pre-calculated
1769         &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
1770         &and    ($len,15);
1771         &jz     (&label("xts_dec_ret"));
1772         &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
1773         &jmp    (&label("xts_dec_only_one_more"));
1774
1775 &set_label("xts_dec_done",16);
1776         &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
1777         &pxor   ($twtmp,$twtmp);
1778         &and    ($len,15);
1779         &jz     (&label("xts_dec_ret"));
1780
1781         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1782         &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
1783         &pshufd ($twres,$twtmp,0x13);
1784         &pxor   ($twtmp,$twtmp);
1785         &movdqa ($twmask,&QWP(16*6,"esp"));
1786         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1787         &pand   ($twres,$twmask);               # isolate carry and residue
1788         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1789         &pxor   ($tweak,$twres);
1790
1791 &set_label("xts_dec_only_one_more");
1792         &pshufd ($inout3,$twtmp,0x13);
1793         &movdqa ($inout4,$tweak);               # put aside previous tweak
1794         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1795         &pand   ($inout3,$twmask);              # isolate carry and residue
1796         &pxor   ($inout3,$tweak);
1797
1798         &mov    ($key,$key_);                   # restore $key
1799         &mov    ($rounds,$rounds_);             # restore $rounds
1800
1801         &movups ($inout0,&QWP(0,$inp));         # load input
1802         &xorps  ($inout0,$inout3);              # input^=tweak
1803         if ($inline)
1804         {   &aesni_inline_generate1("dec");     }
1805         else
1806         {   &call       ("_aesni_decrypt1");    }
1807         &xorps  ($inout0,$inout3);              # output^=tweak
1808         &movups (&QWP(0,$out),$inout0);         # write output
1809
1810 &set_label("xts_dec_steal");
1811         &movz   ($rounds,&BP(16,$inp));
1812         &movz   ($key,&BP(0,$out));
1813         &lea    ($inp,&DWP(1,$inp));
1814         &mov    (&BP(0,$out),&LB($rounds));
1815         &mov    (&BP(16,$out),&LB($key));
1816         &lea    ($out,&DWP(1,$out));
1817         &sub    ($len,1);
1818         &jnz    (&label("xts_dec_steal"));
1819
1820         &sub    ($out,&DWP(16*7+0,"esp"));      # rewind $out
1821         &mov    ($key,$key_);                   # restore $key
1822         &mov    ($rounds,$rounds_);             # restore $rounds
1823
1824         &movups ($inout0,&QWP(0,$out));         # load input
1825         &xorps  ($inout0,$inout4);              # input^=tweak
1826         if ($inline)
1827         {   &aesni_inline_generate1("dec");     }
1828         else
1829         {   &call       ("_aesni_decrypt1");    }
1830         &xorps  ($inout0,$inout4);              # output^=tweak
1831         &movups (&QWP(0,$out),$inout0);         # write output
1832
1833 &set_label("xts_dec_ret");
1834         &pxor   ("xmm0","xmm0");                # clear register bank
1835         &pxor   ("xmm1","xmm1");
1836         &pxor   ("xmm2","xmm2");
1837         &movdqa (&QWP(16*0,"esp"),"xmm0");      # clear stack
1838         &pxor   ("xmm3","xmm3");
1839         &movdqa (&QWP(16*1,"esp"),"xmm0");
1840         &pxor   ("xmm4","xmm4");
1841         &movdqa (&QWP(16*2,"esp"),"xmm0");
1842         &pxor   ("xmm5","xmm5");
1843         &movdqa (&QWP(16*3,"esp"),"xmm0");
1844         &pxor   ("xmm6","xmm6");
1845         &movdqa (&QWP(16*4,"esp"),"xmm0");
1846         &pxor   ("xmm7","xmm7");
1847         &movdqa (&QWP(16*5,"esp"),"xmm0");
1848         &mov    ("esp",&DWP(16*7+4,"esp"));     # restore %esp
1849 &function_end("aesni_xts_decrypt");
1850 }
1851 \f
1852 ######################################################################
1853 # void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
1854 #       const AES_KEY *key, unsigned int start_block_num,
1855 #       unsigned char offset_i[16], const unsigned char L_[][16],
1856 #       unsigned char checksum[16]);
1857 #
1858 {
1859 # offsets within stack frame
1860 my $checksum = 16*6;
1861 my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
1862
1863 # reassigned registers
1864 my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
1865 # $l_, $blocks, $inp, $key are permanently allocated in registers;
1866 # remaining non-volatile ones are offloaded to stack, which even
1867 # stay invariant after written to stack.
1868
1869 &function_begin("aesni_ocb_encrypt");
1870         &mov    ($rounds,&wparam(5));           # &offset_i
1871         &mov    ($rounds_,&wparam(7));          # &checksum
1872
1873         &mov    ($inp,&wparam(0));
1874         &mov    ($out,&wparam(1));
1875         &mov    ($len,&wparam(2));
1876         &mov    ($key,&wparam(3));
1877         &movdqu ($rndkey0,&QWP(0,$rounds));     # load offset_i
1878         &mov    ($block,&wparam(4));            # start_block_num
1879         &movdqu ($rndkey1,&QWP(0,$rounds_));    # load checksum
1880         &mov    ($l_,&wparam(6));               # L_
1881
1882         &mov    ($rounds,"esp");
1883         &sub    ("esp",$esp_off+4);             # alloca
1884         &and    ("esp",-16);                    # align stack
1885
1886         &sub    ($out,$inp);
1887         &shl    ($len,4);
1888         &lea    ($len,&DWP(-16*6,$inp,$len));   # end of input - 16*6
1889         &mov    (&DWP($out_off,"esp"),$out);
1890         &mov    (&DWP($end_off,"esp"),$len);
1891         &mov    (&DWP($esp_off,"esp"),$rounds);
1892
1893         &mov    ($rounds,&DWP(240,$key));
1894
1895         &test   ($block,1);
1896         &jnz    (&label("odd"));
1897
1898         &bsf            ($i3,$block);
1899         &add            ($block,1);
1900         &shl            ($i3,4);
1901         &movdqu         ($inout5,&QWP(0,$l_,$i3));
1902         &mov            ($i3,$key);                     # put aside key
1903
1904         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
1905         &lea            ($inp,&DWP(16,$inp));
1906
1907         &pxor           ($inout5,$rndkey0);             # ^ last offset_i
1908         &pxor           ($rndkey1,$inout0);             # checksum
1909         &pxor           ($inout0,$inout5);              # ^ offset_i
1910
1911         &movdqa         ($inout4,$rndkey1);
1912         if ($inline)
1913         {   &aesni_inline_generate1("enc");     }
1914         else
1915         {   &call       ("_aesni_encrypt1");    }
1916
1917         &xorps          ($inout0,$inout5);              # ^ offset_i
1918         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
1919         &movdqa         ($rndkey1,$inout4);             # pass the checksum
1920
1921         &movups         (&QWP(-16,$out,$inp),$inout0);  # store output
1922
1923         &mov            ($rounds,&DWP(240,$i3));
1924         &mov            ($key,$i3);                     # restore key
1925         &mov            ($len,&DWP($end_off,"esp"));
1926
1927 &set_label("odd");
1928         &shl            ($rounds,4);
1929         &mov            ($out,16);
1930         &sub            ($out,$rounds);                 # twisted rounds
1931         &mov            (&DWP($key_off,"esp"),$key);
1932         &lea            ($key,&DWP(32,$key,$rounds));   # end of key schedule
1933         &mov            (&DWP($rounds_off,"esp"),$out);
1934
1935         &cmp            ($inp,$len);
1936         &ja             (&label("short"));
1937         &jmp            (&label("grandloop"));
1938
1939 &set_label("grandloop",32);
1940         &lea            ($i1,&DWP(1,$block));
1941         &lea            ($i3,&DWP(3,$block));
1942         &lea            ($i5,&DWP(5,$block));
1943         &add            ($block,6);
1944         &bsf            ($i1,$i1);
1945         &bsf            ($i3,$i3);
1946         &bsf            ($i5,$i5);
1947         &shl            ($i1,4);
1948         &shl            ($i3,4);
1949         &shl            ($i5,4);
1950         &movdqu         ($inout0,&QWP(0,$l_));
1951         &movdqu         ($inout1,&QWP(0,$l_,$i1));
1952         &mov            ($rounds,&DWP($rounds_off,"esp"));
1953         &movdqa         ($inout2,$inout0);
1954         &movdqu         ($inout3,&QWP(0,$l_,$i3));
1955         &movdqa         ($inout4,$inout0);
1956         &movdqu         ($inout5,&QWP(0,$l_,$i5));
1957
1958         &pxor           ($inout0,$rndkey0);             # ^ last offset_i
1959         &pxor           ($inout1,$inout0);
1960         &movdqa         (&QWP(16*0,"esp"),$inout0);
1961         &pxor           ($inout2,$inout1);
1962         &movdqa         (&QWP(16*1,"esp"),$inout1);
1963         &pxor           ($inout3,$inout2);
1964         &movdqa         (&QWP(16*2,"esp"),$inout2);
1965         &pxor           ($inout4,$inout3);
1966         &movdqa         (&QWP(16*3,"esp"),$inout3);
1967         &pxor           ($inout5,$inout4);
1968         &movdqa         (&QWP(16*4,"esp"),$inout4);
1969         &movdqa         (&QWP(16*5,"esp"),$inout5);
1970
1971         &$movekey       ($rndkey0,&QWP(-48,$key,$rounds));
1972         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
1973         &movdqu         ($inout1,&QWP(16*1,$inp));
1974         &movdqu         ($inout2,&QWP(16*2,$inp));
1975         &movdqu         ($inout3,&QWP(16*3,$inp));
1976         &movdqu         ($inout4,&QWP(16*4,$inp));
1977         &movdqu         ($inout5,&QWP(16*5,$inp));
1978         &lea            ($inp,&DWP(16*6,$inp));
1979
1980         &pxor           ($rndkey1,$inout0);             # checksum
1981         &pxor           ($inout0,$rndkey0);             # ^ roundkey[0]
1982         &pxor           ($rndkey1,$inout1);
1983         &pxor           ($inout1,$rndkey0);
1984         &pxor           ($rndkey1,$inout2);
1985         &pxor           ($inout2,$rndkey0);
1986         &pxor           ($rndkey1,$inout3);
1987         &pxor           ($inout3,$rndkey0);
1988         &pxor           ($rndkey1,$inout4);
1989         &pxor           ($inout4,$rndkey0);
1990         &pxor           ($rndkey1,$inout5);
1991         &pxor           ($inout5,$rndkey0);
1992         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
1993
1994         &$movekey       ($rndkey1,&QWP(-32,$key,$rounds));
1995         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
1996         &pxor           ($inout1,&QWP(16*1,"esp"));
1997         &pxor           ($inout2,&QWP(16*2,"esp"));
1998         &pxor           ($inout3,&QWP(16*3,"esp"));
1999         &pxor           ($inout4,&QWP(16*4,"esp"));
2000         &pxor           ($inout5,&QWP(16*5,"esp"));
2001
2002         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
2003         &aesenc         ($inout0,$rndkey1);
2004         &aesenc         ($inout1,$rndkey1);
2005         &aesenc         ($inout2,$rndkey1);
2006         &aesenc         ($inout3,$rndkey1);
2007         &aesenc         ($inout4,$rndkey1);
2008         &aesenc         ($inout5,$rndkey1);
2009
2010         &mov            ($out,&DWP($out_off,"esp"));
2011         &mov            ($len,&DWP($end_off,"esp"));
2012         &call           ("_aesni_encrypt6_enter");
2013
2014         &movdqa         ($rndkey0,&QWP(16*5,"esp"));    # pass last offset_i
2015         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2016         &pxor           ($inout1,&QWP(16*1,"esp"));
2017         &pxor           ($inout2,&QWP(16*2,"esp"));
2018         &pxor           ($inout3,&QWP(16*3,"esp"));
2019         &pxor           ($inout4,&QWP(16*4,"esp"));
2020         &pxor           ($inout5,$rndkey0);
2021         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2022
2023         &movdqu         (&QWP(-16*6,$out,$inp),$inout0);# store output
2024         &movdqu         (&QWP(-16*5,$out,$inp),$inout1);
2025         &movdqu         (&QWP(-16*4,$out,$inp),$inout2);
2026         &movdqu         (&QWP(-16*3,$out,$inp),$inout3);
2027         &movdqu         (&QWP(-16*2,$out,$inp),$inout4);
2028         &movdqu         (&QWP(-16*1,$out,$inp),$inout5);
2029         &cmp            ($inp,$len);                    # done yet?
2030         &jb             (&label("grandloop"));
2031
2032 &set_label("short");
2033         &add            ($len,16*6);
2034         &sub            ($len,$inp);
2035         &jz             (&label("done"));
2036
2037         &cmp            ($len,16*2);
2038         &jb             (&label("one"));
2039         &je             (&label("two"));
2040
2041         &cmp            ($len,16*4);
2042         &jb             (&label("three"));
2043         &je             (&label("four"));
2044
2045         &lea            ($i1,&DWP(1,$block));
2046         &lea            ($i3,&DWP(3,$block));
2047         &bsf            ($i1,$i1);
2048         &bsf            ($i3,$i3);
2049         &shl            ($i1,4);
2050         &shl            ($i3,4);
2051         &movdqu         ($inout0,&QWP(0,$l_));
2052         &movdqu         ($inout1,&QWP(0,$l_,$i1));
2053         &mov            ($rounds,&DWP($rounds_off,"esp"));
2054         &movdqa         ($inout2,$inout0);
2055         &movdqu         ($inout3,&QWP(0,$l_,$i3));
2056         &movdqa         ($inout4,$inout0);
2057
2058         &pxor           ($inout0,$rndkey0);             # ^ last offset_i
2059         &pxor           ($inout1,$inout0);
2060         &movdqa         (&QWP(16*0,"esp"),$inout0);
2061         &pxor           ($inout2,$inout1);
2062         &movdqa         (&QWP(16*1,"esp"),$inout1);
2063         &pxor           ($inout3,$inout2);
2064         &movdqa         (&QWP(16*2,"esp"),$inout2);
2065         &pxor           ($inout4,$inout3);
2066         &movdqa         (&QWP(16*3,"esp"),$inout3);
2067         &pxor           ($inout5,$inout4);
2068         &movdqa         (&QWP(16*4,"esp"),$inout4);
2069
2070         &$movekey       ($rndkey0,&QWP(-48,$key,$rounds));
2071         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2072         &movdqu         ($inout1,&QWP(16*1,$inp));
2073         &movdqu         ($inout2,&QWP(16*2,$inp));
2074         &movdqu         ($inout3,&QWP(16*3,$inp));
2075         &movdqu         ($inout4,&QWP(16*4,$inp));
2076         &pxor           ($inout5,$inout5);
2077
2078         &pxor           ($rndkey1,$inout0);             # checksum
2079         &pxor           ($inout0,$rndkey0);             # ^ roundkey[0]
2080         &pxor           ($rndkey1,$inout1);
2081         &pxor           ($inout1,$rndkey0);
2082         &pxor           ($rndkey1,$inout2);
2083         &pxor           ($inout2,$rndkey0);
2084         &pxor           ($rndkey1,$inout3);
2085         &pxor           ($inout3,$rndkey0);
2086         &pxor           ($rndkey1,$inout4);
2087         &pxor           ($inout4,$rndkey0);
2088         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2089
2090         &$movekey       ($rndkey1,&QWP(-32,$key,$rounds));
2091         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2092         &pxor           ($inout1,&QWP(16*1,"esp"));
2093         &pxor           ($inout2,&QWP(16*2,"esp"));
2094         &pxor           ($inout3,&QWP(16*3,"esp"));
2095         &pxor           ($inout4,&QWP(16*4,"esp"));
2096
2097         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
2098         &aesenc         ($inout0,$rndkey1);
2099         &aesenc         ($inout1,$rndkey1);
2100         &aesenc         ($inout2,$rndkey1);
2101         &aesenc         ($inout3,$rndkey1);
2102         &aesenc         ($inout4,$rndkey1);
2103         &aesenc         ($inout5,$rndkey1);
2104
2105         &mov            ($out,&DWP($out_off,"esp"));
2106         &call           ("_aesni_encrypt6_enter");
2107
2108         &movdqa         ($rndkey0,&QWP(16*4,"esp"));    # pass last offset_i
2109         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2110         &pxor           ($inout1,&QWP(16*1,"esp"));
2111         &pxor           ($inout2,&QWP(16*2,"esp"));
2112         &pxor           ($inout3,&QWP(16*3,"esp"));
2113         &pxor           ($inout4,$rndkey0);
2114         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2115
2116         &movdqu         (&QWP(16*0,$out,$inp),$inout0); # store output
2117         &movdqu         (&QWP(16*1,$out,$inp),$inout1);
2118         &movdqu         (&QWP(16*2,$out,$inp),$inout2);
2119         &movdqu         (&QWP(16*3,$out,$inp),$inout3);
2120         &movdqu         (&QWP(16*4,$out,$inp),$inout4);
2121
2122         &jmp            (&label("done"));
2123
2124 &set_label("one",16);
2125         &movdqu         ($inout5,&QWP(0,$l_));
2126         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2127
2128         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2129         &mov            ($rounds,&DWP(240,$key));
2130
2131         &pxor           ($inout5,$rndkey0);             # ^ last offset_i
2132         &pxor           ($rndkey1,$inout0);             # checksum
2133         &pxor           ($inout0,$inout5);              # ^ offset_i
2134
2135         &movdqa         ($inout4,$rndkey1);
2136         &mov            ($out,&DWP($out_off,"esp"));
2137         if ($inline)
2138         {   &aesni_inline_generate1("enc");     }
2139         else
2140         {   &call       ("_aesni_encrypt1");    }
2141
2142         &xorps          ($inout0,$inout5);              # ^ offset_i
2143         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2144         &movdqa         ($rndkey1,$inout4);             # pass the checksum
2145         &movups         (&QWP(0,$out,$inp),$inout0);
2146
2147         &jmp            (&label("done"));
2148
2149 &set_label("two",16);
2150         &lea            ($i1,&DWP(1,$block));
2151         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2152         &bsf            ($i1,$i1);
2153         &shl            ($i1,4);
2154         &movdqu         ($inout4,&QWP(0,$l_));
2155         &movdqu         ($inout5,&QWP(0,$l_,$i1));
2156
2157         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2158         &movdqu         ($inout1,&QWP(16*1,$inp));
2159         &mov            ($rounds,&DWP(240,$key));
2160
2161         &pxor           ($inout4,$rndkey0);             # ^ last offset_i
2162         &pxor           ($inout5,$inout4);
2163
2164         &pxor           ($rndkey1,$inout0);             # checksum
2165         &pxor           ($inout0,$inout4);              # ^ offset_i
2166         &pxor           ($rndkey1,$inout1);
2167         &pxor           ($inout1,$inout5);
2168
2169         &movdqa         ($inout3,$rndkey1)
2170         &mov            ($out,&DWP($out_off,"esp"));
2171         &call           ("_aesni_encrypt2");
2172
2173         &xorps          ($inout0,$inout4);              # ^ offset_i
2174         &xorps          ($inout1,$inout5);
2175         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2176         &movdqa         ($rndkey1,$inout3);             # pass the checksum
2177         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2178         &movups         (&QWP(16*1,$out,$inp),$inout1);
2179
2180         &jmp            (&label("done"));
2181
2182 &set_label("three",16);
2183         &lea            ($i1,&DWP(1,$block));
2184         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2185         &bsf            ($i1,$i1);
2186         &shl            ($i1,4);
2187         &movdqu         ($inout3,&QWP(0,$l_));
2188         &movdqu         ($inout4,&QWP(0,$l_,$i1));
2189         &movdqa         ($inout5,$inout3);
2190
2191         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2192         &movdqu         ($inout1,&QWP(16*1,$inp));
2193         &movdqu         ($inout2,&QWP(16*2,$inp));
2194         &mov            ($rounds,&DWP(240,$key));
2195
2196         &pxor           ($inout3,$rndkey0);             # ^ last offset_i
2197         &pxor           ($inout4,$inout3);
2198         &pxor           ($inout5,$inout4);
2199
2200         &pxor           ($rndkey1,$inout0);             # checksum
2201         &pxor           ($inout0,$inout3);              # ^ offset_i
2202         &pxor           ($rndkey1,$inout1);
2203         &pxor           ($inout1,$inout4);
2204         &pxor           ($rndkey1,$inout2);
2205         &pxor           ($inout2,$inout5);
2206
2207         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2208         &mov            ($out,&DWP($out_off,"esp"));
2209         &call           ("_aesni_encrypt3");
2210
2211         &xorps          ($inout0,$inout3);              # ^ offset_i
2212         &xorps          ($inout1,$inout4);
2213         &xorps          ($inout2,$inout5);
2214         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2215         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2216         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2217         &movups         (&QWP(16*1,$out,$inp),$inout1);
2218         &movups         (&QWP(16*2,$out,$inp),$inout2);
2219
2220         &jmp            (&label("done"));
2221
2222 &set_label("four",16);
2223         &lea            ($i1,&DWP(1,$block));
2224         &lea            ($i3,&DWP(3,$block));
2225         &bsf            ($i1,$i1);
2226         &bsf            ($i3,$i3);
2227         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2228         &shl            ($i1,4);
2229         &shl            ($i3,4);
2230         &movdqu         ($inout2,&QWP(0,$l_));
2231         &movdqu         ($inout3,&QWP(0,$l_,$i1));
2232         &movdqa         ($inout4,$inout2);
2233         &movdqu         ($inout5,&QWP(0,$l_,$i3));
2234
2235         &pxor           ($inout2,$rndkey0);             # ^ last offset_i
2236         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2237         &pxor           ($inout3,$inout2);
2238         &movdqu         ($inout1,&QWP(16*1,$inp));
2239         &pxor           ($inout4,$inout3);
2240         &movdqa         (&QWP(16*0,"esp"),$inout2);
2241         &pxor           ($inout5,$inout4);
2242         &movdqa         (&QWP(16*1,"esp"),$inout3);
2243         &movdqu         ($inout2,&QWP(16*2,$inp));
2244         &movdqu         ($inout3,&QWP(16*3,$inp));
2245         &mov            ($rounds,&DWP(240,$key));
2246
2247         &pxor           ($rndkey1,$inout0);             # checksum
2248         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2249         &pxor           ($rndkey1,$inout1);
2250         &pxor           ($inout1,&QWP(16*1,"esp"));
2251         &pxor           ($rndkey1,$inout2);
2252         &pxor           ($inout2,$inout4);
2253         &pxor           ($rndkey1,$inout3);
2254         &pxor           ($inout3,$inout5);
2255
2256         &movdqa         (&QWP($checksum,"esp"),$rndkey1)
2257         &mov            ($out,&DWP($out_off,"esp"));
2258         &call           ("_aesni_encrypt4");
2259
2260         &xorps          ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2261         &xorps          ($inout1,&QWP(16*1,"esp"));
2262         &xorps          ($inout2,$inout4);
2263         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2264         &xorps          ($inout3,$inout5);
2265         &movups         (&QWP(16*1,$out,$inp),$inout1);
2266         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2267         &movups         (&QWP(16*2,$out,$inp),$inout2);
2268         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2269         &movups         (&QWP(16*3,$out,$inp),$inout3);
2270
2271 &set_label("done");
2272         &mov    ($key,&DWP($esp_off,"esp"));
2273         &pxor   ($inout0,$inout0);              # clear register bank
2274         &pxor   ($inout1,$inout1);
2275         &movdqa (&QWP(16*0,"esp"),$inout0);     # clear stack
2276         &pxor   ($inout2,$inout2);
2277         &movdqa (&QWP(16*1,"esp"),$inout0);
2278         &pxor   ($inout3,$inout3);
2279         &movdqa (&QWP(16*2,"esp"),$inout0);
2280         &pxor   ($inout4,$inout4);
2281         &movdqa (&QWP(16*3,"esp"),$inout0);
2282         &pxor   ($inout5,$inout5);
2283         &movdqa (&QWP(16*4,"esp"),$inout0);
2284         &movdqa (&QWP(16*5,"esp"),$inout0);
2285         &movdqa (&QWP(16*6,"esp"),$inout0);
2286
2287         &lea    ("esp",&DWP(0,$key));
2288         &mov    ($rounds,&wparam(5));           # &offset_i
2289         &mov    ($rounds_,&wparam(7));          # &checksum
2290         &movdqu (&QWP(0,$rounds),$rndkey0);
2291         &pxor   ($rndkey0,$rndkey0);
2292         &movdqu (&QWP(0,$rounds_),$rndkey1);
2293         &pxor   ($rndkey1,$rndkey1);
2294 &function_end("aesni_ocb_encrypt");
2295
2296 &function_begin("aesni_ocb_decrypt");
2297         &mov    ($rounds,&wparam(5));           # &offset_i
2298         &mov    ($rounds_,&wparam(7));          # &checksum
2299
2300         &mov    ($inp,&wparam(0));
2301         &mov    ($out,&wparam(1));
2302         &mov    ($len,&wparam(2));
2303         &mov    ($key,&wparam(3));
2304         &movdqu ($rndkey0,&QWP(0,$rounds));     # load offset_i
2305         &mov    ($block,&wparam(4));            # start_block_num
2306         &movdqu ($rndkey1,&QWP(0,$rounds_));    # load checksum
2307         &mov    ($l_,&wparam(6));               # L_
2308
2309         &mov    ($rounds,"esp");
2310         &sub    ("esp",$esp_off+4);             # alloca
2311         &and    ("esp",-16);                    # align stack
2312
2313         &sub    ($out,$inp);
2314         &shl    ($len,4);
2315         &lea    ($len,&DWP(-16*6,$inp,$len));   # end of input - 16*6
2316         &mov    (&DWP($out_off,"esp"),$out);
2317         &mov    (&DWP($end_off,"esp"),$len);
2318         &mov    (&DWP($esp_off,"esp"),$rounds);
2319
2320         &mov    ($rounds,&DWP(240,$key));
2321
2322         &test   ($block,1);
2323         &jnz    (&label("odd"));
2324
2325         &bsf            ($i3,$block);
2326         &add            ($block,1);
2327         &shl            ($i3,4);
2328         &movdqu         ($inout5,&QWP(0,$l_,$i3));
2329         &mov            ($i3,$key);                     # put aside key
2330
2331         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2332         &lea            ($inp,&DWP(16,$inp));
2333
2334         &pxor           ($inout5,$rndkey0);             # ^ last offset_i
2335         &pxor           ($inout0,$inout5);              # ^ offset_i
2336
2337         &movdqa         ($inout4,$rndkey1);
2338         if ($inline)
2339         {   &aesni_inline_generate1("dec");     }
2340         else
2341         {   &call       ("_aesni_decrypt1");    }
2342
2343         &xorps          ($inout0,$inout5);              # ^ offset_i
2344         &movaps         ($rndkey1,$inout4);             # pass the checksum
2345         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2346         &xorps          ($rndkey1,$inout0);             # checksum
2347         &movups         (&QWP(-16,$out,$inp),$inout0);  # store output
2348
2349         &mov            ($rounds,&DWP(240,$i3));
2350         &mov            ($key,$i3);                     # restore key
2351         &mov            ($len,&DWP($end_off,"esp"));
2352
2353 &set_label("odd");
2354         &shl            ($rounds,4);
2355         &mov            ($out,16);
2356         &sub            ($out,$rounds);                 # twisted rounds
2357         &mov            (&DWP($key_off,"esp"),$key);
2358         &lea            ($key,&DWP(32,$key,$rounds));   # end of key schedule
2359         &mov            (&DWP($rounds_off,"esp"),$out);
2360
2361         &cmp            ($inp,$len);
2362         &ja             (&label("short"));
2363         &jmp            (&label("grandloop"));
2364
2365 &set_label("grandloop",32);
2366         &lea            ($i1,&DWP(1,$block));
2367         &lea            ($i3,&DWP(3,$block));
2368         &lea            ($i5,&DWP(5,$block));
2369         &add            ($block,6);
2370         &bsf            ($i1,$i1);
2371         &bsf            ($i3,$i3);
2372         &bsf            ($i5,$i5);
2373         &shl            ($i1,4);
2374         &shl            ($i3,4);
2375         &shl            ($i5,4);
2376         &movdqu         ($inout0,&QWP(0,$l_));
2377         &movdqu         ($inout1,&QWP(0,$l_,$i1));
2378         &mov            ($rounds,&DWP($rounds_off,"esp"));
2379         &movdqa         ($inout2,$inout0);
2380         &movdqu         ($inout3,&QWP(0,$l_,$i3));
2381         &movdqa         ($inout4,$inout0);
2382         &movdqu         ($inout5,&QWP(0,$l_,$i5));
2383
2384         &pxor           ($inout0,$rndkey0);             # ^ last offset_i
2385         &pxor           ($inout1,$inout0);
2386         &movdqa         (&QWP(16*0,"esp"),$inout0);
2387         &pxor           ($inout2,$inout1);
2388         &movdqa         (&QWP(16*1,"esp"),$inout1);
2389         &pxor           ($inout3,$inout2);
2390         &movdqa         (&QWP(16*2,"esp"),$inout2);
2391         &pxor           ($inout4,$inout3);
2392         &movdqa         (&QWP(16*3,"esp"),$inout3);
2393         &pxor           ($inout5,$inout4);
2394         &movdqa         (&QWP(16*4,"esp"),$inout4);
2395         &movdqa         (&QWP(16*5,"esp"),$inout5);
2396
2397         &$movekey       ($rndkey0,&QWP(-48,$key,$rounds));
2398         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2399         &movdqu         ($inout1,&QWP(16*1,$inp));
2400         &movdqu         ($inout2,&QWP(16*2,$inp));
2401         &movdqu         ($inout3,&QWP(16*3,$inp));
2402         &movdqu         ($inout4,&QWP(16*4,$inp));
2403         &movdqu         ($inout5,&QWP(16*5,$inp));
2404         &lea            ($inp,&DWP(16*6,$inp));
2405
2406         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2407         &pxor           ($inout0,$rndkey0);             # ^ roundkey[0]
2408         &pxor           ($inout1,$rndkey0);
2409         &pxor           ($inout2,$rndkey0);
2410         &pxor           ($inout3,$rndkey0);
2411         &pxor           ($inout4,$rndkey0);
2412         &pxor           ($inout5,$rndkey0);
2413
2414         &$movekey       ($rndkey1,&QWP(-32,$key,$rounds));
2415         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2416         &pxor           ($inout1,&QWP(16*1,"esp"));
2417         &pxor           ($inout2,&QWP(16*2,"esp"));
2418         &pxor           ($inout3,&QWP(16*3,"esp"));
2419         &pxor           ($inout4,&QWP(16*4,"esp"));
2420         &pxor           ($inout5,&QWP(16*5,"esp"));
2421
2422         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
2423         &aesdec         ($inout0,$rndkey1);
2424         &aesdec         ($inout1,$rndkey1);
2425         &aesdec         ($inout2,$rndkey1);
2426         &aesdec         ($inout3,$rndkey1);
2427         &aesdec         ($inout4,$rndkey1);
2428         &aesdec         ($inout5,$rndkey1);
2429
2430         &mov            ($out,&DWP($out_off,"esp"));
2431         &mov            ($len,&DWP($end_off,"esp"));
2432         &call           ("_aesni_decrypt6_enter");
2433
2434         &movdqa         ($rndkey0,&QWP(16*5,"esp"));    # pass last offset_i
2435         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2436         &movdqa         ($rndkey1,&QWP($checksum,"esp"));
2437         &pxor           ($inout1,&QWP(16*1,"esp"));
2438         &pxor           ($inout2,&QWP(16*2,"esp"));
2439         &pxor           ($inout3,&QWP(16*3,"esp"));
2440         &pxor           ($inout4,&QWP(16*4,"esp"));
2441         &pxor           ($inout5,$rndkey0);
2442
2443         &pxor           ($rndkey1,$inout0);             # checksum
2444         &movdqu         (&QWP(-16*6,$out,$inp),$inout0);# store output
2445         &pxor           ($rndkey1,$inout1);
2446         &movdqu         (&QWP(-16*5,$out,$inp),$inout1);
2447         &pxor           ($rndkey1,$inout2);
2448         &movdqu         (&QWP(-16*4,$out,$inp),$inout2);
2449         &pxor           ($rndkey1,$inout3);
2450         &movdqu         (&QWP(-16*3,$out,$inp),$inout3);
2451         &pxor           ($rndkey1,$inout4);
2452         &movdqu         (&QWP(-16*2,$out,$inp),$inout4);
2453         &pxor           ($rndkey1,$inout5);
2454         &movdqu         (&QWP(-16*1,$out,$inp),$inout5);
2455         &cmp            ($inp,$len);                    # done yet?
2456         &jb             (&label("grandloop"));
2457
2458 &set_label("short");
2459         &add            ($len,16*6);
2460         &sub            ($len,$inp);
2461         &jz             (&label("done"));
2462
2463         &cmp            ($len,16*2);
2464         &jb             (&label("one"));
2465         &je             (&label("two"));
2466
2467         &cmp            ($len,16*4);
2468         &jb             (&label("three"));
2469         &je             (&label("four"));
2470
2471         &lea            ($i1,&DWP(1,$block));
2472         &lea            ($i3,&DWP(3,$block));
2473         &bsf            ($i1,$i1);
2474         &bsf            ($i3,$i3);
2475         &shl            ($i1,4);
2476         &shl            ($i3,4);
2477         &movdqu         ($inout0,&QWP(0,$l_));
2478         &movdqu         ($inout1,&QWP(0,$l_,$i1));
2479         &mov            ($rounds,&DWP($rounds_off,"esp"));
2480         &movdqa         ($inout2,$inout0);
2481         &movdqu         ($inout3,&QWP(0,$l_,$i3));
2482         &movdqa         ($inout4,$inout0);
2483
2484         &pxor           ($inout0,$rndkey0);             # ^ last offset_i
2485         &pxor           ($inout1,$inout0);
2486         &movdqa         (&QWP(16*0,"esp"),$inout0);
2487         &pxor           ($inout2,$inout1);
2488         &movdqa         (&QWP(16*1,"esp"),$inout1);
2489         &pxor           ($inout3,$inout2);
2490         &movdqa         (&QWP(16*2,"esp"),$inout2);
2491         &pxor           ($inout4,$inout3);
2492         &movdqa         (&QWP(16*3,"esp"),$inout3);
2493         &pxor           ($inout5,$inout4);
2494         &movdqa         (&QWP(16*4,"esp"),$inout4);
2495
2496         &$movekey       ($rndkey0,&QWP(-48,$key,$rounds));
2497         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2498         &movdqu         ($inout1,&QWP(16*1,$inp));
2499         &movdqu         ($inout2,&QWP(16*2,$inp));
2500         &movdqu         ($inout3,&QWP(16*3,$inp));
2501         &movdqu         ($inout4,&QWP(16*4,$inp));
2502         &pxor           ($inout5,$inout5);
2503
2504         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2505         &pxor           ($inout0,$rndkey0);             # ^ roundkey[0]
2506         &pxor           ($inout1,$rndkey0);
2507         &pxor           ($inout2,$rndkey0);
2508         &pxor           ($inout3,$rndkey0);
2509         &pxor           ($inout4,$rndkey0);
2510
2511         &$movekey       ($rndkey1,&QWP(-32,$key,$rounds));
2512         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2513         &pxor           ($inout1,&QWP(16*1,"esp"));
2514         &pxor           ($inout2,&QWP(16*2,"esp"));
2515         &pxor           ($inout3,&QWP(16*3,"esp"));
2516         &pxor           ($inout4,&QWP(16*4,"esp"));
2517
2518         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
2519         &aesdec         ($inout0,$rndkey1);
2520         &aesdec         ($inout1,$rndkey1);
2521         &aesdec         ($inout2,$rndkey1);
2522         &aesdec         ($inout3,$rndkey1);
2523         &aesdec         ($inout4,$rndkey1);
2524         &aesdec         ($inout5,$rndkey1);
2525
2526         &mov            ($out,&DWP($out_off,"esp"));
2527         &call           ("_aesni_decrypt6_enter");
2528
2529         &movdqa         ($rndkey0,&QWP(16*4,"esp"));    # pass last offset_i
2530         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2531         &movdqa         ($rndkey1,&QWP($checksum,"esp"));
2532         &pxor           ($inout1,&QWP(16*1,"esp"));
2533         &pxor           ($inout2,&QWP(16*2,"esp"));
2534         &pxor           ($inout3,&QWP(16*3,"esp"));
2535         &pxor           ($inout4,$rndkey0);
2536
2537         &pxor           ($rndkey1,$inout0);             # checksum
2538         &movdqu         (&QWP(16*0,$out,$inp),$inout0); # store output
2539         &pxor           ($rndkey1,$inout1);
2540         &movdqu         (&QWP(16*1,$out,$inp),$inout1);
2541         &pxor           ($rndkey1,$inout2);
2542         &movdqu         (&QWP(16*2,$out,$inp),$inout2);
2543         &pxor           ($rndkey1,$inout3);
2544         &movdqu         (&QWP(16*3,$out,$inp),$inout3);
2545         &pxor           ($rndkey1,$inout4);
2546         &movdqu         (&QWP(16*4,$out,$inp),$inout4);
2547
2548         &jmp            (&label("done"));
2549
2550 &set_label("one",16);
2551         &movdqu         ($inout5,&QWP(0,$l_));
2552         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2553
2554         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2555         &mov            ($rounds,&DWP(240,$key));
2556
2557         &pxor           ($inout5,$rndkey0);             # ^ last offset_i
2558         &pxor           ($inout0,$inout5);              # ^ offset_i
2559
2560         &movdqa         ($inout4,$rndkey1);
2561         &mov            ($out,&DWP($out_off,"esp"));
2562         if ($inline)
2563         {   &aesni_inline_generate1("dec");     }
2564         else
2565         {   &call       ("_aesni_decrypt1");    }
2566
2567         &xorps          ($inout0,$inout5);              # ^ offset_i
2568         &movaps         ($rndkey1,$inout4);             # pass the checksum
2569         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2570         &xorps          ($rndkey1,$inout0);             # checksum
2571         &movups         (&QWP(0,$out,$inp),$inout0);
2572
2573         &jmp            (&label("done"));
2574
2575 &set_label("two",16);
2576         &lea            ($i1,&DWP(1,$block));
2577         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2578         &bsf            ($i1,$i1);
2579         &shl            ($i1,4);
2580         &movdqu         ($inout4,&QWP(0,$l_));
2581         &movdqu         ($inout5,&QWP(0,$l_,$i1));
2582
2583         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2584         &movdqu         ($inout1,&QWP(16*1,$inp));
2585         &mov            ($rounds,&DWP(240,$key));
2586
2587         &movdqa         ($inout3,$rndkey1);
2588         &pxor           ($inout4,$rndkey0);             # ^ last offset_i
2589         &pxor           ($inout5,$inout4);
2590
2591         &pxor           ($inout0,$inout4);              # ^ offset_i
2592         &pxor           ($inout1,$inout5);
2593
2594         &mov            ($out,&DWP($out_off,"esp"));
2595         &call           ("_aesni_decrypt2");
2596
2597         &xorps          ($inout0,$inout4);              # ^ offset_i
2598         &xorps          ($inout1,$inout5);
2599         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2600         &xorps          ($inout3,$inout0);              # checksum
2601         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2602         &xorps          ($inout3,$inout1);
2603         &movups         (&QWP(16*1,$out,$inp),$inout1);
2604         &movaps         ($rndkey1,$inout3);             # pass the checksum
2605
2606         &jmp            (&label("done"));
2607
2608 &set_label("three",16);
2609         &lea            ($i1,&DWP(1,$block));
2610         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2611         &bsf            ($i1,$i1);
2612         &shl            ($i1,4);
2613         &movdqu         ($inout3,&QWP(0,$l_));
2614         &movdqu         ($inout4,&QWP(0,$l_,$i1));
2615         &movdqa         ($inout5,$inout3);
2616
2617         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2618         &movdqu         ($inout1,&QWP(16*1,$inp));
2619         &movdqu         ($inout2,&QWP(16*2,$inp));
2620         &mov            ($rounds,&DWP(240,$key));
2621
2622         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2623         &pxor           ($inout3,$rndkey0);             # ^ last offset_i
2624         &pxor           ($inout4,$inout3);
2625         &pxor           ($inout5,$inout4);
2626
2627         &pxor           ($inout0,$inout3);              # ^ offset_i
2628         &pxor           ($inout1,$inout4);
2629         &pxor           ($inout2,$inout5);
2630
2631         &mov            ($out,&DWP($out_off,"esp"));
2632         &call           ("_aesni_decrypt3");
2633
2634         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2635         &xorps          ($inout0,$inout3);              # ^ offset_i
2636         &xorps          ($inout1,$inout4);
2637         &xorps          ($inout2,$inout5);
2638         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2639         &pxor           ($rndkey1,$inout0);             # checksum
2640         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2641         &movups         (&QWP(16*1,$out,$inp),$inout1);
2642         &pxor           ($rndkey1,$inout1);
2643         &movups         (&QWP(16*2,$out,$inp),$inout2);
2644         &pxor           ($rndkey1,$inout2);
2645
2646         &jmp            (&label("done"));
2647
2648 &set_label("four",16);
2649         &lea            ($i1,&DWP(1,$block));
2650         &lea            ($i3,&DWP(3,$block));
2651         &bsf            ($i1,$i1);
2652         &bsf            ($i3,$i3);
2653         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2654         &shl            ($i1,4);
2655         &shl            ($i3,4);
2656         &movdqu         ($inout2,&QWP(0,$l_));
2657         &movdqu         ($inout3,&QWP(0,$l_,$i1));
2658         &movdqa         ($inout4,$inout2);
2659         &movdqu         ($inout5,&QWP(0,$l_,$i3));
2660
2661         &pxor           ($inout2,$rndkey0);             # ^ last offset_i
2662         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2663         &pxor           ($inout3,$inout2);
2664         &movdqu         ($inout1,&QWP(16*1,$inp));
2665         &pxor           ($inout4,$inout3);
2666         &movdqa         (&QWP(16*0,"esp"),$inout2);
2667         &pxor           ($inout5,$inout4);
2668         &movdqa         (&QWP(16*1,"esp"),$inout3);
2669         &movdqu         ($inout2,&QWP(16*2,$inp));
2670         &movdqu         ($inout3,&QWP(16*3,$inp));
2671         &mov            ($rounds,&DWP(240,$key));
2672
2673         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2674         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2675         &pxor           ($inout1,&QWP(16*1,"esp"));
2676         &pxor           ($inout2,$inout4);
2677         &pxor           ($inout3,$inout5);
2678
2679         &mov            ($out,&DWP($out_off,"esp"));
2680         &call           ("_aesni_decrypt4");
2681
2682         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2683         &xorps          ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2684         &xorps          ($inout1,&QWP(16*1,"esp"));
2685         &xorps          ($inout2,$inout4);
2686         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2687         &pxor           ($rndkey1,$inout0);             # checksum
2688         &xorps          ($inout3,$inout5);
2689         &movups         (&QWP(16*1,$out,$inp),$inout1);
2690         &pxor           ($rndkey1,$inout1);
2691         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2692         &movups         (&QWP(16*2,$out,$inp),$inout2);
2693         &pxor           ($rndkey1,$inout2);
2694         &movups         (&QWP(16*3,$out,$inp),$inout3);
2695         &pxor           ($rndkey1,$inout3);
2696
2697 &set_label("done");
2698         &mov    ($key,&DWP($esp_off,"esp"));
2699         &pxor   ($inout0,$inout0);              # clear register bank
2700         &pxor   ($inout1,$inout1);
2701         &movdqa (&QWP(16*0,"esp"),$inout0);     # clear stack
2702         &pxor   ($inout2,$inout2);
2703         &movdqa (&QWP(16*1,"esp"),$inout0);
2704         &pxor   ($inout3,$inout3);
2705         &movdqa (&QWP(16*2,"esp"),$inout0);
2706         &pxor   ($inout4,$inout4);
2707         &movdqa (&QWP(16*3,"esp"),$inout0);
2708         &pxor   ($inout5,$inout5);
2709         &movdqa (&QWP(16*4,"esp"),$inout0);
2710         &movdqa (&QWP(16*5,"esp"),$inout0);
2711         &movdqa (&QWP(16*6,"esp"),$inout0);
2712
2713         &lea    ("esp",&DWP(0,$key));
2714         &mov    ($rounds,&wparam(5));           # &offset_i
2715         &mov    ($rounds_,&wparam(7));          # &checksum
2716         &movdqu (&QWP(0,$rounds),$rndkey0);
2717         &pxor   ($rndkey0,$rndkey0);
2718         &movdqu (&QWP(0,$rounds_),$rndkey1);
2719         &pxor   ($rndkey1,$rndkey1);
2720 &function_end("aesni_ocb_decrypt");
2721 }
2722 }
2723 \f
2724 ######################################################################
2725 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
2726 #                           size_t length, const AES_KEY *key,
2727 #                           unsigned char *ivp,const int enc);
2728 &function_begin("${PREFIX}_cbc_encrypt");
2729         &mov    ($inp,&wparam(0));
2730         &mov    ($rounds_,"esp");
2731         &mov    ($out,&wparam(1));
2732         &sub    ($rounds_,24);
2733         &mov    ($len,&wparam(2));
2734         &and    ($rounds_,-16);
2735         &mov    ($key,&wparam(3));
2736         &mov    ($key_,&wparam(4));
2737         &test   ($len,$len);
2738         &jz     (&label("cbc_abort"));
2739
2740         &cmp    (&wparam(5),0);
2741         &xchg   ($rounds_,"esp");               # alloca
2742         &movups ($ivec,&QWP(0,$key_));          # load IV
2743         &mov    ($rounds,&DWP(240,$key));
2744         &mov    ($key_,$key);                   # backup $key
2745         &mov    (&DWP(16,"esp"),$rounds_);      # save original %esp
2746         &mov    ($rounds_,$rounds);             # backup $rounds
2747         &je     (&label("cbc_decrypt"));
2748
2749         &movaps ($inout0,$ivec);
2750         &cmp    ($len,16);
2751         &jb     (&label("cbc_enc_tail"));
2752         &sub    ($len,16);
2753         &jmp    (&label("cbc_enc_loop"));
2754
2755 &set_label("cbc_enc_loop",16);
2756         &movups ($ivec,&QWP(0,$inp));           # input actually
2757         &lea    ($inp,&DWP(16,$inp));
2758         if ($inline)
2759         {   &aesni_inline_generate1("enc",$inout0,$ivec);       }
2760         else
2761         {   &xorps($inout0,$ivec); &call("_aesni_encrypt1");    }
2762         &mov    ($rounds,$rounds_);     # restore $rounds
2763         &mov    ($key,$key_);           # restore $key
2764         &movups (&QWP(0,$out),$inout0); # store output
2765         &lea    ($out,&DWP(16,$out));
2766         &sub    ($len,16);
2767         &jnc    (&label("cbc_enc_loop"));
2768         &add    ($len,16);
2769         &jnz    (&label("cbc_enc_tail"));
2770         &movaps ($ivec,$inout0);
2771         &pxor   ($inout0,$inout0);
2772         &jmp    (&label("cbc_ret"));
2773
2774 &set_label("cbc_enc_tail");
2775         &mov    ("ecx",$len);           # zaps $rounds
2776         &data_word(0xA4F3F689);         # rep movsb
2777         &mov    ("ecx",16);             # zero tail
2778         &sub    ("ecx",$len);
2779         &xor    ("eax","eax");          # zaps $len
2780         &data_word(0xAAF3F689);         # rep stosb
2781         &lea    ($out,&DWP(-16,$out));  # rewind $out by 1 block
2782         &mov    ($rounds,$rounds_);     # restore $rounds
2783         &mov    ($inp,$out);            # $inp and $out are the same
2784         &mov    ($key,$key_);           # restore $key
2785         &jmp    (&label("cbc_enc_loop"));
2786 ######################################################################
2787 &set_label("cbc_decrypt",16);
2788         &cmp    ($len,0x50);
2789         &jbe    (&label("cbc_dec_tail"));
2790         &movaps (&QWP(0,"esp"),$ivec);          # save IV
2791         &sub    ($len,0x50);
2792         &jmp    (&label("cbc_dec_loop6_enter"));
2793
2794 &set_label("cbc_dec_loop6",16);
2795         &movaps (&QWP(0,"esp"),$rndkey0);       # save IV
2796         &movups (&QWP(0,$out),$inout5);
2797         &lea    ($out,&DWP(0x10,$out));
2798 &set_label("cbc_dec_loop6_enter");
2799         &movdqu ($inout0,&QWP(0,$inp));
2800         &movdqu ($inout1,&QWP(0x10,$inp));
2801         &movdqu ($inout2,&QWP(0x20,$inp));
2802         &movdqu ($inout3,&QWP(0x30,$inp));
2803         &movdqu ($inout4,&QWP(0x40,$inp));
2804         &movdqu ($inout5,&QWP(0x50,$inp));
2805
2806         &call   ("_aesni_decrypt6");
2807
2808         &movups ($rndkey1,&QWP(0,$inp));
2809         &movups ($rndkey0,&QWP(0x10,$inp));
2810         &xorps  ($inout0,&QWP(0,"esp"));        # ^=IV
2811         &xorps  ($inout1,$rndkey1);
2812         &movups ($rndkey1,&QWP(0x20,$inp));
2813         &xorps  ($inout2,$rndkey0);
2814         &movups ($rndkey0,&QWP(0x30,$inp));
2815         &xorps  ($inout3,$rndkey1);
2816         &movups ($rndkey1,&QWP(0x40,$inp));
2817         &xorps  ($inout4,$rndkey0);
2818         &movups ($rndkey0,&QWP(0x50,$inp));     # IV
2819         &xorps  ($inout5,$rndkey1);
2820         &movups (&QWP(0,$out),$inout0);
2821         &movups (&QWP(0x10,$out),$inout1);
2822         &lea    ($inp,&DWP(0x60,$inp));
2823         &movups (&QWP(0x20,$out),$inout2);
2824         &mov    ($rounds,$rounds_);             # restore $rounds
2825         &movups (&QWP(0x30,$out),$inout3);
2826         &mov    ($key,$key_);                   # restore $key
2827         &movups (&QWP(0x40,$out),$inout4);
2828         &lea    ($out,&DWP(0x50,$out));
2829         &sub    ($len,0x60);
2830         &ja     (&label("cbc_dec_loop6"));
2831
2832         &movaps ($inout0,$inout5);
2833         &movaps ($ivec,$rndkey0);
2834         &add    ($len,0x50);
2835         &jle    (&label("cbc_dec_clear_tail_collected"));
2836         &movups (&QWP(0,$out),$inout0);
2837         &lea    ($out,&DWP(0x10,$out));
2838 &set_label("cbc_dec_tail");
2839         &movups ($inout0,&QWP(0,$inp));
2840         &movaps ($in0,$inout0);
2841         &cmp    ($len,0x10);
2842         &jbe    (&label("cbc_dec_one"));
2843
2844         &movups ($inout1,&QWP(0x10,$inp));
2845         &movaps ($in1,$inout1);
2846         &cmp    ($len,0x20);
2847         &jbe    (&label("cbc_dec_two"));
2848
2849         &movups ($inout2,&QWP(0x20,$inp));
2850         &cmp    ($len,0x30);
2851         &jbe    (&label("cbc_dec_three"));
2852
2853         &movups ($inout3,&QWP(0x30,$inp));
2854         &cmp    ($len,0x40);
2855         &jbe    (&label("cbc_dec_four"));
2856
2857         &movups ($inout4,&QWP(0x40,$inp));
2858         &movaps (&QWP(0,"esp"),$ivec);          # save IV
2859         &movups ($inout0,&QWP(0,$inp));
2860         &xorps  ($inout5,$inout5);
2861         &call   ("_aesni_decrypt6");
2862         &movups ($rndkey1,&QWP(0,$inp));
2863         &movups ($rndkey0,&QWP(0x10,$inp));
2864         &xorps  ($inout0,&QWP(0,"esp"));        # ^= IV
2865         &xorps  ($inout1,$rndkey1);
2866         &movups ($rndkey1,&QWP(0x20,$inp));
2867         &xorps  ($inout2,$rndkey0);
2868         &movups ($rndkey0,&QWP(0x30,$inp));
2869         &xorps  ($inout3,$rndkey1);
2870         &movups ($ivec,&QWP(0x40,$inp));        # IV
2871         &xorps  ($inout4,$rndkey0);
2872         &movups (&QWP(0,$out),$inout0);
2873         &movups (&QWP(0x10,$out),$inout1);
2874         &pxor   ($inout1,$inout1);
2875         &movups (&QWP(0x20,$out),$inout2);
2876         &pxor   ($inout2,$inout2);
2877         &movups (&QWP(0x30,$out),$inout3);
2878         &pxor   ($inout3,$inout3);
2879         &lea    ($out,&DWP(0x40,$out));
2880         &movaps ($inout0,$inout4);
2881         &pxor   ($inout4,$inout4);
2882         &sub    ($len,0x50);
2883         &jmp    (&label("cbc_dec_tail_collected"));
2884
2885 &set_label("cbc_dec_one",16);
2886         if ($inline)
2887         {   &aesni_inline_generate1("dec");     }
2888         else
2889         {   &call       ("_aesni_decrypt1");    }
2890         &xorps  ($inout0,$ivec);
2891         &movaps ($ivec,$in0);
2892         &sub    ($len,0x10);
2893         &jmp    (&label("cbc_dec_tail_collected"));
2894
2895 &set_label("cbc_dec_two",16);
2896         &call   ("_aesni_decrypt2");
2897         &xorps  ($inout0,$ivec);
2898         &xorps  ($inout1,$in0);
2899         &movups (&QWP(0,$out),$inout0);
2900         &movaps ($inout0,$inout1);
2901         &pxor   ($inout1,$inout1);
2902         &lea    ($out,&DWP(0x10,$out));
2903         &movaps ($ivec,$in1);
2904         &sub    ($len,0x20);
2905         &jmp    (&label("cbc_dec_tail_collected"));
2906
2907 &set_label("cbc_dec_three",16);
2908         &call   ("_aesni_decrypt3");
2909         &xorps  ($inout0,$ivec);
2910         &xorps  ($inout1,$in0);
2911         &xorps  ($inout2,$in1);
2912         &movups (&QWP(0,$out),$inout0);
2913         &movaps ($inout0,$inout2);
2914         &pxor   ($inout2,$inout2);
2915         &movups (&QWP(0x10,$out),$inout1);
2916         &pxor   ($inout1,$inout1);
2917         &lea    ($out,&DWP(0x20,$out));
2918         &movups ($ivec,&QWP(0x20,$inp));
2919         &sub    ($len,0x30);
2920         &jmp    (&label("cbc_dec_tail_collected"));
2921
2922 &set_label("cbc_dec_four",16);
2923         &call   ("_aesni_decrypt4");
2924         &movups ($rndkey1,&QWP(0x10,$inp));
2925         &movups ($rndkey0,&QWP(0x20,$inp));
2926         &xorps  ($inout0,$ivec);
2927         &movups ($ivec,&QWP(0x30,$inp));
2928         &xorps  ($inout1,$in0);
2929         &movups (&QWP(0,$out),$inout0);
2930         &xorps  ($inout2,$rndkey1);
2931         &movups (&QWP(0x10,$out),$inout1);
2932         &pxor   ($inout1,$inout1);
2933         &xorps  ($inout3,$rndkey0);
2934         &movups (&QWP(0x20,$out),$inout2);
2935         &pxor   ($inout2,$inout2);
2936         &lea    ($out,&DWP(0x30,$out));
2937         &movaps ($inout0,$inout3);
2938         &pxor   ($inout3,$inout3);
2939         &sub    ($len,0x40);
2940         &jmp    (&label("cbc_dec_tail_collected"));
2941
2942 &set_label("cbc_dec_clear_tail_collected",16);
2943         &pxor   ($inout1,$inout1);
2944         &pxor   ($inout2,$inout2);
2945         &pxor   ($inout3,$inout3);
2946         &pxor   ($inout4,$inout4);
2947 &set_label("cbc_dec_tail_collected");
2948         &and    ($len,15);
2949         &jnz    (&label("cbc_dec_tail_partial"));
2950         &movups (&QWP(0,$out),$inout0);
2951         &pxor   ($rndkey0,$rndkey0);
2952         &jmp    (&label("cbc_ret"));
2953
2954 &set_label("cbc_dec_tail_partial",16);
2955         &movaps (&QWP(0,"esp"),$inout0);
2956         &pxor   ($rndkey0,$rndkey0);
2957         &mov    ("ecx",16);
2958         &mov    ($inp,"esp");
2959         &sub    ("ecx",$len);
2960         &data_word(0xA4F3F689);         # rep movsb
2961         &movdqa (&QWP(0,"esp"),$inout0);
2962
2963 &set_label("cbc_ret");
2964         &mov    ("esp",&DWP(16,"esp")); # pull original %esp
2965         &mov    ($key_,&wparam(4));
2966         &pxor   ($inout0,$inout0);
2967         &pxor   ($rndkey1,$rndkey1);
2968         &movups (&QWP(0,$key_),$ivec);  # output IV
2969         &pxor   ($ivec,$ivec);
2970 &set_label("cbc_abort");
2971 &function_end("${PREFIX}_cbc_encrypt");
2972 \f
2973 ######################################################################
2974 # Mechanical port from aesni-x86_64.pl.
2975 #
2976 # _aesni_set_encrypt_key is private interface,
2977 # input:
2978 #       "eax"   const unsigned char *userKey
2979 #       $rounds int bits
2980 #       $key    AES_KEY *key
2981 # output:
2982 #       "eax"   return code
2983 #       $round  rounds
2984
2985 &function_begin_B("_aesni_set_encrypt_key");
2986         &push   ("ebp");
2987         &push   ("ebx");
2988         &test   ("eax","eax");
2989         &jz     (&label("bad_pointer"));
2990         &test   ($key,$key);
2991         &jz     (&label("bad_pointer"));
2992
2993         &call   (&label("pic"));
2994 &set_label("pic");
2995         &blindpop("ebx");
2996         &lea    ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
2997
2998         &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
2999         &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
3000         &xorps  ("xmm4","xmm4");        # low dword of xmm4 is assumed 0
3001         &mov    ("ebp",&DWP(4,"ebp"));
3002         &lea    ($key,&DWP(16,$key));
3003         &and    ("ebp",1<<28|1<<11);    # AVX and XOP bits
3004         &cmp    ($rounds,256);
3005         &je     (&label("14rounds"));
3006         &cmp    ($rounds,192);
3007         &je     (&label("12rounds"));
3008         &cmp    ($rounds,128);
3009         &jne    (&label("bad_keybits"));
3010
3011 &set_label("10rounds",16);
3012         &cmp            ("ebp",1<<28);
3013         &je             (&label("10rounds_alt"));
3014
3015         &mov            ($rounds,9);
3016         &$movekey       (&QWP(-16,$key),"xmm0");        # round 0
3017         &aeskeygenassist("xmm1","xmm0",0x01);           # round 1
3018         &call           (&label("key_128_cold"));
3019         &aeskeygenassist("xmm1","xmm0",0x2);            # round 2
3020         &call           (&label("key_128"));
3021         &aeskeygenassist("xmm1","xmm0",0x04);           # round 3
3022         &call           (&label("key_128"));
3023         &aeskeygenassist("xmm1","xmm0",0x08);           # round 4
3024         &call           (&label("key_128"));
3025         &aeskeygenassist("xmm1","xmm0",0x10);           # round 5
3026         &call           (&label("key_128"));
3027         &aeskeygenassist("xmm1","xmm0",0x20);           # round 6
3028         &call           (&label("key_128"));
3029         &aeskeygenassist("xmm1","xmm0",0x40);           # round 7
3030         &call           (&label("key_128"));
3031         &aeskeygenassist("xmm1","xmm0",0x80);           # round 8
3032         &call           (&label("key_128"));
3033         &aeskeygenassist("xmm1","xmm0",0x1b);           # round 9
3034         &call           (&label("key_128"));
3035         &aeskeygenassist("xmm1","xmm0",0x36);           # round 10
3036         &call           (&label("key_128"));
3037         &$movekey       (&QWP(0,$key),"xmm0");
3038         &mov            (&DWP(80,$key),$rounds);
3039
3040         &jmp    (&label("good_key"));
3041
3042 &set_label("key_128",16);
3043         &$movekey       (&QWP(0,$key),"xmm0");
3044         &lea            ($key,&DWP(16,$key));
3045 &set_label("key_128_cold");
3046         &shufps         ("xmm4","xmm0",0b00010000);
3047         &xorps          ("xmm0","xmm4");
3048         &shufps         ("xmm4","xmm0",0b10001100);
3049         &xorps          ("xmm0","xmm4");
3050         &shufps         ("xmm1","xmm1",0b11111111);     # critical path
3051         &xorps          ("xmm0","xmm1");
3052         &ret();
3053
3054 &set_label("10rounds_alt",16);
3055         &movdqa         ("xmm5",&QWP(0x00,"ebx"));
3056         &mov            ($rounds,8);
3057         &movdqa         ("xmm4",&QWP(0x20,"ebx"));
3058         &movdqa         ("xmm2","xmm0");
3059         &movdqu         (&QWP(-16,$key),"xmm0");
3060
3061 &set_label("loop_key128");
3062         &pshufb         ("xmm0","xmm5");
3063         &aesenclast     ("xmm0","xmm4");
3064         &pslld          ("xmm4",1);
3065         &lea            ($key,&DWP(16,$key));
3066
3067         &movdqa         ("xmm3","xmm2");
3068         &pslldq         ("xmm2",4);
3069         &pxor           ("xmm3","xmm2");
3070         &pslldq         ("xmm2",4);
3071         &pxor           ("xmm3","xmm2");
3072         &pslldq         ("xmm2",4);
3073         &pxor           ("xmm2","xmm3");
3074
3075         &pxor           ("xmm0","xmm2");
3076         &movdqu         (&QWP(-16,$key),"xmm0");
3077         &movdqa         ("xmm2","xmm0");
3078
3079         &dec            ($rounds);
3080         &jnz            (&label("loop_key128"));
3081
3082         &movdqa         ("xmm4",&QWP(0x30,"ebx"));
3083
3084         &pshufb         ("xmm0","xmm5");
3085         &aesenclast     ("xmm0","xmm4");
3086         &pslld          ("xmm4",1);
3087
3088         &movdqa         ("xmm3","xmm2");
3089         &pslldq         ("xmm2",4);
3090         &pxor           ("xmm3","xmm2");
3091         &pslldq         ("xmm2",4);
3092         &pxor           ("xmm3","xmm2");
3093         &pslldq         ("xmm2",4);
3094         &pxor           ("xmm2","xmm3");
3095
3096         &pxor           ("xmm0","xmm2");
3097         &movdqu         (&QWP(0,$key),"xmm0");
3098
3099         &movdqa         ("xmm2","xmm0");
3100         &pshufb         ("xmm0","xmm5");
3101         &aesenclast     ("xmm0","xmm4");
3102
3103         &movdqa         ("xmm3","xmm2");
3104         &pslldq         ("xmm2",4);
3105         &pxor           ("xmm3","xmm2");
3106         &pslldq         ("xmm2",4);
3107         &pxor           ("xmm3","xmm2");
3108         &pslldq         ("xmm2",4);
3109         &pxor           ("xmm2","xmm3");
3110
3111         &pxor           ("xmm0","xmm2");
3112         &movdqu         (&QWP(16,$key),"xmm0");
3113
3114         &mov            ($rounds,9);
3115         &mov            (&DWP(96,$key),$rounds);
3116
3117         &jmp    (&label("good_key"));
3118
3119 &set_label("12rounds",16);
3120         &movq           ("xmm2",&QWP(16,"eax"));        # remaining 1/3 of *userKey
3121         &cmp            ("ebp",1<<28);
3122         &je             (&label("12rounds_alt"));
3123
3124         &mov            ($rounds,11);
3125         &$movekey       (&QWP(-16,$key),"xmm0");        # round 0
3126         &aeskeygenassist("xmm1","xmm2",0x01);           # round 1,2
3127         &call           (&label("key_192a_cold"));
3128         &aeskeygenassist("xmm1","xmm2",0x02);           # round 2,3
3129         &call           (&label("key_192b"));
3130         &aeskeygenassist("xmm1","xmm2",0x04);           # round 4,5
3131         &call           (&label("key_192a"));
3132         &aeskeygenassist("xmm1","xmm2",0x08);           # round 5,6
3133         &call           (&label("key_192b"));
3134         &aeskeygenassist("xmm1","xmm2",0x10);           # round 7,8
3135         &call           (&label("key_192a"));
3136         &aeskeygenassist("xmm1","xmm2",0x20);           # round 8,9
3137         &call           (&label("key_192b"));
3138         &aeskeygenassist("xmm1","xmm2",0x40);           # round 10,11
3139         &call           (&label("key_192a"));
3140         &aeskeygenassist("xmm1","xmm2",0x80);           # round 11,12
3141         &call           (&label("key_192b"));
3142         &$movekey       (&QWP(0,$key),"xmm0");
3143         &mov            (&DWP(48,$key),$rounds);
3144
3145         &jmp    (&label("good_key"));
3146
3147 &set_label("key_192a",16);
3148         &$movekey       (&QWP(0,$key),"xmm0");
3149         &lea            ($key,&DWP(16,$key));
3150 &set_label("key_192a_cold",16);
3151         &movaps         ("xmm5","xmm2");
3152 &set_label("key_192b_warm");
3153         &shufps         ("xmm4","xmm0",0b00010000);
3154         &movdqa         ("xmm3","xmm2");
3155         &xorps          ("xmm0","xmm4");
3156         &shufps         ("xmm4","xmm0",0b10001100);
3157         &pslldq         ("xmm3",4);
3158         &xorps          ("xmm0","xmm4");
3159         &pshufd         ("xmm1","xmm1",0b01010101);     # critical path
3160         &pxor           ("xmm2","xmm3");
3161         &pxor           ("xmm0","xmm1");
3162         &pshufd         ("xmm3","xmm0",0b11111111);
3163         &pxor           ("xmm2","xmm3");
3164         &ret();
3165
3166 &set_label("key_192b",16);
3167         &movaps         ("xmm3","xmm0");
3168         &shufps         ("xmm5","xmm0",0b01000100);
3169         &$movekey       (&QWP(0,$key),"xmm5");
3170         &shufps         ("xmm3","xmm2",0b01001110);
3171         &$movekey       (&QWP(16,$key),"xmm3");
3172         &lea            ($key,&DWP(32,$key));
3173         &jmp            (&label("key_192b_warm"));
3174
3175 &set_label("12rounds_alt",16);
3176         &movdqa         ("xmm5",&QWP(0x10,"ebx"));
3177         &movdqa         ("xmm4",&QWP(0x20,"ebx"));
3178         &mov            ($rounds,8);
3179         &movdqu         (&QWP(-16,$key),"xmm0");
3180
3181 &set_label("loop_key192");
3182         &movq           (&QWP(0,$key),"xmm2");
3183         &movdqa         ("xmm1","xmm2");
3184         &pshufb         ("xmm2","xmm5");
3185         &aesenclast     ("xmm2","xmm4");
3186         &pslld          ("xmm4",1);
3187         &lea            ($key,&DWP(24,$key));
3188
3189         &movdqa         ("xmm3","xmm0");
3190         &pslldq         ("xmm0",4);
3191         &pxor           ("xmm3","xmm0");
3192         &pslldq         ("xmm0",4);
3193         &pxor           ("xmm3","xmm0");
3194         &pslldq         ("xmm0",4);
3195         &pxor           ("xmm0","xmm3");
3196
3197         &pshufd         ("xmm3","xmm0",0xff);
3198         &pxor           ("xmm3","xmm1");
3199         &pslldq         ("xmm1",4);
3200         &pxor           ("xmm3","xmm1");
3201
3202         &pxor           ("xmm0","xmm2");
3203         &pxor           ("xmm2","xmm3");
3204         &movdqu         (&QWP(-16,$key),"xmm0");
3205
3206         &dec            ($rounds);
3207         &jnz            (&label("loop_key192"));
3208
3209         &mov    ($rounds,11);
3210         &mov    (&DWP(32,$key),$rounds);