x86[_64] assembly pack: add optimized AES-NI OCB subroutines.
[openssl.git] / crypto / aes / asm / aesni-x86.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for Intel AES-NI extension. In
11 # OpenSSL context it's used with Intel engine, but can also be used as
12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13 # details].
14 #
15 # Performance.
16 #
17 # To start with see corresponding paragraph in aesni-x86_64.pl...
18 # Instead of filling table similar to one found there I've chosen to
19 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20 # The simplified table below represents 32-bit performance relative
21 # to 64-bit one in every given point. Ratios vary for different
22 # encryption modes, therefore interval values.
23 #
24 #       16-byte     64-byte     256-byte    1-KB        8-KB
25 #       53-67%      67-84%      91-94%      95-98%      97-99.5%
26 #
27 # Lower ratios for smaller block sizes are perfectly understandable,
28 # because function call overhead is higher in 32-bit mode. Largest
29 # 8-KB block performance is virtually same: 32-bit code is less than
30 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
31
32 # January 2011
33 #
34 # See aesni-x86_64.pl for details. Unlike x86_64 version this module
35 # interleaves at most 6 aes[enc|dec] instructions, because there are
36 # not enough registers for 8x interleave [which should be optimal for
37 # Sandy Bridge]. Actually, performance results for 6x interleave
38 # factor presented in aesni-x86_64.pl (except for CTR) are for this
39 # module.
40
41 # April 2011
42 #
43 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
44 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
45
46 # November 2015
47 #
48 # Add aesni_ocb_[en|de]crypt.
49
50 ######################################################################
51 # Current large-block performance in cycles per byte processed with
52 # 128-bit key (less is better).
53 #
54 #               CBC en-/decrypt CTR     XTS     ECB     OCB
55 # Westmere      3.77/1.37       1.37    1.52    1.27
56 # * Bridge      5.07/0.98       0.99    1.09    0.91    1.10
57 # Haswell       4.44/0.80       0.97    1.03    0.72    0.76
58 # Silvermont    5.77/3.56       3.67    4.03    3.46    4.03
59 # Bulldozer     5.80/0.98       1.05    1.24    0.93    1.23
60
61 $PREFIX="aesni";        # if $PREFIX is set to "AES", the script
62                         # generates drop-in replacement for
63                         # crypto/aes/asm/aes-586.pl:-)
64 $inline=1;              # inline _aesni_[en|de]crypt
65
66 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
67 push(@INC,"${dir}","${dir}../../perlasm");
68 require "x86asm.pl";
69
70 &asm_init($ARGV[0],$0);
71
72 &external_label("OPENSSL_ia32cap_P");
73 &static_label("key_const");
74
75 if ($PREFIX eq "aesni") { $movekey=\&movups; }
76 else                    { $movekey=\&movups; }
77
78 $len="eax";
79 $rounds="ecx";
80 $key="edx";
81 $inp="esi";
82 $out="edi";
83 $rounds_="ebx"; # backup copy for $rounds
84 $key_="ebp";    # backup copy for $key
85
86 $rndkey0="xmm0";
87 $rndkey1="xmm1";
88 $inout0="xmm2";
89 $inout1="xmm3";
90 $inout2="xmm4";
91 $inout3="xmm5"; $in1="xmm5";
92 $inout4="xmm6"; $in0="xmm6";
93 $inout5="xmm7"; $ivec="xmm7";
94
95 # AESNI extension
96 sub aeskeygenassist
97 { my($dst,$src,$imm)=@_;
98     if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
99     {   &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);   }
100 }
101 sub aescommon
102 { my($opcodelet,$dst,$src)=@_;
103     if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
104     {   &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
105 }
106 sub aesimc      { aescommon(0xdb,@_); }
107 sub aesenc      { aescommon(0xdc,@_); }
108 sub aesenclast  { aescommon(0xdd,@_); }
109 sub aesdec      { aescommon(0xde,@_); }
110 sub aesdeclast  { aescommon(0xdf,@_); }
111 \f
112 # Inline version of internal aesni_[en|de]crypt1
113 { my $sn;
114 sub aesni_inline_generate1
115 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
116   $sn++;
117
118     &$movekey           ($rndkey0,&QWP(0,$key));
119     &$movekey           ($rndkey1,&QWP(16,$key));
120     &xorps              ($ivec,$rndkey0)        if (defined($ivec));
121     &lea                ($key,&DWP(32,$key));
122     &xorps              ($inout,$ivec)          if (defined($ivec));
123     &xorps              ($inout,$rndkey0)       if (!defined($ivec));
124     &set_label("${p}1_loop_$sn");
125         eval"&aes${p}   ($inout,$rndkey1)";
126         &dec            ($rounds);
127         &$movekey       ($rndkey1,&QWP(0,$key));
128         &lea            ($key,&DWP(16,$key));
129     &jnz                (&label("${p}1_loop_$sn"));
130     eval"&aes${p}last   ($inout,$rndkey1)";
131 }}
132
133 sub aesni_generate1     # fully unrolled loop
134 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
135
136     &function_begin_B("_aesni_${p}rypt1");
137         &movups         ($rndkey0,&QWP(0,$key));
138         &$movekey       ($rndkey1,&QWP(0x10,$key));
139         &xorps          ($inout,$rndkey0);
140         &$movekey       ($rndkey0,&QWP(0x20,$key));
141         &lea            ($key,&DWP(0x30,$key));
142         &cmp            ($rounds,11);
143         &jb             (&label("${p}128"));
144         &lea            ($key,&DWP(0x20,$key));
145         &je             (&label("${p}192"));
146         &lea            ($key,&DWP(0x20,$key));
147         eval"&aes${p}   ($inout,$rndkey1)";
148         &$movekey       ($rndkey1,&QWP(-0x40,$key));
149         eval"&aes${p}   ($inout,$rndkey0)";
150         &$movekey       ($rndkey0,&QWP(-0x30,$key));
151     &set_label("${p}192");
152         eval"&aes${p}   ($inout,$rndkey1)";
153         &$movekey       ($rndkey1,&QWP(-0x20,$key));
154         eval"&aes${p}   ($inout,$rndkey0)";
155         &$movekey       ($rndkey0,&QWP(-0x10,$key));
156     &set_label("${p}128");
157         eval"&aes${p}   ($inout,$rndkey1)";
158         &$movekey       ($rndkey1,&QWP(0,$key));
159         eval"&aes${p}   ($inout,$rndkey0)";
160         &$movekey       ($rndkey0,&QWP(0x10,$key));
161         eval"&aes${p}   ($inout,$rndkey1)";
162         &$movekey       ($rndkey1,&QWP(0x20,$key));
163         eval"&aes${p}   ($inout,$rndkey0)";
164         &$movekey       ($rndkey0,&QWP(0x30,$key));
165         eval"&aes${p}   ($inout,$rndkey1)";
166         &$movekey       ($rndkey1,&QWP(0x40,$key));
167         eval"&aes${p}   ($inout,$rndkey0)";
168         &$movekey       ($rndkey0,&QWP(0x50,$key));
169         eval"&aes${p}   ($inout,$rndkey1)";
170         &$movekey       ($rndkey1,&QWP(0x60,$key));
171         eval"&aes${p}   ($inout,$rndkey0)";
172         &$movekey       ($rndkey0,&QWP(0x70,$key));
173         eval"&aes${p}   ($inout,$rndkey1)";
174     eval"&aes${p}last   ($inout,$rndkey0)";
175     &ret();
176     &function_end_B("_aesni_${p}rypt1");
177 }
178 \f
179 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
180 &aesni_generate1("enc") if (!$inline);
181 &function_begin_B("${PREFIX}_encrypt");
182         &mov    ("eax",&wparam(0));
183         &mov    ($key,&wparam(2));
184         &movups ($inout0,&QWP(0,"eax"));
185         &mov    ($rounds,&DWP(240,$key));
186         &mov    ("eax",&wparam(1));
187         if ($inline)
188         {   &aesni_inline_generate1("enc");     }
189         else
190         {   &call       ("_aesni_encrypt1");    }
191         &pxor   ($rndkey0,$rndkey0);            # clear register bank
192         &pxor   ($rndkey1,$rndkey1);
193         &movups (&QWP(0,"eax"),$inout0);
194         &pxor   ($inout0,$inout0);
195         &ret    ();
196 &function_end_B("${PREFIX}_encrypt");
197
198 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
199 &aesni_generate1("dec") if(!$inline);
200 &function_begin_B("${PREFIX}_decrypt");
201         &mov    ("eax",&wparam(0));
202         &mov    ($key,&wparam(2));
203         &movups ($inout0,&QWP(0,"eax"));
204         &mov    ($rounds,&DWP(240,$key));
205         &mov    ("eax",&wparam(1));
206         if ($inline)
207         {   &aesni_inline_generate1("dec");     }
208         else
209         {   &call       ("_aesni_decrypt1");    }
210         &pxor   ($rndkey0,$rndkey0);            # clear register bank
211         &pxor   ($rndkey1,$rndkey1);
212         &movups (&QWP(0,"eax"),$inout0);
213         &pxor   ($inout0,$inout0);
214         &ret    ();
215 &function_end_B("${PREFIX}_decrypt");
216
217 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
218 # factor. Why 3x subroutine were originally used in loops? Even though
219 # aes[enc|dec] latency was originally 6, it could be scheduled only
220 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
221 # utilization, i.e. when subroutine's throughput is virtually same as
222 # of non-interleaved subroutine [for number of input blocks up to 3].
223 # This is why it originally made no sense to implement 2x subroutine.
224 # But times change and it became appropriate to spend extra 192 bytes
225 # on 2x subroutine on Atom Silvermont account. For processors that
226 # can schedule aes[enc|dec] every cycle optimal interleave factor
227 # equals to corresponding instructions latency. 8x is optimal for
228 # * Bridge, but it's unfeasible to accommodate such implementation
229 # in XMM registers addreassable in 32-bit mode and therefore maximum
230 # of 6x is used instead...
231
232 sub aesni_generate2
233 { my $p=shift;
234
235     &function_begin_B("_aesni_${p}rypt2");
236         &$movekey       ($rndkey0,&QWP(0,$key));
237         &shl            ($rounds,4);
238         &$movekey       ($rndkey1,&QWP(16,$key));
239         &xorps          ($inout0,$rndkey0);
240         &pxor           ($inout1,$rndkey0);
241         &$movekey       ($rndkey0,&QWP(32,$key));
242         &lea            ($key,&DWP(32,$key,$rounds));
243         &neg            ($rounds);
244         &add            ($rounds,16);
245
246     &set_label("${p}2_loop");
247         eval"&aes${p}   ($inout0,$rndkey1)";
248         eval"&aes${p}   ($inout1,$rndkey1)";
249         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
250         &add            ($rounds,32);
251         eval"&aes${p}   ($inout0,$rndkey0)";
252         eval"&aes${p}   ($inout1,$rndkey0)";
253         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
254         &jnz            (&label("${p}2_loop"));
255     eval"&aes${p}       ($inout0,$rndkey1)";
256     eval"&aes${p}       ($inout1,$rndkey1)";
257     eval"&aes${p}last   ($inout0,$rndkey0)";
258     eval"&aes${p}last   ($inout1,$rndkey0)";
259     &ret();
260     &function_end_B("_aesni_${p}rypt2");
261 }
262
263 sub aesni_generate3
264 { my $p=shift;
265
266     &function_begin_B("_aesni_${p}rypt3");
267         &$movekey       ($rndkey0,&QWP(0,$key));
268         &shl            ($rounds,4);
269         &$movekey       ($rndkey1,&QWP(16,$key));
270         &xorps          ($inout0,$rndkey0);
271         &pxor           ($inout1,$rndkey0);
272         &pxor           ($inout2,$rndkey0);
273         &$movekey       ($rndkey0,&QWP(32,$key));
274         &lea            ($key,&DWP(32,$key,$rounds));
275         &neg            ($rounds);
276         &add            ($rounds,16);
277
278     &set_label("${p}3_loop");
279         eval"&aes${p}   ($inout0,$rndkey1)";
280         eval"&aes${p}   ($inout1,$rndkey1)";
281         eval"&aes${p}   ($inout2,$rndkey1)";
282         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
283         &add            ($rounds,32);
284         eval"&aes${p}   ($inout0,$rndkey0)";
285         eval"&aes${p}   ($inout1,$rndkey0)";
286         eval"&aes${p}   ($inout2,$rndkey0)";
287         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
288         &jnz            (&label("${p}3_loop"));
289     eval"&aes${p}       ($inout0,$rndkey1)";
290     eval"&aes${p}       ($inout1,$rndkey1)";
291     eval"&aes${p}       ($inout2,$rndkey1)";
292     eval"&aes${p}last   ($inout0,$rndkey0)";
293     eval"&aes${p}last   ($inout1,$rndkey0)";
294     eval"&aes${p}last   ($inout2,$rndkey0)";
295     &ret();
296     &function_end_B("_aesni_${p}rypt3");
297 }
298
299 # 4x interleave is implemented to improve small block performance,
300 # most notably [and naturally] 4 block by ~30%. One can argue that one
301 # should have implemented 5x as well, but improvement  would be <20%,
302 # so it's not worth it...
303 sub aesni_generate4
304 { my $p=shift;
305
306     &function_begin_B("_aesni_${p}rypt4");
307         &$movekey       ($rndkey0,&QWP(0,$key));
308         &$movekey       ($rndkey1,&QWP(16,$key));
309         &shl            ($rounds,4);
310         &xorps          ($inout0,$rndkey0);
311         &pxor           ($inout1,$rndkey0);
312         &pxor           ($inout2,$rndkey0);
313         &pxor           ($inout3,$rndkey0);
314         &$movekey       ($rndkey0,&QWP(32,$key));
315         &lea            ($key,&DWP(32,$key,$rounds));
316         &neg            ($rounds);
317         &data_byte      (0x0f,0x1f,0x40,0x00);
318         &add            ($rounds,16);
319
320     &set_label("${p}4_loop");
321         eval"&aes${p}   ($inout0,$rndkey1)";
322         eval"&aes${p}   ($inout1,$rndkey1)";
323         eval"&aes${p}   ($inout2,$rndkey1)";
324         eval"&aes${p}   ($inout3,$rndkey1)";
325         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
326         &add            ($rounds,32);
327         eval"&aes${p}   ($inout0,$rndkey0)";
328         eval"&aes${p}   ($inout1,$rndkey0)";
329         eval"&aes${p}   ($inout2,$rndkey0)";
330         eval"&aes${p}   ($inout3,$rndkey0)";
331         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
332     &jnz                (&label("${p}4_loop"));
333
334     eval"&aes${p}       ($inout0,$rndkey1)";
335     eval"&aes${p}       ($inout1,$rndkey1)";
336     eval"&aes${p}       ($inout2,$rndkey1)";
337     eval"&aes${p}       ($inout3,$rndkey1)";
338     eval"&aes${p}last   ($inout0,$rndkey0)";
339     eval"&aes${p}last   ($inout1,$rndkey0)";
340     eval"&aes${p}last   ($inout2,$rndkey0)";
341     eval"&aes${p}last   ($inout3,$rndkey0)";
342     &ret();
343     &function_end_B("_aesni_${p}rypt4");
344 }
345
346 sub aesni_generate6
347 { my $p=shift;
348
349     &function_begin_B("_aesni_${p}rypt6");
350     &static_label("_aesni_${p}rypt6_enter");
351         &$movekey       ($rndkey0,&QWP(0,$key));
352         &shl            ($rounds,4);
353         &$movekey       ($rndkey1,&QWP(16,$key));
354         &xorps          ($inout0,$rndkey0);
355         &pxor           ($inout1,$rndkey0);     # pxor does better here
356         &pxor           ($inout2,$rndkey0);
357         eval"&aes${p}   ($inout0,$rndkey1)";
358         &pxor           ($inout3,$rndkey0);
359         &pxor           ($inout4,$rndkey0);
360         eval"&aes${p}   ($inout1,$rndkey1)";
361         &lea            ($key,&DWP(32,$key,$rounds));
362         &neg            ($rounds);
363         eval"&aes${p}   ($inout2,$rndkey1)";
364         &pxor           ($inout5,$rndkey0);
365         &$movekey       ($rndkey0,&QWP(0,$key,$rounds));
366         &add            ($rounds,16);
367         &jmp            (&label("_aesni_${p}rypt6_inner"));
368
369     &set_label("${p}6_loop",16);
370         eval"&aes${p}   ($inout0,$rndkey1)";
371         eval"&aes${p}   ($inout1,$rndkey1)";
372         eval"&aes${p}   ($inout2,$rndkey1)";
373     &set_label("_aesni_${p}rypt6_inner");
374         eval"&aes${p}   ($inout3,$rndkey1)";
375         eval"&aes${p}   ($inout4,$rndkey1)";
376         eval"&aes${p}   ($inout5,$rndkey1)";
377     &set_label("_aesni_${p}rypt6_enter");
378         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
379         &add            ($rounds,32);
380         eval"&aes${p}   ($inout0,$rndkey0)";
381         eval"&aes${p}   ($inout1,$rndkey0)";
382         eval"&aes${p}   ($inout2,$rndkey0)";
383         eval"&aes${p}   ($inout3,$rndkey0)";
384         eval"&aes${p}   ($inout4,$rndkey0)";
385         eval"&aes${p}   ($inout5,$rndkey0)";
386         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
387     &jnz                (&label("${p}6_loop"));
388
389     eval"&aes${p}       ($inout0,$rndkey1)";
390     eval"&aes${p}       ($inout1,$rndkey1)";
391     eval"&aes${p}       ($inout2,$rndkey1)";
392     eval"&aes${p}       ($inout3,$rndkey1)";
393     eval"&aes${p}       ($inout4,$rndkey1)";
394     eval"&aes${p}       ($inout5,$rndkey1)";
395     eval"&aes${p}last   ($inout0,$rndkey0)";
396     eval"&aes${p}last   ($inout1,$rndkey0)";
397     eval"&aes${p}last   ($inout2,$rndkey0)";
398     eval"&aes${p}last   ($inout3,$rndkey0)";
399     eval"&aes${p}last   ($inout4,$rndkey0)";
400     eval"&aes${p}last   ($inout5,$rndkey0)";
401     &ret();
402     &function_end_B("_aesni_${p}rypt6");
403 }
404 &aesni_generate2("enc") if ($PREFIX eq "aesni");
405 &aesni_generate2("dec");
406 &aesni_generate3("enc") if ($PREFIX eq "aesni");
407 &aesni_generate3("dec");
408 &aesni_generate4("enc") if ($PREFIX eq "aesni");
409 &aesni_generate4("dec");
410 &aesni_generate6("enc") if ($PREFIX eq "aesni");
411 &aesni_generate6("dec");
412 \f
413 if ($PREFIX eq "aesni") {
414 ######################################################################
415 # void aesni_ecb_encrypt (const void *in, void *out,
416 #                         size_t length, const AES_KEY *key,
417 #                         int enc);
418 &function_begin("aesni_ecb_encrypt");
419         &mov    ($inp,&wparam(0));
420         &mov    ($out,&wparam(1));
421         &mov    ($len,&wparam(2));
422         &mov    ($key,&wparam(3));
423         &mov    ($rounds_,&wparam(4));
424         &and    ($len,-16);
425         &jz     (&label("ecb_ret"));
426         &mov    ($rounds,&DWP(240,$key));
427         &test   ($rounds_,$rounds_);
428         &jz     (&label("ecb_decrypt"));
429
430         &mov    ($key_,$key);           # backup $key
431         &mov    ($rounds_,$rounds);     # backup $rounds
432         &cmp    ($len,0x60);
433         &jb     (&label("ecb_enc_tail"));
434
435         &movdqu ($inout0,&QWP(0,$inp));
436         &movdqu ($inout1,&QWP(0x10,$inp));
437         &movdqu ($inout2,&QWP(0x20,$inp));
438         &movdqu ($inout3,&QWP(0x30,$inp));
439         &movdqu ($inout4,&QWP(0x40,$inp));
440         &movdqu ($inout5,&QWP(0x50,$inp));
441         &lea    ($inp,&DWP(0x60,$inp));
442         &sub    ($len,0x60);
443         &jmp    (&label("ecb_enc_loop6_enter"));
444
445 &set_label("ecb_enc_loop6",16);
446         &movups (&QWP(0,$out),$inout0);
447         &movdqu ($inout0,&QWP(0,$inp));
448         &movups (&QWP(0x10,$out),$inout1);
449         &movdqu ($inout1,&QWP(0x10,$inp));
450         &movups (&QWP(0x20,$out),$inout2);
451         &movdqu ($inout2,&QWP(0x20,$inp));
452         &movups (&QWP(0x30,$out),$inout3);
453         &movdqu ($inout3,&QWP(0x30,$inp));
454         &movups (&QWP(0x40,$out),$inout4);
455         &movdqu ($inout4,&QWP(0x40,$inp));
456         &movups (&QWP(0x50,$out),$inout5);
457         &lea    ($out,&DWP(0x60,$out));
458         &movdqu ($inout5,&QWP(0x50,$inp));
459         &lea    ($inp,&DWP(0x60,$inp));
460 &set_label("ecb_enc_loop6_enter");
461
462         &call   ("_aesni_encrypt6");
463
464         &mov    ($key,$key_);           # restore $key
465         &mov    ($rounds,$rounds_);     # restore $rounds
466         &sub    ($len,0x60);
467         &jnc    (&label("ecb_enc_loop6"));
468
469         &movups (&QWP(0,$out),$inout0);
470         &movups (&QWP(0x10,$out),$inout1);
471         &movups (&QWP(0x20,$out),$inout2);
472         &movups (&QWP(0x30,$out),$inout3);
473         &movups (&QWP(0x40,$out),$inout4);
474         &movups (&QWP(0x50,$out),$inout5);
475         &lea    ($out,&DWP(0x60,$out));
476         &add    ($len,0x60);
477         &jz     (&label("ecb_ret"));
478
479 &set_label("ecb_enc_tail");
480         &movups ($inout0,&QWP(0,$inp));
481         &cmp    ($len,0x20);
482         &jb     (&label("ecb_enc_one"));
483         &movups ($inout1,&QWP(0x10,$inp));
484         &je     (&label("ecb_enc_two"));
485         &movups ($inout2,&QWP(0x20,$inp));
486         &cmp    ($len,0x40);
487         &jb     (&label("ecb_enc_three"));
488         &movups ($inout3,&QWP(0x30,$inp));
489         &je     (&label("ecb_enc_four"));
490         &movups ($inout4,&QWP(0x40,$inp));
491         &xorps  ($inout5,$inout5);
492         &call   ("_aesni_encrypt6");
493         &movups (&QWP(0,$out),$inout0);
494         &movups (&QWP(0x10,$out),$inout1);
495         &movups (&QWP(0x20,$out),$inout2);
496         &movups (&QWP(0x30,$out),$inout3);
497         &movups (&QWP(0x40,$out),$inout4);
498         jmp     (&label("ecb_ret"));
499
500 &set_label("ecb_enc_one",16);
501         if ($inline)
502         {   &aesni_inline_generate1("enc");     }
503         else
504         {   &call       ("_aesni_encrypt1");    }
505         &movups (&QWP(0,$out),$inout0);
506         &jmp    (&label("ecb_ret"));
507
508 &set_label("ecb_enc_two",16);
509         &call   ("_aesni_encrypt2");
510         &movups (&QWP(0,$out),$inout0);
511         &movups (&QWP(0x10,$out),$inout1);
512         &jmp    (&label("ecb_ret"));
513
514 &set_label("ecb_enc_three",16);
515         &call   ("_aesni_encrypt3");
516         &movups (&QWP(0,$out),$inout0);
517         &movups (&QWP(0x10,$out),$inout1);
518         &movups (&QWP(0x20,$out),$inout2);
519         &jmp    (&label("ecb_ret"));
520
521 &set_label("ecb_enc_four",16);
522         &call   ("_aesni_encrypt4");
523         &movups (&QWP(0,$out),$inout0);
524         &movups (&QWP(0x10,$out),$inout1);
525         &movups (&QWP(0x20,$out),$inout2);
526         &movups (&QWP(0x30,$out),$inout3);
527         &jmp    (&label("ecb_ret"));
528 ######################################################################
529 &set_label("ecb_decrypt",16);
530         &mov    ($key_,$key);           # backup $key
531         &mov    ($rounds_,$rounds);     # backup $rounds
532         &cmp    ($len,0x60);
533         &jb     (&label("ecb_dec_tail"));
534
535         &movdqu ($inout0,&QWP(0,$inp));
536         &movdqu ($inout1,&QWP(0x10,$inp));
537         &movdqu ($inout2,&QWP(0x20,$inp));
538         &movdqu ($inout3,&QWP(0x30,$inp));
539         &movdqu ($inout4,&QWP(0x40,$inp));
540         &movdqu ($inout5,&QWP(0x50,$inp));
541         &lea    ($inp,&DWP(0x60,$inp));
542         &sub    ($len,0x60);
543         &jmp    (&label("ecb_dec_loop6_enter"));
544
545 &set_label("ecb_dec_loop6",16);
546         &movups (&QWP(0,$out),$inout0);
547         &movdqu ($inout0,&QWP(0,$inp));
548         &movups (&QWP(0x10,$out),$inout1);
549         &movdqu ($inout1,&QWP(0x10,$inp));
550         &movups (&QWP(0x20,$out),$inout2);
551         &movdqu ($inout2,&QWP(0x20,$inp));
552         &movups (&QWP(0x30,$out),$inout3);
553         &movdqu ($inout3,&QWP(0x30,$inp));
554         &movups (&QWP(0x40,$out),$inout4);
555         &movdqu ($inout4,&QWP(0x40,$inp));
556         &movups (&QWP(0x50,$out),$inout5);
557         &lea    ($out,&DWP(0x60,$out));
558         &movdqu ($inout5,&QWP(0x50,$inp));
559         &lea    ($inp,&DWP(0x60,$inp));
560 &set_label("ecb_dec_loop6_enter");
561
562         &call   ("_aesni_decrypt6");
563
564         &mov    ($key,$key_);           # restore $key
565         &mov    ($rounds,$rounds_);     # restore $rounds
566         &sub    ($len,0x60);
567         &jnc    (&label("ecb_dec_loop6"));
568
569         &movups (&QWP(0,$out),$inout0);
570         &movups (&QWP(0x10,$out),$inout1);
571         &movups (&QWP(0x20,$out),$inout2);
572         &movups (&QWP(0x30,$out),$inout3);
573         &movups (&QWP(0x40,$out),$inout4);
574         &movups (&QWP(0x50,$out),$inout5);
575         &lea    ($out,&DWP(0x60,$out));
576         &add    ($len,0x60);
577         &jz     (&label("ecb_ret"));
578
579 &set_label("ecb_dec_tail");
580         &movups ($inout0,&QWP(0,$inp));
581         &cmp    ($len,0x20);
582         &jb     (&label("ecb_dec_one"));
583         &movups ($inout1,&QWP(0x10,$inp));
584         &je     (&label("ecb_dec_two"));
585         &movups ($inout2,&QWP(0x20,$inp));
586         &cmp    ($len,0x40);
587         &jb     (&label("ecb_dec_three"));
588         &movups ($inout3,&QWP(0x30,$inp));
589         &je     (&label("ecb_dec_four"));
590         &movups ($inout4,&QWP(0x40,$inp));
591         &xorps  ($inout5,$inout5);
592         &call   ("_aesni_decrypt6");
593         &movups (&QWP(0,$out),$inout0);
594         &movups (&QWP(0x10,$out),$inout1);
595         &movups (&QWP(0x20,$out),$inout2);
596         &movups (&QWP(0x30,$out),$inout3);
597         &movups (&QWP(0x40,$out),$inout4);
598         &jmp    (&label("ecb_ret"));
599
600 &set_label("ecb_dec_one",16);
601         if ($inline)
602         {   &aesni_inline_generate1("dec");     }
603         else
604         {   &call       ("_aesni_decrypt1");    }
605         &movups (&QWP(0,$out),$inout0);
606         &jmp    (&label("ecb_ret"));
607
608 &set_label("ecb_dec_two",16);
609         &call   ("_aesni_decrypt2");
610         &movups (&QWP(0,$out),$inout0);
611         &movups (&QWP(0x10,$out),$inout1);
612         &jmp    (&label("ecb_ret"));
613
614 &set_label("ecb_dec_three",16);
615         &call   ("_aesni_decrypt3");
616         &movups (&QWP(0,$out),$inout0);
617         &movups (&QWP(0x10,$out),$inout1);
618         &movups (&QWP(0x20,$out),$inout2);
619         &jmp    (&label("ecb_ret"));
620
621 &set_label("ecb_dec_four",16);
622         &call   ("_aesni_decrypt4");
623         &movups (&QWP(0,$out),$inout0);
624         &movups (&QWP(0x10,$out),$inout1);
625         &movups (&QWP(0x20,$out),$inout2);
626         &movups (&QWP(0x30,$out),$inout3);
627
628 &set_label("ecb_ret");
629         &pxor   ("xmm0","xmm0");                # clear register bank
630         &pxor   ("xmm1","xmm1");
631         &pxor   ("xmm2","xmm2");
632         &pxor   ("xmm3","xmm3");
633         &pxor   ("xmm4","xmm4");
634         &pxor   ("xmm5","xmm5");
635         &pxor   ("xmm6","xmm6");
636         &pxor   ("xmm7","xmm7");
637 &function_end("aesni_ecb_encrypt");
638 \f
639 ######################################################################
640 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
641 #                         size_t blocks, const AES_KEY *key,
642 #                         const char *ivec,char *cmac);
643 #
644 # Handles only complete blocks, operates on 64-bit counter and
645 # does not update *ivec! Nor does it finalize CMAC value
646 # (see engine/eng_aesni.c for details)
647 #
648 { my $cmac=$inout1;
649 &function_begin("aesni_ccm64_encrypt_blocks");
650         &mov    ($inp,&wparam(0));
651         &mov    ($out,&wparam(1));
652         &mov    ($len,&wparam(2));
653         &mov    ($key,&wparam(3));
654         &mov    ($rounds_,&wparam(4));
655         &mov    ($rounds,&wparam(5));
656         &mov    ($key_,"esp");
657         &sub    ("esp",60);
658         &and    ("esp",-16);                    # align stack
659         &mov    (&DWP(48,"esp"),$key_);
660
661         &movdqu ($ivec,&QWP(0,$rounds_));       # load ivec
662         &movdqu ($cmac,&QWP(0,$rounds));        # load cmac
663         &mov    ($rounds,&DWP(240,$key));
664
665         # compose byte-swap control mask for pshufb on stack
666         &mov    (&DWP(0,"esp"),0x0c0d0e0f);
667         &mov    (&DWP(4,"esp"),0x08090a0b);
668         &mov    (&DWP(8,"esp"),0x04050607);
669         &mov    (&DWP(12,"esp"),0x00010203);
670
671         # compose counter increment vector on stack
672         &mov    ($rounds_,1);
673         &xor    ($key_,$key_);
674         &mov    (&DWP(16,"esp"),$rounds_);
675         &mov    (&DWP(20,"esp"),$key_);
676         &mov    (&DWP(24,"esp"),$key_);
677         &mov    (&DWP(28,"esp"),$key_);
678
679         &shl    ($rounds,4);
680         &mov    ($rounds_,16);
681         &lea    ($key_,&DWP(0,$key));
682         &movdqa ($inout3,&QWP(0,"esp"));
683         &movdqa ($inout0,$ivec);
684         &lea    ($key,&DWP(32,$key,$rounds));
685         &sub    ($rounds_,$rounds);
686         &pshufb ($ivec,$inout3);
687
688 &set_label("ccm64_enc_outer");
689         &$movekey       ($rndkey0,&QWP(0,$key_));
690         &mov            ($rounds,$rounds_);
691         &movups         ($in0,&QWP(0,$inp));
692
693         &xorps          ($inout0,$rndkey0);
694         &$movekey       ($rndkey1,&QWP(16,$key_));
695         &xorps          ($rndkey0,$in0);
696         &xorps          ($cmac,$rndkey0);               # cmac^=inp
697         &$movekey       ($rndkey0,&QWP(32,$key_));
698
699 &set_label("ccm64_enc2_loop");
700         &aesenc         ($inout0,$rndkey1);
701         &aesenc         ($cmac,$rndkey1);
702         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
703         &add            ($rounds,32);
704         &aesenc         ($inout0,$rndkey0);
705         &aesenc         ($cmac,$rndkey0);
706         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
707         &jnz            (&label("ccm64_enc2_loop"));
708         &aesenc         ($inout0,$rndkey1);
709         &aesenc         ($cmac,$rndkey1);
710         &paddq          ($ivec,&QWP(16,"esp"));
711         &dec            ($len);
712         &aesenclast     ($inout0,$rndkey0);
713         &aesenclast     ($cmac,$rndkey0);
714
715         &lea    ($inp,&DWP(16,$inp));
716         &xorps  ($in0,$inout0);                 # inp^=E(ivec)
717         &movdqa ($inout0,$ivec);
718         &movups (&QWP(0,$out),$in0);            # save output
719         &pshufb ($inout0,$inout3);
720         &lea    ($out,&DWP(16,$out));
721         &jnz    (&label("ccm64_enc_outer"));
722
723         &mov    ("esp",&DWP(48,"esp"));
724         &mov    ($out,&wparam(5));
725         &movups (&QWP(0,$out),$cmac);
726
727         &pxor   ("xmm0","xmm0");                # clear register bank
728         &pxor   ("xmm1","xmm1");
729         &pxor   ("xmm2","xmm2");
730         &pxor   ("xmm3","xmm3");
731         &pxor   ("xmm4","xmm4");
732         &pxor   ("xmm5","xmm5");
733         &pxor   ("xmm6","xmm6");
734         &pxor   ("xmm7","xmm7");
735 &function_end("aesni_ccm64_encrypt_blocks");
736
737 &function_begin("aesni_ccm64_decrypt_blocks");
738         &mov    ($inp,&wparam(0));
739         &mov    ($out,&wparam(1));
740         &mov    ($len,&wparam(2));
741         &mov    ($key,&wparam(3));
742         &mov    ($rounds_,&wparam(4));
743         &mov    ($rounds,&wparam(5));
744         &mov    ($key_,"esp");
745         &sub    ("esp",60);
746         &and    ("esp",-16);                    # align stack
747         &mov    (&DWP(48,"esp"),$key_);
748
749         &movdqu ($ivec,&QWP(0,$rounds_));       # load ivec
750         &movdqu ($cmac,&QWP(0,$rounds));        # load cmac
751         &mov    ($rounds,&DWP(240,$key));
752
753         # compose byte-swap control mask for pshufb on stack
754         &mov    (&DWP(0,"esp"),0x0c0d0e0f);
755         &mov    (&DWP(4,"esp"),0x08090a0b);
756         &mov    (&DWP(8,"esp"),0x04050607);
757         &mov    (&DWP(12,"esp"),0x00010203);
758
759         # compose counter increment vector on stack
760         &mov    ($rounds_,1);
761         &xor    ($key_,$key_);
762         &mov    (&DWP(16,"esp"),$rounds_);
763         &mov    (&DWP(20,"esp"),$key_);
764         &mov    (&DWP(24,"esp"),$key_);
765         &mov    (&DWP(28,"esp"),$key_);
766
767         &movdqa ($inout3,&QWP(0,"esp"));        # bswap mask
768         &movdqa ($inout0,$ivec);
769
770         &mov    ($key_,$key);
771         &mov    ($rounds_,$rounds);
772
773         &pshufb ($ivec,$inout3);
774         if ($inline)
775         {   &aesni_inline_generate1("enc");     }
776         else
777         {   &call       ("_aesni_encrypt1");    }
778         &shl    ($rounds_,4);
779         &mov    ($rounds,16);
780         &movups ($in0,&QWP(0,$inp));            # load inp
781         &paddq  ($ivec,&QWP(16,"esp"));
782         &lea    ($inp,&QWP(16,$inp));
783         &sub    ($rounds,$rounds_);
784         &lea    ($key,&DWP(32,$key_,$rounds_));
785         &mov    ($rounds_,$rounds);
786         &jmp    (&label("ccm64_dec_outer"));
787
788 &set_label("ccm64_dec_outer",16);
789         &xorps  ($in0,$inout0);                 # inp ^= E(ivec)
790         &movdqa ($inout0,$ivec);
791         &movups (&QWP(0,$out),$in0);            # save output
792         &lea    ($out,&DWP(16,$out));
793         &pshufb ($inout0,$inout3);
794
795         &sub    ($len,1);
796         &jz     (&label("ccm64_dec_break"));
797
798         &$movekey       ($rndkey0,&QWP(0,$key_));
799         &mov            ($rounds,$rounds_);
800         &$movekey       ($rndkey1,&QWP(16,$key_));
801         &xorps          ($in0,$rndkey0);
802         &xorps          ($inout0,$rndkey0);
803         &xorps          ($cmac,$in0);           # cmac^=out
804         &$movekey       ($rndkey0,&QWP(32,$key_));
805
806 &set_label("ccm64_dec2_loop");
807         &aesenc         ($inout0,$rndkey1);
808         &aesenc         ($cmac,$rndkey1);
809         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
810         &add            ($rounds,32);
811         &aesenc         ($inout0,$rndkey0);
812         &aesenc         ($cmac,$rndkey0);
813         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
814         &jnz            (&label("ccm64_dec2_loop"));
815         &movups         ($in0,&QWP(0,$inp));    # load inp
816         &paddq          ($ivec,&QWP(16,"esp"));
817         &aesenc         ($inout0,$rndkey1);
818         &aesenc         ($cmac,$rndkey1);
819         &aesenclast     ($inout0,$rndkey0);
820         &aesenclast     ($cmac,$rndkey0);
821         &lea            ($inp,&QWP(16,$inp));
822         &jmp    (&label("ccm64_dec_outer"));
823
824 &set_label("ccm64_dec_break",16);
825         &mov    ($rounds,&DWP(240,$key_));
826         &mov    ($key,$key_);
827         if ($inline)
828         {   &aesni_inline_generate1("enc",$cmac,$in0);  }
829         else
830         {   &call       ("_aesni_encrypt1",$cmac);      }
831
832         &mov    ("esp",&DWP(48,"esp"));
833         &mov    ($out,&wparam(5));
834         &movups (&QWP(0,$out),$cmac);
835
836         &pxor   ("xmm0","xmm0");                # clear register bank
837         &pxor   ("xmm1","xmm1");
838         &pxor   ("xmm2","xmm2");
839         &pxor   ("xmm3","xmm3");
840         &pxor   ("xmm4","xmm4");
841         &pxor   ("xmm5","xmm5");
842         &pxor   ("xmm6","xmm6");
843         &pxor   ("xmm7","xmm7");
844 &function_end("aesni_ccm64_decrypt_blocks");
845 }
846 \f
847 ######################################################################
848 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
849 #                         size_t blocks, const AES_KEY *key,
850 #                         const char *ivec);
851 #
852 # Handles only complete blocks, operates on 32-bit counter and
853 # does not update *ivec! (see crypto/modes/ctr128.c for details)
854 #
855 # stack layout:
856 #       0       pshufb mask
857 #       16      vector addend: 0,6,6,6
858 #       32      counter-less ivec
859 #       48      1st triplet of counter vector
860 #       64      2nd triplet of counter vector
861 #       80      saved %esp
862
863 &function_begin("aesni_ctr32_encrypt_blocks");
864         &mov    ($inp,&wparam(0));
865         &mov    ($out,&wparam(1));
866         &mov    ($len,&wparam(2));
867         &mov    ($key,&wparam(3));
868         &mov    ($rounds_,&wparam(4));
869         &mov    ($key_,"esp");
870         &sub    ("esp",88);
871         &and    ("esp",-16);                    # align stack
872         &mov    (&DWP(80,"esp"),$key_);
873
874         &cmp    ($len,1);
875         &je     (&label("ctr32_one_shortcut"));
876
877         &movdqu ($inout5,&QWP(0,$rounds_));     # load ivec
878
879         # compose byte-swap control mask for pshufb on stack
880         &mov    (&DWP(0,"esp"),0x0c0d0e0f);
881         &mov    (&DWP(4,"esp"),0x08090a0b);
882         &mov    (&DWP(8,"esp"),0x04050607);
883         &mov    (&DWP(12,"esp"),0x00010203);
884
885         # compose counter increment vector on stack
886         &mov    ($rounds,6);
887         &xor    ($key_,$key_);
888         &mov    (&DWP(16,"esp"),$rounds);
889         &mov    (&DWP(20,"esp"),$rounds);
890         &mov    (&DWP(24,"esp"),$rounds);
891         &mov    (&DWP(28,"esp"),$key_);
892
893         &pextrd ($rounds_,$inout5,3);           # pull 32-bit counter
894         &pinsrd ($inout5,$key_,3);              # wipe 32-bit counter
895
896         &mov    ($rounds,&DWP(240,$key));       # key->rounds
897
898         # compose 2 vectors of 3x32-bit counters
899         &bswap  ($rounds_);
900         &pxor   ($rndkey0,$rndkey0);
901         &pxor   ($rndkey1,$rndkey1);
902         &movdqa ($inout0,&QWP(0,"esp"));        # load byte-swap mask
903         &pinsrd ($rndkey0,$rounds_,0);
904         &lea    ($key_,&DWP(3,$rounds_));
905         &pinsrd ($rndkey1,$key_,0);
906         &inc    ($rounds_);
907         &pinsrd ($rndkey0,$rounds_,1);
908         &inc    ($key_);
909         &pinsrd ($rndkey1,$key_,1);
910         &inc    ($rounds_);
911         &pinsrd ($rndkey0,$rounds_,2);
912         &inc    ($key_);
913         &pinsrd ($rndkey1,$key_,2);
914         &movdqa (&QWP(48,"esp"),$rndkey0);      # save 1st triplet
915         &pshufb ($rndkey0,$inout0);             # byte swap
916         &movdqu ($inout4,&QWP(0,$key));         # key[0]
917         &movdqa (&QWP(64,"esp"),$rndkey1);      # save 2nd triplet
918         &pshufb ($rndkey1,$inout0);             # byte swap
919
920         &pshufd ($inout0,$rndkey0,3<<6);        # place counter to upper dword
921         &pshufd ($inout1,$rndkey0,2<<6);
922         &cmp    ($len,6);
923         &jb     (&label("ctr32_tail"));
924         &pxor   ($inout5,$inout4);              # counter-less ivec^key[0]
925         &shl    ($rounds,4);
926         &mov    ($rounds_,16);
927         &movdqa (&QWP(32,"esp"),$inout5);       # save counter-less ivec^key[0]
928         &mov    ($key_,$key);                   # backup $key
929         &sub    ($rounds_,$rounds);             # backup twisted $rounds
930         &lea    ($key,&DWP(32,$key,$rounds));
931         &sub    ($len,6);
932         &jmp    (&label("ctr32_loop6"));
933
934 &set_label("ctr32_loop6",16);
935         # inlining _aesni_encrypt6's prologue gives ~6% improvement...
936         &pshufd ($inout2,$rndkey0,1<<6);
937         &movdqa ($rndkey0,&QWP(32,"esp"));      # pull counter-less ivec
938         &pshufd ($inout3,$rndkey1,3<<6);
939         &pxor           ($inout0,$rndkey0);     # merge counter-less ivec
940         &pshufd ($inout4,$rndkey1,2<<6);
941         &pxor           ($inout1,$rndkey0);
942         &pshufd ($inout5,$rndkey1,1<<6);
943         &$movekey       ($rndkey1,&QWP(16,$key_));
944         &pxor           ($inout2,$rndkey0);
945         &pxor           ($inout3,$rndkey0);
946         &aesenc         ($inout0,$rndkey1);
947         &pxor           ($inout4,$rndkey0);
948         &pxor           ($inout5,$rndkey0);
949         &aesenc         ($inout1,$rndkey1);
950         &$movekey       ($rndkey0,&QWP(32,$key_));
951         &mov            ($rounds,$rounds_);
952         &aesenc         ($inout2,$rndkey1);
953         &aesenc         ($inout3,$rndkey1);
954         &aesenc         ($inout4,$rndkey1);
955         &aesenc         ($inout5,$rndkey1);
956
957         &call           (&label("_aesni_encrypt6_enter"));
958
959         &movups ($rndkey1,&QWP(0,$inp));
960         &movups ($rndkey0,&QWP(0x10,$inp));
961         &xorps  ($inout0,$rndkey1);
962         &movups ($rndkey1,&QWP(0x20,$inp));
963         &xorps  ($inout1,$rndkey0);
964         &movups (&QWP(0,$out),$inout0);
965         &movdqa ($rndkey0,&QWP(16,"esp"));      # load increment
966         &xorps  ($inout2,$rndkey1);
967         &movdqa ($rndkey1,&QWP(64,"esp"));      # load 2nd triplet
968         &movups (&QWP(0x10,$out),$inout1);
969         &movups (&QWP(0x20,$out),$inout2);
970
971         &paddd  ($rndkey1,$rndkey0);            # 2nd triplet increment
972         &paddd  ($rndkey0,&QWP(48,"esp"));      # 1st triplet increment
973         &movdqa ($inout0,&QWP(0,"esp"));        # load byte swap mask
974
975         &movups ($inout1,&QWP(0x30,$inp));
976         &movups ($inout2,&QWP(0x40,$inp));
977         &xorps  ($inout3,$inout1);
978         &movups ($inout1,&QWP(0x50,$inp));
979         &lea    ($inp,&DWP(0x60,$inp));
980         &movdqa (&QWP(48,"esp"),$rndkey0);      # save 1st triplet
981         &pshufb ($rndkey0,$inout0);             # byte swap
982         &xorps  ($inout4,$inout2);
983         &movups (&QWP(0x30,$out),$inout3);
984         &xorps  ($inout5,$inout1);
985         &movdqa (&QWP(64,"esp"),$rndkey1);      # save 2nd triplet
986         &pshufb ($rndkey1,$inout0);             # byte swap
987         &movups (&QWP(0x40,$out),$inout4);
988         &pshufd ($inout0,$rndkey0,3<<6);
989         &movups (&QWP(0x50,$out),$inout5);
990         &lea    ($out,&DWP(0x60,$out));
991
992         &pshufd ($inout1,$rndkey0,2<<6);
993         &sub    ($len,6);
994         &jnc    (&label("ctr32_loop6"));
995
996         &add    ($len,6);
997         &jz     (&label("ctr32_ret"));
998         &movdqu ($inout5,&QWP(0,$key_));
999         &mov    ($key,$key_);
1000         &pxor   ($inout5,&QWP(32,"esp"));       # restore count-less ivec
1001         &mov    ($rounds,&DWP(240,$key_));      # restore $rounds
1002
1003 &set_label("ctr32_tail");
1004         &por    ($inout0,$inout5);
1005         &cmp    ($len,2);
1006         &jb     (&label("ctr32_one"));
1007
1008         &pshufd ($inout2,$rndkey0,1<<6);
1009         &por    ($inout1,$inout5);
1010         &je     (&label("ctr32_two"));
1011
1012         &pshufd ($inout3,$rndkey1,3<<6);
1013         &por    ($inout2,$inout5);
1014         &cmp    ($len,4);
1015         &jb     (&label("ctr32_three"));
1016
1017         &pshufd ($inout4,$rndkey1,2<<6);
1018         &por    ($inout3,$inout5);
1019         &je     (&label("ctr32_four"));
1020
1021         &por    ($inout4,$inout5);
1022         &call   ("_aesni_encrypt6");
1023         &movups ($rndkey1,&QWP(0,$inp));
1024         &movups ($rndkey0,&QWP(0x10,$inp));
1025         &xorps  ($inout0,$rndkey1);
1026         &movups ($rndkey1,&QWP(0x20,$inp));
1027         &xorps  ($inout1,$rndkey0);
1028         &movups ($rndkey0,&QWP(0x30,$inp));
1029         &xorps  ($inout2,$rndkey1);
1030         &movups ($rndkey1,&QWP(0x40,$inp));
1031         &xorps  ($inout3,$rndkey0);
1032         &movups (&QWP(0,$out),$inout0);
1033         &xorps  ($inout4,$rndkey1);
1034         &movups (&QWP(0x10,$out),$inout1);
1035         &movups (&QWP(0x20,$out),$inout2);
1036         &movups (&QWP(0x30,$out),$inout3);
1037         &movups (&QWP(0x40,$out),$inout4);
1038         &jmp    (&label("ctr32_ret"));
1039
1040 &set_label("ctr32_one_shortcut",16);
1041         &movups ($inout0,&QWP(0,$rounds_));     # load ivec
1042         &mov    ($rounds,&DWP(240,$key));
1043         
1044 &set_label("ctr32_one");
1045         if ($inline)
1046         {   &aesni_inline_generate1("enc");     }
1047         else
1048         {   &call       ("_aesni_encrypt1");    }
1049         &movups ($in0,&QWP(0,$inp));
1050         &xorps  ($in0,$inout0);
1051         &movups (&QWP(0,$out),$in0);
1052         &jmp    (&label("ctr32_ret"));
1053
1054 &set_label("ctr32_two",16);
1055         &call   ("_aesni_encrypt2");
1056         &movups ($inout3,&QWP(0,$inp));
1057         &movups ($inout4,&QWP(0x10,$inp));
1058         &xorps  ($inout0,$inout3);
1059         &xorps  ($inout1,$inout4);
1060         &movups (&QWP(0,$out),$inout0);
1061         &movups (&QWP(0x10,$out),$inout1);
1062         &jmp    (&label("ctr32_ret"));
1063
1064 &set_label("ctr32_three",16);
1065         &call   ("_aesni_encrypt3");
1066         &movups ($inout3,&QWP(0,$inp));
1067         &movups ($inout4,&QWP(0x10,$inp));
1068         &xorps  ($inout0,$inout3);
1069         &movups ($inout5,&QWP(0x20,$inp));
1070         &xorps  ($inout1,$inout4);
1071         &movups (&QWP(0,$out),$inout0);
1072         &xorps  ($inout2,$inout5);
1073         &movups (&QWP(0x10,$out),$inout1);
1074         &movups (&QWP(0x20,$out),$inout2);
1075         &jmp    (&label("ctr32_ret"));
1076
1077 &set_label("ctr32_four",16);
1078         &call   ("_aesni_encrypt4");
1079         &movups ($inout4,&QWP(0,$inp));
1080         &movups ($inout5,&QWP(0x10,$inp));
1081         &movups ($rndkey1,&QWP(0x20,$inp));
1082         &xorps  ($inout0,$inout4);
1083         &movups ($rndkey0,&QWP(0x30,$inp));
1084         &xorps  ($inout1,$inout5);
1085         &movups (&QWP(0,$out),$inout0);
1086         &xorps  ($inout2,$rndkey1);
1087         &movups (&QWP(0x10,$out),$inout1);
1088         &xorps  ($inout3,$rndkey0);
1089         &movups (&QWP(0x20,$out),$inout2);
1090         &movups (&QWP(0x30,$out),$inout3);
1091
1092 &set_label("ctr32_ret");
1093         &pxor   ("xmm0","xmm0");                # clear register bank
1094         &pxor   ("xmm1","xmm1");
1095         &pxor   ("xmm2","xmm2");
1096         &pxor   ("xmm3","xmm3");
1097         &pxor   ("xmm4","xmm4");
1098         &movdqa (&QWP(32,"esp"),"xmm0");        # clear stack
1099         &pxor   ("xmm5","xmm5");
1100         &movdqa (&QWP(48,"esp"),"xmm0");
1101         &pxor   ("xmm6","xmm6");
1102         &movdqa (&QWP(64,"esp"),"xmm0");
1103         &pxor   ("xmm7","xmm7");
1104         &mov    ("esp",&DWP(80,"esp"));
1105 &function_end("aesni_ctr32_encrypt_blocks");
1106 \f
1107 ######################################################################
1108 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1109 #       const AES_KEY *key1, const AES_KEY *key2
1110 #       const unsigned char iv[16]);
1111 #
1112 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1113
1114 &function_begin("aesni_xts_encrypt");
1115         &mov    ($key,&wparam(4));              # key2
1116         &mov    ($inp,&wparam(5));              # clear-text tweak
1117
1118         &mov    ($rounds,&DWP(240,$key));       # key2->rounds
1119         &movups ($inout0,&QWP(0,$inp));
1120         if ($inline)
1121         {   &aesni_inline_generate1("enc");     }
1122         else
1123         {   &call       ("_aesni_encrypt1");    }
1124
1125         &mov    ($inp,&wparam(0));
1126         &mov    ($out,&wparam(1));
1127         &mov    ($len,&wparam(2));
1128         &mov    ($key,&wparam(3));              # key1
1129
1130         &mov    ($key_,"esp");
1131         &sub    ("esp",16*7+8);
1132         &mov    ($rounds,&DWP(240,$key));       # key1->rounds
1133         &and    ("esp",-16);                    # align stack
1134
1135         &mov    (&DWP(16*6+0,"esp"),0x87);      # compose the magic constant
1136         &mov    (&DWP(16*6+4,"esp"),0);
1137         &mov    (&DWP(16*6+8,"esp"),1);
1138         &mov    (&DWP(16*6+12,"esp"),0);
1139         &mov    (&DWP(16*7+0,"esp"),$len);      # save original $len
1140         &mov    (&DWP(16*7+4,"esp"),$key_);     # save original %esp
1141
1142         &movdqa ($tweak,$inout0);
1143         &pxor   ($twtmp,$twtmp);
1144         &movdqa ($twmask,&QWP(6*16,"esp"));     # 0x0...010...87
1145         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1146
1147         &and    ($len,-16);
1148         &mov    ($key_,$key);                   # backup $key
1149         &mov    ($rounds_,$rounds);             # backup $rounds
1150         &sub    ($len,16*6);
1151         &jc     (&label("xts_enc_short"));
1152
1153         &shl    ($rounds,4);
1154         &mov    ($rounds_,16);
1155         &sub    ($rounds_,$rounds);
1156         &lea    ($key,&DWP(32,$key,$rounds));
1157         &jmp    (&label("xts_enc_loop6"));
1158
1159 &set_label("xts_enc_loop6",16);
1160         for ($i=0;$i<4;$i++) {
1161             &pshufd     ($twres,$twtmp,0x13);
1162             &pxor       ($twtmp,$twtmp);
1163             &movdqa     (&QWP(16*$i,"esp"),$tweak);
1164             &paddq      ($tweak,$tweak);        # &psllq($tweak,1);
1165             &pand       ($twres,$twmask);       # isolate carry and residue
1166             &pcmpgtd    ($twtmp,$tweak);        # broadcast upper bits
1167             &pxor       ($tweak,$twres);
1168         }
1169         &pshufd ($inout5,$twtmp,0x13);
1170         &movdqa (&QWP(16*$i++,"esp"),$tweak);
1171         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1172          &$movekey      ($rndkey0,&QWP(0,$key_));
1173         &pand   ($inout5,$twmask);              # isolate carry and residue
1174          &movups        ($inout0,&QWP(0,$inp)); # load input
1175         &pxor   ($inout5,$tweak);
1176
1177         # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1178         &mov    ($rounds,$rounds_);             # restore $rounds
1179         &movdqu ($inout1,&QWP(16*1,$inp));
1180          &xorps         ($inout0,$rndkey0);     # input^=rndkey[0]
1181         &movdqu ($inout2,&QWP(16*2,$inp));
1182          &pxor          ($inout1,$rndkey0);
1183         &movdqu ($inout3,&QWP(16*3,$inp));
1184          &pxor          ($inout2,$rndkey0);
1185         &movdqu ($inout4,&QWP(16*4,$inp));
1186          &pxor          ($inout3,$rndkey0);
1187         &movdqu ($rndkey1,&QWP(16*5,$inp));
1188          &pxor          ($inout4,$rndkey0);
1189         &lea    ($inp,&DWP(16*6,$inp));
1190         &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1191         &movdqa (&QWP(16*$i,"esp"),$inout5);    # save last tweak
1192         &pxor   ($inout5,$rndkey1);
1193
1194          &$movekey      ($rndkey1,&QWP(16,$key_));
1195         &pxor   ($inout1,&QWP(16*1,"esp"));
1196         &pxor   ($inout2,&QWP(16*2,"esp"));
1197          &aesenc        ($inout0,$rndkey1);
1198         &pxor   ($inout3,&QWP(16*3,"esp"));
1199         &pxor   ($inout4,&QWP(16*4,"esp"));
1200          &aesenc        ($inout1,$rndkey1);
1201         &pxor           ($inout5,$rndkey0);
1202          &$movekey      ($rndkey0,&QWP(32,$key_));
1203          &aesenc        ($inout2,$rndkey1);
1204          &aesenc        ($inout3,$rndkey1);
1205          &aesenc        ($inout4,$rndkey1);
1206          &aesenc        ($inout5,$rndkey1);
1207         &call           (&label("_aesni_encrypt6_enter"));
1208
1209         &movdqa ($tweak,&QWP(16*5,"esp"));      # last tweak
1210        &pxor    ($twtmp,$twtmp);
1211         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1212        &pcmpgtd ($twtmp,$tweak);                # broadcast upper bits
1213         &xorps  ($inout1,&QWP(16*1,"esp"));
1214         &movups (&QWP(16*0,$out),$inout0);      # write output
1215         &xorps  ($inout2,&QWP(16*2,"esp"));
1216         &movups (&QWP(16*1,$out),$inout1);
1217         &xorps  ($inout3,&QWP(16*3,"esp"));
1218         &movups (&QWP(16*2,$out),$inout2);
1219         &xorps  ($inout4,&QWP(16*4,"esp"));
1220         &movups (&QWP(16*3,$out),$inout3);
1221         &xorps  ($inout5,$tweak);
1222         &movups (&QWP(16*4,$out),$inout4);
1223        &pshufd  ($twres,$twtmp,0x13);
1224         &movups (&QWP(16*5,$out),$inout5);
1225         &lea    ($out,&DWP(16*6,$out));
1226        &movdqa  ($twmask,&QWP(16*6,"esp"));     # 0x0...010...87
1227
1228         &pxor   ($twtmp,$twtmp);
1229         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1230         &pand   ($twres,$twmask);               # isolate carry and residue
1231         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1232         &pxor   ($tweak,$twres);
1233
1234         &sub    ($len,16*6);
1235         &jnc    (&label("xts_enc_loop6"));
1236
1237         &mov    ($rounds,&DWP(240,$key_));      # restore $rounds
1238         &mov    ($key,$key_);                   # restore $key
1239         &mov    ($rounds_,$rounds);
1240
1241 &set_label("xts_enc_short");
1242         &add    ($len,16*6);
1243         &jz     (&label("xts_enc_done6x"));
1244
1245         &movdqa ($inout3,$tweak);               # put aside previous tweak
1246         &cmp    ($len,0x20);
1247         &jb     (&label("xts_enc_one"));
1248
1249         &pshufd ($twres,$twtmp,0x13);
1250         &pxor   ($twtmp,$twtmp);
1251         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1252         &pand   ($twres,$twmask);               # isolate carry and residue
1253         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1254         &pxor   ($tweak,$twres);
1255         &je     (&label("xts_enc_two"));
1256
1257         &pshufd ($twres,$twtmp,0x13);
1258         &pxor   ($twtmp,$twtmp);
1259         &movdqa ($inout4,$tweak);               # put aside previous tweak
1260         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1261         &pand   ($twres,$twmask);               # isolate carry and residue
1262         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1263         &pxor   ($tweak,$twres);
1264         &cmp    ($len,0x40);
1265         &jb     (&label("xts_enc_three"));
1266
1267         &pshufd ($twres,$twtmp,0x13);
1268         &pxor   ($twtmp,$twtmp);
1269         &movdqa ($inout5,$tweak);               # put aside previous tweak
1270         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1271         &pand   ($twres,$twmask);               # isolate carry and residue
1272         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1273         &pxor   ($tweak,$twres);
1274         &movdqa (&QWP(16*0,"esp"),$inout3);
1275         &movdqa (&QWP(16*1,"esp"),$inout4);
1276         &je     (&label("xts_enc_four"));
1277
1278         &movdqa (&QWP(16*2,"esp"),$inout5);
1279         &pshufd ($inout5,$twtmp,0x13);
1280         &movdqa (&QWP(16*3,"esp"),$tweak);
1281         &paddq  ($tweak,$tweak);                # &psllq($inout0,1);
1282         &pand   ($inout5,$twmask);              # isolate carry and residue
1283         &pxor   ($inout5,$tweak);
1284
1285         &movdqu ($inout0,&QWP(16*0,$inp));      # load input
1286         &movdqu ($inout1,&QWP(16*1,$inp));
1287         &movdqu ($inout2,&QWP(16*2,$inp));
1288         &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1289         &movdqu ($inout3,&QWP(16*3,$inp));
1290         &pxor   ($inout1,&QWP(16*1,"esp"));
1291         &movdqu ($inout4,&QWP(16*4,$inp));
1292         &pxor   ($inout2,&QWP(16*2,"esp"));
1293         &lea    ($inp,&DWP(16*5,$inp));
1294         &pxor   ($inout3,&QWP(16*3,"esp"));
1295         &movdqa (&QWP(16*4,"esp"),$inout5);     # save last tweak
1296         &pxor   ($inout4,$inout5);
1297
1298         &call   ("_aesni_encrypt6");
1299
1300         &movaps ($tweak,&QWP(16*4,"esp"));      # last tweak
1301         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1302         &xorps  ($inout1,&QWP(16*1,"esp"));
1303         &xorps  ($inout2,&QWP(16*2,"esp"));
1304         &movups (&QWP(16*0,$out),$inout0);      # write output
1305         &xorps  ($inout3,&QWP(16*3,"esp"));
1306         &movups (&QWP(16*1,$out),$inout1);
1307         &xorps  ($inout4,$tweak);
1308         &movups (&QWP(16*2,$out),$inout2);
1309         &movups (&QWP(16*3,$out),$inout3);
1310         &movups (&QWP(16*4,$out),$inout4);
1311         &lea    ($out,&DWP(16*5,$out));
1312         &jmp    (&label("xts_enc_done"));
1313
1314 &set_label("xts_enc_one",16);
1315         &movups ($inout0,&QWP(16*0,$inp));      # load input
1316         &lea    ($inp,&DWP(16*1,$inp));
1317         &xorps  ($inout0,$inout3);              # input^=tweak
1318         if ($inline)
1319         {   &aesni_inline_generate1("enc");     }
1320         else
1321         {   &call       ("_aesni_encrypt1");    }
1322         &xorps  ($inout0,$inout3);              # output^=tweak
1323         &movups (&QWP(16*0,$out),$inout0);      # write output
1324         &lea    ($out,&DWP(16*1,$out));
1325
1326         &movdqa ($tweak,$inout3);               # last tweak
1327         &jmp    (&label("xts_enc_done"));
1328
1329 &set_label("xts_enc_two",16);
1330         &movaps ($inout4,$tweak);               # put aside last tweak
1331
1332         &movups ($inout0,&QWP(16*0,$inp));      # load input
1333         &movups ($inout1,&QWP(16*1,$inp));
1334         &lea    ($inp,&DWP(16*2,$inp));
1335         &xorps  ($inout0,$inout3);              # input^=tweak
1336         &xorps  ($inout1,$inout4);
1337
1338         &call   ("_aesni_encrypt2");
1339
1340         &xorps  ($inout0,$inout3);              # output^=tweak
1341         &xorps  ($inout1,$inout4);
1342         &movups (&QWP(16*0,$out),$inout0);      # write output
1343         &movups (&QWP(16*1,$out),$inout1);
1344         &lea    ($out,&DWP(16*2,$out));
1345
1346         &movdqa ($tweak,$inout4);               # last tweak
1347         &jmp    (&label("xts_enc_done"));
1348
1349 &set_label("xts_enc_three",16);
1350         &movaps ($inout5,$tweak);               # put aside last tweak
1351         &movups ($inout0,&QWP(16*0,$inp));      # load input
1352         &movups ($inout1,&QWP(16*1,$inp));
1353         &movups ($inout2,&QWP(16*2,$inp));
1354         &lea    ($inp,&DWP(16*3,$inp));
1355         &xorps  ($inout0,$inout3);              # input^=tweak
1356         &xorps  ($inout1,$inout4);
1357         &xorps  ($inout2,$inout5);
1358
1359         &call   ("_aesni_encrypt3");
1360
1361         &xorps  ($inout0,$inout3);              # output^=tweak
1362         &xorps  ($inout1,$inout4);
1363         &xorps  ($inout2,$inout5);
1364         &movups (&QWP(16*0,$out),$inout0);      # write output
1365         &movups (&QWP(16*1,$out),$inout1);
1366         &movups (&QWP(16*2,$out),$inout2);
1367         &lea    ($out,&DWP(16*3,$out));
1368
1369         &movdqa ($tweak,$inout5);               # last tweak
1370         &jmp    (&label("xts_enc_done"));
1371
1372 &set_label("xts_enc_four",16);
1373         &movaps ($inout4,$tweak);               # put aside last tweak
1374
1375         &movups ($inout0,&QWP(16*0,$inp));      # load input
1376         &movups ($inout1,&QWP(16*1,$inp));
1377         &movups ($inout2,&QWP(16*2,$inp));
1378         &xorps  ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1379         &movups ($inout3,&QWP(16*3,$inp));
1380         &lea    ($inp,&DWP(16*4,$inp));
1381         &xorps  ($inout1,&QWP(16*1,"esp"));
1382         &xorps  ($inout2,$inout5);
1383         &xorps  ($inout3,$inout4);
1384
1385         &call   ("_aesni_encrypt4");
1386
1387         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1388         &xorps  ($inout1,&QWP(16*1,"esp"));
1389         &xorps  ($inout2,$inout5);
1390         &movups (&QWP(16*0,$out),$inout0);      # write output
1391         &xorps  ($inout3,$inout4);
1392         &movups (&QWP(16*1,$out),$inout1);
1393         &movups (&QWP(16*2,$out),$inout2);
1394         &movups (&QWP(16*3,$out),$inout3);
1395         &lea    ($out,&DWP(16*4,$out));
1396
1397         &movdqa ($tweak,$inout4);               # last tweak
1398         &jmp    (&label("xts_enc_done"));
1399
1400 &set_label("xts_enc_done6x",16);                # $tweak is pre-calculated
1401         &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
1402         &and    ($len,15);
1403         &jz     (&label("xts_enc_ret"));
1404         &movdqa ($inout3,$tweak);
1405         &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
1406         &jmp    (&label("xts_enc_steal"));
1407
1408 &set_label("xts_enc_done",16);
1409         &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
1410         &pxor   ($twtmp,$twtmp);
1411         &and    ($len,15);
1412         &jz     (&label("xts_enc_ret"));
1413
1414         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1415         &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
1416         &pshufd ($inout3,$twtmp,0x13);
1417         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1418         &pand   ($inout3,&QWP(16*6,"esp"));     # isolate carry and residue
1419         &pxor   ($inout3,$tweak);
1420
1421 &set_label("xts_enc_steal");
1422         &movz   ($rounds,&BP(0,$inp));
1423         &movz   ($key,&BP(-16,$out));
1424         &lea    ($inp,&DWP(1,$inp));
1425         &mov    (&BP(-16,$out),&LB($rounds));
1426         &mov    (&BP(0,$out),&LB($key));
1427         &lea    ($out,&DWP(1,$out));
1428         &sub    ($len,1);
1429         &jnz    (&label("xts_enc_steal"));
1430
1431         &sub    ($out,&DWP(16*7+0,"esp"));      # rewind $out
1432         &mov    ($key,$key_);                   # restore $key
1433         &mov    ($rounds,$rounds_);             # restore $rounds
1434
1435         &movups ($inout0,&QWP(-16,$out));       # load input
1436         &xorps  ($inout0,$inout3);              # input^=tweak
1437         if ($inline)
1438         {   &aesni_inline_generate1("enc");     }
1439         else
1440         {   &call       ("_aesni_encrypt1");    }
1441         &xorps  ($inout0,$inout3);              # output^=tweak
1442         &movups (&QWP(-16,$out),$inout0);       # write output
1443
1444 &set_label("xts_enc_ret");
1445         &pxor   ("xmm0","xmm0");                # clear register bank
1446         &pxor   ("xmm1","xmm1");
1447         &pxor   ("xmm2","xmm2");
1448         &movdqa (&QWP(16*0,"esp"),"xmm0");      # clear stack
1449         &pxor   ("xmm3","xmm3");
1450         &movdqa (&QWP(16*1,"esp"),"xmm0");
1451         &pxor   ("xmm4","xmm4");
1452         &movdqa (&QWP(16*2,"esp"),"xmm0");
1453         &pxor   ("xmm5","xmm5");
1454         &movdqa (&QWP(16*3,"esp"),"xmm0");
1455         &pxor   ("xmm6","xmm6");
1456         &movdqa (&QWP(16*4,"esp"),"xmm0");
1457         &pxor   ("xmm7","xmm7");
1458         &movdqa (&QWP(16*5,"esp"),"xmm0");
1459         &mov    ("esp",&DWP(16*7+4,"esp"));     # restore %esp
1460 &function_end("aesni_xts_encrypt");
1461
1462 &function_begin("aesni_xts_decrypt");
1463         &mov    ($key,&wparam(4));              # key2
1464         &mov    ($inp,&wparam(5));              # clear-text tweak
1465
1466         &mov    ($rounds,&DWP(240,$key));       # key2->rounds
1467         &movups ($inout0,&QWP(0,$inp));
1468         if ($inline)
1469         {   &aesni_inline_generate1("enc");     }
1470         else
1471         {   &call       ("_aesni_encrypt1");    }
1472
1473         &mov    ($inp,&wparam(0));
1474         &mov    ($out,&wparam(1));
1475         &mov    ($len,&wparam(2));
1476         &mov    ($key,&wparam(3));              # key1
1477
1478         &mov    ($key_,"esp");
1479         &sub    ("esp",16*7+8);
1480         &and    ("esp",-16);                    # align stack
1481
1482         &xor    ($rounds_,$rounds_);            # if(len%16) len-=16;
1483         &test   ($len,15);
1484         &setnz  (&LB($rounds_));
1485         &shl    ($rounds_,4);
1486         &sub    ($len,$rounds_);
1487
1488         &mov    (&DWP(16*6+0,"esp"),0x87);      # compose the magic constant
1489         &mov    (&DWP(16*6+4,"esp"),0);
1490         &mov    (&DWP(16*6+8,"esp"),1);
1491         &mov    (&DWP(16*6+12,"esp"),0);
1492         &mov    (&DWP(16*7+0,"esp"),$len);      # save original $len
1493         &mov    (&DWP(16*7+4,"esp"),$key_);     # save original %esp
1494
1495         &mov    ($rounds,&DWP(240,$key));       # key1->rounds
1496         &mov    ($key_,$key);                   # backup $key
1497         &mov    ($rounds_,$rounds);             # backup $rounds
1498
1499         &movdqa ($tweak,$inout0);
1500         &pxor   ($twtmp,$twtmp);
1501         &movdqa ($twmask,&QWP(6*16,"esp"));     # 0x0...010...87
1502         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1503
1504         &and    ($len,-16);
1505         &sub    ($len,16*6);
1506         &jc     (&label("xts_dec_short"));
1507
1508         &shl    ($rounds,4);
1509         &mov    ($rounds_,16);
1510         &sub    ($rounds_,$rounds);
1511         &lea    ($key,&DWP(32,$key,$rounds));
1512         &jmp    (&label("xts_dec_loop6"));
1513
1514 &set_label("xts_dec_loop6",16);
1515         for ($i=0;$i<4;$i++) {
1516             &pshufd     ($twres,$twtmp,0x13);
1517             &pxor       ($twtmp,$twtmp);
1518             &movdqa     (&QWP(16*$i,"esp"),$tweak);
1519             &paddq      ($tweak,$tweak);        # &psllq($tweak,1);
1520             &pand       ($twres,$twmask);       # isolate carry and residue
1521             &pcmpgtd    ($twtmp,$tweak);        # broadcast upper bits
1522             &pxor       ($tweak,$twres);
1523         }
1524         &pshufd ($inout5,$twtmp,0x13);
1525         &movdqa (&QWP(16*$i++,"esp"),$tweak);
1526         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1527          &$movekey      ($rndkey0,&QWP(0,$key_));
1528         &pand   ($inout5,$twmask);              # isolate carry and residue
1529          &movups        ($inout0,&QWP(0,$inp)); # load input
1530         &pxor   ($inout5,$tweak);
1531
1532         # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1533         &mov    ($rounds,$rounds_);
1534         &movdqu ($inout1,&QWP(16*1,$inp));
1535          &xorps         ($inout0,$rndkey0);     # input^=rndkey[0]
1536         &movdqu ($inout2,&QWP(16*2,$inp));
1537          &pxor          ($inout1,$rndkey0);
1538         &movdqu ($inout3,&QWP(16*3,$inp));
1539          &pxor          ($inout2,$rndkey0);
1540         &movdqu ($inout4,&QWP(16*4,$inp));
1541          &pxor          ($inout3,$rndkey0);
1542         &movdqu ($rndkey1,&QWP(16*5,$inp));
1543          &pxor          ($inout4,$rndkey0);
1544         &lea    ($inp,&DWP(16*6,$inp));
1545         &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1546         &movdqa (&QWP(16*$i,"esp"),$inout5);    # save last tweak
1547         &pxor   ($inout5,$rndkey1);
1548
1549          &$movekey      ($rndkey1,&QWP(16,$key_));
1550         &pxor   ($inout1,&QWP(16*1,"esp"));
1551         &pxor   ($inout2,&QWP(16*2,"esp"));
1552          &aesdec        ($inout0,$rndkey1);
1553         &pxor   ($inout3,&QWP(16*3,"esp"));
1554         &pxor   ($inout4,&QWP(16*4,"esp"));
1555          &aesdec        ($inout1,$rndkey1);
1556         &pxor           ($inout5,$rndkey0);
1557          &$movekey      ($rndkey0,&QWP(32,$key_));
1558          &aesdec        ($inout2,$rndkey1);
1559          &aesdec        ($inout3,$rndkey1);
1560          &aesdec        ($inout4,$rndkey1);
1561          &aesdec        ($inout5,$rndkey1);
1562         &call           (&label("_aesni_decrypt6_enter"));
1563
1564         &movdqa ($tweak,&QWP(16*5,"esp"));      # last tweak
1565        &pxor    ($twtmp,$twtmp);
1566         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1567        &pcmpgtd ($twtmp,$tweak);                # broadcast upper bits
1568         &xorps  ($inout1,&QWP(16*1,"esp"));
1569         &movups (&QWP(16*0,$out),$inout0);      # write output
1570         &xorps  ($inout2,&QWP(16*2,"esp"));
1571         &movups (&QWP(16*1,$out),$inout1);
1572         &xorps  ($inout3,&QWP(16*3,"esp"));
1573         &movups (&QWP(16*2,$out),$inout2);
1574         &xorps  ($inout4,&QWP(16*4,"esp"));
1575         &movups (&QWP(16*3,$out),$inout3);
1576         &xorps  ($inout5,$tweak);
1577         &movups (&QWP(16*4,$out),$inout4);
1578        &pshufd  ($twres,$twtmp,0x13);
1579         &movups (&QWP(16*5,$out),$inout5);
1580         &lea    ($out,&DWP(16*6,$out));
1581        &movdqa  ($twmask,&QWP(16*6,"esp"));     # 0x0...010...87
1582
1583         &pxor   ($twtmp,$twtmp);
1584         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1585         &pand   ($twres,$twmask);               # isolate carry and residue
1586         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1587         &pxor   ($tweak,$twres);
1588
1589         &sub    ($len,16*6);
1590         &jnc    (&label("xts_dec_loop6"));
1591
1592         &mov    ($rounds,&DWP(240,$key_));      # restore $rounds
1593         &mov    ($key,$key_);                   # restore $key
1594         &mov    ($rounds_,$rounds);
1595
1596 &set_label("xts_dec_short");
1597         &add    ($len,16*6);
1598         &jz     (&label("xts_dec_done6x"));
1599
1600         &movdqa ($inout3,$tweak);               # put aside previous tweak
1601         &cmp    ($len,0x20);
1602         &jb     (&label("xts_dec_one"));
1603
1604         &pshufd ($twres,$twtmp,0x13);
1605         &pxor   ($twtmp,$twtmp);
1606         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1607         &pand   ($twres,$twmask);               # isolate carry and residue
1608         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1609         &pxor   ($tweak,$twres);
1610         &je     (&label("xts_dec_two"));
1611
1612         &pshufd ($twres,$twtmp,0x13);
1613         &pxor   ($twtmp,$twtmp);
1614         &movdqa ($inout4,$tweak);               # put aside previous tweak
1615         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1616         &pand   ($twres,$twmask);               # isolate carry and residue
1617         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1618         &pxor   ($tweak,$twres);
1619         &cmp    ($len,0x40);
1620         &jb     (&label("xts_dec_three"));
1621
1622         &pshufd ($twres,$twtmp,0x13);
1623         &pxor   ($twtmp,$twtmp);
1624         &movdqa ($inout5,$tweak);               # put aside previous tweak
1625         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1626         &pand   ($twres,$twmask);               # isolate carry and residue
1627         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1628         &pxor   ($tweak,$twres);
1629         &movdqa (&QWP(16*0,"esp"),$inout3);
1630         &movdqa (&QWP(16*1,"esp"),$inout4);
1631         &je     (&label("xts_dec_four"));
1632
1633         &movdqa (&QWP(16*2,"esp"),$inout5);
1634         &pshufd ($inout5,$twtmp,0x13);
1635         &movdqa (&QWP(16*3,"esp"),$tweak);
1636         &paddq  ($tweak,$tweak);                # &psllq($inout0,1);
1637         &pand   ($inout5,$twmask);              # isolate carry and residue
1638         &pxor   ($inout5,$tweak);
1639
1640         &movdqu ($inout0,&QWP(16*0,$inp));      # load input
1641         &movdqu ($inout1,&QWP(16*1,$inp));
1642         &movdqu ($inout2,&QWP(16*2,$inp));
1643         &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1644         &movdqu ($inout3,&QWP(16*3,$inp));
1645         &pxor   ($inout1,&QWP(16*1,"esp"));
1646         &movdqu ($inout4,&QWP(16*4,$inp));
1647         &pxor   ($inout2,&QWP(16*2,"esp"));
1648         &lea    ($inp,&DWP(16*5,$inp));
1649         &pxor   ($inout3,&QWP(16*3,"esp"));
1650         &movdqa (&QWP(16*4,"esp"),$inout5);     # save last tweak
1651         &pxor   ($inout4,$inout5);
1652
1653         &call   ("_aesni_decrypt6");
1654
1655         &movaps ($tweak,&QWP(16*4,"esp"));      # last tweak
1656         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1657         &xorps  ($inout1,&QWP(16*1,"esp"));
1658         &xorps  ($inout2,&QWP(16*2,"esp"));
1659         &movups (&QWP(16*0,$out),$inout0);      # write output
1660         &xorps  ($inout3,&QWP(16*3,"esp"));
1661         &movups (&QWP(16*1,$out),$inout1);
1662         &xorps  ($inout4,$tweak);
1663         &movups (&QWP(16*2,$out),$inout2);
1664         &movups (&QWP(16*3,$out),$inout3);
1665         &movups (&QWP(16*4,$out),$inout4);
1666         &lea    ($out,&DWP(16*5,$out));
1667         &jmp    (&label("xts_dec_done"));
1668
1669 &set_label("xts_dec_one",16);
1670         &movups ($inout0,&QWP(16*0,$inp));      # load input
1671         &lea    ($inp,&DWP(16*1,$inp));
1672         &xorps  ($inout0,$inout3);              # input^=tweak
1673         if ($inline)
1674         {   &aesni_inline_generate1("dec");     }
1675         else
1676         {   &call       ("_aesni_decrypt1");    }
1677         &xorps  ($inout0,$inout3);              # output^=tweak
1678         &movups (&QWP(16*0,$out),$inout0);      # write output
1679         &lea    ($out,&DWP(16*1,$out));
1680
1681         &movdqa ($tweak,$inout3);               # last tweak
1682         &jmp    (&label("xts_dec_done"));
1683
1684 &set_label("xts_dec_two",16);
1685         &movaps ($inout4,$tweak);               # put aside last tweak
1686
1687         &movups ($inout0,&QWP(16*0,$inp));      # load input
1688         &movups ($inout1,&QWP(16*1,$inp));
1689         &lea    ($inp,&DWP(16*2,$inp));
1690         &xorps  ($inout0,$inout3);              # input^=tweak
1691         &xorps  ($inout1,$inout4);
1692
1693         &call   ("_aesni_decrypt2");
1694
1695         &xorps  ($inout0,$inout3);              # output^=tweak
1696         &xorps  ($inout1,$inout4);
1697         &movups (&QWP(16*0,$out),$inout0);      # write output
1698         &movups (&QWP(16*1,$out),$inout1);
1699         &lea    ($out,&DWP(16*2,$out));
1700
1701         &movdqa ($tweak,$inout4);               # last tweak
1702         &jmp    (&label("xts_dec_done"));
1703
1704 &set_label("xts_dec_three",16);
1705         &movaps ($inout5,$tweak);               # put aside last tweak
1706         &movups ($inout0,&QWP(16*0,$inp));      # load input
1707         &movups ($inout1,&QWP(16*1,$inp));
1708         &movups ($inout2,&QWP(16*2,$inp));
1709         &lea    ($inp,&DWP(16*3,$inp));
1710         &xorps  ($inout0,$inout3);              # input^=tweak
1711         &xorps  ($inout1,$inout4);
1712         &xorps  ($inout2,$inout5);
1713
1714         &call   ("_aesni_decrypt3");
1715
1716         &xorps  ($inout0,$inout3);              # output^=tweak
1717         &xorps  ($inout1,$inout4);
1718         &xorps  ($inout2,$inout5);
1719         &movups (&QWP(16*0,$out),$inout0);      # write output
1720         &movups (&QWP(16*1,$out),$inout1);
1721         &movups (&QWP(16*2,$out),$inout2);
1722         &lea    ($out,&DWP(16*3,$out));
1723
1724         &movdqa ($tweak,$inout5);               # last tweak
1725         &jmp    (&label("xts_dec_done"));
1726
1727 &set_label("xts_dec_four",16);
1728         &movaps ($inout4,$tweak);               # put aside last tweak
1729
1730         &movups ($inout0,&QWP(16*0,$inp));      # load input
1731         &movups ($inout1,&QWP(16*1,$inp));
1732         &movups ($inout2,&QWP(16*2,$inp));
1733         &xorps  ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1734         &movups ($inout3,&QWP(16*3,$inp));
1735         &lea    ($inp,&DWP(16*4,$inp));
1736         &xorps  ($inout1,&QWP(16*1,"esp"));
1737         &xorps  ($inout2,$inout5);
1738         &xorps  ($inout3,$inout4);
1739
1740         &call   ("_aesni_decrypt4");
1741
1742         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1743         &xorps  ($inout1,&QWP(16*1,"esp"));
1744         &xorps  ($inout2,$inout5);
1745         &movups (&QWP(16*0,$out),$inout0);      # write output
1746         &xorps  ($inout3,$inout4);
1747         &movups (&QWP(16*1,$out),$inout1);
1748         &movups (&QWP(16*2,$out),$inout2);
1749         &movups (&QWP(16*3,$out),$inout3);
1750         &lea    ($out,&DWP(16*4,$out));
1751
1752         &movdqa ($tweak,$inout4);               # last tweak
1753         &jmp    (&label("xts_dec_done"));
1754
1755 &set_label("xts_dec_done6x",16);                # $tweak is pre-calculated
1756         &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
1757         &and    ($len,15);
1758         &jz     (&label("xts_dec_ret"));
1759         &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
1760         &jmp    (&label("xts_dec_only_one_more"));
1761
1762 &set_label("xts_dec_done",16);
1763         &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
1764         &pxor   ($twtmp,$twtmp);
1765         &and    ($len,15);
1766         &jz     (&label("xts_dec_ret"));
1767
1768         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1769         &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
1770         &pshufd ($twres,$twtmp,0x13);
1771         &pxor   ($twtmp,$twtmp);
1772         &movdqa ($twmask,&QWP(16*6,"esp"));
1773         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1774         &pand   ($twres,$twmask);               # isolate carry and residue
1775         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1776         &pxor   ($tweak,$twres);
1777
1778 &set_label("xts_dec_only_one_more");
1779         &pshufd ($inout3,$twtmp,0x13);
1780         &movdqa ($inout4,$tweak);               # put aside previous tweak
1781         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1782         &pand   ($inout3,$twmask);              # isolate carry and residue
1783         &pxor   ($inout3,$tweak);
1784
1785         &mov    ($key,$key_);                   # restore $key
1786         &mov    ($rounds,$rounds_);             # restore $rounds
1787
1788         &movups ($inout0,&QWP(0,$inp));         # load input
1789         &xorps  ($inout0,$inout3);              # input^=tweak
1790         if ($inline)
1791         {   &aesni_inline_generate1("dec");     }
1792         else
1793         {   &call       ("_aesni_decrypt1");    }
1794         &xorps  ($inout0,$inout3);              # output^=tweak
1795         &movups (&QWP(0,$out),$inout0);         # write output
1796
1797 &set_label("xts_dec_steal");
1798         &movz   ($rounds,&BP(16,$inp));
1799         &movz   ($key,&BP(0,$out));
1800         &lea    ($inp,&DWP(1,$inp));
1801         &mov    (&BP(0,$out),&LB($rounds));
1802         &mov    (&BP(16,$out),&LB($key));
1803         &lea    ($out,&DWP(1,$out));
1804         &sub    ($len,1);
1805         &jnz    (&label("xts_dec_steal"));
1806
1807         &sub    ($out,&DWP(16*7+0,"esp"));      # rewind $out
1808         &mov    ($key,$key_);                   # restore $key
1809         &mov    ($rounds,$rounds_);             # restore $rounds
1810
1811         &movups ($inout0,&QWP(0,$out));         # load input
1812         &xorps  ($inout0,$inout4);              # input^=tweak
1813         if ($inline)
1814         {   &aesni_inline_generate1("dec");     }
1815         else
1816         {   &call       ("_aesni_decrypt1");    }
1817         &xorps  ($inout0,$inout4);              # output^=tweak
1818         &movups (&QWP(0,$out),$inout0);         # write output
1819
1820 &set_label("xts_dec_ret");
1821         &pxor   ("xmm0","xmm0");                # clear register bank
1822         &pxor   ("xmm1","xmm1");
1823         &pxor   ("xmm2","xmm2");
1824         &movdqa (&QWP(16*0,"esp"),"xmm0");      # clear stack
1825         &pxor   ("xmm3","xmm3");
1826         &movdqa (&QWP(16*1,"esp"),"xmm0");
1827         &pxor   ("xmm4","xmm4");
1828         &movdqa (&QWP(16*2,"esp"),"xmm0");
1829         &pxor   ("xmm5","xmm5");
1830         &movdqa (&QWP(16*3,"esp"),"xmm0");
1831         &pxor   ("xmm6","xmm6");
1832         &movdqa (&QWP(16*4,"esp"),"xmm0");
1833         &pxor   ("xmm7","xmm7");
1834         &movdqa (&QWP(16*5,"esp"),"xmm0");
1835         &mov    ("esp",&DWP(16*7+4,"esp"));     # restore %esp
1836 &function_end("aesni_xts_decrypt");
1837 }
1838 \f
1839 ######################################################################
1840 # void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
1841 #       const AES_KEY *key, unsigned int start_block_num,
1842 #       unsigned char offset_i[16], const unsigned char L_[][16],
1843 #       unsigned char checksum[16]);
1844 #
1845 {
1846 # offsets within stack frame
1847 my $checksum = 16*6;
1848 my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
1849
1850 # reassigned registers
1851 my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
1852 # $l_, $blocks, $inp, $key are permanently allocated in registers;
1853 # remaining non-volatile ones are offloaded to stack, which even
1854 # stay invariant after written to stack.
1855
1856 &function_begin("aesni_ocb_encrypt");
1857         &mov    ($rounds,&wparam(5));           # &offset_i
1858         &mov    ($rounds_,&wparam(7));          # &checksum
1859
1860         &mov    ($inp,&wparam(0));
1861         &mov    ($out,&wparam(1));
1862         &mov    ($len,&wparam(2));
1863         &mov    ($key,&wparam(3));
1864         &movdqu ($rndkey0,&QWP(0,$rounds));     # load offset_i
1865         &mov    ($block,&wparam(4));            # start_block_num
1866         &movdqu ($rndkey1,&QWP(0,$rounds_));    # load checksum
1867         &mov    ($l_,&wparam(6));               # L_
1868
1869         &mov    ($rounds,"esp");
1870         &sub    ("esp",$esp_off+4);             # alloca
1871         &and    ("esp",-16);                    # align stack
1872
1873         &sub    ($out,$inp);
1874         &shl    ($len,4);
1875         &lea    ($len,&DWP(-16*6,$inp,$len));   # end of input - 16*6
1876         &mov    (&DWP($out_off,"esp"),$out);
1877         &mov    (&DWP($end_off,"esp"),$len);
1878         &mov    (&DWP($esp_off,"esp"),$rounds);
1879
1880         &mov    ($rounds,&DWP(240,$key));
1881
1882         &test   ($block,1);
1883         &jnz    (&label("odd"));
1884
1885         &bsf            ($i3,$block);
1886         &add            ($block,1);
1887         &shl            ($i3,4);
1888         &movdqu         ($inout5,&QWP(0,$l_,$i3));
1889         &mov            ($i3,$key);                     # put aside key
1890
1891         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
1892         &lea            ($inp,&DWP(16,$inp));
1893
1894         &pxor           ($inout5,$rndkey0);             # ^ last offset_i
1895         &pxor           ($rndkey1,$inout0);             # checksum
1896         &pxor           ($inout0,$inout5);              # ^ offset_i
1897
1898         &movdqa         ($inout4,$rndkey1);
1899         if ($inline)
1900         {   &aesni_inline_generate1("enc");     }
1901         else
1902         {   &call       ("_aesni_encrypt1");    }
1903
1904         &xorps          ($inout0,$inout5);              # ^ offset_i
1905         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
1906         &movdqa         ($rndkey1,$inout4);             # pass the checksum
1907
1908         &movups         (&QWP(-16,$out,$inp),$inout0);  # store output
1909
1910         &mov            ($rounds,&DWP(240,$i3));
1911         &mov            ($key,$i3);                     # restore key
1912         &mov            ($len,&DWP($end_off,"esp"));
1913
1914 &set_label("odd");
1915         &shl            ($rounds,4);
1916         &mov            ($out,16);
1917         &sub            ($out,$rounds);                 # twisted rounds
1918         &mov            (&DWP($key_off,"esp"),$key);
1919         &lea            ($key,&DWP(32,$key,$rounds));   # end of key schedule
1920         &mov            (&DWP($rounds_off,"esp"),$out);
1921
1922         &cmp            ($inp,$len);
1923         &ja             (&label("short"));
1924         &jmp            (&label("grandloop"));
1925
1926 &set_label("grandloop",32);
1927         &lea            ($i1,&DWP(1,$block));
1928         &lea            ($i3,&DWP(3,$block));
1929         &lea            ($i5,&DWP(5,$block));
1930         &add            ($block,6);
1931         &bsf            ($i1,$i1);
1932         &bsf            ($i3,$i3);
1933         &bsf            ($i5,$i5);
1934         &shl            ($i1,4);
1935         &shl            ($i3,4);
1936         &shl            ($i5,4);
1937         &movdqu         ($inout0,&QWP(0,$l_));
1938         &movdqu         ($inout1,&QWP(0,$l_,$i1));
1939         &mov            ($rounds,&DWP($rounds_off,"esp"));
1940         &movdqa         ($inout2,$inout0);
1941         &movdqu         ($inout3,&QWP(0,$l_,$i3));
1942         &movdqa         ($inout4,$inout0);
1943         &movdqu         ($inout5,&QWP(0,$l_,$i5));
1944
1945         &pxor           ($inout0,$rndkey0);             # ^ last offset_i
1946         &pxor           ($inout1,$inout0);
1947         &movdqa         (&QWP(16*0,"esp"),$inout0);
1948         &pxor           ($inout2,$inout1);
1949         &movdqa         (&QWP(16*1,"esp"),$inout1);
1950         &pxor           ($inout3,$inout2);
1951         &movdqa         (&QWP(16*2,"esp"),$inout2);
1952         &pxor           ($inout4,$inout3);
1953         &movdqa         (&QWP(16*3,"esp"),$inout3);
1954         &pxor           ($inout5,$inout4);
1955         &movdqa         (&QWP(16*4,"esp"),$inout4);
1956         &movdqa         (&QWP(16*5,"esp"),$inout5);
1957
1958         &$movekey       ($rndkey0,&QWP(-48,$key,$rounds));
1959         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
1960         &movdqu         ($inout1,&QWP(16*1,$inp));
1961         &movdqu         ($inout2,&QWP(16*2,$inp));
1962         &movdqu         ($inout3,&QWP(16*3,$inp));
1963         &movdqu         ($inout4,&QWP(16*4,$inp));
1964         &movdqu         ($inout5,&QWP(16*5,$inp));
1965         &lea            ($inp,&DWP(16*6,$inp));
1966
1967         &pxor           ($rndkey1,$inout0);             # checksum
1968         &pxor           ($inout0,$rndkey0);             # ^ roundkey[0]
1969         &pxor           ($rndkey1,$inout1);
1970         &pxor           ($inout1,$rndkey0);
1971         &pxor           ($rndkey1,$inout2);
1972         &pxor           ($inout2,$rndkey0);
1973         &pxor           ($rndkey1,$inout3);
1974         &pxor           ($inout3,$rndkey0);
1975         &pxor           ($rndkey1,$inout4);
1976         &pxor           ($inout4,$rndkey0);
1977         &pxor           ($rndkey1,$inout5);
1978         &pxor           ($inout5,$rndkey0);
1979         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
1980
1981         &$movekey       ($rndkey1,&QWP(-32,$key,$rounds));
1982         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
1983         &pxor           ($inout1,&QWP(16*1,"esp"));
1984         &pxor           ($inout2,&QWP(16*2,"esp"));
1985         &pxor           ($inout3,&QWP(16*3,"esp"));
1986         &pxor           ($inout4,&QWP(16*4,"esp"));
1987         &pxor           ($inout5,&QWP(16*5,"esp"));
1988
1989         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
1990         &aesenc         ($inout0,$rndkey1);
1991         &aesenc         ($inout1,$rndkey1);
1992         &aesenc         ($inout2,$rndkey1);
1993         &aesenc         ($inout3,$rndkey1);
1994         &aesenc         ($inout4,$rndkey1);
1995         &aesenc         ($inout5,$rndkey1);
1996
1997         &mov            ($out,&DWP($out_off,"esp"));
1998         &mov            ($len,&DWP($end_off,"esp"));
1999         &call           ("_aesni_encrypt6_enter");
2000
2001         &movdqa         ($rndkey0,&QWP(16*5,"esp"));    # pass last offset_i
2002         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2003         &pxor           ($inout1,&QWP(16*1,"esp"));
2004         &pxor           ($inout2,&QWP(16*2,"esp"));
2005         &pxor           ($inout3,&QWP(16*3,"esp"));
2006         &pxor           ($inout4,&QWP(16*4,"esp"));
2007         &pxor           ($inout5,$rndkey0);
2008         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2009
2010         &movdqu         (&QWP(-16*6,$out,$inp),$inout0);# store output
2011         &movdqu         (&QWP(-16*5,$out,$inp),$inout1);
2012         &movdqu         (&QWP(-16*4,$out,$inp),$inout2);
2013         &movdqu         (&QWP(-16*3,$out,$inp),$inout3);
2014         &movdqu         (&QWP(-16*2,$out,$inp),$inout4);
2015         &movdqu         (&QWP(-16*1,$out,$inp),$inout5);
2016         &cmp            ($inp,$len);                    # done yet?
2017         &jb             (&label("grandloop"));
2018
2019 &set_label("short");
2020         &add            ($len,16*6);
2021         &sub            ($len,$inp);
2022         &jz             (&label("done"));
2023
2024         &cmp            ($len,16*2);
2025         &jb             (&label("one"));
2026         &je             (&label("two"));
2027
2028         &cmp            ($len,16*4);
2029         &jb             (&label("three"));
2030         &je             (&label("four"));
2031
2032         &lea            ($i1,&DWP(1,$block));
2033         &lea            ($i3,&DWP(3,$block));
2034         &bsf            ($i1,$i1);
2035         &bsf            ($i3,$i3);
2036         &shl            ($i1,4);
2037         &shl            ($i3,4);
2038         &movdqu         ($inout0,&QWP(0,$l_));
2039         &movdqu         ($inout1,&QWP(0,$l_,$i1));
2040         &mov            ($rounds,&DWP($rounds_off,"esp"));
2041         &movdqa         ($inout2,$inout0);
2042         &movdqu         ($inout3,&QWP(0,$l_,$i3));
2043         &movdqa         ($inout4,$inout0);
2044
2045         &pxor           ($inout0,$rndkey0);             # ^ last offset_i
2046         &pxor           ($inout1,$inout0);
2047         &movdqa         (&QWP(16*0,"esp"),$inout0);
2048         &pxor           ($inout2,$inout1);
2049         &movdqa         (&QWP(16*1,"esp"),$inout1);
2050         &pxor           ($inout3,$inout2);
2051         &movdqa         (&QWP(16*2,"esp"),$inout2);
2052         &pxor           ($inout4,$inout3);
2053         &movdqa         (&QWP(16*3,"esp"),$inout3);
2054         &pxor           ($inout5,$inout4);
2055         &movdqa         (&QWP(16*4,"esp"),$inout4);
2056
2057         &$movekey       ($rndkey0,&QWP(-48,$key,$rounds));
2058         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2059         &movdqu         ($inout1,&QWP(16*1,$inp));
2060         &movdqu         ($inout2,&QWP(16*2,$inp));
2061         &movdqu         ($inout3,&QWP(16*3,$inp));
2062         &movdqu         ($inout4,&QWP(16*4,$inp));
2063         &pxor           ($inout5,$inout5);
2064
2065         &pxor           ($rndkey1,$inout0);             # checksum
2066         &pxor           ($inout0,$rndkey0);             # ^ roundkey[0]
2067         &pxor           ($rndkey1,$inout1);
2068         &pxor           ($inout1,$rndkey0);
2069         &pxor           ($rndkey1,$inout2);
2070         &pxor           ($inout2,$rndkey0);
2071         &pxor           ($rndkey1,$inout3);
2072         &pxor           ($inout3,$rndkey0);
2073         &pxor           ($rndkey1,$inout4);
2074         &pxor           ($inout4,$rndkey0);
2075         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2076
2077         &$movekey       ($rndkey1,&QWP(-32,$key,$rounds));
2078         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2079         &pxor           ($inout1,&QWP(16*1,"esp"));
2080         &pxor           ($inout2,&QWP(16*2,"esp"));
2081         &pxor           ($inout3,&QWP(16*3,"esp"));
2082         &pxor           ($inout4,&QWP(16*4,"esp"));
2083
2084         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
2085         &aesenc         ($inout0,$rndkey1);
2086         &aesenc         ($inout1,$rndkey1);
2087         &aesenc         ($inout2,$rndkey1);
2088         &aesenc         ($inout3,$rndkey1);
2089         &aesenc         ($inout4,$rndkey1);
2090         &aesenc         ($inout5,$rndkey1);
2091
2092         &mov            ($out,&DWP($out_off,"esp"));
2093         &call           ("_aesni_encrypt6_enter");
2094
2095         &movdqa         ($rndkey0,&QWP(16*4,"esp"));    # pass last offset_i
2096         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2097         &pxor           ($inout1,&QWP(16*1,"esp"));
2098         &pxor           ($inout2,&QWP(16*2,"esp"));
2099         &pxor           ($inout3,&QWP(16*3,"esp"));
2100         &pxor           ($inout4,$rndkey0);
2101         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2102
2103         &movdqu         (&QWP(16*0,$out,$inp),$inout0); # store output
2104         &movdqu         (&QWP(16*1,$out,$inp),$inout1);
2105         &movdqu         (&QWP(16*2,$out,$inp),$inout2);
2106         &movdqu         (&QWP(16*3,$out,$inp),$inout3);
2107         &movdqu         (&QWP(16*4,$out,$inp),$inout4);
2108
2109         &jmp            (&label("done"));
2110
2111 &set_label("one",16);
2112         &movdqu         ($inout5,&QWP(0,$l_));
2113         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2114
2115         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2116         &mov            ($rounds,&DWP(240,$key));
2117
2118         &pxor           ($inout5,$rndkey0);             # ^ last offset_i
2119         &pxor           ($rndkey1,$inout0);             # checksum
2120         &pxor           ($inout0,$inout5);              # ^ offset_i
2121
2122         &movdqa         ($inout4,$rndkey1);
2123         &mov            ($out,&DWP($out_off,"esp"));
2124         if ($inline)
2125         {   &aesni_inline_generate1("enc");     }
2126         else
2127         {   &call       ("_aesni_encrypt1");    }
2128
2129         &xorps          ($inout0,$inout5);              # ^ offset_i
2130         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2131         &movdqa         ($rndkey1,$inout4);             # pass the checksum
2132         &movups         (&QWP(0,$out,$inp),$inout0);
2133
2134         &jmp            (&label("done"));
2135
2136 &set_label("two",16);
2137         &lea            ($i1,&DWP(1,$block));
2138         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2139         &bsf            ($i1,$i1);
2140         &shl            ($i1,4);
2141         &movdqu         ($inout4,&QWP(0,$l_));
2142         &movdqu         ($inout5,&QWP(0,$l_,$i1));
2143
2144         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2145         &movdqu         ($inout1,&QWP(16*1,$inp));
2146         &mov            ($rounds,&DWP(240,$key));
2147
2148         &pxor           ($inout4,$rndkey0);             # ^ last offset_i
2149         &pxor           ($inout5,$inout4);
2150
2151         &pxor           ($rndkey1,$inout0);             # checksum
2152         &pxor           ($inout0,$inout4);              # ^ offset_i
2153         &pxor           ($rndkey1,$inout1);
2154         &pxor           ($inout1,$inout5);
2155
2156         &movdqa         ($inout3,$rndkey1)
2157         &mov            ($out,&DWP($out_off,"esp"));
2158         &call           ("_aesni_encrypt2");
2159
2160         &xorps          ($inout0,$inout4);              # ^ offset_i
2161         &xorps          ($inout1,$inout5);
2162         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2163         &movdqa         ($rndkey1,$inout3);             # pass the checksum
2164         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2165         &movups         (&QWP(16*1,$out,$inp),$inout1);
2166
2167         &jmp            (&label("done"));
2168
2169 &set_label("three",16);
2170         &lea            ($i1,&DWP(1,$block));
2171         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2172         &bsf            ($i1,$i1);
2173         &shl            ($i1,4);
2174         &movdqu         ($inout3,&QWP(0,$l_));
2175         &movdqu         ($inout4,&QWP(0,$l_,$i1));
2176         &movdqa         ($inout5,$inout3);
2177
2178         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2179         &movdqu         ($inout1,&QWP(16*1,$inp));
2180         &movdqu         ($inout2,&QWP(16*2,$inp));
2181         &mov            ($rounds,&DWP(240,$key));
2182
2183         &pxor           ($inout3,$rndkey0);             # ^ last offset_i
2184         &pxor           ($inout4,$inout3);
2185         &pxor           ($inout5,$inout4);
2186
2187         &pxor           ($rndkey1,$inout0);             # checksum
2188         &pxor           ($inout0,$inout3);              # ^ offset_i
2189         &pxor           ($rndkey1,$inout1);
2190         &pxor           ($inout1,$inout4);
2191         &pxor           ($rndkey1,$inout2);
2192         &pxor           ($inout2,$inout5);
2193
2194         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2195         &mov            ($out,&DWP($out_off,"esp"));
2196         &call           ("_aesni_encrypt3");
2197
2198         &xorps          ($inout0,$inout3);              # ^ offset_i
2199         &xorps          ($inout1,$inout4);
2200         &xorps          ($inout2,$inout5);
2201         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2202         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2203         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2204         &movups         (&QWP(16*1,$out,$inp),$inout1);
2205         &movups         (&QWP(16*2,$out,$inp),$inout2);
2206
2207         &jmp            (&label("done"));
2208
2209 &set_label("four",16);
2210         &lea            ($i1,&DWP(1,$block));
2211         &lea            ($i3,&DWP(3,$block));
2212         &bsf            ($i1,$i1);
2213         &bsf            ($i3,$i3);
2214         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2215         &shl            ($i1,4);
2216         &shl            ($i3,4);
2217         &movdqu         ($inout2,&QWP(0,$l_));
2218         &movdqu         ($inout3,&QWP(0,$l_,$i1));
2219         &movdqa         ($inout4,$inout2);
2220         &movdqu         ($inout5,&QWP(0,$l_,$i3));
2221
2222         &pxor           ($inout2,$rndkey0);             # ^ last offset_i
2223         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2224         &pxor           ($inout3,$inout2);
2225         &movdqu         ($inout1,&QWP(16*1,$inp));
2226         &pxor           ($inout4,$inout3);
2227         &movdqa         (&QWP(16*0,"esp"),$inout2);
2228         &pxor           ($inout5,$inout4);
2229         &movdqa         (&QWP(16*1,"esp"),$inout3);
2230         &movdqu         ($inout2,&QWP(16*2,$inp));
2231         &movdqu         ($inout3,&QWP(16*3,$inp));
2232         &mov            ($rounds,&DWP(240,$key));
2233
2234         &pxor           ($rndkey1,$inout0);             # checksum
2235         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2236         &pxor           ($rndkey1,$inout1);
2237         &pxor           ($inout1,&QWP(16*1,"esp"));
2238         &pxor           ($rndkey1,$inout2);
2239         &pxor           ($inout2,$inout4);
2240         &pxor           ($rndkey1,$inout3);
2241         &pxor           ($inout3,$inout5);
2242
2243         &movdqa         (&QWP($checksum,"esp"),$rndkey1)
2244         &mov            ($out,&DWP($out_off,"esp"));
2245         &call           ("_aesni_encrypt4");
2246
2247         &xorps          ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2248         &xorps          ($inout1,&QWP(16*1,"esp"));
2249         &xorps          ($inout2,$inout4);
2250         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2251         &xorps          ($inout3,$inout5);
2252         &movups         (&QWP(16*1,$out,$inp),$inout1);
2253         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2254         &movups         (&QWP(16*2,$out,$inp),$inout2);
2255         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2256         &movups         (&QWP(16*3,$out,$inp),$inout3);
2257
2258 &set_label("done");
2259         &mov    ($key,&DWP($esp_off,"esp"));
2260         &pxor   ($inout0,$inout0);              # clear register bank
2261         &pxor   ($inout1,$inout1);
2262         &movdqa (&QWP(16*0,"esp"),$inout0);     # clear stack
2263         &pxor   ($inout2,$inout2);
2264         &movdqa (&QWP(16*1,"esp"),$inout0);
2265         &pxor   ($inout3,$inout3);
2266         &movdqa (&QWP(16*2,"esp"),$inout0);
2267         &pxor   ($inout4,$inout4);
2268         &movdqa (&QWP(16*3,"esp"),$inout0);
2269         &pxor   ($inout5,$inout5);
2270         &movdqa (&QWP(16*4,"esp"),$inout0);
2271         &movdqa (&QWP(16*5,"esp"),$inout0);
2272         &movdqa (&QWP(16*6,"esp"),$inout0);
2273
2274         &lea    ("esp",&DWP(0,$key));
2275         &mov    ($rounds,&wparam(5));           # &offset_i
2276         &mov    ($rounds_,&wparam(7));          # &checksum
2277         &movdqu (&QWP(0,$rounds),$rndkey0);
2278         &pxor   ($rndkey0,$rndkey0);
2279         &movdqu (&QWP(0,$rounds_),$rndkey1);
2280         &pxor   ($rndkey1,$rndkey1);
2281 &function_end("aesni_ocb_encrypt");
2282
2283 &function_begin("aesni_ocb_decrypt");
2284         &mov    ($rounds,&wparam(5));           # &offset_i
2285         &mov    ($rounds_,&wparam(7));          # &checksum
2286
2287         &mov    ($inp,&wparam(0));
2288         &mov    ($out,&wparam(1));
2289         &mov    ($len,&wparam(2));
2290         &mov    ($key,&wparam(3));
2291         &movdqu ($rndkey0,&QWP(0,$rounds));     # load offset_i
2292         &mov    ($block,&wparam(4));            # start_block_num
2293         &movdqu ($rndkey1,&QWP(0,$rounds_));    # load checksum
2294         &mov    ($l_,&wparam(6));               # L_
2295
2296         &mov    ($rounds,"esp");
2297         &sub    ("esp",$esp_off+4);             # alloca
2298         &and    ("esp",-16);                    # align stack
2299
2300         &sub    ($out,$inp);
2301         &shl    ($len,4);
2302         &lea    ($len,&DWP(-16*6,$inp,$len));   # end of input - 16*6
2303         &mov    (&DWP($out_off,"esp"),$out);
2304         &mov    (&DWP($end_off,"esp"),$len);
2305         &mov    (&DWP($esp_off,"esp"),$rounds);
2306
2307         &mov    ($rounds,&DWP(240,$key));
2308
2309         &test   ($block,1);
2310         &jnz    (&label("odd"));
2311
2312         &bsf            ($i3,$block);
2313         &add            ($block,1);
2314         &shl            ($i3,4);
2315         &movdqu         ($inout5,&QWP(0,$l_,$i3));
2316         &mov            ($i3,$key);                     # put aside key
2317
2318         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2319         &lea            ($inp,&DWP(16,$inp));
2320
2321         &pxor           ($inout5,$rndkey0);             # ^ last offset_i
2322         &pxor           ($inout0,$inout5);              # ^ offset_i
2323
2324         &movdqa         ($inout4,$rndkey1);
2325         if ($inline)
2326         {   &aesni_inline_generate1("dec");     }
2327         else
2328         {   &call       ("_aesni_decrypt1");    }
2329
2330         &xorps          ($inout0,$inout5);              # ^ offset_i
2331         &movaps         ($rndkey1,$inout4);             # pass the checksum
2332         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2333         &xorps          ($rndkey1,$inout0);             # checksum
2334         &movups         (&QWP(-16,$out,$inp),$inout0);  # store output
2335
2336         &mov            ($rounds,&DWP(240,$i3));
2337         &mov            ($key,$i3);                     # restore key
2338         &mov            ($len,&DWP($end_off,"esp"));
2339
2340 &set_label("odd");
2341         &shl            ($rounds,4);
2342         &mov            ($out,16);
2343         &sub            ($out,$rounds);                 # twisted rounds
2344         &mov            (&DWP($key_off,"esp"),$key);
2345         &lea            ($key,&DWP(32,$key,$rounds));   # end of key schedule
2346         &mov            (&DWP($rounds_off,"esp"),$out);
2347
2348         &cmp            ($inp,$len);
2349         &ja             (&label("short"));
2350         &jmp            (&label("grandloop"));
2351
2352 &set_label("grandloop",32);
2353         &lea            ($i1,&DWP(1,$block));
2354         &lea            ($i3,&DWP(3,$block));
2355         &lea            ($i5,&DWP(5,$block));
2356         &add            ($block,6);
2357         &bsf            ($i1,$i1);
2358         &bsf            ($i3,$i3);
2359         &bsf            ($i5,$i5);
2360         &shl            ($i1,4);
2361         &shl            ($i3,4);
2362         &shl            ($i5,4);
2363         &movdqu         ($inout0,&QWP(0,$l_));
2364         &movdqu         ($inout1,&QWP(0,$l_,$i1));
2365         &mov            ($rounds,&DWP($rounds_off,"esp"));
2366         &movdqa         ($inout2,$inout0);
2367         &movdqu         ($inout3,&QWP(0,$l_,$i3));
2368         &movdqa         ($inout4,$inout0);
2369         &movdqu         ($inout5,&QWP(0,$l_,$i5));
2370
2371         &pxor           ($inout0,$rndkey0);             # ^ last offset_i
2372         &pxor           ($inout1,$inout0);
2373         &movdqa         (&QWP(16*0,"esp"),$inout0);
2374         &pxor           ($inout2,$inout1);
2375         &movdqa         (&QWP(16*1,"esp"),$inout1);
2376         &pxor           ($inout3,$inout2);
2377         &movdqa         (&QWP(16*2,"esp"),$inout2);
2378         &pxor           ($inout4,$inout3);
2379         &movdqa         (&QWP(16*3,"esp"),$inout3);
2380         &pxor           ($inout5,$inout4);
2381         &movdqa         (&QWP(16*4,"esp"),$inout4);
2382         &movdqa         (&QWP(16*5,"esp"),$inout5);
2383
2384         &$movekey       ($rndkey0,&QWP(-48,$key,$rounds));
2385         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2386         &movdqu         ($inout1,&QWP(16*1,$inp));
2387         &movdqu         ($inout2,&QWP(16*2,$inp));
2388         &movdqu         ($inout3,&QWP(16*3,$inp));
2389         &movdqu         ($inout4,&QWP(16*4,$inp));
2390         &movdqu         ($inout5,&QWP(16*5,$inp));
2391         &lea            ($inp,&DWP(16*6,$inp));
2392
2393         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2394         &pxor           ($inout0,$rndkey0);             # ^ roundkey[0]
2395         &pxor           ($inout1,$rndkey0);
2396         &pxor           ($inout2,$rndkey0);
2397         &pxor           ($inout3,$rndkey0);
2398         &pxor           ($inout4,$rndkey0);
2399         &pxor           ($inout5,$rndkey0);
2400
2401         &$movekey       ($rndkey1,&QWP(-32,$key,$rounds));
2402         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2403         &pxor           ($inout1,&QWP(16*1,"esp"));
2404         &pxor           ($inout2,&QWP(16*2,"esp"));
2405         &pxor           ($inout3,&QWP(16*3,"esp"));
2406         &pxor           ($inout4,&QWP(16*4,"esp"));
2407         &pxor           ($inout5,&QWP(16*5,"esp"));
2408
2409         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
2410         &aesdec         ($inout0,$rndkey1);
2411         &aesdec         ($inout1,$rndkey1);
2412         &aesdec         ($inout2,$rndkey1);
2413         &aesdec         ($inout3,$rndkey1);
2414         &aesdec         ($inout4,$rndkey1);
2415         &aesdec         ($inout5,$rndkey1);
2416
2417         &mov            ($out,&DWP($out_off,"esp"));
2418         &mov            ($len,&DWP($end_off,"esp"));
2419         &call           ("_aesni_decrypt6_enter");
2420
2421         &movdqa         ($rndkey0,&QWP(16*5,"esp"));    # pass last offset_i
2422         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2423         &movdqa         ($rndkey1,&QWP($checksum,"esp"));
2424         &pxor           ($inout1,&QWP(16*1,"esp"));
2425         &pxor           ($inout2,&QWP(16*2,"esp"));
2426         &pxor           ($inout3,&QWP(16*3,"esp"));
2427         &pxor           ($inout4,&QWP(16*4,"esp"));
2428         &pxor           ($inout5,$rndkey0);
2429
2430         &pxor           ($rndkey1,$inout0);             # checksum
2431         &movdqu         (&QWP(-16*6,$out,$inp),$inout0);# store output
2432         &pxor           ($rndkey1,$inout1);
2433         &movdqu         (&QWP(-16*5,$out,$inp),$inout1);
2434         &pxor           ($rndkey1,$inout2);
2435         &movdqu         (&QWP(-16*4,$out,$inp),$inout2);
2436         &pxor           ($rndkey1,$inout3);
2437         &movdqu         (&QWP(-16*3,$out,$inp),$inout3);
2438         &pxor           ($rndkey1,$inout4);
2439         &movdqu         (&QWP(-16*2,$out,$inp),$inout4);
2440         &pxor           ($rndkey1,$inout5);
2441         &movdqu         (&QWP(-16*1,$out,$inp),$inout5);
2442         &cmp            ($inp,$len);                    # done yet?
2443         &jb             (&label("grandloop"));
2444
2445 &set_label("short");
2446         &add            ($len,16*6);
2447         &sub            ($len,$inp);
2448         &jz             (&label("done"));
2449
2450         &cmp            ($len,16*2);
2451         &jb             (&label("one"));
2452         &je             (&label("two"));
2453
2454         &cmp            ($len,16*4);
2455         &jb             (&label("three"));
2456         &je             (&label("four"));
2457
2458         &lea            ($i1,&DWP(1,$block));
2459         &lea            ($i3,&DWP(3,$block));
2460         &bsf            ($i1,$i1);
2461         &bsf            ($i3,$i3);
2462         &shl            ($i1,4);
2463         &shl            ($i3,4);
2464         &movdqu         ($inout0,&QWP(0,$l_));
2465         &movdqu         ($inout1,&QWP(0,$l_,$i1));
2466         &mov            ($rounds,&DWP($rounds_off,"esp"));
2467         &movdqa         ($inout2,$inout0);
2468         &movdqu         ($inout3,&QWP(0,$l_,$i3));
2469         &movdqa         ($inout4,$inout0);
2470
2471         &pxor           ($inout0,$rndkey0);             # ^ last offset_i
2472         &pxor           ($inout1,$inout0);
2473         &movdqa         (&QWP(16*0,"esp"),$inout0);
2474         &pxor           ($inout2,$inout1);
2475         &movdqa         (&QWP(16*1,"esp"),$inout1);
2476         &pxor           ($inout3,$inout2);
2477         &movdqa         (&QWP(16*2,"esp"),$inout2);
2478         &pxor           ($inout4,$inout3);
2479         &movdqa         (&QWP(16*3,"esp"),$inout3);
2480         &pxor           ($inout5,$inout4);
2481         &movdqa         (&QWP(16*4,"esp"),$inout4);
2482
2483         &$movekey       ($rndkey0,&QWP(-48,$key,$rounds));
2484         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2485         &movdqu         ($inout1,&QWP(16*1,$inp));
2486         &movdqu         ($inout2,&QWP(16*2,$inp));
2487         &movdqu         ($inout3,&QWP(16*3,$inp));
2488         &movdqu         ($inout4,&QWP(16*4,$inp));
2489         &pxor           ($inout5,$inout5);
2490
2491         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2492         &pxor           ($inout0,$rndkey0);             # ^ roundkey[0]
2493         &pxor           ($inout1,$rndkey0);
2494         &pxor           ($inout2,$rndkey0);
2495         &pxor           ($inout3,$rndkey0);
2496         &pxor           ($inout4,$rndkey0);
2497
2498         &$movekey       ($rndkey1,&QWP(-32,$key,$rounds));
2499         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2500         &pxor           ($inout1,&QWP(16*1,"esp"));
2501         &pxor           ($inout2,&QWP(16*2,"esp"));
2502         &pxor           ($inout3,&QWP(16*3,"esp"));
2503         &pxor           ($inout4,&QWP(16*4,"esp"));
2504
2505         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
2506         &aesdec         ($inout0,$rndkey1);
2507         &aesdec         ($inout1,$rndkey1);
2508         &aesdec         ($inout2,$rndkey1);
2509         &aesdec         ($inout3,$rndkey1);
2510         &aesdec         ($inout4,$rndkey1);
2511         &aesdec         ($inout5,$rndkey1);
2512
2513         &mov            ($out,&DWP($out_off,"esp"));
2514         &call           ("_aesni_decrypt6_enter");
2515
2516         &movdqa         ($rndkey0,&QWP(16*4,"esp"));    # pass last offset_i
2517         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2518         &movdqa         ($rndkey1,&QWP($checksum,"esp"));
2519         &pxor           ($inout1,&QWP(16*1,"esp"));
2520         &pxor           ($inout2,&QWP(16*2,"esp"));
2521         &pxor           ($inout3,&QWP(16*3,"esp"));
2522         &pxor           ($inout4,$rndkey0);
2523
2524         &pxor           ($rndkey1,$inout0);             # checksum
2525         &movdqu         (&QWP(16*0,$out,$inp),$inout0); # store output
2526         &pxor           ($rndkey1,$inout1);
2527         &movdqu         (&QWP(16*1,$out,$inp),$inout1);
2528         &pxor           ($rndkey1,$inout2);
2529         &movdqu         (&QWP(16*2,$out,$inp),$inout2);
2530         &pxor           ($rndkey1,$inout3);
2531         &movdqu         (&QWP(16*3,$out,$inp),$inout3);
2532         &pxor           ($rndkey1,$inout4);
2533         &movdqu         (&QWP(16*4,$out,$inp),$inout4);
2534
2535         &jmp            (&label("done"));
2536
2537 &set_label("one",16);
2538         &movdqu         ($inout5,&QWP(0,$l_));
2539         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2540
2541         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2542         &mov            ($rounds,&DWP(240,$key));
2543
2544         &pxor           ($inout5,$rndkey0);             # ^ last offset_i
2545         &pxor           ($inout0,$inout5);              # ^ offset_i
2546
2547         &movdqa         ($inout4,$rndkey1);
2548         &mov            ($out,&DWP($out_off,"esp"));
2549         if ($inline)
2550         {   &aesni_inline_generate1("dec");     }
2551         else
2552         {   &call       ("_aesni_decrypt1");    }
2553
2554         &xorps          ($inout0,$inout5);              # ^ offset_i
2555         &movaps         ($rndkey1,$inout4);             # pass the checksum
2556         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2557         &xorps          ($rndkey1,$inout0);             # checksum
2558         &movups         (&QWP(0,$out,$inp),$inout0);
2559
2560         &jmp            (&label("done"));
2561
2562 &set_label("two",16);
2563         &lea            ($i1,&DWP(1,$block));
2564         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2565         &bsf            ($i1,$i1);
2566         &shl            ($i1,4);
2567         &movdqu         ($inout4,&QWP(0,$l_));
2568         &movdqu         ($inout5,&QWP(0,$l_,$i1));
2569
2570         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2571         &movdqu         ($inout1,&QWP(16*1,$inp));
2572         &mov            ($rounds,&DWP(240,$key));
2573
2574         &movdqa         ($inout3,$rndkey1);
2575         &pxor           ($inout4,$rndkey0);             # ^ last offset_i
2576         &pxor           ($inout5,$inout4);
2577
2578         &pxor           ($inout0,$inout4);              # ^ offset_i
2579         &pxor           ($inout1,$inout5);
2580
2581         &mov            ($out,&DWP($out_off,"esp"));
2582         &call           ("_aesni_decrypt2");
2583
2584         &xorps          ($inout0,$inout4);              # ^ offset_i
2585         &xorps          ($inout1,$inout5);
2586         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2587         &xorps          ($inout3,$inout0);              # checksum
2588         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2589         &xorps          ($inout3,$inout1);
2590         &movups         (&QWP(16*1,$out,$inp),$inout1);
2591         &movaps         ($rndkey1,$inout3);             # pass the checksum
2592
2593         &jmp            (&label("done"));
2594
2595 &set_label("three",16);
2596         &lea            ($i1,&DWP(1,$block));
2597         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2598         &bsf            ($i1,$i1);
2599         &shl            ($i1,4);
2600         &movdqu         ($inout3,&QWP(0,$l_));
2601         &movdqu         ($inout4,&QWP(0,$l_,$i1));
2602         &movdqa         ($inout5,$inout3);
2603
2604         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2605         &movdqu         ($inout1,&QWP(16*1,$inp));
2606         &movdqu         ($inout2,&QWP(16*2,$inp));
2607         &mov            ($rounds,&DWP(240,$key));
2608
2609         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2610         &pxor           ($inout3,$rndkey0);             # ^ last offset_i
2611         &pxor           ($inout4,$inout3);
2612         &pxor           ($inout5,$inout4);
2613
2614         &pxor           ($inout0,$inout3);              # ^ offset_i
2615         &pxor           ($inout1,$inout4);
2616         &pxor           ($inout2,$inout5);
2617
2618         &mov            ($out,&DWP($out_off,"esp"));
2619         &call           ("_aesni_decrypt3");
2620
2621         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2622         &xorps          ($inout0,$inout3);              # ^ offset_i
2623         &xorps          ($inout1,$inout4);
2624         &xorps          ($inout2,$inout5);
2625         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2626         &pxor           ($rndkey1,$inout0);             # checksum
2627         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2628         &movups         (&QWP(16*1,$out,$inp),$inout1);
2629         &pxor           ($rndkey1,$inout1);
2630         &movups         (&QWP(16*2,$out,$inp),$inout2);
2631         &pxor           ($rndkey1,$inout2);
2632
2633         &jmp            (&label("done"));
2634
2635 &set_label("four",16);
2636         &lea            ($i1,&DWP(1,$block));
2637         &lea            ($i3,&DWP(3,$block));
2638         &bsf            ($i1,$i1);
2639         &bsf            ($i3,$i3);
2640         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2641         &shl            ($i1,4);
2642         &shl            ($i3,4);
2643         &movdqu         ($inout2,&QWP(0,$l_));
2644         &movdqu         ($inout3,&QWP(0,$l_,$i1));
2645         &movdqa         ($inout4,$inout2);
2646         &movdqu         ($inout5,&QWP(0,$l_,$i3));
2647
2648         &pxor           ($inout2,$rndkey0);             # ^ last offset_i
2649         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2650         &pxor           ($inout3,$inout2);
2651         &movdqu         ($inout1,&QWP(16*1,$inp));
2652         &pxor           ($inout4,$inout3);
2653         &movdqa         (&QWP(16*0,"esp"),$inout2);
2654         &pxor           ($inout5,$inout4);
2655         &movdqa         (&QWP(16*1,"esp"),$inout3);
2656         &movdqu         ($inout2,&QWP(16*2,$inp));
2657         &movdqu         ($inout3,&QWP(16*3,$inp));
2658         &mov            ($rounds,&DWP(240,$key));
2659
2660         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2661         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2662         &pxor           ($inout1,&QWP(16*1,"esp"));
2663         &pxor           ($inout2,$inout4);
2664         &pxor           ($inout3,$inout5);
2665
2666         &mov            ($out,&DWP($out_off,"esp"));
2667         &call           ("_aesni_decrypt4");
2668
2669         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2670         &xorps          ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2671         &xorps          ($inout1,&QWP(16*1,"esp"));
2672         &xorps          ($inout2,$inout4);
2673         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2674         &pxor           ($rndkey1,$inout0);             # checksum
2675         &xorps          ($inout3,$inout5);
2676         &movups         (&QWP(16*1,$out,$inp),$inout1);
2677         &pxor           ($rndkey1,$inout1);
2678         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2679         &movups         (&QWP(16*2,$out,$inp),$inout2);
2680         &pxor           ($rndkey1,$inout2);
2681         &movups         (&QWP(16*3,$out,$inp),$inout3);
2682         &pxor           ($rndkey1,$inout3);
2683
2684 &set_label("done");
2685         &mov    ($key,&DWP($esp_off,"esp"));
2686         &pxor   ($inout0,$inout0);              # clear register bank
2687         &pxor   ($inout1,$inout1);
2688         &movdqa (&QWP(16*0,"esp"),$inout0);     # clear stack
2689         &pxor   ($inout2,$inout2);
2690         &movdqa (&QWP(16*1,"esp"),$inout0);
2691         &pxor   ($inout3,$inout3);
2692         &movdqa (&QWP(16*2,"esp"),$inout0);
2693         &pxor   ($inout4,$inout4);
2694         &movdqa (&QWP(16*3,"esp"),$inout0);
2695         &pxor   ($inout5,$inout5);
2696         &movdqa (&QWP(16*4,"esp"),$inout0);
2697         &movdqa (&QWP(16*5,"esp"),$inout0);
2698         &movdqa (&QWP(16*6,"esp"),$inout0);
2699
2700         &lea    ("esp",&DWP(0,$key));
2701         &mov    ($rounds,&wparam(5));           # &offset_i
2702         &mov    ($rounds_,&wparam(7));          # &checksum
2703         &movdqu (&QWP(0,$rounds),$rndkey0);
2704         &pxor   ($rndkey0,$rndkey0);
2705         &movdqu (&QWP(0,$rounds_),$rndkey1);
2706         &pxor   ($rndkey1,$rndkey1);
2707 &function_end("aesni_ocb_decrypt");
2708 }
2709 }
2710 \f
2711 ######################################################################
2712 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
2713 #                           size_t length, const AES_KEY *key,
2714 #                           unsigned char *ivp,const int enc);
2715 &function_begin("${PREFIX}_cbc_encrypt");
2716         &mov    ($inp,&wparam(0));
2717         &mov    ($rounds_,"esp");
2718         &mov    ($out,&wparam(1));
2719         &sub    ($rounds_,24);
2720         &mov    ($len,&wparam(2));
2721         &and    ($rounds_,-16);
2722         &mov    ($key,&wparam(3));
2723         &mov    ($key_,&wparam(4));
2724         &test   ($len,$len);
2725         &jz     (&label("cbc_abort"));
2726
2727         &cmp    (&wparam(5),0);
2728         &xchg   ($rounds_,"esp");               # alloca
2729         &movups ($ivec,&QWP(0,$key_));          # load IV
2730         &mov    ($rounds,&DWP(240,$key));
2731         &mov    ($key_,$key);                   # backup $key
2732         &mov    (&DWP(16,"esp"),$rounds_);      # save original %esp
2733         &mov    ($rounds_,$rounds);             # backup $rounds
2734         &je     (&label("cbc_decrypt"));
2735
2736         &movaps ($inout0,$ivec);
2737         &cmp    ($len,16);
2738         &jb     (&label("cbc_enc_tail"));
2739         &sub    ($len,16);
2740         &jmp    (&label("cbc_enc_loop"));
2741
2742 &set_label("cbc_enc_loop",16);
2743         &movups ($ivec,&QWP(0,$inp));           # input actually
2744         &lea    ($inp,&DWP(16,$inp));
2745         if ($inline)
2746         {   &aesni_inline_generate1("enc",$inout0,$ivec);       }
2747         else
2748         {   &xorps($inout0,$ivec); &call("_aesni_encrypt1");    }
2749         &mov    ($rounds,$rounds_);     # restore $rounds
2750         &mov    ($key,$key_);           # restore $key
2751         &movups (&QWP(0,$out),$inout0); # store output
2752         &lea    ($out,&DWP(16,$out));
2753         &sub    ($len,16);
2754         &jnc    (&label("cbc_enc_loop"));
2755         &add    ($len,16);
2756         &jnz    (&label("cbc_enc_tail"));
2757         &movaps ($ivec,$inout0);
2758         &pxor   ($inout0,$inout0);
2759         &jmp    (&label("cbc_ret"));
2760
2761 &set_label("cbc_enc_tail");
2762         &mov    ("ecx",$len);           # zaps $rounds
2763         &data_word(0xA4F3F689);         # rep movsb
2764         &mov    ("ecx",16);             # zero tail
2765         &sub    ("ecx",$len);
2766         &xor    ("eax","eax");          # zaps $len
2767         &data_word(0xAAF3F689);         # rep stosb
2768         &lea    ($out,&DWP(-16,$out));  # rewind $out by 1 block
2769         &mov    ($rounds,$rounds_);     # restore $rounds
2770         &mov    ($inp,$out);            # $inp and $out are the same
2771         &mov    ($key,$key_);           # restore $key
2772         &jmp    (&label("cbc_enc_loop"));
2773 ######################################################################
2774 &set_label("cbc_decrypt",16);
2775         &cmp    ($len,0x50);
2776         &jbe    (&label("cbc_dec_tail"));
2777         &movaps (&QWP(0,"esp"),$ivec);          # save IV
2778         &sub    ($len,0x50);
2779         &jmp    (&label("cbc_dec_loop6_enter"));
2780
2781 &set_label("cbc_dec_loop6",16);
2782         &movaps (&QWP(0,"esp"),$rndkey0);       # save IV
2783         &movups (&QWP(0,$out),$inout5);
2784         &lea    ($out,&DWP(0x10,$out));
2785 &set_label("cbc_dec_loop6_enter");
2786         &movdqu ($inout0,&QWP(0,$inp));
2787         &movdqu ($inout1,&QWP(0x10,$inp));
2788         &movdqu ($inout2,&QWP(0x20,$inp));
2789         &movdqu ($inout3,&QWP(0x30,$inp));
2790         &movdqu ($inout4,&QWP(0x40,$inp));
2791         &movdqu ($inout5,&QWP(0x50,$inp));
2792
2793         &call   ("_aesni_decrypt6");
2794
2795         &movups ($rndkey1,&QWP(0,$inp));
2796         &movups ($rndkey0,&QWP(0x10,$inp));
2797         &xorps  ($inout0,&QWP(0,"esp"));        # ^=IV
2798         &xorps  ($inout1,$rndkey1);
2799         &movups ($rndkey1,&QWP(0x20,$inp));
2800         &xorps  ($inout2,$rndkey0);
2801         &movups ($rndkey0,&QWP(0x30,$inp));
2802         &xorps  ($inout3,$rndkey1);
2803         &movups ($rndkey1,&QWP(0x40,$inp));
2804         &xorps  ($inout4,$rndkey0);
2805         &movups ($rndkey0,&QWP(0x50,$inp));     # IV
2806         &xorps  ($inout5,$rndkey1);
2807         &movups (&QWP(0,$out),$inout0);
2808         &movups (&QWP(0x10,$out),$inout1);
2809         &lea    ($inp,&DWP(0x60,$inp));
2810         &movups (&QWP(0x20,$out),$inout2);
2811         &mov    ($rounds,$rounds_);             # restore $rounds
2812         &movups (&QWP(0x30,$out),$inout3);
2813         &mov    ($key,$key_);                   # restore $key
2814         &movups (&QWP(0x40,$out),$inout4);
2815         &lea    ($out,&DWP(0x50,$out));
2816         &sub    ($len,0x60);
2817         &ja     (&label("cbc_dec_loop6"));
2818
2819         &movaps ($inout0,$inout5);
2820         &movaps ($ivec,$rndkey0);
2821         &add    ($len,0x50);
2822         &jle    (&label("cbc_dec_clear_tail_collected"));
2823         &movups (&QWP(0,$out),$inout0);
2824         &lea    ($out,&DWP(0x10,$out));
2825 &set_label("cbc_dec_tail");
2826         &movups ($inout0,&QWP(0,$inp));
2827         &movaps ($in0,$inout0);
2828         &cmp    ($len,0x10);
2829         &jbe    (&label("cbc_dec_one"));
2830
2831         &movups ($inout1,&QWP(0x10,$inp));
2832         &movaps ($in1,$inout1);
2833         &cmp    ($len,0x20);
2834         &jbe    (&label("cbc_dec_two"));
2835
2836         &movups ($inout2,&QWP(0x20,$inp));
2837         &cmp    ($len,0x30);
2838         &jbe    (&label("cbc_dec_three"));
2839
2840         &movups ($inout3,&QWP(0x30,$inp));
2841         &cmp    ($len,0x40);
2842         &jbe    (&label("cbc_dec_four"));
2843
2844         &movups ($inout4,&QWP(0x40,$inp));
2845         &movaps (&QWP(0,"esp"),$ivec);          # save IV
2846         &movups ($inout0,&QWP(0,$inp));
2847         &xorps  ($inout5,$inout5);
2848         &call   ("_aesni_decrypt6");
2849         &movups ($rndkey1,&QWP(0,$inp));
2850         &movups ($rndkey0,&QWP(0x10,$inp));
2851         &xorps  ($inout0,&QWP(0,"esp"));        # ^= IV
2852         &xorps  ($inout1,$rndkey1);
2853         &movups ($rndkey1,&QWP(0x20,$inp));
2854         &xorps  ($inout2,$rndkey0);
2855         &movups ($rndkey0,&QWP(0x30,$inp));
2856         &xorps  ($inout3,$rndkey1);
2857         &movups ($ivec,&QWP(0x40,$inp));        # IV
2858         &xorps  ($inout4,$rndkey0);
2859         &movups (&QWP(0,$out),$inout0);
2860         &movups (&QWP(0x10,$out),$inout1);
2861         &pxor   ($inout1,$inout1);
2862         &movups (&QWP(0x20,$out),$inout2);
2863         &pxor   ($inout2,$inout2);
2864         &movups (&QWP(0x30,$out),$inout3);
2865         &pxor   ($inout3,$inout3);
2866         &lea    ($out,&DWP(0x40,$out));
2867         &movaps ($inout0,$inout4);
2868         &pxor   ($inout4,$inout4);
2869         &sub    ($len,0x50);
2870         &jmp    (&label("cbc_dec_tail_collected"));
2871
2872 &set_label("cbc_dec_one",16);
2873         if ($inline)
2874         {   &aesni_inline_generate1("dec");     }
2875         else
2876         {   &call       ("_aesni_decrypt1");    }
2877         &xorps  ($inout0,$ivec);
2878         &movaps ($ivec,$in0);
2879         &sub    ($len,0x10);
2880         &jmp    (&label("cbc_dec_tail_collected"));
2881
2882 &set_label("cbc_dec_two",16);
2883         &call   ("_aesni_decrypt2");
2884         &xorps  ($inout0,$ivec);
2885         &xorps  ($inout1,$in0);
2886         &movups (&QWP(0,$out),$inout0);
2887         &movaps ($inout0,$inout1);
2888         &pxor   ($inout1,$inout1);
2889         &lea    ($out,&DWP(0x10,$out));
2890         &movaps ($ivec,$in1);
2891         &sub    ($len,0x20);
2892         &jmp    (&label("cbc_dec_tail_collected"));
2893
2894 &set_label("cbc_dec_three",16);
2895         &call   ("_aesni_decrypt3");
2896         &xorps  ($inout0,$ivec);
2897         &xorps  ($inout1,$in0);
2898         &xorps  ($inout2,$in1);
2899         &movups (&QWP(0,$out),$inout0);
2900         &movaps ($inout0,$inout2);
2901         &pxor   ($inout2,$inout2);
2902         &movups (&QWP(0x10,$out),$inout1);
2903         &pxor   ($inout1,$inout1);
2904         &lea    ($out,&DWP(0x20,$out));
2905         &movups ($ivec,&QWP(0x20,$inp));
2906         &sub    ($len,0x30);
2907         &jmp    (&label("cbc_dec_tail_collected"));
2908
2909 &set_label("cbc_dec_four",16);
2910         &call   ("_aesni_decrypt4");
2911         &movups ($rndkey1,&QWP(0x10,$inp));
2912         &movups ($rndkey0,&QWP(0x20,$inp));
2913         &xorps  ($inout0,$ivec);
2914         &movups ($ivec,&QWP(0x30,$inp));
2915         &xorps  ($inout1,$in0);
2916         &movups (&QWP(0,$out),$inout0);
2917         &xorps  ($inout2,$rndkey1);
2918         &movups (&QWP(0x10,$out),$inout1);
2919         &pxor   ($inout1,$inout1);
2920         &xorps  ($inout3,$rndkey0);
2921         &movups (&QWP(0x20,$out),$inout2);
2922         &pxor   ($inout2,$inout2);
2923         &lea    ($out,&DWP(0x30,$out));
2924         &movaps ($inout0,$inout3);
2925         &pxor   ($inout3,$inout3);
2926         &sub    ($len,0x40);
2927         &jmp    (&label("cbc_dec_tail_collected"));
2928
2929 &set_label("cbc_dec_clear_tail_collected",16);
2930         &pxor   ($inout1,$inout1);
2931         &pxor   ($inout2,$inout2);
2932         &pxor   ($inout3,$inout3);
2933         &pxor   ($inout4,$inout4);
2934 &set_label("cbc_dec_tail_collected");
2935         &and    ($len,15);
2936         &jnz    (&label("cbc_dec_tail_partial"));
2937         &movups (&QWP(0,$out),$inout0);
2938         &pxor   ($rndkey0,$rndkey0);
2939         &jmp    (&label("cbc_ret"));
2940
2941 &set_label("cbc_dec_tail_partial",16);
2942         &movaps (&QWP(0,"esp"),$inout0);
2943         &pxor   ($rndkey0,$rndkey0);
2944         &mov    ("ecx",16);
2945         &mov    ($inp,"esp");
2946         &sub    ("ecx",$len);
2947         &data_word(0xA4F3F689);         # rep movsb
2948         &movdqa (&QWP(0,"esp"),$inout0);
2949
2950 &set_label("cbc_ret");
2951         &mov    ("esp",&DWP(16,"esp")); # pull original %esp
2952         &mov    ($key_,&wparam(4));
2953         &pxor   ($inout0,$inout0);
2954         &pxor   ($rndkey1,$rndkey1);
2955         &movups (&QWP(0,$key_),$ivec);  # output IV
2956         &pxor   ($ivec,$ivec);
2957 &set_label("cbc_abort");
2958 &function_end("${PREFIX}_cbc_encrypt");
2959 \f
2960 ######################################################################
2961 # Mechanical port from aesni-x86_64.pl.
2962 #
2963 # _aesni_set_encrypt_key is private interface,
2964 # input:
2965 #       "eax"   const unsigned char *userKey
2966 #       $rounds int bits
2967 #       $key    AES_KEY *key
2968 # output:
2969 #       "eax"   return code
2970 #       $round  rounds
2971
2972 &function_begin_B("_aesni_set_encrypt_key");
2973         &push   ("ebp");
2974         &push   ("ebx");
2975         &test   ("eax","eax");
2976         &jz     (&label("bad_pointer"));
2977         &test   ($key,$key);
2978         &jz     (&label("bad_pointer"));
2979
2980         &call   (&label("pic"));
2981 &set_label("pic");
2982         &blindpop("ebx");
2983         &lea    ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
2984
2985         &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
2986         &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
2987         &xorps  ("xmm4","xmm4");        # low dword of xmm4 is assumed 0
2988         &mov    ("ebp",&DWP(4,"ebp"));
2989         &lea    ($key,&DWP(16,$key));
2990         &and    ("ebp",1<<28|1<<11);    # AVX and XOP bits
2991         &cmp    ($rounds,256);
2992         &je     (&label("14rounds"));
2993         &cmp    ($rounds,192);
2994         &je     (&label("12rounds"));
2995         &cmp    ($rounds,128);
2996         &jne    (&label("bad_keybits"));
2997
2998 &set_label("10rounds",16);
2999         &cmp            ("ebp",1<<28);
3000         &je             (&label("10rounds_alt"));
3001
3002         &mov            ($rounds,9);
3003         &$movekey       (&QWP(-16,$key),"xmm0");        # round 0
3004         &aeskeygenassist("xmm1","xmm0",0x01);           # round 1
3005         &call           (&label("key_128_cold"));
3006         &aeskeygenassist("xmm1","xmm0",0x2);            # round 2
3007         &call           (&label("key_128"));
3008         &aeskeygenassist("xmm1","xmm0",0x04);           # round 3
3009         &call           (&label("key_128"));
3010         &aeskeygenassist("xmm1","xmm0",0x08);           # round 4
3011         &call           (&label("key_128"));
3012         &aeskeygenassist("xmm1","xmm0",0x10);           # round 5
3013         &call           (&label("key_128"));
3014         &aeskeygenassist("xmm1","xmm0",0x20);           # round 6
3015         &call           (&label("key_128"));
3016         &aeskeygenassist("xmm1","xmm0",0x40);           # round 7
3017         &call           (&label("key_128"));
3018         &aeskeygenassist("xmm1","xmm0",0x80);           # round 8
3019         &call           (&label("key_128"));
3020         &aeskeygenassist("xmm1","xmm0",0x1b);           # round 9
3021         &call           (&label("key_128"));
3022         &aeskeygenassist("xmm1","xmm0",0x36);           # round 10
3023         &call           (&label("key_128"));
3024         &$movekey       (&QWP(0,$key),"xmm0");
3025         &mov            (&DWP(80,$key),$rounds);
3026
3027         &jmp    (&label("good_key"));
3028
3029 &set_label("key_128",16);
3030         &$movekey       (&QWP(0,$key),"xmm0");
3031         &lea            ($key,&DWP(16,$key));
3032 &set_label("key_128_cold");
3033         &shufps         ("xmm4","xmm0",0b00010000);
3034         &xorps          ("xmm0","xmm4");
3035         &shufps         ("xmm4","xmm0",0b10001100);
3036         &xorps          ("xmm0","xmm4");
3037         &shufps         ("xmm1","xmm1",0b11111111);     # critical path
3038         &xorps          ("xmm0","xmm1");
3039         &ret();
3040
3041 &set_label("10rounds_alt",16);
3042         &movdqa         ("xmm5",&QWP(0x00,"ebx"));
3043         &mov            ($rounds,8);
3044         &movdqa         ("xmm4",&QWP(0x20,"ebx"));
3045         &movdqa         ("xmm2","xmm0");
3046         &movdqu         (&QWP(-16,$key),"xmm0");
3047
3048 &set_label("loop_key128");
3049         &pshufb         ("xmm0","xmm5");
3050         &aesenclast     ("xmm0","xmm4");
3051         &pslld          ("xmm4",1);
3052         &lea            ($key,&DWP(16,$key));
3053
3054         &movdqa         ("xmm3","xmm2");
3055         &pslldq         ("xmm2",4);
3056         &pxor           ("xmm3","xmm2");
3057         &pslldq         ("xmm2",4);
3058         &pxor           ("xmm3","xmm2");
3059         &pslldq         ("xmm2",4);
3060         &pxor           ("xmm2","xmm3");
3061
3062         &pxor           ("xmm0","xmm2");
3063         &movdqu         (&QWP(-16,$key),"xmm0");
3064         &movdqa         ("xmm2","xmm0");
3065
3066         &dec            ($rounds);
3067         &jnz            (&label("loop_key128"));
3068
3069         &movdqa         ("xmm4",&QWP(0x30,"ebx"));
3070
3071         &pshufb         ("xmm0","xmm5");
3072         &aesenclast     ("xmm0","xmm4");
3073         &pslld          ("xmm4",1);
3074
3075         &movdqa         ("xmm3","xmm2");
3076         &pslldq         ("xmm2",4);
3077         &pxor           ("xmm3","xmm2");
3078         &pslldq         ("xmm2",4);
3079         &pxor           ("xmm3","xmm2");
3080         &pslldq         ("xmm2",4);
3081         &pxor           ("xmm2","xmm3");
3082
3083         &pxor           ("xmm0","xmm2");
3084         &movdqu         (&QWP(0,$key),"xmm0");
3085
3086         &movdqa         ("xmm2","xmm0");
3087         &pshufb         ("xmm0","xmm5");
3088         &aesenclast     ("xmm0","xmm4");
3089
3090         &movdqa         ("xmm3","xmm2");
3091         &pslldq         ("xmm2",4);
3092         &pxor           ("xmm3","xmm2");
3093         &pslldq         ("xmm2",4);
3094         &pxor           ("xmm3","xmm2");
3095         &pslldq         ("xmm2",4);
3096         &pxor           ("xmm2","xmm3");
3097
3098         &pxor           ("xmm0","xmm2");
3099         &movdqu         (&QWP(16,$key),"xmm0");
3100
3101         &mov            ($rounds,9);
3102         &mov            (&DWP(96,$key),$rounds);
3103
3104         &jmp    (&label("good_key"));
3105
3106 &set_label("12rounds",16);
3107         &movq           ("xmm2",&QWP(16,"eax"));        # remaining 1/3 of *userKey
3108         &cmp            ("ebp",1<<28);
3109         &je             (&label("12rounds_alt"));
3110
3111         &mov            ($rounds,11);
3112         &$movekey       (&QWP(-16,$key),"xmm0");        # round 0
3113         &aeskeygenassist("xmm1","xmm2",0x01);           # round 1,2
3114         &call           (&label("key_192a_cold"));
3115         &aeskeygenassist("xmm1","xmm2",0x02);           # round 2,3
3116         &call           (&label("key_192b"));
3117         &aeskeygenassist("xmm1","xmm2",0x04);           # round 4,5
3118         &call           (&label("key_192a"));
3119         &aeskeygenassist("xmm1","xmm2",0x08);           # round 5,6
3120         &call           (&label("key_192b"));
3121         &aeskeygenassist("xmm1","xmm2",0x10);           # round 7,8
3122         &call           (&label("key_192a"));
3123         &aeskeygenassist("xmm1","xmm2",0x20);           # round 8,9
3124         &call           (&label("key_192b"));
3125         &aeskeygenassist("xmm1","xmm2",0x40);           # round 10,11
3126         &call           (&label("key_192a"));
3127         &aeskeygenassist("xmm1","xmm2",0x80);           # round 11,12
3128         &call           (&label("key_192b"));
3129         &$movekey       (&QWP(0,$key),"xmm0");
3130         &mov            (&DWP(48,$key),$rounds);
3131
3132         &jmp    (&label("good_key"));
3133
3134 &set_label("key_192a",16);
3135         &$movekey       (&QWP(0,$key),"xmm0");
3136         &lea            ($key,&DWP(16,$key));
3137 &set_label("key_192a_cold",16);
3138         &movaps         ("xmm5","xmm2");
3139 &set_label("key_192b_warm");
3140         &shufps         ("xmm4","xmm0",0b00010000);
3141         &movdqa         ("xmm3","xmm2");
3142         &xorps          ("xmm0","xmm4");
3143         &shufps