1c714531eae7f17a5c39ebb0d9d97b53012f179b
[openssl.git] / crypto / aes / asm / aesni-x86.pl
1 #! /usr/bin/env perl
2 # Copyright 2009-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements support for Intel AES-NI extension. In
18 # OpenSSL context it's used with Intel engine, but can also be used as
19 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
20 # details].
21 #
22 # Performance.
23 #
24 # To start with see corresponding paragraph in aesni-x86_64.pl...
25 # Instead of filling table similar to one found there I've chosen to
26 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
27 # The simplified table below represents 32-bit performance relative
28 # to 64-bit one in every given point. Ratios vary for different
29 # encryption modes, therefore interval values.
30 #
31 #       16-byte     64-byte     256-byte    1-KB        8-KB
32 #       53-67%      67-84%      91-94%      95-98%      97-99.5%
33 #
34 # Lower ratios for smaller block sizes are perfectly understandable,
35 # because function call overhead is higher in 32-bit mode. Largest
36 # 8-KB block performance is virtually same: 32-bit code is less than
37 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
38
39 # January 2011
40 #
41 # See aesni-x86_64.pl for details. Unlike x86_64 version this module
42 # interleaves at most 6 aes[enc|dec] instructions, because there are
43 # not enough registers for 8x interleave [which should be optimal for
44 # Sandy Bridge]. Actually, performance results for 6x interleave
45 # factor presented in aesni-x86_64.pl (except for CTR) are for this
46 # module.
47
48 # April 2011
49 #
50 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
51 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
52
53 # November 2015
54 #
55 # Add aesni_ocb_[en|de]crypt.
56
57 ######################################################################
58 # Current large-block performance in cycles per byte processed with
59 # 128-bit key (less is better).
60 #
61 #               CBC en-/decrypt CTR     XTS     ECB     OCB
62 # Westmere      3.77/1.37       1.37    1.52    1.27
63 # * Bridge      5.07/0.98       0.99    1.09    0.91    1.10
64 # Haswell       4.44/0.80       0.97    1.03    0.72    0.76
65 # Skylake       2.68/0.65       0.65    0.66    0.64    0.66
66 # Silvermont    5.77/3.56       3.67    4.03    3.46    4.03
67 # Goldmont      3.84/1.39       1.39    1.63    1.31    1.70
68 # Bulldozer     5.80/0.98       1.05    1.24    0.93    1.23
69
70 $PREFIX="aesni";        # if $PREFIX is set to "AES", the script
71                         # generates drop-in replacement for
72                         # crypto/aes/asm/aes-586.pl:-)
73 $inline=1;              # inline _aesni_[en|de]crypt
74
75 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
76 push(@INC,"${dir}","${dir}../../perlasm");
77 require "x86asm.pl";
78
79 $output = pop and open STDOUT,">$output";
80
81 &asm_init($ARGV[0]);
82
83 &external_label("OPENSSL_ia32cap_P");
84 &static_label("key_const");
85
86 if ($PREFIX eq "aesni") { $movekey=\&movups; }
87 else                    { $movekey=\&movups; }
88
89 $len="eax";
90 $rounds="ecx";
91 $key="edx";
92 $inp="esi";
93 $out="edi";
94 $rounds_="ebx"; # backup copy for $rounds
95 $key_="ebp";    # backup copy for $key
96
97 $rndkey0="xmm0";
98 $rndkey1="xmm1";
99 $inout0="xmm2";
100 $inout1="xmm3";
101 $inout2="xmm4";
102 $inout3="xmm5"; $in1="xmm5";
103 $inout4="xmm6"; $in0="xmm6";
104 $inout5="xmm7"; $ivec="xmm7";
105
106 # AESNI extension
107 sub aeskeygenassist
108 { my($dst,$src,$imm)=@_;
109     if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
110     {   &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm);   }
111 }
112 sub aescommon
113 { my($opcodelet,$dst,$src)=@_;
114     if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
115     {   &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
116 }
117 sub aesimc      { aescommon(0xdb,@_); }
118 sub aesenc      { aescommon(0xdc,@_); }
119 sub aesenclast  { aescommon(0xdd,@_); }
120 sub aesdec      { aescommon(0xde,@_); }
121 sub aesdeclast  { aescommon(0xdf,@_); }
122 \f
123 # Inline version of internal aesni_[en|de]crypt1
124 { my $sn;
125 sub aesni_inline_generate1
126 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
127   $sn++;
128
129     &$movekey           ($rndkey0,&QWP(0,$key));
130     &$movekey           ($rndkey1,&QWP(16,$key));
131     &xorps              ($ivec,$rndkey0)        if (defined($ivec));
132     &lea                ($key,&DWP(32,$key));
133     &xorps              ($inout,$ivec)          if (defined($ivec));
134     &xorps              ($inout,$rndkey0)       if (!defined($ivec));
135     &set_label("${p}1_loop_$sn");
136         eval"&aes${p}   ($inout,$rndkey1)";
137         &dec            ($rounds);
138         &$movekey       ($rndkey1,&QWP(0,$key));
139         &lea            ($key,&DWP(16,$key));
140     &jnz                (&label("${p}1_loop_$sn"));
141     eval"&aes${p}last   ($inout,$rndkey1)";
142 }}
143
144 sub aesni_generate1     # fully unrolled loop
145 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
146
147     &function_begin_B("_aesni_${p}rypt1");
148         &movups         ($rndkey0,&QWP(0,$key));
149         &$movekey       ($rndkey1,&QWP(0x10,$key));
150         &xorps          ($inout,$rndkey0);
151         &$movekey       ($rndkey0,&QWP(0x20,$key));
152         &lea            ($key,&DWP(0x30,$key));
153         &cmp            ($rounds,11);
154         &jb             (&label("${p}128"));
155         &lea            ($key,&DWP(0x20,$key));
156         &je             (&label("${p}192"));
157         &lea            ($key,&DWP(0x20,$key));
158         eval"&aes${p}   ($inout,$rndkey1)";
159         &$movekey       ($rndkey1,&QWP(-0x40,$key));
160         eval"&aes${p}   ($inout,$rndkey0)";
161         &$movekey       ($rndkey0,&QWP(-0x30,$key));
162     &set_label("${p}192");
163         eval"&aes${p}   ($inout,$rndkey1)";
164         &$movekey       ($rndkey1,&QWP(-0x20,$key));
165         eval"&aes${p}   ($inout,$rndkey0)";
166         &$movekey       ($rndkey0,&QWP(-0x10,$key));
167     &set_label("${p}128");
168         eval"&aes${p}   ($inout,$rndkey1)";
169         &$movekey       ($rndkey1,&QWP(0,$key));
170         eval"&aes${p}   ($inout,$rndkey0)";
171         &$movekey       ($rndkey0,&QWP(0x10,$key));
172         eval"&aes${p}   ($inout,$rndkey1)";
173         &$movekey       ($rndkey1,&QWP(0x20,$key));
174         eval"&aes${p}   ($inout,$rndkey0)";
175         &$movekey       ($rndkey0,&QWP(0x30,$key));
176         eval"&aes${p}   ($inout,$rndkey1)";
177         &$movekey       ($rndkey1,&QWP(0x40,$key));
178         eval"&aes${p}   ($inout,$rndkey0)";
179         &$movekey       ($rndkey0,&QWP(0x50,$key));
180         eval"&aes${p}   ($inout,$rndkey1)";
181         &$movekey       ($rndkey1,&QWP(0x60,$key));
182         eval"&aes${p}   ($inout,$rndkey0)";
183         &$movekey       ($rndkey0,&QWP(0x70,$key));
184         eval"&aes${p}   ($inout,$rndkey1)";
185     eval"&aes${p}last   ($inout,$rndkey0)";
186     &ret();
187     &function_end_B("_aesni_${p}rypt1");
188 }
189 \f
190 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
191 &aesni_generate1("enc") if (!$inline);
192 &function_begin_B("${PREFIX}_encrypt");
193         &mov    ("eax",&wparam(0));
194         &mov    ($key,&wparam(2));
195         &movups ($inout0,&QWP(0,"eax"));
196         &mov    ($rounds,&DWP(240,$key));
197         &mov    ("eax",&wparam(1));
198         if ($inline)
199         {   &aesni_inline_generate1("enc");     }
200         else
201         {   &call       ("_aesni_encrypt1");    }
202         &pxor   ($rndkey0,$rndkey0);            # clear register bank
203         &pxor   ($rndkey1,$rndkey1);
204         &movups (&QWP(0,"eax"),$inout0);
205         &pxor   ($inout0,$inout0);
206         &ret    ();
207 &function_end_B("${PREFIX}_encrypt");
208
209 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
210 &aesni_generate1("dec") if(!$inline);
211 &function_begin_B("${PREFIX}_decrypt");
212         &mov    ("eax",&wparam(0));
213         &mov    ($key,&wparam(2));
214         &movups ($inout0,&QWP(0,"eax"));
215         &mov    ($rounds,&DWP(240,$key));
216         &mov    ("eax",&wparam(1));
217         if ($inline)
218         {   &aesni_inline_generate1("dec");     }
219         else
220         {   &call       ("_aesni_decrypt1");    }
221         &pxor   ($rndkey0,$rndkey0);            # clear register bank
222         &pxor   ($rndkey1,$rndkey1);
223         &movups (&QWP(0,"eax"),$inout0);
224         &pxor   ($inout0,$inout0);
225         &ret    ();
226 &function_end_B("${PREFIX}_decrypt");
227
228 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
229 # factor. Why 3x subroutine were originally used in loops? Even though
230 # aes[enc|dec] latency was originally 6, it could be scheduled only
231 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
232 # utilization, i.e. when subroutine's throughput is virtually same as
233 # of non-interleaved subroutine [for number of input blocks up to 3].
234 # This is why it originally made no sense to implement 2x subroutine.
235 # But times change and it became appropriate to spend extra 192 bytes
236 # on 2x subroutine on Atom Silvermont account. For processors that
237 # can schedule aes[enc|dec] every cycle optimal interleave factor
238 # equals to corresponding instructions latency. 8x is optimal for
239 # * Bridge, but it's unfeasible to accommodate such implementation
240 # in XMM registers addressable in 32-bit mode and therefore maximum
241 # of 6x is used instead...
242
243 sub aesni_generate2
244 { my $p=shift;
245
246     &function_begin_B("_aesni_${p}rypt2");
247         &$movekey       ($rndkey0,&QWP(0,$key));
248         &shl            ($rounds,4);
249         &$movekey       ($rndkey1,&QWP(16,$key));
250         &xorps          ($inout0,$rndkey0);
251         &pxor           ($inout1,$rndkey0);
252         &$movekey       ($rndkey0,&QWP(32,$key));
253         &lea            ($key,&DWP(32,$key,$rounds));
254         &neg            ($rounds);
255         &add            ($rounds,16);
256
257     &set_label("${p}2_loop");
258         eval"&aes${p}   ($inout0,$rndkey1)";
259         eval"&aes${p}   ($inout1,$rndkey1)";
260         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
261         &add            ($rounds,32);
262         eval"&aes${p}   ($inout0,$rndkey0)";
263         eval"&aes${p}   ($inout1,$rndkey0)";
264         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
265         &jnz            (&label("${p}2_loop"));
266     eval"&aes${p}       ($inout0,$rndkey1)";
267     eval"&aes${p}       ($inout1,$rndkey1)";
268     eval"&aes${p}last   ($inout0,$rndkey0)";
269     eval"&aes${p}last   ($inout1,$rndkey0)";
270     &ret();
271     &function_end_B("_aesni_${p}rypt2");
272 }
273
274 sub aesni_generate3
275 { my $p=shift;
276
277     &function_begin_B("_aesni_${p}rypt3");
278         &$movekey       ($rndkey0,&QWP(0,$key));
279         &shl            ($rounds,4);
280         &$movekey       ($rndkey1,&QWP(16,$key));
281         &xorps          ($inout0,$rndkey0);
282         &pxor           ($inout1,$rndkey0);
283         &pxor           ($inout2,$rndkey0);
284         &$movekey       ($rndkey0,&QWP(32,$key));
285         &lea            ($key,&DWP(32,$key,$rounds));
286         &neg            ($rounds);
287         &add            ($rounds,16);
288
289     &set_label("${p}3_loop");
290         eval"&aes${p}   ($inout0,$rndkey1)";
291         eval"&aes${p}   ($inout1,$rndkey1)";
292         eval"&aes${p}   ($inout2,$rndkey1)";
293         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
294         &add            ($rounds,32);
295         eval"&aes${p}   ($inout0,$rndkey0)";
296         eval"&aes${p}   ($inout1,$rndkey0)";
297         eval"&aes${p}   ($inout2,$rndkey0)";
298         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
299         &jnz            (&label("${p}3_loop"));
300     eval"&aes${p}       ($inout0,$rndkey1)";
301     eval"&aes${p}       ($inout1,$rndkey1)";
302     eval"&aes${p}       ($inout2,$rndkey1)";
303     eval"&aes${p}last   ($inout0,$rndkey0)";
304     eval"&aes${p}last   ($inout1,$rndkey0)";
305     eval"&aes${p}last   ($inout2,$rndkey0)";
306     &ret();
307     &function_end_B("_aesni_${p}rypt3");
308 }
309
310 # 4x interleave is implemented to improve small block performance,
311 # most notably [and naturally] 4 block by ~30%. One can argue that one
312 # should have implemented 5x as well, but improvement  would be <20%,
313 # so it's not worth it...
314 sub aesni_generate4
315 { my $p=shift;
316
317     &function_begin_B("_aesni_${p}rypt4");
318         &$movekey       ($rndkey0,&QWP(0,$key));
319         &$movekey       ($rndkey1,&QWP(16,$key));
320         &shl            ($rounds,4);
321         &xorps          ($inout0,$rndkey0);
322         &pxor           ($inout1,$rndkey0);
323         &pxor           ($inout2,$rndkey0);
324         &pxor           ($inout3,$rndkey0);
325         &$movekey       ($rndkey0,&QWP(32,$key));
326         &lea            ($key,&DWP(32,$key,$rounds));
327         &neg            ($rounds);
328         &data_byte      (0x0f,0x1f,0x40,0x00);
329         &add            ($rounds,16);
330
331     &set_label("${p}4_loop");
332         eval"&aes${p}   ($inout0,$rndkey1)";
333         eval"&aes${p}   ($inout1,$rndkey1)";
334         eval"&aes${p}   ($inout2,$rndkey1)";
335         eval"&aes${p}   ($inout3,$rndkey1)";
336         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
337         &add            ($rounds,32);
338         eval"&aes${p}   ($inout0,$rndkey0)";
339         eval"&aes${p}   ($inout1,$rndkey0)";
340         eval"&aes${p}   ($inout2,$rndkey0)";
341         eval"&aes${p}   ($inout3,$rndkey0)";
342         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
343     &jnz                (&label("${p}4_loop"));
344
345     eval"&aes${p}       ($inout0,$rndkey1)";
346     eval"&aes${p}       ($inout1,$rndkey1)";
347     eval"&aes${p}       ($inout2,$rndkey1)";
348     eval"&aes${p}       ($inout3,$rndkey1)";
349     eval"&aes${p}last   ($inout0,$rndkey0)";
350     eval"&aes${p}last   ($inout1,$rndkey0)";
351     eval"&aes${p}last   ($inout2,$rndkey0)";
352     eval"&aes${p}last   ($inout3,$rndkey0)";
353     &ret();
354     &function_end_B("_aesni_${p}rypt4");
355 }
356
357 sub aesni_generate6
358 { my $p=shift;
359
360     &function_begin_B("_aesni_${p}rypt6");
361     &static_label("_aesni_${p}rypt6_enter");
362         &$movekey       ($rndkey0,&QWP(0,$key));
363         &shl            ($rounds,4);
364         &$movekey       ($rndkey1,&QWP(16,$key));
365         &xorps          ($inout0,$rndkey0);
366         &pxor           ($inout1,$rndkey0);     # pxor does better here
367         &pxor           ($inout2,$rndkey0);
368         eval"&aes${p}   ($inout0,$rndkey1)";
369         &pxor           ($inout3,$rndkey0);
370         &pxor           ($inout4,$rndkey0);
371         eval"&aes${p}   ($inout1,$rndkey1)";
372         &lea            ($key,&DWP(32,$key,$rounds));
373         &neg            ($rounds);
374         eval"&aes${p}   ($inout2,$rndkey1)";
375         &pxor           ($inout5,$rndkey0);
376         &$movekey       ($rndkey0,&QWP(0,$key,$rounds));
377         &add            ($rounds,16);
378         &jmp            (&label("_aesni_${p}rypt6_inner"));
379
380     &set_label("${p}6_loop",16);
381         eval"&aes${p}   ($inout0,$rndkey1)";
382         eval"&aes${p}   ($inout1,$rndkey1)";
383         eval"&aes${p}   ($inout2,$rndkey1)";
384     &set_label("_aesni_${p}rypt6_inner");
385         eval"&aes${p}   ($inout3,$rndkey1)";
386         eval"&aes${p}   ($inout4,$rndkey1)";
387         eval"&aes${p}   ($inout5,$rndkey1)";
388     &set_label("_aesni_${p}rypt6_enter");
389         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
390         &add            ($rounds,32);
391         eval"&aes${p}   ($inout0,$rndkey0)";
392         eval"&aes${p}   ($inout1,$rndkey0)";
393         eval"&aes${p}   ($inout2,$rndkey0)";
394         eval"&aes${p}   ($inout3,$rndkey0)";
395         eval"&aes${p}   ($inout4,$rndkey0)";
396         eval"&aes${p}   ($inout5,$rndkey0)";
397         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
398     &jnz                (&label("${p}6_loop"));
399
400     eval"&aes${p}       ($inout0,$rndkey1)";
401     eval"&aes${p}       ($inout1,$rndkey1)";
402     eval"&aes${p}       ($inout2,$rndkey1)";
403     eval"&aes${p}       ($inout3,$rndkey1)";
404     eval"&aes${p}       ($inout4,$rndkey1)";
405     eval"&aes${p}       ($inout5,$rndkey1)";
406     eval"&aes${p}last   ($inout0,$rndkey0)";
407     eval"&aes${p}last   ($inout1,$rndkey0)";
408     eval"&aes${p}last   ($inout2,$rndkey0)";
409     eval"&aes${p}last   ($inout3,$rndkey0)";
410     eval"&aes${p}last   ($inout4,$rndkey0)";
411     eval"&aes${p}last   ($inout5,$rndkey0)";
412     &ret();
413     &function_end_B("_aesni_${p}rypt6");
414 }
415 &aesni_generate2("enc") if ($PREFIX eq "aesni");
416 &aesni_generate2("dec");
417 &aesni_generate3("enc") if ($PREFIX eq "aesni");
418 &aesni_generate3("dec");
419 &aesni_generate4("enc") if ($PREFIX eq "aesni");
420 &aesni_generate4("dec");
421 &aesni_generate6("enc") if ($PREFIX eq "aesni");
422 &aesni_generate6("dec");
423 \f
424 if ($PREFIX eq "aesni") {
425 ######################################################################
426 # void aesni_ecb_encrypt (const void *in, void *out,
427 #                         size_t length, const AES_KEY *key,
428 #                         int enc);
429 &function_begin("aesni_ecb_encrypt");
430         &mov    ($inp,&wparam(0));
431         &mov    ($out,&wparam(1));
432         &mov    ($len,&wparam(2));
433         &mov    ($key,&wparam(3));
434         &mov    ($rounds_,&wparam(4));
435         &and    ($len,-16);
436         &jz     (&label("ecb_ret"));
437         &mov    ($rounds,&DWP(240,$key));
438         &test   ($rounds_,$rounds_);
439         &jz     (&label("ecb_decrypt"));
440
441         &mov    ($key_,$key);           # backup $key
442         &mov    ($rounds_,$rounds);     # backup $rounds
443         &cmp    ($len,0x60);
444         &jb     (&label("ecb_enc_tail"));
445
446         &movdqu ($inout0,&QWP(0,$inp));
447         &movdqu ($inout1,&QWP(0x10,$inp));
448         &movdqu ($inout2,&QWP(0x20,$inp));
449         &movdqu ($inout3,&QWP(0x30,$inp));
450         &movdqu ($inout4,&QWP(0x40,$inp));
451         &movdqu ($inout5,&QWP(0x50,$inp));
452         &lea    ($inp,&DWP(0x60,$inp));
453         &sub    ($len,0x60);
454         &jmp    (&label("ecb_enc_loop6_enter"));
455
456 &set_label("ecb_enc_loop6",16);
457         &movups (&QWP(0,$out),$inout0);
458         &movdqu ($inout0,&QWP(0,$inp));
459         &movups (&QWP(0x10,$out),$inout1);
460         &movdqu ($inout1,&QWP(0x10,$inp));
461         &movups (&QWP(0x20,$out),$inout2);
462         &movdqu ($inout2,&QWP(0x20,$inp));
463         &movups (&QWP(0x30,$out),$inout3);
464         &movdqu ($inout3,&QWP(0x30,$inp));
465         &movups (&QWP(0x40,$out),$inout4);
466         &movdqu ($inout4,&QWP(0x40,$inp));
467         &movups (&QWP(0x50,$out),$inout5);
468         &lea    ($out,&DWP(0x60,$out));
469         &movdqu ($inout5,&QWP(0x50,$inp));
470         &lea    ($inp,&DWP(0x60,$inp));
471 &set_label("ecb_enc_loop6_enter");
472
473         &call   ("_aesni_encrypt6");
474
475         &mov    ($key,$key_);           # restore $key
476         &mov    ($rounds,$rounds_);     # restore $rounds
477         &sub    ($len,0x60);
478         &jnc    (&label("ecb_enc_loop6"));
479
480         &movups (&QWP(0,$out),$inout0);
481         &movups (&QWP(0x10,$out),$inout1);
482         &movups (&QWP(0x20,$out),$inout2);
483         &movups (&QWP(0x30,$out),$inout3);
484         &movups (&QWP(0x40,$out),$inout4);
485         &movups (&QWP(0x50,$out),$inout5);
486         &lea    ($out,&DWP(0x60,$out));
487         &add    ($len,0x60);
488         &jz     (&label("ecb_ret"));
489
490 &set_label("ecb_enc_tail");
491         &movups ($inout0,&QWP(0,$inp));
492         &cmp    ($len,0x20);
493         &jb     (&label("ecb_enc_one"));
494         &movups ($inout1,&QWP(0x10,$inp));
495         &je     (&label("ecb_enc_two"));
496         &movups ($inout2,&QWP(0x20,$inp));
497         &cmp    ($len,0x40);
498         &jb     (&label("ecb_enc_three"));
499         &movups ($inout3,&QWP(0x30,$inp));
500         &je     (&label("ecb_enc_four"));
501         &movups ($inout4,&QWP(0x40,$inp));
502         &xorps  ($inout5,$inout5);
503         &call   ("_aesni_encrypt6");
504         &movups (&QWP(0,$out),$inout0);
505         &movups (&QWP(0x10,$out),$inout1);
506         &movups (&QWP(0x20,$out),$inout2);
507         &movups (&QWP(0x30,$out),$inout3);
508         &movups (&QWP(0x40,$out),$inout4);
509         jmp     (&label("ecb_ret"));
510
511 &set_label("ecb_enc_one",16);
512         if ($inline)
513         {   &aesni_inline_generate1("enc");     }
514         else
515         {   &call       ("_aesni_encrypt1");    }
516         &movups (&QWP(0,$out),$inout0);
517         &jmp    (&label("ecb_ret"));
518
519 &set_label("ecb_enc_two",16);
520         &call   ("_aesni_encrypt2");
521         &movups (&QWP(0,$out),$inout0);
522         &movups (&QWP(0x10,$out),$inout1);
523         &jmp    (&label("ecb_ret"));
524
525 &set_label("ecb_enc_three",16);
526         &call   ("_aesni_encrypt3");
527         &movups (&QWP(0,$out),$inout0);
528         &movups (&QWP(0x10,$out),$inout1);
529         &movups (&QWP(0x20,$out),$inout2);
530         &jmp    (&label("ecb_ret"));
531
532 &set_label("ecb_enc_four",16);
533         &call   ("_aesni_encrypt4");
534         &movups (&QWP(0,$out),$inout0);
535         &movups (&QWP(0x10,$out),$inout1);
536         &movups (&QWP(0x20,$out),$inout2);
537         &movups (&QWP(0x30,$out),$inout3);
538         &jmp    (&label("ecb_ret"));
539 ######################################################################
540 &set_label("ecb_decrypt",16);
541         &mov    ($key_,$key);           # backup $key
542         &mov    ($rounds_,$rounds);     # backup $rounds
543         &cmp    ($len,0x60);
544         &jb     (&label("ecb_dec_tail"));
545
546         &movdqu ($inout0,&QWP(0,$inp));
547         &movdqu ($inout1,&QWP(0x10,$inp));
548         &movdqu ($inout2,&QWP(0x20,$inp));
549         &movdqu ($inout3,&QWP(0x30,$inp));
550         &movdqu ($inout4,&QWP(0x40,$inp));
551         &movdqu ($inout5,&QWP(0x50,$inp));
552         &lea    ($inp,&DWP(0x60,$inp));
553         &sub    ($len,0x60);
554         &jmp    (&label("ecb_dec_loop6_enter"));
555
556 &set_label("ecb_dec_loop6",16);
557         &movups (&QWP(0,$out),$inout0);
558         &movdqu ($inout0,&QWP(0,$inp));
559         &movups (&QWP(0x10,$out),$inout1);
560         &movdqu ($inout1,&QWP(0x10,$inp));
561         &movups (&QWP(0x20,$out),$inout2);
562         &movdqu ($inout2,&QWP(0x20,$inp));
563         &movups (&QWP(0x30,$out),$inout3);
564         &movdqu ($inout3,&QWP(0x30,$inp));
565         &movups (&QWP(0x40,$out),$inout4);
566         &movdqu ($inout4,&QWP(0x40,$inp));
567         &movups (&QWP(0x50,$out),$inout5);
568         &lea    ($out,&DWP(0x60,$out));
569         &movdqu ($inout5,&QWP(0x50,$inp));
570         &lea    ($inp,&DWP(0x60,$inp));
571 &set_label("ecb_dec_loop6_enter");
572
573         &call   ("_aesni_decrypt6");
574
575         &mov    ($key,$key_);           # restore $key
576         &mov    ($rounds,$rounds_);     # restore $rounds
577         &sub    ($len,0x60);
578         &jnc    (&label("ecb_dec_loop6"));
579
580         &movups (&QWP(0,$out),$inout0);
581         &movups (&QWP(0x10,$out),$inout1);
582         &movups (&QWP(0x20,$out),$inout2);
583         &movups (&QWP(0x30,$out),$inout3);
584         &movups (&QWP(0x40,$out),$inout4);
585         &movups (&QWP(0x50,$out),$inout5);
586         &lea    ($out,&DWP(0x60,$out));
587         &add    ($len,0x60);
588         &jz     (&label("ecb_ret"));
589
590 &set_label("ecb_dec_tail");
591         &movups ($inout0,&QWP(0,$inp));
592         &cmp    ($len,0x20);
593         &jb     (&label("ecb_dec_one"));
594         &movups ($inout1,&QWP(0x10,$inp));
595         &je     (&label("ecb_dec_two"));
596         &movups ($inout2,&QWP(0x20,$inp));
597         &cmp    ($len,0x40);
598         &jb     (&label("ecb_dec_three"));
599         &movups ($inout3,&QWP(0x30,$inp));
600         &je     (&label("ecb_dec_four"));
601         &movups ($inout4,&QWP(0x40,$inp));
602         &xorps  ($inout5,$inout5);
603         &call   ("_aesni_decrypt6");
604         &movups (&QWP(0,$out),$inout0);
605         &movups (&QWP(0x10,$out),$inout1);
606         &movups (&QWP(0x20,$out),$inout2);
607         &movups (&QWP(0x30,$out),$inout3);
608         &movups (&QWP(0x40,$out),$inout4);
609         &jmp    (&label("ecb_ret"));
610
611 &set_label("ecb_dec_one",16);
612         if ($inline)
613         {   &aesni_inline_generate1("dec");     }
614         else
615         {   &call       ("_aesni_decrypt1");    }
616         &movups (&QWP(0,$out),$inout0);
617         &jmp    (&label("ecb_ret"));
618
619 &set_label("ecb_dec_two",16);
620         &call   ("_aesni_decrypt2");
621         &movups (&QWP(0,$out),$inout0);
622         &movups (&QWP(0x10,$out),$inout1);
623         &jmp    (&label("ecb_ret"));
624
625 &set_label("ecb_dec_three",16);
626         &call   ("_aesni_decrypt3");
627         &movups (&QWP(0,$out),$inout0);
628         &movups (&QWP(0x10,$out),$inout1);
629         &movups (&QWP(0x20,$out),$inout2);
630         &jmp    (&label("ecb_ret"));
631
632 &set_label("ecb_dec_four",16);
633         &call   ("_aesni_decrypt4");
634         &movups (&QWP(0,$out),$inout0);
635         &movups (&QWP(0x10,$out),$inout1);
636         &movups (&QWP(0x20,$out),$inout2);
637         &movups (&QWP(0x30,$out),$inout3);
638
639 &set_label("ecb_ret");
640         &pxor   ("xmm0","xmm0");                # clear register bank
641         &pxor   ("xmm1","xmm1");
642         &pxor   ("xmm2","xmm2");
643         &pxor   ("xmm3","xmm3");
644         &pxor   ("xmm4","xmm4");
645         &pxor   ("xmm5","xmm5");
646         &pxor   ("xmm6","xmm6");
647         &pxor   ("xmm7","xmm7");
648 &function_end("aesni_ecb_encrypt");
649 \f
650 ######################################################################
651 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
652 #                         size_t blocks, const AES_KEY *key,
653 #                         const char *ivec,char *cmac);
654 #
655 # Handles only complete blocks, operates on 64-bit counter and
656 # does not update *ivec! Nor does it finalize CMAC value
657 # (see engine/eng_aesni.c for details)
658 #
659 { my $cmac=$inout1;
660 &function_begin("aesni_ccm64_encrypt_blocks");
661         &mov    ($inp,&wparam(0));
662         &mov    ($out,&wparam(1));
663         &mov    ($len,&wparam(2));
664         &mov    ($key,&wparam(3));
665         &mov    ($rounds_,&wparam(4));
666         &mov    ($rounds,&wparam(5));
667         &mov    ($key_,"esp");
668         &sub    ("esp",60);
669         &and    ("esp",-16);                    # align stack
670         &mov    (&DWP(48,"esp"),$key_);
671
672         &movdqu ($ivec,&QWP(0,$rounds_));       # load ivec
673         &movdqu ($cmac,&QWP(0,$rounds));        # load cmac
674         &mov    ($rounds,&DWP(240,$key));
675
676         # compose byte-swap control mask for pshufb on stack
677         &mov    (&DWP(0,"esp"),0x0c0d0e0f);
678         &mov    (&DWP(4,"esp"),0x08090a0b);
679         &mov    (&DWP(8,"esp"),0x04050607);
680         &mov    (&DWP(12,"esp"),0x00010203);
681
682         # compose counter increment vector on stack
683         &mov    ($rounds_,1);
684         &xor    ($key_,$key_);
685         &mov    (&DWP(16,"esp"),$rounds_);
686         &mov    (&DWP(20,"esp"),$key_);
687         &mov    (&DWP(24,"esp"),$key_);
688         &mov    (&DWP(28,"esp"),$key_);
689
690         &shl    ($rounds,4);
691         &mov    ($rounds_,16);
692         &lea    ($key_,&DWP(0,$key));
693         &movdqa ($inout3,&QWP(0,"esp"));
694         &movdqa ($inout0,$ivec);
695         &lea    ($key,&DWP(32,$key,$rounds));
696         &sub    ($rounds_,$rounds);
697         &pshufb ($ivec,$inout3);
698
699 &set_label("ccm64_enc_outer");
700         &$movekey       ($rndkey0,&QWP(0,$key_));
701         &mov            ($rounds,$rounds_);
702         &movups         ($in0,&QWP(0,$inp));
703
704         &xorps          ($inout0,$rndkey0);
705         &$movekey       ($rndkey1,&QWP(16,$key_));
706         &xorps          ($rndkey0,$in0);
707         &xorps          ($cmac,$rndkey0);               # cmac^=inp
708         &$movekey       ($rndkey0,&QWP(32,$key_));
709
710 &set_label("ccm64_enc2_loop");
711         &aesenc         ($inout0,$rndkey1);
712         &aesenc         ($cmac,$rndkey1);
713         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
714         &add            ($rounds,32);
715         &aesenc         ($inout0,$rndkey0);
716         &aesenc         ($cmac,$rndkey0);
717         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
718         &jnz            (&label("ccm64_enc2_loop"));
719         &aesenc         ($inout0,$rndkey1);
720         &aesenc         ($cmac,$rndkey1);
721         &paddq          ($ivec,&QWP(16,"esp"));
722         &dec            ($len);
723         &aesenclast     ($inout0,$rndkey0);
724         &aesenclast     ($cmac,$rndkey0);
725
726         &lea    ($inp,&DWP(16,$inp));
727         &xorps  ($in0,$inout0);                 # inp^=E(ivec)
728         &movdqa ($inout0,$ivec);
729         &movups (&QWP(0,$out),$in0);            # save output
730         &pshufb ($inout0,$inout3);
731         &lea    ($out,&DWP(16,$out));
732         &jnz    (&label("ccm64_enc_outer"));
733
734         &mov    ("esp",&DWP(48,"esp"));
735         &mov    ($out,&wparam(5));
736         &movups (&QWP(0,$out),$cmac);
737
738         &pxor   ("xmm0","xmm0");                # clear register bank
739         &pxor   ("xmm1","xmm1");
740         &pxor   ("xmm2","xmm2");
741         &pxor   ("xmm3","xmm3");
742         &pxor   ("xmm4","xmm4");
743         &pxor   ("xmm5","xmm5");
744         &pxor   ("xmm6","xmm6");
745         &pxor   ("xmm7","xmm7");
746 &function_end("aesni_ccm64_encrypt_blocks");
747
748 &function_begin("aesni_ccm64_decrypt_blocks");
749         &mov    ($inp,&wparam(0));
750         &mov    ($out,&wparam(1));
751         &mov    ($len,&wparam(2));
752         &mov    ($key,&wparam(3));
753         &mov    ($rounds_,&wparam(4));
754         &mov    ($rounds,&wparam(5));
755         &mov    ($key_,"esp");
756         &sub    ("esp",60);
757         &and    ("esp",-16);                    # align stack
758         &mov    (&DWP(48,"esp"),$key_);
759
760         &movdqu ($ivec,&QWP(0,$rounds_));       # load ivec
761         &movdqu ($cmac,&QWP(0,$rounds));        # load cmac
762         &mov    ($rounds,&DWP(240,$key));
763
764         # compose byte-swap control mask for pshufb on stack
765         &mov    (&DWP(0,"esp"),0x0c0d0e0f);
766         &mov    (&DWP(4,"esp"),0x08090a0b);
767         &mov    (&DWP(8,"esp"),0x04050607);
768         &mov    (&DWP(12,"esp"),0x00010203);
769
770         # compose counter increment vector on stack
771         &mov    ($rounds_,1);
772         &xor    ($key_,$key_);
773         &mov    (&DWP(16,"esp"),$rounds_);
774         &mov    (&DWP(20,"esp"),$key_);
775         &mov    (&DWP(24,"esp"),$key_);
776         &mov    (&DWP(28,"esp"),$key_);
777
778         &movdqa ($inout3,&QWP(0,"esp"));        # bswap mask
779         &movdqa ($inout0,$ivec);
780
781         &mov    ($key_,$key);
782         &mov    ($rounds_,$rounds);
783
784         &pshufb ($ivec,$inout3);
785         if ($inline)
786         {   &aesni_inline_generate1("enc");     }
787         else
788         {   &call       ("_aesni_encrypt1");    }
789         &shl    ($rounds_,4);
790         &mov    ($rounds,16);
791         &movups ($in0,&QWP(0,$inp));            # load inp
792         &paddq  ($ivec,&QWP(16,"esp"));
793         &lea    ($inp,&QWP(16,$inp));
794         &sub    ($rounds,$rounds_);
795         &lea    ($key,&DWP(32,$key_,$rounds_));
796         &mov    ($rounds_,$rounds);
797         &jmp    (&label("ccm64_dec_outer"));
798
799 &set_label("ccm64_dec_outer",16);
800         &xorps  ($in0,$inout0);                 # inp ^= E(ivec)
801         &movdqa ($inout0,$ivec);
802         &movups (&QWP(0,$out),$in0);            # save output
803         &lea    ($out,&DWP(16,$out));
804         &pshufb ($inout0,$inout3);
805
806         &sub    ($len,1);
807         &jz     (&label("ccm64_dec_break"));
808
809         &$movekey       ($rndkey0,&QWP(0,$key_));
810         &mov            ($rounds,$rounds_);
811         &$movekey       ($rndkey1,&QWP(16,$key_));
812         &xorps          ($in0,$rndkey0);
813         &xorps          ($inout0,$rndkey0);
814         &xorps          ($cmac,$in0);           # cmac^=out
815         &$movekey       ($rndkey0,&QWP(32,$key_));
816
817 &set_label("ccm64_dec2_loop");
818         &aesenc         ($inout0,$rndkey1);
819         &aesenc         ($cmac,$rndkey1);
820         &$movekey       ($rndkey1,&QWP(0,$key,$rounds));
821         &add            ($rounds,32);
822         &aesenc         ($inout0,$rndkey0);
823         &aesenc         ($cmac,$rndkey0);
824         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
825         &jnz            (&label("ccm64_dec2_loop"));
826         &movups         ($in0,&QWP(0,$inp));    # load inp
827         &paddq          ($ivec,&QWP(16,"esp"));
828         &aesenc         ($inout0,$rndkey1);
829         &aesenc         ($cmac,$rndkey1);
830         &aesenclast     ($inout0,$rndkey0);
831         &aesenclast     ($cmac,$rndkey0);
832         &lea            ($inp,&QWP(16,$inp));
833         &jmp    (&label("ccm64_dec_outer"));
834
835 &set_label("ccm64_dec_break",16);
836         &mov    ($rounds,&DWP(240,$key_));
837         &mov    ($key,$key_);
838         if ($inline)
839         {   &aesni_inline_generate1("enc",$cmac,$in0);  }
840         else
841         {   &call       ("_aesni_encrypt1",$cmac);      }
842
843         &mov    ("esp",&DWP(48,"esp"));
844         &mov    ($out,&wparam(5));
845         &movups (&QWP(0,$out),$cmac);
846
847         &pxor   ("xmm0","xmm0");                # clear register bank
848         &pxor   ("xmm1","xmm1");
849         &pxor   ("xmm2","xmm2");
850         &pxor   ("xmm3","xmm3");
851         &pxor   ("xmm4","xmm4");
852         &pxor   ("xmm5","xmm5");
853         &pxor   ("xmm6","xmm6");
854         &pxor   ("xmm7","xmm7");
855 &function_end("aesni_ccm64_decrypt_blocks");
856 }
857 \f
858 ######################################################################
859 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
860 #                         size_t blocks, const AES_KEY *key,
861 #                         const char *ivec);
862 #
863 # Handles only complete blocks, operates on 32-bit counter and
864 # does not update *ivec! (see crypto/modes/ctr128.c for details)
865 #
866 # stack layout:
867 #       0       pshufb mask
868 #       16      vector addend: 0,6,6,6
869 #       32      counter-less ivec
870 #       48      1st triplet of counter vector
871 #       64      2nd triplet of counter vector
872 #       80      saved %esp
873
874 &function_begin("aesni_ctr32_encrypt_blocks");
875         &mov    ($inp,&wparam(0));
876         &mov    ($out,&wparam(1));
877         &mov    ($len,&wparam(2));
878         &mov    ($key,&wparam(3));
879         &mov    ($rounds_,&wparam(4));
880         &mov    ($key_,"esp");
881         &sub    ("esp",88);
882         &and    ("esp",-16);                    # align stack
883         &mov    (&DWP(80,"esp"),$key_);
884
885         &cmp    ($len,1);
886         &je     (&label("ctr32_one_shortcut"));
887
888         &movdqu ($inout5,&QWP(0,$rounds_));     # load ivec
889
890         # compose byte-swap control mask for pshufb on stack
891         &mov    (&DWP(0,"esp"),0x0c0d0e0f);
892         &mov    (&DWP(4,"esp"),0x08090a0b);
893         &mov    (&DWP(8,"esp"),0x04050607);
894         &mov    (&DWP(12,"esp"),0x00010203);
895
896         # compose counter increment vector on stack
897         &mov    ($rounds,6);
898         &xor    ($key_,$key_);
899         &mov    (&DWP(16,"esp"),$rounds);
900         &mov    (&DWP(20,"esp"),$rounds);
901         &mov    (&DWP(24,"esp"),$rounds);
902         &mov    (&DWP(28,"esp"),$key_);
903
904         &pextrd ($rounds_,$inout5,3);           # pull 32-bit counter
905         &pinsrd ($inout5,$key_,3);              # wipe 32-bit counter
906
907         &mov    ($rounds,&DWP(240,$key));       # key->rounds
908
909         # compose 2 vectors of 3x32-bit counters
910         &bswap  ($rounds_);
911         &pxor   ($rndkey0,$rndkey0);
912         &pxor   ($rndkey1,$rndkey1);
913         &movdqa ($inout0,&QWP(0,"esp"));        # load byte-swap mask
914         &pinsrd ($rndkey0,$rounds_,0);
915         &lea    ($key_,&DWP(3,$rounds_));
916         &pinsrd ($rndkey1,$key_,0);
917         &inc    ($rounds_);
918         &pinsrd ($rndkey0,$rounds_,1);
919         &inc    ($key_);
920         &pinsrd ($rndkey1,$key_,1);
921         &inc    ($rounds_);
922         &pinsrd ($rndkey0,$rounds_,2);
923         &inc    ($key_);
924         &pinsrd ($rndkey1,$key_,2);
925         &movdqa (&QWP(48,"esp"),$rndkey0);      # save 1st triplet
926         &pshufb ($rndkey0,$inout0);             # byte swap
927         &movdqu ($inout4,&QWP(0,$key));         # key[0]
928         &movdqa (&QWP(64,"esp"),$rndkey1);      # save 2nd triplet
929         &pshufb ($rndkey1,$inout0);             # byte swap
930
931         &pshufd ($inout0,$rndkey0,3<<6);        # place counter to upper dword
932         &pshufd ($inout1,$rndkey0,2<<6);
933         &cmp    ($len,6);
934         &jb     (&label("ctr32_tail"));
935         &pxor   ($inout5,$inout4);              # counter-less ivec^key[0]
936         &shl    ($rounds,4);
937         &mov    ($rounds_,16);
938         &movdqa (&QWP(32,"esp"),$inout5);       # save counter-less ivec^key[0]
939         &mov    ($key_,$key);                   # backup $key
940         &sub    ($rounds_,$rounds);             # backup twisted $rounds
941         &lea    ($key,&DWP(32,$key,$rounds));
942         &sub    ($len,6);
943         &jmp    (&label("ctr32_loop6"));
944
945 &set_label("ctr32_loop6",16);
946         # inlining _aesni_encrypt6's prologue gives ~6% improvement...
947         &pshufd ($inout2,$rndkey0,1<<6);
948         &movdqa ($rndkey0,&QWP(32,"esp"));      # pull counter-less ivec
949         &pshufd ($inout3,$rndkey1,3<<6);
950         &pxor           ($inout0,$rndkey0);     # merge counter-less ivec
951         &pshufd ($inout4,$rndkey1,2<<6);
952         &pxor           ($inout1,$rndkey0);
953         &pshufd ($inout5,$rndkey1,1<<6);
954         &$movekey       ($rndkey1,&QWP(16,$key_));
955         &pxor           ($inout2,$rndkey0);
956         &pxor           ($inout3,$rndkey0);
957         &aesenc         ($inout0,$rndkey1);
958         &pxor           ($inout4,$rndkey0);
959         &pxor           ($inout5,$rndkey0);
960         &aesenc         ($inout1,$rndkey1);
961         &$movekey       ($rndkey0,&QWP(32,$key_));
962         &mov            ($rounds,$rounds_);
963         &aesenc         ($inout2,$rndkey1);
964         &aesenc         ($inout3,$rndkey1);
965         &aesenc         ($inout4,$rndkey1);
966         &aesenc         ($inout5,$rndkey1);
967
968         &call           (&label("_aesni_encrypt6_enter"));
969
970         &movups ($rndkey1,&QWP(0,$inp));
971         &movups ($rndkey0,&QWP(0x10,$inp));
972         &xorps  ($inout0,$rndkey1);
973         &movups ($rndkey1,&QWP(0x20,$inp));
974         &xorps  ($inout1,$rndkey0);
975         &movups (&QWP(0,$out),$inout0);
976         &movdqa ($rndkey0,&QWP(16,"esp"));      # load increment
977         &xorps  ($inout2,$rndkey1);
978         &movdqa ($rndkey1,&QWP(64,"esp"));      # load 2nd triplet
979         &movups (&QWP(0x10,$out),$inout1);
980         &movups (&QWP(0x20,$out),$inout2);
981
982         &paddd  ($rndkey1,$rndkey0);            # 2nd triplet increment
983         &paddd  ($rndkey0,&QWP(48,"esp"));      # 1st triplet increment
984         &movdqa ($inout0,&QWP(0,"esp"));        # load byte swap mask
985
986         &movups ($inout1,&QWP(0x30,$inp));
987         &movups ($inout2,&QWP(0x40,$inp));
988         &xorps  ($inout3,$inout1);
989         &movups ($inout1,&QWP(0x50,$inp));
990         &lea    ($inp,&DWP(0x60,$inp));
991         &movdqa (&QWP(48,"esp"),$rndkey0);      # save 1st triplet
992         &pshufb ($rndkey0,$inout0);             # byte swap
993         &xorps  ($inout4,$inout2);
994         &movups (&QWP(0x30,$out),$inout3);
995         &xorps  ($inout5,$inout1);
996         &movdqa (&QWP(64,"esp"),$rndkey1);      # save 2nd triplet
997         &pshufb ($rndkey1,$inout0);             # byte swap
998         &movups (&QWP(0x40,$out),$inout4);
999         &pshufd ($inout0,$rndkey0,3<<6);
1000         &movups (&QWP(0x50,$out),$inout5);
1001         &lea    ($out,&DWP(0x60,$out));
1002
1003         &pshufd ($inout1,$rndkey0,2<<6);
1004         &sub    ($len,6);
1005         &jnc    (&label("ctr32_loop6"));
1006
1007         &add    ($len,6);
1008         &jz     (&label("ctr32_ret"));
1009         &movdqu ($inout5,&QWP(0,$key_));
1010         &mov    ($key,$key_);
1011         &pxor   ($inout5,&QWP(32,"esp"));       # restore count-less ivec
1012         &mov    ($rounds,&DWP(240,$key_));      # restore $rounds
1013
1014 &set_label("ctr32_tail");
1015         &por    ($inout0,$inout5);
1016         &cmp    ($len,2);
1017         &jb     (&label("ctr32_one"));
1018
1019         &pshufd ($inout2,$rndkey0,1<<6);
1020         &por    ($inout1,$inout5);
1021         &je     (&label("ctr32_two"));
1022
1023         &pshufd ($inout3,$rndkey1,3<<6);
1024         &por    ($inout2,$inout5);
1025         &cmp    ($len,4);
1026         &jb     (&label("ctr32_three"));
1027
1028         &pshufd ($inout4,$rndkey1,2<<6);
1029         &por    ($inout3,$inout5);
1030         &je     (&label("ctr32_four"));
1031
1032         &por    ($inout4,$inout5);
1033         &call   ("_aesni_encrypt6");
1034         &movups ($rndkey1,&QWP(0,$inp));
1035         &movups ($rndkey0,&QWP(0x10,$inp));
1036         &xorps  ($inout0,$rndkey1);
1037         &movups ($rndkey1,&QWP(0x20,$inp));
1038         &xorps  ($inout1,$rndkey0);
1039         &movups ($rndkey0,&QWP(0x30,$inp));
1040         &xorps  ($inout2,$rndkey1);
1041         &movups ($rndkey1,&QWP(0x40,$inp));
1042         &xorps  ($inout3,$rndkey0);
1043         &movups (&QWP(0,$out),$inout0);
1044         &xorps  ($inout4,$rndkey1);
1045         &movups (&QWP(0x10,$out),$inout1);
1046         &movups (&QWP(0x20,$out),$inout2);
1047         &movups (&QWP(0x30,$out),$inout3);
1048         &movups (&QWP(0x40,$out),$inout4);
1049         &jmp    (&label("ctr32_ret"));
1050
1051 &set_label("ctr32_one_shortcut",16);
1052         &movups ($inout0,&QWP(0,$rounds_));     # load ivec
1053         &mov    ($rounds,&DWP(240,$key));
1054
1055 &set_label("ctr32_one");
1056         if ($inline)
1057         {   &aesni_inline_generate1("enc");     }
1058         else
1059         {   &call       ("_aesni_encrypt1");    }
1060         &movups ($in0,&QWP(0,$inp));
1061         &xorps  ($in0,$inout0);
1062         &movups (&QWP(0,$out),$in0);
1063         &jmp    (&label("ctr32_ret"));
1064
1065 &set_label("ctr32_two",16);
1066         &call   ("_aesni_encrypt2");
1067         &movups ($inout3,&QWP(0,$inp));
1068         &movups ($inout4,&QWP(0x10,$inp));
1069         &xorps  ($inout0,$inout3);
1070         &xorps  ($inout1,$inout4);
1071         &movups (&QWP(0,$out),$inout0);
1072         &movups (&QWP(0x10,$out),$inout1);
1073         &jmp    (&label("ctr32_ret"));
1074
1075 &set_label("ctr32_three",16);
1076         &call   ("_aesni_encrypt3");
1077         &movups ($inout3,&QWP(0,$inp));
1078         &movups ($inout4,&QWP(0x10,$inp));
1079         &xorps  ($inout0,$inout3);
1080         &movups ($inout5,&QWP(0x20,$inp));
1081         &xorps  ($inout1,$inout4);
1082         &movups (&QWP(0,$out),$inout0);
1083         &xorps  ($inout2,$inout5);
1084         &movups (&QWP(0x10,$out),$inout1);
1085         &movups (&QWP(0x20,$out),$inout2);
1086         &jmp    (&label("ctr32_ret"));
1087
1088 &set_label("ctr32_four",16);
1089         &call   ("_aesni_encrypt4");
1090         &movups ($inout4,&QWP(0,$inp));
1091         &movups ($inout5,&QWP(0x10,$inp));
1092         &movups ($rndkey1,&QWP(0x20,$inp));
1093         &xorps  ($inout0,$inout4);
1094         &movups ($rndkey0,&QWP(0x30,$inp));
1095         &xorps  ($inout1,$inout5);
1096         &movups (&QWP(0,$out),$inout0);
1097         &xorps  ($inout2,$rndkey1);
1098         &movups (&QWP(0x10,$out),$inout1);
1099         &xorps  ($inout3,$rndkey0);
1100         &movups (&QWP(0x20,$out),$inout2);
1101         &movups (&QWP(0x30,$out),$inout3);
1102
1103 &set_label("ctr32_ret");
1104         &pxor   ("xmm0","xmm0");                # clear register bank
1105         &pxor   ("xmm1","xmm1");
1106         &pxor   ("xmm2","xmm2");
1107         &pxor   ("xmm3","xmm3");
1108         &pxor   ("xmm4","xmm4");
1109         &movdqa (&QWP(32,"esp"),"xmm0");        # clear stack
1110         &pxor   ("xmm5","xmm5");
1111         &movdqa (&QWP(48,"esp"),"xmm0");
1112         &pxor   ("xmm6","xmm6");
1113         &movdqa (&QWP(64,"esp"),"xmm0");
1114         &pxor   ("xmm7","xmm7");
1115         &mov    ("esp",&DWP(80,"esp"));
1116 &function_end("aesni_ctr32_encrypt_blocks");
1117 \f
1118 ######################################################################
1119 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1120 #       const AES_KEY *key1, const AES_KEY *key2
1121 #       const unsigned char iv[16]);
1122 #
1123 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1124
1125 &function_begin("aesni_xts_encrypt");
1126         &mov    ($key,&wparam(4));              # key2
1127         &mov    ($inp,&wparam(5));              # clear-text tweak
1128
1129         &mov    ($rounds,&DWP(240,$key));       # key2->rounds
1130         &movups ($inout0,&QWP(0,$inp));
1131         if ($inline)
1132         {   &aesni_inline_generate1("enc");     }
1133         else
1134         {   &call       ("_aesni_encrypt1");    }
1135
1136         &mov    ($inp,&wparam(0));
1137         &mov    ($out,&wparam(1));
1138         &mov    ($len,&wparam(2));
1139         &mov    ($key,&wparam(3));              # key1
1140
1141         &mov    ($key_,"esp");
1142         &sub    ("esp",16*7+8);
1143         &mov    ($rounds,&DWP(240,$key));       # key1->rounds
1144         &and    ("esp",-16);                    # align stack
1145
1146         &mov    (&DWP(16*6+0,"esp"),0x87);      # compose the magic constant
1147         &mov    (&DWP(16*6+4,"esp"),0);
1148         &mov    (&DWP(16*6+8,"esp"),1);
1149         &mov    (&DWP(16*6+12,"esp"),0);
1150         &mov    (&DWP(16*7+0,"esp"),$len);      # save original $len
1151         &mov    (&DWP(16*7+4,"esp"),$key_);     # save original %esp
1152
1153         &movdqa ($tweak,$inout0);
1154         &pxor   ($twtmp,$twtmp);
1155         &movdqa ($twmask,&QWP(6*16,"esp"));     # 0x0...010...87
1156         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1157
1158         &and    ($len,-16);
1159         &mov    ($key_,$key);                   # backup $key
1160         &mov    ($rounds_,$rounds);             # backup $rounds
1161         &sub    ($len,16*6);
1162         &jc     (&label("xts_enc_short"));
1163
1164         &shl    ($rounds,4);
1165         &mov    ($rounds_,16);
1166         &sub    ($rounds_,$rounds);
1167         &lea    ($key,&DWP(32,$key,$rounds));
1168         &jmp    (&label("xts_enc_loop6"));
1169
1170 &set_label("xts_enc_loop6",16);
1171         for ($i=0;$i<4;$i++) {
1172             &pshufd     ($twres,$twtmp,0x13);
1173             &pxor       ($twtmp,$twtmp);
1174             &movdqa     (&QWP(16*$i,"esp"),$tweak);
1175             &paddq      ($tweak,$tweak);        # &psllq($tweak,1);
1176             &pand       ($twres,$twmask);       # isolate carry and residue
1177             &pcmpgtd    ($twtmp,$tweak);        # broadcast upper bits
1178             &pxor       ($tweak,$twres);
1179         }
1180         &pshufd ($inout5,$twtmp,0x13);
1181         &movdqa (&QWP(16*$i++,"esp"),$tweak);
1182         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1183          &$movekey      ($rndkey0,&QWP(0,$key_));
1184         &pand   ($inout5,$twmask);              # isolate carry and residue
1185          &movups        ($inout0,&QWP(0,$inp)); # load input
1186         &pxor   ($inout5,$tweak);
1187
1188         # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1189         &mov    ($rounds,$rounds_);             # restore $rounds
1190         &movdqu ($inout1,&QWP(16*1,$inp));
1191          &xorps         ($inout0,$rndkey0);     # input^=rndkey[0]
1192         &movdqu ($inout2,&QWP(16*2,$inp));
1193          &pxor          ($inout1,$rndkey0);
1194         &movdqu ($inout3,&QWP(16*3,$inp));
1195          &pxor          ($inout2,$rndkey0);
1196         &movdqu ($inout4,&QWP(16*4,$inp));
1197          &pxor          ($inout3,$rndkey0);
1198         &movdqu ($rndkey1,&QWP(16*5,$inp));
1199          &pxor          ($inout4,$rndkey0);
1200         &lea    ($inp,&DWP(16*6,$inp));
1201         &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1202         &movdqa (&QWP(16*$i,"esp"),$inout5);    # save last tweak
1203         &pxor   ($inout5,$rndkey1);
1204
1205          &$movekey      ($rndkey1,&QWP(16,$key_));
1206         &pxor   ($inout1,&QWP(16*1,"esp"));
1207         &pxor   ($inout2,&QWP(16*2,"esp"));
1208          &aesenc        ($inout0,$rndkey1);
1209         &pxor   ($inout3,&QWP(16*3,"esp"));
1210         &pxor   ($inout4,&QWP(16*4,"esp"));
1211          &aesenc        ($inout1,$rndkey1);
1212         &pxor           ($inout5,$rndkey0);
1213          &$movekey      ($rndkey0,&QWP(32,$key_));
1214          &aesenc        ($inout2,$rndkey1);
1215          &aesenc        ($inout3,$rndkey1);
1216          &aesenc        ($inout4,$rndkey1);
1217          &aesenc        ($inout5,$rndkey1);
1218         &call           (&label("_aesni_encrypt6_enter"));
1219
1220         &movdqa ($tweak,&QWP(16*5,"esp"));      # last tweak
1221        &pxor    ($twtmp,$twtmp);
1222         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1223        &pcmpgtd ($twtmp,$tweak);                # broadcast upper bits
1224         &xorps  ($inout1,&QWP(16*1,"esp"));
1225         &movups (&QWP(16*0,$out),$inout0);      # write output
1226         &xorps  ($inout2,&QWP(16*2,"esp"));
1227         &movups (&QWP(16*1,$out),$inout1);
1228         &xorps  ($inout3,&QWP(16*3,"esp"));
1229         &movups (&QWP(16*2,$out),$inout2);
1230         &xorps  ($inout4,&QWP(16*4,"esp"));
1231         &movups (&QWP(16*3,$out),$inout3);
1232         &xorps  ($inout5,$tweak);
1233         &movups (&QWP(16*4,$out),$inout4);
1234        &pshufd  ($twres,$twtmp,0x13);
1235         &movups (&QWP(16*5,$out),$inout5);
1236         &lea    ($out,&DWP(16*6,$out));
1237        &movdqa  ($twmask,&QWP(16*6,"esp"));     # 0x0...010...87
1238
1239         &pxor   ($twtmp,$twtmp);
1240         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1241         &pand   ($twres,$twmask);               # isolate carry and residue
1242         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1243         &pxor   ($tweak,$twres);
1244
1245         &sub    ($len,16*6);
1246         &jnc    (&label("xts_enc_loop6"));
1247
1248         &mov    ($rounds,&DWP(240,$key_));      # restore $rounds
1249         &mov    ($key,$key_);                   # restore $key
1250         &mov    ($rounds_,$rounds);
1251
1252 &set_label("xts_enc_short");
1253         &add    ($len,16*6);
1254         &jz     (&label("xts_enc_done6x"));
1255
1256         &movdqa ($inout3,$tweak);               # put aside previous tweak
1257         &cmp    ($len,0x20);
1258         &jb     (&label("xts_enc_one"));
1259
1260         &pshufd ($twres,$twtmp,0x13);
1261         &pxor   ($twtmp,$twtmp);
1262         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1263         &pand   ($twres,$twmask);               # isolate carry and residue
1264         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1265         &pxor   ($tweak,$twres);
1266         &je     (&label("xts_enc_two"));
1267
1268         &pshufd ($twres,$twtmp,0x13);
1269         &pxor   ($twtmp,$twtmp);
1270         &movdqa ($inout4,$tweak);               # put aside previous tweak
1271         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1272         &pand   ($twres,$twmask);               # isolate carry and residue
1273         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1274         &pxor   ($tweak,$twres);
1275         &cmp    ($len,0x40);
1276         &jb     (&label("xts_enc_three"));
1277
1278         &pshufd ($twres,$twtmp,0x13);
1279         &pxor   ($twtmp,$twtmp);
1280         &movdqa ($inout5,$tweak);               # put aside previous tweak
1281         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1282         &pand   ($twres,$twmask);               # isolate carry and residue
1283         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1284         &pxor   ($tweak,$twres);
1285         &movdqa (&QWP(16*0,"esp"),$inout3);
1286         &movdqa (&QWP(16*1,"esp"),$inout4);
1287         &je     (&label("xts_enc_four"));
1288
1289         &movdqa (&QWP(16*2,"esp"),$inout5);
1290         &pshufd ($inout5,$twtmp,0x13);
1291         &movdqa (&QWP(16*3,"esp"),$tweak);
1292         &paddq  ($tweak,$tweak);                # &psllq($inout0,1);
1293         &pand   ($inout5,$twmask);              # isolate carry and residue
1294         &pxor   ($inout5,$tweak);
1295
1296         &movdqu ($inout0,&QWP(16*0,$inp));      # load input
1297         &movdqu ($inout1,&QWP(16*1,$inp));
1298         &movdqu ($inout2,&QWP(16*2,$inp));
1299         &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1300         &movdqu ($inout3,&QWP(16*3,$inp));
1301         &pxor   ($inout1,&QWP(16*1,"esp"));
1302         &movdqu ($inout4,&QWP(16*4,$inp));
1303         &pxor   ($inout2,&QWP(16*2,"esp"));
1304         &lea    ($inp,&DWP(16*5,$inp));
1305         &pxor   ($inout3,&QWP(16*3,"esp"));
1306         &movdqa (&QWP(16*4,"esp"),$inout5);     # save last tweak
1307         &pxor   ($inout4,$inout5);
1308
1309         &call   ("_aesni_encrypt6");
1310
1311         &movaps ($tweak,&QWP(16*4,"esp"));      # last tweak
1312         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1313         &xorps  ($inout1,&QWP(16*1,"esp"));
1314         &xorps  ($inout2,&QWP(16*2,"esp"));
1315         &movups (&QWP(16*0,$out),$inout0);      # write output
1316         &xorps  ($inout3,&QWP(16*3,"esp"));
1317         &movups (&QWP(16*1,$out),$inout1);
1318         &xorps  ($inout4,$tweak);
1319         &movups (&QWP(16*2,$out),$inout2);
1320         &movups (&QWP(16*3,$out),$inout3);
1321         &movups (&QWP(16*4,$out),$inout4);
1322         &lea    ($out,&DWP(16*5,$out));
1323         &jmp    (&label("xts_enc_done"));
1324
1325 &set_label("xts_enc_one",16);
1326         &movups ($inout0,&QWP(16*0,$inp));      # load input
1327         &lea    ($inp,&DWP(16*1,$inp));
1328         &xorps  ($inout0,$inout3);              # input^=tweak
1329         if ($inline)
1330         {   &aesni_inline_generate1("enc");     }
1331         else
1332         {   &call       ("_aesni_encrypt1");    }
1333         &xorps  ($inout0,$inout3);              # output^=tweak
1334         &movups (&QWP(16*0,$out),$inout0);      # write output
1335         &lea    ($out,&DWP(16*1,$out));
1336
1337         &movdqa ($tweak,$inout3);               # last tweak
1338         &jmp    (&label("xts_enc_done"));
1339
1340 &set_label("xts_enc_two",16);
1341         &movaps ($inout4,$tweak);               # put aside last tweak
1342
1343         &movups ($inout0,&QWP(16*0,$inp));      # load input
1344         &movups ($inout1,&QWP(16*1,$inp));
1345         &lea    ($inp,&DWP(16*2,$inp));
1346         &xorps  ($inout0,$inout3);              # input^=tweak
1347         &xorps  ($inout1,$inout4);
1348
1349         &call   ("_aesni_encrypt2");
1350
1351         &xorps  ($inout0,$inout3);              # output^=tweak
1352         &xorps  ($inout1,$inout4);
1353         &movups (&QWP(16*0,$out),$inout0);      # write output
1354         &movups (&QWP(16*1,$out),$inout1);
1355         &lea    ($out,&DWP(16*2,$out));
1356
1357         &movdqa ($tweak,$inout4);               # last tweak
1358         &jmp    (&label("xts_enc_done"));
1359
1360 &set_label("xts_enc_three",16);
1361         &movaps ($inout5,$tweak);               # put aside last tweak
1362         &movups ($inout0,&QWP(16*0,$inp));      # load input
1363         &movups ($inout1,&QWP(16*1,$inp));
1364         &movups ($inout2,&QWP(16*2,$inp));
1365         &lea    ($inp,&DWP(16*3,$inp));
1366         &xorps  ($inout0,$inout3);              # input^=tweak
1367         &xorps  ($inout1,$inout4);
1368         &xorps  ($inout2,$inout5);
1369
1370         &call   ("_aesni_encrypt3");
1371
1372         &xorps  ($inout0,$inout3);              # output^=tweak
1373         &xorps  ($inout1,$inout4);
1374         &xorps  ($inout2,$inout5);
1375         &movups (&QWP(16*0,$out),$inout0);      # write output
1376         &movups (&QWP(16*1,$out),$inout1);
1377         &movups (&QWP(16*2,$out),$inout2);
1378         &lea    ($out,&DWP(16*3,$out));
1379
1380         &movdqa ($tweak,$inout5);               # last tweak
1381         &jmp    (&label("xts_enc_done"));
1382
1383 &set_label("xts_enc_four",16);
1384         &movaps ($inout4,$tweak);               # put aside last tweak
1385
1386         &movups ($inout0,&QWP(16*0,$inp));      # load input
1387         &movups ($inout1,&QWP(16*1,$inp));
1388         &movups ($inout2,&QWP(16*2,$inp));
1389         &xorps  ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1390         &movups ($inout3,&QWP(16*3,$inp));
1391         &lea    ($inp,&DWP(16*4,$inp));
1392         &xorps  ($inout1,&QWP(16*1,"esp"));
1393         &xorps  ($inout2,$inout5);
1394         &xorps  ($inout3,$inout4);
1395
1396         &call   ("_aesni_encrypt4");
1397
1398         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1399         &xorps  ($inout1,&QWP(16*1,"esp"));
1400         &xorps  ($inout2,$inout5);
1401         &movups (&QWP(16*0,$out),$inout0);      # write output
1402         &xorps  ($inout3,$inout4);
1403         &movups (&QWP(16*1,$out),$inout1);
1404         &movups (&QWP(16*2,$out),$inout2);
1405         &movups (&QWP(16*3,$out),$inout3);
1406         &lea    ($out,&DWP(16*4,$out));
1407
1408         &movdqa ($tweak,$inout4);               # last tweak
1409         &jmp    (&label("xts_enc_done"));
1410
1411 &set_label("xts_enc_done6x",16);                # $tweak is pre-calculated
1412         &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
1413         &and    ($len,15);
1414         &jz     (&label("xts_enc_ret"));
1415         &movdqa ($inout3,$tweak);
1416         &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
1417         &jmp    (&label("xts_enc_steal"));
1418
1419 &set_label("xts_enc_done",16);
1420         &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
1421         &pxor   ($twtmp,$twtmp);
1422         &and    ($len,15);
1423         &jz     (&label("xts_enc_ret"));
1424
1425         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1426         &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
1427         &pshufd ($inout3,$twtmp,0x13);
1428         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1429         &pand   ($inout3,&QWP(16*6,"esp"));     # isolate carry and residue
1430         &pxor   ($inout3,$tweak);
1431
1432 &set_label("xts_enc_steal");
1433         &movz   ($rounds,&BP(0,$inp));
1434         &movz   ($key,&BP(-16,$out));
1435         &lea    ($inp,&DWP(1,$inp));
1436         &mov    (&BP(-16,$out),&LB($rounds));
1437         &mov    (&BP(0,$out),&LB($key));
1438         &lea    ($out,&DWP(1,$out));
1439         &sub    ($len,1);
1440         &jnz    (&label("xts_enc_steal"));
1441
1442         &sub    ($out,&DWP(16*7+0,"esp"));      # rewind $out
1443         &mov    ($key,$key_);                   # restore $key
1444         &mov    ($rounds,$rounds_);             # restore $rounds
1445
1446         &movups ($inout0,&QWP(-16,$out));       # load input
1447         &xorps  ($inout0,$inout3);              # input^=tweak
1448         if ($inline)
1449         {   &aesni_inline_generate1("enc");     }
1450         else
1451         {   &call       ("_aesni_encrypt1");    }
1452         &xorps  ($inout0,$inout3);              # output^=tweak
1453         &movups (&QWP(-16,$out),$inout0);       # write output
1454
1455 &set_label("xts_enc_ret");
1456         &pxor   ("xmm0","xmm0");                # clear register bank
1457         &pxor   ("xmm1","xmm1");
1458         &pxor   ("xmm2","xmm2");
1459         &movdqa (&QWP(16*0,"esp"),"xmm0");      # clear stack
1460         &pxor   ("xmm3","xmm3");
1461         &movdqa (&QWP(16*1,"esp"),"xmm0");
1462         &pxor   ("xmm4","xmm4");
1463         &movdqa (&QWP(16*2,"esp"),"xmm0");
1464         &pxor   ("xmm5","xmm5");
1465         &movdqa (&QWP(16*3,"esp"),"xmm0");
1466         &pxor   ("xmm6","xmm6");
1467         &movdqa (&QWP(16*4,"esp"),"xmm0");
1468         &pxor   ("xmm7","xmm7");
1469         &movdqa (&QWP(16*5,"esp"),"xmm0");
1470         &mov    ("esp",&DWP(16*7+4,"esp"));     # restore %esp
1471 &function_end("aesni_xts_encrypt");
1472
1473 &function_begin("aesni_xts_decrypt");
1474         &mov    ($key,&wparam(4));              # key2
1475         &mov    ($inp,&wparam(5));              # clear-text tweak
1476
1477         &mov    ($rounds,&DWP(240,$key));       # key2->rounds
1478         &movups ($inout0,&QWP(0,$inp));
1479         if ($inline)
1480         {   &aesni_inline_generate1("enc");     }
1481         else
1482         {   &call       ("_aesni_encrypt1");    }
1483
1484         &mov    ($inp,&wparam(0));
1485         &mov    ($out,&wparam(1));
1486         &mov    ($len,&wparam(2));
1487         &mov    ($key,&wparam(3));              # key1
1488
1489         &mov    ($key_,"esp");
1490         &sub    ("esp",16*7+8);
1491         &and    ("esp",-16);                    # align stack
1492
1493         &xor    ($rounds_,$rounds_);            # if(len%16) len-=16;
1494         &test   ($len,15);
1495         &setnz  (&LB($rounds_));
1496         &shl    ($rounds_,4);
1497         &sub    ($len,$rounds_);
1498
1499         &mov    (&DWP(16*6+0,"esp"),0x87);      # compose the magic constant
1500         &mov    (&DWP(16*6+4,"esp"),0);
1501         &mov    (&DWP(16*6+8,"esp"),1);
1502         &mov    (&DWP(16*6+12,"esp"),0);
1503         &mov    (&DWP(16*7+0,"esp"),$len);      # save original $len
1504         &mov    (&DWP(16*7+4,"esp"),$key_);     # save original %esp
1505
1506         &mov    ($rounds,&DWP(240,$key));       # key1->rounds
1507         &mov    ($key_,$key);                   # backup $key
1508         &mov    ($rounds_,$rounds);             # backup $rounds
1509
1510         &movdqa ($tweak,$inout0);
1511         &pxor   ($twtmp,$twtmp);
1512         &movdqa ($twmask,&QWP(6*16,"esp"));     # 0x0...010...87
1513         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1514
1515         &and    ($len,-16);
1516         &sub    ($len,16*6);
1517         &jc     (&label("xts_dec_short"));
1518
1519         &shl    ($rounds,4);
1520         &mov    ($rounds_,16);
1521         &sub    ($rounds_,$rounds);
1522         &lea    ($key,&DWP(32,$key,$rounds));
1523         &jmp    (&label("xts_dec_loop6"));
1524
1525 &set_label("xts_dec_loop6",16);
1526         for ($i=0;$i<4;$i++) {
1527             &pshufd     ($twres,$twtmp,0x13);
1528             &pxor       ($twtmp,$twtmp);
1529             &movdqa     (&QWP(16*$i,"esp"),$tweak);
1530             &paddq      ($tweak,$tweak);        # &psllq($tweak,1);
1531             &pand       ($twres,$twmask);       # isolate carry and residue
1532             &pcmpgtd    ($twtmp,$tweak);        # broadcast upper bits
1533             &pxor       ($tweak,$twres);
1534         }
1535         &pshufd ($inout5,$twtmp,0x13);
1536         &movdqa (&QWP(16*$i++,"esp"),$tweak);
1537         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1538          &$movekey      ($rndkey0,&QWP(0,$key_));
1539         &pand   ($inout5,$twmask);              # isolate carry and residue
1540          &movups        ($inout0,&QWP(0,$inp)); # load input
1541         &pxor   ($inout5,$tweak);
1542
1543         # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1544         &mov    ($rounds,$rounds_);
1545         &movdqu ($inout1,&QWP(16*1,$inp));
1546          &xorps         ($inout0,$rndkey0);     # input^=rndkey[0]
1547         &movdqu ($inout2,&QWP(16*2,$inp));
1548          &pxor          ($inout1,$rndkey0);
1549         &movdqu ($inout3,&QWP(16*3,$inp));
1550          &pxor          ($inout2,$rndkey0);
1551         &movdqu ($inout4,&QWP(16*4,$inp));
1552          &pxor          ($inout3,$rndkey0);
1553         &movdqu ($rndkey1,&QWP(16*5,$inp));
1554          &pxor          ($inout4,$rndkey0);
1555         &lea    ($inp,&DWP(16*6,$inp));
1556         &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1557         &movdqa (&QWP(16*$i,"esp"),$inout5);    # save last tweak
1558         &pxor   ($inout5,$rndkey1);
1559
1560          &$movekey      ($rndkey1,&QWP(16,$key_));
1561         &pxor   ($inout1,&QWP(16*1,"esp"));
1562         &pxor   ($inout2,&QWP(16*2,"esp"));
1563          &aesdec        ($inout0,$rndkey1);
1564         &pxor   ($inout3,&QWP(16*3,"esp"));
1565         &pxor   ($inout4,&QWP(16*4,"esp"));
1566          &aesdec        ($inout1,$rndkey1);
1567         &pxor           ($inout5,$rndkey0);
1568          &$movekey      ($rndkey0,&QWP(32,$key_));
1569          &aesdec        ($inout2,$rndkey1);
1570          &aesdec        ($inout3,$rndkey1);
1571          &aesdec        ($inout4,$rndkey1);
1572          &aesdec        ($inout5,$rndkey1);
1573         &call           (&label("_aesni_decrypt6_enter"));
1574
1575         &movdqa ($tweak,&QWP(16*5,"esp"));      # last tweak
1576        &pxor    ($twtmp,$twtmp);
1577         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1578        &pcmpgtd ($twtmp,$tweak);                # broadcast upper bits
1579         &xorps  ($inout1,&QWP(16*1,"esp"));
1580         &movups (&QWP(16*0,$out),$inout0);      # write output
1581         &xorps  ($inout2,&QWP(16*2,"esp"));
1582         &movups (&QWP(16*1,$out),$inout1);
1583         &xorps  ($inout3,&QWP(16*3,"esp"));
1584         &movups (&QWP(16*2,$out),$inout2);
1585         &xorps  ($inout4,&QWP(16*4,"esp"));
1586         &movups (&QWP(16*3,$out),$inout3);
1587         &xorps  ($inout5,$tweak);
1588         &movups (&QWP(16*4,$out),$inout4);
1589        &pshufd  ($twres,$twtmp,0x13);
1590         &movups (&QWP(16*5,$out),$inout5);
1591         &lea    ($out,&DWP(16*6,$out));
1592        &movdqa  ($twmask,&QWP(16*6,"esp"));     # 0x0...010...87
1593
1594         &pxor   ($twtmp,$twtmp);
1595         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1596         &pand   ($twres,$twmask);               # isolate carry and residue
1597         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1598         &pxor   ($tweak,$twres);
1599
1600         &sub    ($len,16*6);
1601         &jnc    (&label("xts_dec_loop6"));
1602
1603         &mov    ($rounds,&DWP(240,$key_));      # restore $rounds
1604         &mov    ($key,$key_);                   # restore $key
1605         &mov    ($rounds_,$rounds);
1606
1607 &set_label("xts_dec_short");
1608         &add    ($len,16*6);
1609         &jz     (&label("xts_dec_done6x"));
1610
1611         &movdqa ($inout3,$tweak);               # put aside previous tweak
1612         &cmp    ($len,0x20);
1613         &jb     (&label("xts_dec_one"));
1614
1615         &pshufd ($twres,$twtmp,0x13);
1616         &pxor   ($twtmp,$twtmp);
1617         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1618         &pand   ($twres,$twmask);               # isolate carry and residue
1619         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1620         &pxor   ($tweak,$twres);
1621         &je     (&label("xts_dec_two"));
1622
1623         &pshufd ($twres,$twtmp,0x13);
1624         &pxor   ($twtmp,$twtmp);
1625         &movdqa ($inout4,$tweak);               # put aside previous tweak
1626         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1627         &pand   ($twres,$twmask);               # isolate carry and residue
1628         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1629         &pxor   ($tweak,$twres);
1630         &cmp    ($len,0x40);
1631         &jb     (&label("xts_dec_three"));
1632
1633         &pshufd ($twres,$twtmp,0x13);
1634         &pxor   ($twtmp,$twtmp);
1635         &movdqa ($inout5,$tweak);               # put aside previous tweak
1636         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1637         &pand   ($twres,$twmask);               # isolate carry and residue
1638         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1639         &pxor   ($tweak,$twres);
1640         &movdqa (&QWP(16*0,"esp"),$inout3);
1641         &movdqa (&QWP(16*1,"esp"),$inout4);
1642         &je     (&label("xts_dec_four"));
1643
1644         &movdqa (&QWP(16*2,"esp"),$inout5);
1645         &pshufd ($inout5,$twtmp,0x13);
1646         &movdqa (&QWP(16*3,"esp"),$tweak);
1647         &paddq  ($tweak,$tweak);                # &psllq($inout0,1);
1648         &pand   ($inout5,$twmask);              # isolate carry and residue
1649         &pxor   ($inout5,$tweak);
1650
1651         &movdqu ($inout0,&QWP(16*0,$inp));      # load input
1652         &movdqu ($inout1,&QWP(16*1,$inp));
1653         &movdqu ($inout2,&QWP(16*2,$inp));
1654         &pxor   ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1655         &movdqu ($inout3,&QWP(16*3,$inp));
1656         &pxor   ($inout1,&QWP(16*1,"esp"));
1657         &movdqu ($inout4,&QWP(16*4,$inp));
1658         &pxor   ($inout2,&QWP(16*2,"esp"));
1659         &lea    ($inp,&DWP(16*5,$inp));
1660         &pxor   ($inout3,&QWP(16*3,"esp"));
1661         &movdqa (&QWP(16*4,"esp"),$inout5);     # save last tweak
1662         &pxor   ($inout4,$inout5);
1663
1664         &call   ("_aesni_decrypt6");
1665
1666         &movaps ($tweak,&QWP(16*4,"esp"));      # last tweak
1667         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1668         &xorps  ($inout1,&QWP(16*1,"esp"));
1669         &xorps  ($inout2,&QWP(16*2,"esp"));
1670         &movups (&QWP(16*0,$out),$inout0);      # write output
1671         &xorps  ($inout3,&QWP(16*3,"esp"));
1672         &movups (&QWP(16*1,$out),$inout1);
1673         &xorps  ($inout4,$tweak);
1674         &movups (&QWP(16*2,$out),$inout2);
1675         &movups (&QWP(16*3,$out),$inout3);
1676         &movups (&QWP(16*4,$out),$inout4);
1677         &lea    ($out,&DWP(16*5,$out));
1678         &jmp    (&label("xts_dec_done"));
1679
1680 &set_label("xts_dec_one",16);
1681         &movups ($inout0,&QWP(16*0,$inp));      # load input
1682         &lea    ($inp,&DWP(16*1,$inp));
1683         &xorps  ($inout0,$inout3);              # input^=tweak
1684         if ($inline)
1685         {   &aesni_inline_generate1("dec");     }
1686         else
1687         {   &call       ("_aesni_decrypt1");    }
1688         &xorps  ($inout0,$inout3);              # output^=tweak
1689         &movups (&QWP(16*0,$out),$inout0);      # write output
1690         &lea    ($out,&DWP(16*1,$out));
1691
1692         &movdqa ($tweak,$inout3);               # last tweak
1693         &jmp    (&label("xts_dec_done"));
1694
1695 &set_label("xts_dec_two",16);
1696         &movaps ($inout4,$tweak);               # put aside last tweak
1697
1698         &movups ($inout0,&QWP(16*0,$inp));      # load input
1699         &movups ($inout1,&QWP(16*1,$inp));
1700         &lea    ($inp,&DWP(16*2,$inp));
1701         &xorps  ($inout0,$inout3);              # input^=tweak
1702         &xorps  ($inout1,$inout4);
1703
1704         &call   ("_aesni_decrypt2");
1705
1706         &xorps  ($inout0,$inout3);              # output^=tweak
1707         &xorps  ($inout1,$inout4);
1708         &movups (&QWP(16*0,$out),$inout0);      # write output
1709         &movups (&QWP(16*1,$out),$inout1);
1710         &lea    ($out,&DWP(16*2,$out));
1711
1712         &movdqa ($tweak,$inout4);               # last tweak
1713         &jmp    (&label("xts_dec_done"));
1714
1715 &set_label("xts_dec_three",16);
1716         &movaps ($inout5,$tweak);               # put aside last tweak
1717         &movups ($inout0,&QWP(16*0,$inp));      # load input
1718         &movups ($inout1,&QWP(16*1,$inp));
1719         &movups ($inout2,&QWP(16*2,$inp));
1720         &lea    ($inp,&DWP(16*3,$inp));
1721         &xorps  ($inout0,$inout3);              # input^=tweak
1722         &xorps  ($inout1,$inout4);
1723         &xorps  ($inout2,$inout5);
1724
1725         &call   ("_aesni_decrypt3");
1726
1727         &xorps  ($inout0,$inout3);              # output^=tweak
1728         &xorps  ($inout1,$inout4);
1729         &xorps  ($inout2,$inout5);
1730         &movups (&QWP(16*0,$out),$inout0);      # write output
1731         &movups (&QWP(16*1,$out),$inout1);
1732         &movups (&QWP(16*2,$out),$inout2);
1733         &lea    ($out,&DWP(16*3,$out));
1734
1735         &movdqa ($tweak,$inout5);               # last tweak
1736         &jmp    (&label("xts_dec_done"));
1737
1738 &set_label("xts_dec_four",16);
1739         &movaps ($inout4,$tweak);               # put aside last tweak
1740
1741         &movups ($inout0,&QWP(16*0,$inp));      # load input
1742         &movups ($inout1,&QWP(16*1,$inp));
1743         &movups ($inout2,&QWP(16*2,$inp));
1744         &xorps  ($inout0,&QWP(16*0,"esp"));     # input^=tweak
1745         &movups ($inout3,&QWP(16*3,$inp));
1746         &lea    ($inp,&DWP(16*4,$inp));
1747         &xorps  ($inout1,&QWP(16*1,"esp"));
1748         &xorps  ($inout2,$inout5);
1749         &xorps  ($inout3,$inout4);
1750
1751         &call   ("_aesni_decrypt4");
1752
1753         &xorps  ($inout0,&QWP(16*0,"esp"));     # output^=tweak
1754         &xorps  ($inout1,&QWP(16*1,"esp"));
1755         &xorps  ($inout2,$inout5);
1756         &movups (&QWP(16*0,$out),$inout0);      # write output
1757         &xorps  ($inout3,$inout4);
1758         &movups (&QWP(16*1,$out),$inout1);
1759         &movups (&QWP(16*2,$out),$inout2);
1760         &movups (&QWP(16*3,$out),$inout3);
1761         &lea    ($out,&DWP(16*4,$out));
1762
1763         &movdqa ($tweak,$inout4);               # last tweak
1764         &jmp    (&label("xts_dec_done"));
1765
1766 &set_label("xts_dec_done6x",16);                # $tweak is pre-calculated
1767         &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
1768         &and    ($len,15);
1769         &jz     (&label("xts_dec_ret"));
1770         &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
1771         &jmp    (&label("xts_dec_only_one_more"));
1772
1773 &set_label("xts_dec_done",16);
1774         &mov    ($len,&DWP(16*7+0,"esp"));      # restore original $len
1775         &pxor   ($twtmp,$twtmp);
1776         &and    ($len,15);
1777         &jz     (&label("xts_dec_ret"));
1778
1779         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1780         &mov    (&DWP(16*7+0,"esp"),$len);      # save $len%16
1781         &pshufd ($twres,$twtmp,0x13);
1782         &pxor   ($twtmp,$twtmp);
1783         &movdqa ($twmask,&QWP(16*6,"esp"));
1784         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1785         &pand   ($twres,$twmask);               # isolate carry and residue
1786         &pcmpgtd($twtmp,$tweak);                # broadcast upper bits
1787         &pxor   ($tweak,$twres);
1788
1789 &set_label("xts_dec_only_one_more");
1790         &pshufd ($inout3,$twtmp,0x13);
1791         &movdqa ($inout4,$tweak);               # put aside previous tweak
1792         &paddq  ($tweak,$tweak);                # &psllq($tweak,1);
1793         &pand   ($inout3,$twmask);              # isolate carry and residue
1794         &pxor   ($inout3,$tweak);
1795
1796         &mov    ($key,$key_);                   # restore $key
1797         &mov    ($rounds,$rounds_);             # restore $rounds
1798
1799         &movups ($inout0,&QWP(0,$inp));         # load input
1800         &xorps  ($inout0,$inout3);              # input^=tweak
1801         if ($inline)
1802         {   &aesni_inline_generate1("dec");     }
1803         else
1804         {   &call       ("_aesni_decrypt1");    }
1805         &xorps  ($inout0,$inout3);              # output^=tweak
1806         &movups (&QWP(0,$out),$inout0);         # write output
1807
1808 &set_label("xts_dec_steal");
1809         &movz   ($rounds,&BP(16,$inp));
1810         &movz   ($key,&BP(0,$out));
1811         &lea    ($inp,&DWP(1,$inp));
1812         &mov    (&BP(0,$out),&LB($rounds));
1813         &mov    (&BP(16,$out),&LB($key));
1814         &lea    ($out,&DWP(1,$out));
1815         &sub    ($len,1);
1816         &jnz    (&label("xts_dec_steal"));
1817
1818         &sub    ($out,&DWP(16*7+0,"esp"));      # rewind $out
1819         &mov    ($key,$key_);                   # restore $key
1820         &mov    ($rounds,$rounds_);             # restore $rounds
1821
1822         &movups ($inout0,&QWP(0,$out));         # load input
1823         &xorps  ($inout0,$inout4);              # input^=tweak
1824         if ($inline)
1825         {   &aesni_inline_generate1("dec");     }
1826         else
1827         {   &call       ("_aesni_decrypt1");    }
1828         &xorps  ($inout0,$inout4);              # output^=tweak
1829         &movups (&QWP(0,$out),$inout0);         # write output
1830
1831 &set_label("xts_dec_ret");
1832         &pxor   ("xmm0","xmm0");                # clear register bank
1833         &pxor   ("xmm1","xmm1");
1834         &pxor   ("xmm2","xmm2");
1835         &movdqa (&QWP(16*0,"esp"),"xmm0");      # clear stack
1836         &pxor   ("xmm3","xmm3");
1837         &movdqa (&QWP(16*1,"esp"),"xmm0");
1838         &pxor   ("xmm4","xmm4");
1839         &movdqa (&QWP(16*2,"esp"),"xmm0");
1840         &pxor   ("xmm5","xmm5");
1841         &movdqa (&QWP(16*3,"esp"),"xmm0");
1842         &pxor   ("xmm6","xmm6");
1843         &movdqa (&QWP(16*4,"esp"),"xmm0");
1844         &pxor   ("xmm7","xmm7");
1845         &movdqa (&QWP(16*5,"esp"),"xmm0");
1846         &mov    ("esp",&DWP(16*7+4,"esp"));     # restore %esp
1847 &function_end("aesni_xts_decrypt");
1848 }
1849 \f
1850 ######################################################################
1851 # void aesni_ocb_[en|de]crypt(const char *inp, char *out, size_t blocks,
1852 #       const AES_KEY *key, unsigned int start_block_num,
1853 #       unsigned char offset_i[16], const unsigned char L_[][16],
1854 #       unsigned char checksum[16]);
1855 #
1856 {
1857 # offsets within stack frame
1858 my $checksum = 16*6;
1859 my ($key_off,$rounds_off,$out_off,$end_off,$esp_off)=map(16*7+4*$_,(0..4));
1860
1861 # reassigned registers
1862 my ($l_,$block,$i1,$i3,$i5) = ($rounds_,$key_,$rounds,$len,$out);
1863 # $l_, $blocks, $inp, $key are permanently allocated in registers;
1864 # remaining non-volatile ones are offloaded to stack, which even
1865 # stay invariant after written to stack.
1866
1867 &function_begin("aesni_ocb_encrypt");
1868         &mov    ($rounds,&wparam(5));           # &offset_i
1869         &mov    ($rounds_,&wparam(7));          # &checksum
1870
1871         &mov    ($inp,&wparam(0));
1872         &mov    ($out,&wparam(1));
1873         &mov    ($len,&wparam(2));
1874         &mov    ($key,&wparam(3));
1875         &movdqu ($rndkey0,&QWP(0,$rounds));     # load offset_i
1876         &mov    ($block,&wparam(4));            # start_block_num
1877         &movdqu ($rndkey1,&QWP(0,$rounds_));    # load checksum
1878         &mov    ($l_,&wparam(6));               # L_
1879
1880         &mov    ($rounds,"esp");
1881         &sub    ("esp",$esp_off+4);             # alloca
1882         &and    ("esp",-16);                    # align stack
1883
1884         &sub    ($out,$inp);
1885         &shl    ($len,4);
1886         &lea    ($len,&DWP(-16*6,$inp,$len));   # end of input - 16*6
1887         &mov    (&DWP($out_off,"esp"),$out);
1888         &mov    (&DWP($end_off,"esp"),$len);
1889         &mov    (&DWP($esp_off,"esp"),$rounds);
1890
1891         &mov    ($rounds,&DWP(240,$key));
1892
1893         &test   ($block,1);
1894         &jnz    (&label("odd"));
1895
1896         &bsf            ($i3,$block);
1897         &add            ($block,1);
1898         &shl            ($i3,4);
1899         &movdqu         ($inout5,&QWP(0,$l_,$i3));
1900         &mov            ($i3,$key);                     # put aside key
1901
1902         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
1903         &lea            ($inp,&DWP(16,$inp));
1904
1905         &pxor           ($inout5,$rndkey0);             # ^ last offset_i
1906         &pxor           ($rndkey1,$inout0);             # checksum
1907         &pxor           ($inout0,$inout5);              # ^ offset_i
1908
1909         &movdqa         ($inout4,$rndkey1);
1910         if ($inline)
1911         {   &aesni_inline_generate1("enc");     }
1912         else
1913         {   &call       ("_aesni_encrypt1");    }
1914
1915         &xorps          ($inout0,$inout5);              # ^ offset_i
1916         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
1917         &movdqa         ($rndkey1,$inout4);             # pass the checksum
1918
1919         &movups         (&QWP(-16,$out,$inp),$inout0);  # store output
1920
1921         &mov            ($rounds,&DWP(240,$i3));
1922         &mov            ($key,$i3);                     # restore key
1923         &mov            ($len,&DWP($end_off,"esp"));
1924
1925 &set_label("odd");
1926         &shl            ($rounds,4);
1927         &mov            ($out,16);
1928         &sub            ($out,$rounds);                 # twisted rounds
1929         &mov            (&DWP($key_off,"esp"),$key);
1930         &lea            ($key,&DWP(32,$key,$rounds));   # end of key schedule
1931         &mov            (&DWP($rounds_off,"esp"),$out);
1932
1933         &cmp            ($inp,$len);
1934         &ja             (&label("short"));
1935         &jmp            (&label("grandloop"));
1936
1937 &set_label("grandloop",32);
1938         &lea            ($i1,&DWP(1,$block));
1939         &lea            ($i3,&DWP(3,$block));
1940         &lea            ($i5,&DWP(5,$block));
1941         &add            ($block,6);
1942         &bsf            ($i1,$i1);
1943         &bsf            ($i3,$i3);
1944         &bsf            ($i5,$i5);
1945         &shl            ($i1,4);
1946         &shl            ($i3,4);
1947         &shl            ($i5,4);
1948         &movdqu         ($inout0,&QWP(0,$l_));
1949         &movdqu         ($inout1,&QWP(0,$l_,$i1));
1950         &mov            ($rounds,&DWP($rounds_off,"esp"));
1951         &movdqa         ($inout2,$inout0);
1952         &movdqu         ($inout3,&QWP(0,$l_,$i3));
1953         &movdqa         ($inout4,$inout0);
1954         &movdqu         ($inout5,&QWP(0,$l_,$i5));
1955
1956         &pxor           ($inout0,$rndkey0);             # ^ last offset_i
1957         &pxor           ($inout1,$inout0);
1958         &movdqa         (&QWP(16*0,"esp"),$inout0);
1959         &pxor           ($inout2,$inout1);
1960         &movdqa         (&QWP(16*1,"esp"),$inout1);
1961         &pxor           ($inout3,$inout2);
1962         &movdqa         (&QWP(16*2,"esp"),$inout2);
1963         &pxor           ($inout4,$inout3);
1964         &movdqa         (&QWP(16*3,"esp"),$inout3);
1965         &pxor           ($inout5,$inout4);
1966         &movdqa         (&QWP(16*4,"esp"),$inout4);
1967         &movdqa         (&QWP(16*5,"esp"),$inout5);
1968
1969         &$movekey       ($rndkey0,&QWP(-48,$key,$rounds));
1970         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
1971         &movdqu         ($inout1,&QWP(16*1,$inp));
1972         &movdqu         ($inout2,&QWP(16*2,$inp));
1973         &movdqu         ($inout3,&QWP(16*3,$inp));
1974         &movdqu         ($inout4,&QWP(16*4,$inp));
1975         &movdqu         ($inout5,&QWP(16*5,$inp));
1976         &lea            ($inp,&DWP(16*6,$inp));
1977
1978         &pxor           ($rndkey1,$inout0);             # checksum
1979         &pxor           ($inout0,$rndkey0);             # ^ roundkey[0]
1980         &pxor           ($rndkey1,$inout1);
1981         &pxor           ($inout1,$rndkey0);
1982         &pxor           ($rndkey1,$inout2);
1983         &pxor           ($inout2,$rndkey0);
1984         &pxor           ($rndkey1,$inout3);
1985         &pxor           ($inout3,$rndkey0);
1986         &pxor           ($rndkey1,$inout4);
1987         &pxor           ($inout4,$rndkey0);
1988         &pxor           ($rndkey1,$inout5);
1989         &pxor           ($inout5,$rndkey0);
1990         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
1991
1992         &$movekey       ($rndkey1,&QWP(-32,$key,$rounds));
1993         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
1994         &pxor           ($inout1,&QWP(16*1,"esp"));
1995         &pxor           ($inout2,&QWP(16*2,"esp"));
1996         &pxor           ($inout3,&QWP(16*3,"esp"));
1997         &pxor           ($inout4,&QWP(16*4,"esp"));
1998         &pxor           ($inout5,&QWP(16*5,"esp"));
1999
2000         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
2001         &aesenc         ($inout0,$rndkey1);
2002         &aesenc         ($inout1,$rndkey1);
2003         &aesenc         ($inout2,$rndkey1);
2004         &aesenc         ($inout3,$rndkey1);
2005         &aesenc         ($inout4,$rndkey1);
2006         &aesenc         ($inout5,$rndkey1);
2007
2008         &mov            ($out,&DWP($out_off,"esp"));
2009         &mov            ($len,&DWP($end_off,"esp"));
2010         &call           ("_aesni_encrypt6_enter");
2011
2012         &movdqa         ($rndkey0,&QWP(16*5,"esp"));    # pass last offset_i
2013         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2014         &pxor           ($inout1,&QWP(16*1,"esp"));
2015         &pxor           ($inout2,&QWP(16*2,"esp"));
2016         &pxor           ($inout3,&QWP(16*3,"esp"));
2017         &pxor           ($inout4,&QWP(16*4,"esp"));
2018         &pxor           ($inout5,$rndkey0);
2019         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2020
2021         &movdqu         (&QWP(-16*6,$out,$inp),$inout0);# store output
2022         &movdqu         (&QWP(-16*5,$out,$inp),$inout1);
2023         &movdqu         (&QWP(-16*4,$out,$inp),$inout2);
2024         &movdqu         (&QWP(-16*3,$out,$inp),$inout3);
2025         &movdqu         (&QWP(-16*2,$out,$inp),$inout4);
2026         &movdqu         (&QWP(-16*1,$out,$inp),$inout5);
2027         &cmp            ($inp,$len);                    # done yet?
2028         &jb             (&label("grandloop"));
2029
2030 &set_label("short");
2031         &add            ($len,16*6);
2032         &sub            ($len,$inp);
2033         &jz             (&label("done"));
2034
2035         &cmp            ($len,16*2);
2036         &jb             (&label("one"));
2037         &je             (&label("two"));
2038
2039         &cmp            ($len,16*4);
2040         &jb             (&label("three"));
2041         &je             (&label("four"));
2042
2043         &lea            ($i1,&DWP(1,$block));
2044         &lea            ($i3,&DWP(3,$block));
2045         &bsf            ($i1,$i1);
2046         &bsf            ($i3,$i3);
2047         &shl            ($i1,4);
2048         &shl            ($i3,4);
2049         &movdqu         ($inout0,&QWP(0,$l_));
2050         &movdqu         ($inout1,&QWP(0,$l_,$i1));
2051         &mov            ($rounds,&DWP($rounds_off,"esp"));
2052         &movdqa         ($inout2,$inout0);
2053         &movdqu         ($inout3,&QWP(0,$l_,$i3));
2054         &movdqa         ($inout4,$inout0);
2055
2056         &pxor           ($inout0,$rndkey0);             # ^ last offset_i
2057         &pxor           ($inout1,$inout0);
2058         &movdqa         (&QWP(16*0,"esp"),$inout0);
2059         &pxor           ($inout2,$inout1);
2060         &movdqa         (&QWP(16*1,"esp"),$inout1);
2061         &pxor           ($inout3,$inout2);
2062         &movdqa         (&QWP(16*2,"esp"),$inout2);
2063         &pxor           ($inout4,$inout3);
2064         &movdqa         (&QWP(16*3,"esp"),$inout3);
2065         &pxor           ($inout5,$inout4);
2066         &movdqa         (&QWP(16*4,"esp"),$inout4);
2067
2068         &$movekey       ($rndkey0,&QWP(-48,$key,$rounds));
2069         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2070         &movdqu         ($inout1,&QWP(16*1,$inp));
2071         &movdqu         ($inout2,&QWP(16*2,$inp));
2072         &movdqu         ($inout3,&QWP(16*3,$inp));
2073         &movdqu         ($inout4,&QWP(16*4,$inp));
2074         &pxor           ($inout5,$inout5);
2075
2076         &pxor           ($rndkey1,$inout0);             # checksum
2077         &pxor           ($inout0,$rndkey0);             # ^ roundkey[0]
2078         &pxor           ($rndkey1,$inout1);
2079         &pxor           ($inout1,$rndkey0);
2080         &pxor           ($rndkey1,$inout2);
2081         &pxor           ($inout2,$rndkey0);
2082         &pxor           ($rndkey1,$inout3);
2083         &pxor           ($inout3,$rndkey0);
2084         &pxor           ($rndkey1,$inout4);
2085         &pxor           ($inout4,$rndkey0);
2086         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2087
2088         &$movekey       ($rndkey1,&QWP(-32,$key,$rounds));
2089         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2090         &pxor           ($inout1,&QWP(16*1,"esp"));
2091         &pxor           ($inout2,&QWP(16*2,"esp"));
2092         &pxor           ($inout3,&QWP(16*3,"esp"));
2093         &pxor           ($inout4,&QWP(16*4,"esp"));
2094
2095         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
2096         &aesenc         ($inout0,$rndkey1);
2097         &aesenc         ($inout1,$rndkey1);
2098         &aesenc         ($inout2,$rndkey1);
2099         &aesenc         ($inout3,$rndkey1);
2100         &aesenc         ($inout4,$rndkey1);
2101         &aesenc         ($inout5,$rndkey1);
2102
2103         &mov            ($out,&DWP($out_off,"esp"));
2104         &call           ("_aesni_encrypt6_enter");
2105
2106         &movdqa         ($rndkey0,&QWP(16*4,"esp"));    # pass last offset_i
2107         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2108         &pxor           ($inout1,&QWP(16*1,"esp"));
2109         &pxor           ($inout2,&QWP(16*2,"esp"));
2110         &pxor           ($inout3,&QWP(16*3,"esp"));
2111         &pxor           ($inout4,$rndkey0);
2112         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2113
2114         &movdqu         (&QWP(16*0,$out,$inp),$inout0); # store output
2115         &movdqu         (&QWP(16*1,$out,$inp),$inout1);
2116         &movdqu         (&QWP(16*2,$out,$inp),$inout2);
2117         &movdqu         (&QWP(16*3,$out,$inp),$inout3);
2118         &movdqu         (&QWP(16*4,$out,$inp),$inout4);
2119
2120         &jmp            (&label("done"));
2121
2122 &set_label("one",16);
2123         &movdqu         ($inout5,&QWP(0,$l_));
2124         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2125
2126         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2127         &mov            ($rounds,&DWP(240,$key));
2128
2129         &pxor           ($inout5,$rndkey0);             # ^ last offset_i
2130         &pxor           ($rndkey1,$inout0);             # checksum
2131         &pxor           ($inout0,$inout5);              # ^ offset_i
2132
2133         &movdqa         ($inout4,$rndkey1);
2134         &mov            ($out,&DWP($out_off,"esp"));
2135         if ($inline)
2136         {   &aesni_inline_generate1("enc");     }
2137         else
2138         {   &call       ("_aesni_encrypt1");    }
2139
2140         &xorps          ($inout0,$inout5);              # ^ offset_i
2141         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2142         &movdqa         ($rndkey1,$inout4);             # pass the checksum
2143         &movups         (&QWP(0,$out,$inp),$inout0);
2144
2145         &jmp            (&label("done"));
2146
2147 &set_label("two",16);
2148         &lea            ($i1,&DWP(1,$block));
2149         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2150         &bsf            ($i1,$i1);
2151         &shl            ($i1,4);
2152         &movdqu         ($inout4,&QWP(0,$l_));
2153         &movdqu         ($inout5,&QWP(0,$l_,$i1));
2154
2155         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2156         &movdqu         ($inout1,&QWP(16*1,$inp));
2157         &mov            ($rounds,&DWP(240,$key));
2158
2159         &pxor           ($inout4,$rndkey0);             # ^ last offset_i
2160         &pxor           ($inout5,$inout4);
2161
2162         &pxor           ($rndkey1,$inout0);             # checksum
2163         &pxor           ($inout0,$inout4);              # ^ offset_i
2164         &pxor           ($rndkey1,$inout1);
2165         &pxor           ($inout1,$inout5);
2166
2167         &movdqa         ($inout3,$rndkey1)
2168         &mov            ($out,&DWP($out_off,"esp"));
2169         &call           ("_aesni_encrypt2");
2170
2171         &xorps          ($inout0,$inout4);              # ^ offset_i
2172         &xorps          ($inout1,$inout5);
2173         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2174         &movdqa         ($rndkey1,$inout3);             # pass the checksum
2175         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2176         &movups         (&QWP(16*1,$out,$inp),$inout1);
2177
2178         &jmp            (&label("done"));
2179
2180 &set_label("three",16);
2181         &lea            ($i1,&DWP(1,$block));
2182         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2183         &bsf            ($i1,$i1);
2184         &shl            ($i1,4);
2185         &movdqu         ($inout3,&QWP(0,$l_));
2186         &movdqu         ($inout4,&QWP(0,$l_,$i1));
2187         &movdqa         ($inout5,$inout3);
2188
2189         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2190         &movdqu         ($inout1,&QWP(16*1,$inp));
2191         &movdqu         ($inout2,&QWP(16*2,$inp));
2192         &mov            ($rounds,&DWP(240,$key));
2193
2194         &pxor           ($inout3,$rndkey0);             # ^ last offset_i
2195         &pxor           ($inout4,$inout3);
2196         &pxor           ($inout5,$inout4);
2197
2198         &pxor           ($rndkey1,$inout0);             # checksum
2199         &pxor           ($inout0,$inout3);              # ^ offset_i
2200         &pxor           ($rndkey1,$inout1);
2201         &pxor           ($inout1,$inout4);
2202         &pxor           ($rndkey1,$inout2);
2203         &pxor           ($inout2,$inout5);
2204
2205         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2206         &mov            ($out,&DWP($out_off,"esp"));
2207         &call           ("_aesni_encrypt3");
2208
2209         &xorps          ($inout0,$inout3);              # ^ offset_i
2210         &xorps          ($inout1,$inout4);
2211         &xorps          ($inout2,$inout5);
2212         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2213         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2214         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2215         &movups         (&QWP(16*1,$out,$inp),$inout1);
2216         &movups         (&QWP(16*2,$out,$inp),$inout2);
2217
2218         &jmp            (&label("done"));
2219
2220 &set_label("four",16);
2221         &lea            ($i1,&DWP(1,$block));
2222         &lea            ($i3,&DWP(3,$block));
2223         &bsf            ($i1,$i1);
2224         &bsf            ($i3,$i3);
2225         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2226         &shl            ($i1,4);
2227         &shl            ($i3,4);
2228         &movdqu         ($inout2,&QWP(0,$l_));
2229         &movdqu         ($inout3,&QWP(0,$l_,$i1));
2230         &movdqa         ($inout4,$inout2);
2231         &movdqu         ($inout5,&QWP(0,$l_,$i3));
2232
2233         &pxor           ($inout2,$rndkey0);             # ^ last offset_i
2234         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2235         &pxor           ($inout3,$inout2);
2236         &movdqu         ($inout1,&QWP(16*1,$inp));
2237         &pxor           ($inout4,$inout3);
2238         &movdqa         (&QWP(16*0,"esp"),$inout2);
2239         &pxor           ($inout5,$inout4);
2240         &movdqa         (&QWP(16*1,"esp"),$inout3);
2241         &movdqu         ($inout2,&QWP(16*2,$inp));
2242         &movdqu         ($inout3,&QWP(16*3,$inp));
2243         &mov            ($rounds,&DWP(240,$key));
2244
2245         &pxor           ($rndkey1,$inout0);             # checksum
2246         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2247         &pxor           ($rndkey1,$inout1);
2248         &pxor           ($inout1,&QWP(16*1,"esp"));
2249         &pxor           ($rndkey1,$inout2);
2250         &pxor           ($inout2,$inout4);
2251         &pxor           ($rndkey1,$inout3);
2252         &pxor           ($inout3,$inout5);
2253
2254         &movdqa         (&QWP($checksum,"esp"),$rndkey1)
2255         &mov            ($out,&DWP($out_off,"esp"));
2256         &call           ("_aesni_encrypt4");
2257
2258         &xorps          ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2259         &xorps          ($inout1,&QWP(16*1,"esp"));
2260         &xorps          ($inout2,$inout4);
2261         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2262         &xorps          ($inout3,$inout5);
2263         &movups         (&QWP(16*1,$out,$inp),$inout1);
2264         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2265         &movups         (&QWP(16*2,$out,$inp),$inout2);
2266         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2267         &movups         (&QWP(16*3,$out,$inp),$inout3);
2268
2269 &set_label("done");
2270         &mov    ($key,&DWP($esp_off,"esp"));
2271         &pxor   ($inout0,$inout0);              # clear register bank
2272         &pxor   ($inout1,$inout1);
2273         &movdqa (&QWP(16*0,"esp"),$inout0);     # clear stack
2274         &pxor   ($inout2,$inout2);
2275         &movdqa (&QWP(16*1,"esp"),$inout0);
2276         &pxor   ($inout3,$inout3);
2277         &movdqa (&QWP(16*2,"esp"),$inout0);
2278         &pxor   ($inout4,$inout4);
2279         &movdqa (&QWP(16*3,"esp"),$inout0);
2280         &pxor   ($inout5,$inout5);
2281         &movdqa (&QWP(16*4,"esp"),$inout0);
2282         &movdqa (&QWP(16*5,"esp"),$inout0);
2283         &movdqa (&QWP(16*6,"esp"),$inout0);
2284
2285         &lea    ("esp",&DWP(0,$key));
2286         &mov    ($rounds,&wparam(5));           # &offset_i
2287         &mov    ($rounds_,&wparam(7));          # &checksum
2288         &movdqu (&QWP(0,$rounds),$rndkey0);
2289         &pxor   ($rndkey0,$rndkey0);
2290         &movdqu (&QWP(0,$rounds_),$rndkey1);
2291         &pxor   ($rndkey1,$rndkey1);
2292 &function_end("aesni_ocb_encrypt");
2293
2294 &function_begin("aesni_ocb_decrypt");
2295         &mov    ($rounds,&wparam(5));           # &offset_i
2296         &mov    ($rounds_,&wparam(7));          # &checksum
2297
2298         &mov    ($inp,&wparam(0));
2299         &mov    ($out,&wparam(1));
2300         &mov    ($len,&wparam(2));
2301         &mov    ($key,&wparam(3));
2302         &movdqu ($rndkey0,&QWP(0,$rounds));     # load offset_i
2303         &mov    ($block,&wparam(4));            # start_block_num
2304         &movdqu ($rndkey1,&QWP(0,$rounds_));    # load checksum
2305         &mov    ($l_,&wparam(6));               # L_
2306
2307         &mov    ($rounds,"esp");
2308         &sub    ("esp",$esp_off+4);             # alloca
2309         &and    ("esp",-16);                    # align stack
2310
2311         &sub    ($out,$inp);
2312         &shl    ($len,4);
2313         &lea    ($len,&DWP(-16*6,$inp,$len));   # end of input - 16*6
2314         &mov    (&DWP($out_off,"esp"),$out);
2315         &mov    (&DWP($end_off,"esp"),$len);
2316         &mov    (&DWP($esp_off,"esp"),$rounds);
2317
2318         &mov    ($rounds,&DWP(240,$key));
2319
2320         &test   ($block,1);
2321         &jnz    (&label("odd"));
2322
2323         &bsf            ($i3,$block);
2324         &add            ($block,1);
2325         &shl            ($i3,4);
2326         &movdqu         ($inout5,&QWP(0,$l_,$i3));
2327         &mov            ($i3,$key);                     # put aside key
2328
2329         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2330         &lea            ($inp,&DWP(16,$inp));
2331
2332         &pxor           ($inout5,$rndkey0);             # ^ last offset_i
2333         &pxor           ($inout0,$inout5);              # ^ offset_i
2334
2335         &movdqa         ($inout4,$rndkey1);
2336         if ($inline)
2337         {   &aesni_inline_generate1("dec");     }
2338         else
2339         {   &call       ("_aesni_decrypt1");    }
2340
2341         &xorps          ($inout0,$inout5);              # ^ offset_i
2342         &movaps         ($rndkey1,$inout4);             # pass the checksum
2343         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2344         &xorps          ($rndkey1,$inout0);             # checksum
2345         &movups         (&QWP(-16,$out,$inp),$inout0);  # store output
2346
2347         &mov            ($rounds,&DWP(240,$i3));
2348         &mov            ($key,$i3);                     # restore key
2349         &mov            ($len,&DWP($end_off,"esp"));
2350
2351 &set_label("odd");
2352         &shl            ($rounds,4);
2353         &mov            ($out,16);
2354         &sub            ($out,$rounds);                 # twisted rounds
2355         &mov            (&DWP($key_off,"esp"),$key);
2356         &lea            ($key,&DWP(32,$key,$rounds));   # end of key schedule
2357         &mov            (&DWP($rounds_off,"esp"),$out);
2358
2359         &cmp            ($inp,$len);
2360         &ja             (&label("short"));
2361         &jmp            (&label("grandloop"));
2362
2363 &set_label("grandloop",32);
2364         &lea            ($i1,&DWP(1,$block));
2365         &lea            ($i3,&DWP(3,$block));
2366         &lea            ($i5,&DWP(5,$block));
2367         &add            ($block,6);
2368         &bsf            ($i1,$i1);
2369         &bsf            ($i3,$i3);
2370         &bsf            ($i5,$i5);
2371         &shl            ($i1,4);
2372         &shl            ($i3,4);
2373         &shl            ($i5,4);
2374         &movdqu         ($inout0,&QWP(0,$l_));
2375         &movdqu         ($inout1,&QWP(0,$l_,$i1));
2376         &mov            ($rounds,&DWP($rounds_off,"esp"));
2377         &movdqa         ($inout2,$inout0);
2378         &movdqu         ($inout3,&QWP(0,$l_,$i3));
2379         &movdqa         ($inout4,$inout0);
2380         &movdqu         ($inout5,&QWP(0,$l_,$i5));
2381
2382         &pxor           ($inout0,$rndkey0);             # ^ last offset_i
2383         &pxor           ($inout1,$inout0);
2384         &movdqa         (&QWP(16*0,"esp"),$inout0);
2385         &pxor           ($inout2,$inout1);
2386         &movdqa         (&QWP(16*1,"esp"),$inout1);
2387         &pxor           ($inout3,$inout2);
2388         &movdqa         (&QWP(16*2,"esp"),$inout2);
2389         &pxor           ($inout4,$inout3);
2390         &movdqa         (&QWP(16*3,"esp"),$inout3);
2391         &pxor           ($inout5,$inout4);
2392         &movdqa         (&QWP(16*4,"esp"),$inout4);
2393         &movdqa         (&QWP(16*5,"esp"),$inout5);
2394
2395         &$movekey       ($rndkey0,&QWP(-48,$key,$rounds));
2396         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2397         &movdqu         ($inout1,&QWP(16*1,$inp));
2398         &movdqu         ($inout2,&QWP(16*2,$inp));
2399         &movdqu         ($inout3,&QWP(16*3,$inp));
2400         &movdqu         ($inout4,&QWP(16*4,$inp));
2401         &movdqu         ($inout5,&QWP(16*5,$inp));
2402         &lea            ($inp,&DWP(16*6,$inp));
2403
2404         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2405         &pxor           ($inout0,$rndkey0);             # ^ roundkey[0]
2406         &pxor           ($inout1,$rndkey0);
2407         &pxor           ($inout2,$rndkey0);
2408         &pxor           ($inout3,$rndkey0);
2409         &pxor           ($inout4,$rndkey0);
2410         &pxor           ($inout5,$rndkey0);
2411
2412         &$movekey       ($rndkey1,&QWP(-32,$key,$rounds));
2413         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2414         &pxor           ($inout1,&QWP(16*1,"esp"));
2415         &pxor           ($inout2,&QWP(16*2,"esp"));
2416         &pxor           ($inout3,&QWP(16*3,"esp"));
2417         &pxor           ($inout4,&QWP(16*4,"esp"));
2418         &pxor           ($inout5,&QWP(16*5,"esp"));
2419
2420         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
2421         &aesdec         ($inout0,$rndkey1);
2422         &aesdec         ($inout1,$rndkey1);
2423         &aesdec         ($inout2,$rndkey1);
2424         &aesdec         ($inout3,$rndkey1);
2425         &aesdec         ($inout4,$rndkey1);
2426         &aesdec         ($inout5,$rndkey1);
2427
2428         &mov            ($out,&DWP($out_off,"esp"));
2429         &mov            ($len,&DWP($end_off,"esp"));
2430         &call           ("_aesni_decrypt6_enter");
2431
2432         &movdqa         ($rndkey0,&QWP(16*5,"esp"));    # pass last offset_i
2433         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2434         &movdqa         ($rndkey1,&QWP($checksum,"esp"));
2435         &pxor           ($inout1,&QWP(16*1,"esp"));
2436         &pxor           ($inout2,&QWP(16*2,"esp"));
2437         &pxor           ($inout3,&QWP(16*3,"esp"));
2438         &pxor           ($inout4,&QWP(16*4,"esp"));
2439         &pxor           ($inout5,$rndkey0);
2440
2441         &pxor           ($rndkey1,$inout0);             # checksum
2442         &movdqu         (&QWP(-16*6,$out,$inp),$inout0);# store output
2443         &pxor           ($rndkey1,$inout1);
2444         &movdqu         (&QWP(-16*5,$out,$inp),$inout1);
2445         &pxor           ($rndkey1,$inout2);
2446         &movdqu         (&QWP(-16*4,$out,$inp),$inout2);
2447         &pxor           ($rndkey1,$inout3);
2448         &movdqu         (&QWP(-16*3,$out,$inp),$inout3);
2449         &pxor           ($rndkey1,$inout4);
2450         &movdqu         (&QWP(-16*2,$out,$inp),$inout4);
2451         &pxor           ($rndkey1,$inout5);
2452         &movdqu         (&QWP(-16*1,$out,$inp),$inout5);
2453         &cmp            ($inp,$len);                    # done yet?
2454         &jb             (&label("grandloop"));
2455
2456 &set_label("short");
2457         &add            ($len,16*6);
2458         &sub            ($len,$inp);
2459         &jz             (&label("done"));
2460
2461         &cmp            ($len,16*2);
2462         &jb             (&label("one"));
2463         &je             (&label("two"));
2464
2465         &cmp            ($len,16*4);
2466         &jb             (&label("three"));
2467         &je             (&label("four"));
2468
2469         &lea            ($i1,&DWP(1,$block));
2470         &lea            ($i3,&DWP(3,$block));
2471         &bsf            ($i1,$i1);
2472         &bsf            ($i3,$i3);
2473         &shl            ($i1,4);
2474         &shl            ($i3,4);
2475         &movdqu         ($inout0,&QWP(0,$l_));
2476         &movdqu         ($inout1,&QWP(0,$l_,$i1));
2477         &mov            ($rounds,&DWP($rounds_off,"esp"));
2478         &movdqa         ($inout2,$inout0);
2479         &movdqu         ($inout3,&QWP(0,$l_,$i3));
2480         &movdqa         ($inout4,$inout0);
2481
2482         &pxor           ($inout0,$rndkey0);             # ^ last offset_i
2483         &pxor           ($inout1,$inout0);
2484         &movdqa         (&QWP(16*0,"esp"),$inout0);
2485         &pxor           ($inout2,$inout1);
2486         &movdqa         (&QWP(16*1,"esp"),$inout1);
2487         &pxor           ($inout3,$inout2);
2488         &movdqa         (&QWP(16*2,"esp"),$inout2);
2489         &pxor           ($inout4,$inout3);
2490         &movdqa         (&QWP(16*3,"esp"),$inout3);
2491         &pxor           ($inout5,$inout4);
2492         &movdqa         (&QWP(16*4,"esp"),$inout4);
2493
2494         &$movekey       ($rndkey0,&QWP(-48,$key,$rounds));
2495         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2496         &movdqu         ($inout1,&QWP(16*1,$inp));
2497         &movdqu         ($inout2,&QWP(16*2,$inp));
2498         &movdqu         ($inout3,&QWP(16*3,$inp));
2499         &movdqu         ($inout4,&QWP(16*4,$inp));
2500         &pxor           ($inout5,$inout5);
2501
2502         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2503         &pxor           ($inout0,$rndkey0);             # ^ roundkey[0]
2504         &pxor           ($inout1,$rndkey0);
2505         &pxor           ($inout2,$rndkey0);
2506         &pxor           ($inout3,$rndkey0);
2507         &pxor           ($inout4,$rndkey0);
2508
2509         &$movekey       ($rndkey1,&QWP(-32,$key,$rounds));
2510         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2511         &pxor           ($inout1,&QWP(16*1,"esp"));
2512         &pxor           ($inout2,&QWP(16*2,"esp"));
2513         &pxor           ($inout3,&QWP(16*3,"esp"));
2514         &pxor           ($inout4,&QWP(16*4,"esp"));
2515
2516         &$movekey       ($rndkey0,&QWP(-16,$key,$rounds));
2517         &aesdec         ($inout0,$rndkey1);
2518         &aesdec         ($inout1,$rndkey1);
2519         &aesdec         ($inout2,$rndkey1);
2520         &aesdec         ($inout3,$rndkey1);
2521         &aesdec         ($inout4,$rndkey1);
2522         &aesdec         ($inout5,$rndkey1);
2523
2524         &mov            ($out,&DWP($out_off,"esp"));
2525         &call           ("_aesni_decrypt6_enter");
2526
2527         &movdqa         ($rndkey0,&QWP(16*4,"esp"));    # pass last offset_i
2528         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2529         &movdqa         ($rndkey1,&QWP($checksum,"esp"));
2530         &pxor           ($inout1,&QWP(16*1,"esp"));
2531         &pxor           ($inout2,&QWP(16*2,"esp"));
2532         &pxor           ($inout3,&QWP(16*3,"esp"));
2533         &pxor           ($inout4,$rndkey0);
2534
2535         &pxor           ($rndkey1,$inout0);             # checksum
2536         &movdqu         (&QWP(16*0,$out,$inp),$inout0); # store output
2537         &pxor           ($rndkey1,$inout1);
2538         &movdqu         (&QWP(16*1,$out,$inp),$inout1);
2539         &pxor           ($rndkey1,$inout2);
2540         &movdqu         (&QWP(16*2,$out,$inp),$inout2);
2541         &pxor           ($rndkey1,$inout3);
2542         &movdqu         (&QWP(16*3,$out,$inp),$inout3);
2543         &pxor           ($rndkey1,$inout4);
2544         &movdqu         (&QWP(16*4,$out,$inp),$inout4);
2545
2546         &jmp            (&label("done"));
2547
2548 &set_label("one",16);
2549         &movdqu         ($inout5,&QWP(0,$l_));
2550         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2551
2552         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2553         &mov            ($rounds,&DWP(240,$key));
2554
2555         &pxor           ($inout5,$rndkey0);             # ^ last offset_i
2556         &pxor           ($inout0,$inout5);              # ^ offset_i
2557
2558         &movdqa         ($inout4,$rndkey1);
2559         &mov            ($out,&DWP($out_off,"esp"));
2560         if ($inline)
2561         {   &aesni_inline_generate1("dec");     }
2562         else
2563         {   &call       ("_aesni_decrypt1");    }
2564
2565         &xorps          ($inout0,$inout5);              # ^ offset_i
2566         &movaps         ($rndkey1,$inout4);             # pass the checksum
2567         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2568         &xorps          ($rndkey1,$inout0);             # checksum
2569         &movups         (&QWP(0,$out,$inp),$inout0);
2570
2571         &jmp            (&label("done"));
2572
2573 &set_label("two",16);
2574         &lea            ($i1,&DWP(1,$block));
2575         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2576         &bsf            ($i1,$i1);
2577         &shl            ($i1,4);
2578         &movdqu         ($inout4,&QWP(0,$l_));
2579         &movdqu         ($inout5,&QWP(0,$l_,$i1));
2580
2581         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2582         &movdqu         ($inout1,&QWP(16*1,$inp));
2583         &mov            ($rounds,&DWP(240,$key));
2584
2585         &movdqa         ($inout3,$rndkey1);
2586         &pxor           ($inout4,$rndkey0);             # ^ last offset_i
2587         &pxor           ($inout5,$inout4);
2588
2589         &pxor           ($inout0,$inout4);              # ^ offset_i
2590         &pxor           ($inout1,$inout5);
2591
2592         &mov            ($out,&DWP($out_off,"esp"));
2593         &call           ("_aesni_decrypt2");
2594
2595         &xorps          ($inout0,$inout4);              # ^ offset_i
2596         &xorps          ($inout1,$inout5);
2597         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2598         &xorps          ($inout3,$inout0);              # checksum
2599         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2600         &xorps          ($inout3,$inout1);
2601         &movups         (&QWP(16*1,$out,$inp),$inout1);
2602         &movaps         ($rndkey1,$inout3);             # pass the checksum
2603
2604         &jmp            (&label("done"));
2605
2606 &set_label("three",16);
2607         &lea            ($i1,&DWP(1,$block));
2608         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2609         &bsf            ($i1,$i1);
2610         &shl            ($i1,4);
2611         &movdqu         ($inout3,&QWP(0,$l_));
2612         &movdqu         ($inout4,&QWP(0,$l_,$i1));
2613         &movdqa         ($inout5,$inout3);
2614
2615         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2616         &movdqu         ($inout1,&QWP(16*1,$inp));
2617         &movdqu         ($inout2,&QWP(16*2,$inp));
2618         &mov            ($rounds,&DWP(240,$key));
2619
2620         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2621         &pxor           ($inout3,$rndkey0);             # ^ last offset_i
2622         &pxor           ($inout4,$inout3);
2623         &pxor           ($inout5,$inout4);
2624
2625         &pxor           ($inout0,$inout3);              # ^ offset_i
2626         &pxor           ($inout1,$inout4);
2627         &pxor           ($inout2,$inout5);
2628
2629         &mov            ($out,&DWP($out_off,"esp"));
2630         &call           ("_aesni_decrypt3");
2631
2632         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2633         &xorps          ($inout0,$inout3);              # ^ offset_i
2634         &xorps          ($inout1,$inout4);
2635         &xorps          ($inout2,$inout5);
2636         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2637         &pxor           ($rndkey1,$inout0);             # checksum
2638         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2639         &movups         (&QWP(16*1,$out,$inp),$inout1);
2640         &pxor           ($rndkey1,$inout1);
2641         &movups         (&QWP(16*2,$out,$inp),$inout2);
2642         &pxor           ($rndkey1,$inout2);
2643
2644         &jmp            (&label("done"));
2645
2646 &set_label("four",16);
2647         &lea            ($i1,&DWP(1,$block));
2648         &lea            ($i3,&DWP(3,$block));
2649         &bsf            ($i1,$i1);
2650         &bsf            ($i3,$i3);
2651         &mov            ($key,&DWP($key_off,"esp"));    # restore key
2652         &shl            ($i1,4);
2653         &shl            ($i3,4);
2654         &movdqu         ($inout2,&QWP(0,$l_));
2655         &movdqu         ($inout3,&QWP(0,$l_,$i1));
2656         &movdqa         ($inout4,$inout2);
2657         &movdqu         ($inout5,&QWP(0,$l_,$i3));
2658
2659         &pxor           ($inout2,$rndkey0);             # ^ last offset_i
2660         &movdqu         ($inout0,&QWP(16*0,$inp));      # load input
2661         &pxor           ($inout3,$inout2);
2662         &movdqu         ($inout1,&QWP(16*1,$inp));
2663         &pxor           ($inout4,$inout3);
2664         &movdqa         (&QWP(16*0,"esp"),$inout2);
2665         &pxor           ($inout5,$inout4);
2666         &movdqa         (&QWP(16*1,"esp"),$inout3);
2667         &movdqu         ($inout2,&QWP(16*2,$inp));
2668         &movdqu         ($inout3,&QWP(16*3,$inp));
2669         &mov            ($rounds,&DWP(240,$key));
2670
2671         &movdqa         (&QWP($checksum,"esp"),$rndkey1);
2672         &pxor           ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2673         &pxor           ($inout1,&QWP(16*1,"esp"));
2674         &pxor           ($inout2,$inout4);
2675         &pxor           ($inout3,$inout5);
2676
2677         &mov            ($out,&DWP($out_off,"esp"));
2678         &call           ("_aesni_decrypt4");
2679
2680         &movdqa         ($rndkey1,&QWP($checksum,"esp"));# pass the checksum
2681         &xorps          ($inout0,&QWP(16*0,"esp"));     # ^ offset_i
2682         &xorps          ($inout1,&QWP(16*1,"esp"));
2683         &xorps          ($inout2,$inout4);
2684         &movups         (&QWP(16*0,$out,$inp),$inout0); # store output
2685         &pxor           ($rndkey1,$inout0);             # checksum
2686         &xorps          ($inout3,$inout5);
2687         &movups         (&QWP(16*1,$out,$inp),$inout1);
2688         &pxor           ($rndkey1,$inout1);
2689         &movdqa         ($rndkey0,$inout5);             # pass last offset_i
2690         &movups         (&QWP(16*2,$out,$inp),$inout2);
2691         &pxor           ($rndkey1,$inout2);
2692         &movups         (&QWP(16*3,$out,$inp),$inout3);
2693         &pxor           ($rndkey1,$inout3);
2694
2695 &set_label("done");
2696         &mov    ($key,&DWP($esp_off,"esp"));
2697         &pxor   ($inout0,$inout0);              # clear register bank
2698         &pxor   ($inout1,$inout1);
2699         &movdqa (&QWP(16*0,"esp"),$inout0);     # clear stack
2700         &pxor   ($inout2,$inout2);
2701         &movdqa (&QWP(16*1,"esp"),$inout0);
2702         &pxor   ($inout3,$inout3);
2703         &movdqa (&QWP(16*2,"esp"),$inout0);
2704         &pxor   ($inout4,$inout4);
2705         &movdqa (&QWP(16*3,"esp"),$inout0);
2706         &pxor   ($inout5,$inout5);
2707         &movdqa (&QWP(16*4,"esp"),$inout0);
2708         &movdqa (&QWP(16*5,"esp"),$inout0);
2709         &movdqa (&QWP(16*6,"esp"),$inout0);
2710
2711         &lea    ("esp",&DWP(0,$key));
2712         &mov    ($rounds,&wparam(5));           # &offset_i
2713         &mov    ($rounds_,&wparam(7));          # &checksum
2714         &movdqu (&QWP(0,$rounds),$rndkey0);
2715         &pxor   ($rndkey0,$rndkey0);
2716         &movdqu (&QWP(0,$rounds_),$rndkey1);
2717         &pxor   ($rndkey1,$rndkey1);
2718 &function_end("aesni_ocb_decrypt");
2719 }
2720 }
2721 \f
2722 ######################################################################
2723 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
2724 #                           size_t length, const AES_KEY *key,
2725 #                           unsigned char *ivp,const int enc);
2726 &function_begin("${PREFIX}_cbc_encrypt");
2727         &mov    ($inp,&wparam(0));
2728         &mov    ($rounds_,"esp");
2729         &mov    ($out,&wparam(1));
2730         &sub    ($rounds_,24);
2731         &mov    ($len,&wparam(2));
2732         &and    ($rounds_,-16);
2733         &mov    ($key,&wparam(3));
2734         &mov    ($key_,&wparam(4));
2735         &test   ($len,$len);
2736         &jz     (&label("cbc_abort"));
2737
2738         &cmp    (&wparam(5),0);
2739         &xchg   ($rounds_,"esp");               # alloca
2740         &movups ($ivec,&QWP(0,$key_));          # load IV
2741         &mov    ($rounds,&DWP(240,$key));
2742         &mov    ($key_,$key);                   # backup $key
2743         &mov    (&DWP(16,"esp"),$rounds_);      # save original %esp
2744         &mov    ($rounds_,$rounds);             # backup $rounds
2745         &je     (&label("cbc_decrypt"));
2746
2747         &movaps ($inout0,$ivec);
2748         &cmp    ($len,16);
2749         &jb     (&label("cbc_enc_tail"));
2750         &sub    ($len,16);
2751         &jmp    (&label("cbc_enc_loop"));
2752
2753 &set_label("cbc_enc_loop",16);
2754         &movups ($ivec,&QWP(0,$inp));           # input actually
2755         &lea    ($inp,&DWP(16,$inp));
2756         if ($inline)
2757         {   &aesni_inline_generate1("enc",$inout0,$ivec);       }
2758         else
2759         {   &xorps($inout0,$ivec); &call("_aesni_encrypt1");    }
2760         &mov    ($rounds,$rounds_);     # restore $rounds
2761         &mov    ($key,$key_);           # restore $key
2762         &movups (&QWP(0,$out),$inout0); # store output
2763         &lea    ($out,&DWP(16,$out));
2764         &sub    ($len,16);
2765         &jnc    (&label("cbc_enc_loop"));
2766         &add    ($len,16);
2767         &jnz    (&label("cbc_enc_tail"));
2768         &movaps ($ivec,$inout0);
2769         &pxor   ($inout0,$inout0);
2770         &jmp    (&label("cbc_ret"));
2771
2772 &set_label("cbc_enc_tail");
2773         &mov    ("ecx",$len);           # zaps $rounds
2774         &data_word(0xA4F3F689);         # rep movsb
2775         &mov    ("ecx",16);             # zero tail
2776         &sub    ("ecx",$len);
2777         &xor    ("eax","eax");          # zaps $len
2778         &data_word(0xAAF3F689);         # rep stosb
2779         &lea    ($out,&DWP(-16,$out));  # rewind $out by 1 block
2780         &mov    ($rounds,$rounds_);     # restore $rounds
2781         &mov    ($inp,$out);            # $inp and $out are the same
2782         &mov    ($key,$key_);           # restore $key
2783         &jmp    (&label("cbc_enc_loop"));
2784 ######################################################################
2785 &set_label("cbc_decrypt",16);
2786         &cmp    ($len,0x50);
2787         &jbe    (&label("cbc_dec_tail"));
2788         &movaps (&QWP(0,"esp"),$ivec);          # save IV
2789         &sub    ($len,0x50);
2790         &jmp    (&label("cbc_dec_loop6_enter"));
2791
2792 &set_label("cbc_dec_loop6",16);
2793         &movaps (&QWP(0,"esp"),$rndkey0);       # save IV
2794         &movups (&QWP(0,$out),$inout5);
2795         &lea    ($out,&DWP(0x10,$out));
2796 &set_label("cbc_dec_loop6_enter");
2797         &movdqu ($inout0,&QWP(0,$inp));
2798         &movdqu ($inout1,&QWP(0x10,$inp));
2799         &movdqu ($inout2,&QWP(0x20,$inp));
2800         &movdqu ($inout3,&QWP(0x30,$inp));
2801         &movdqu ($inout4,&QWP(0x40,$inp));
2802         &movdqu ($inout5,&QWP(0x50,$inp));
2803
2804         &call   ("_aesni_decrypt6");
2805
2806         &movups ($rndkey1,&QWP(0,$inp));
2807         &movups ($rndkey0,&QWP(0x10,$inp));
2808         &xorps  ($inout0,&QWP(0,"esp"));        # ^=IV
2809         &xorps  ($inout1,$rndkey1);
2810         &movups ($rndkey1,&QWP(0x20,$inp));
2811         &xorps  ($inout2,$rndkey0);
2812         &movups ($rndkey0,&QWP(0x30,$inp));
2813         &xorps  ($inout3,$rndkey1);
2814         &movups ($rndkey1,&QWP(0x40,$inp));
2815         &xorps  ($inout4,$rndkey0);
2816         &movups ($rndkey0,&QWP(0x50,$inp));     # IV
2817         &xorps  ($inout5,$rndkey1);
2818         &movups (&QWP(0,$out),$inout0);
2819         &movups (&QWP(0x10,$out),$inout1);
2820         &lea    ($inp,&DWP(0x60,$inp));
2821         &movups (&QWP(0x20,$out),$inout2);
2822         &mov    ($rounds,$rounds_);             # restore $rounds
2823         &movups (&QWP(0x30,$out),$inout3);
2824         &mov    ($key,$key_);                   # restore $key
2825         &movups (&QWP(0x40,$out),$inout4);
2826         &lea    ($out,&DWP(0x50,$out));
2827         &sub    ($len,0x60);
2828         &ja     (&label("cbc_dec_loop6"));
2829
2830         &movaps ($inout0,$inout5);
2831         &movaps ($ivec,$rndkey0);
2832         &add    ($len,0x50);
2833         &jle    (&label("cbc_dec_clear_tail_collected"));
2834         &movups (&QWP(0,$out),$inout0);
2835         &lea    ($out,&DWP(0x10,$out));
2836 &set_label("cbc_dec_tail");
2837         &movups ($inout0,&QWP(0,$inp));
2838         &movaps ($in0,$inout0);
2839         &cmp    ($len,0x10);
2840         &jbe    (&label("cbc_dec_one"));
2841
2842         &movups ($inout1,&QWP(0x10,$inp));
2843         &movaps ($in1,$inout1);
2844         &cmp    ($len,0x20);
2845         &jbe    (&label("cbc_dec_two"));
2846
2847         &movups ($inout2,&QWP(0x20,$inp));
2848         &cmp    ($len,0x30);
2849         &jbe    (&label("cbc_dec_three"));
2850
2851         &movups ($inout3,&QWP(0x30,$inp));
2852         &cmp    ($len,0x40);
2853         &jbe    (&label("cbc_dec_four"));
2854
2855         &movups ($inout4,&QWP(0x40,$inp));
2856         &movaps (&QWP(0,"esp"),$ivec);          # save IV
2857         &movups ($inout0,&QWP(0,$inp));
2858         &xorps  ($inout5,$inout5);
2859         &call   ("_aesni_decrypt6");
2860         &movups ($rndkey1,&QWP(0,$inp));
2861         &movups ($rndkey0,&QWP(0x10,$inp));
2862         &xorps  ($inout0,&QWP(0,"esp"));        # ^= IV
2863         &xorps  ($inout1,$rndkey1);
2864         &movups ($rndkey1,&QWP(0x20,$inp));
2865         &xorps  ($inout2,$rndkey0);
2866         &movups ($rndkey0,&QWP(0x30,$inp));
2867         &xorps  ($inout3,$rndkey1);
2868         &movups ($ivec,&QWP(0x40,$inp));        # IV
2869         &xorps  ($inout4,$rndkey0);
2870         &movups (&QWP(0,$out),$inout0);
2871         &movups (&QWP(0x10,$out),$inout1);
2872         &pxor   ($inout1,$inout1);
2873         &movups (&QWP(0x20,$out),$inout2);
2874         &pxor   ($inout2,$inout2);
2875         &movups (&QWP(0x30,$out),$inout3);
2876         &pxor   ($inout3,$inout3);
2877         &lea    ($out,&DWP(0x40,$out));
2878         &movaps ($inout0,$inout4);
2879         &pxor   ($inout4,$inout4);
2880         &sub    ($len,0x50);
2881         &jmp    (&label("cbc_dec_tail_collected"));
2882
2883 &set_label("cbc_dec_one",16);
2884         if ($inline)
2885         {   &aesni_inline_generate1("dec");     }
2886         else
2887         {   &call       ("_aesni_decrypt1");    }
2888         &xorps  ($inout0,$ivec);
2889         &movaps ($ivec,$in0);
2890         &sub    ($len,0x10);
2891         &jmp    (&label("cbc_dec_tail_collected"));
2892
2893 &set_label("cbc_dec_two",16);
2894         &call   ("_aesni_decrypt2");
2895         &xorps  ($inout0,$ivec);
2896         &xorps  ($inout1,$in0);
2897         &movups (&QWP(0,$out),$inout0);
2898         &movaps ($inout0,$inout1);
2899         &pxor   ($inout1,$inout1);
2900         &lea    ($out,&DWP(0x10,$out));
2901         &movaps ($ivec,$in1);
2902         &sub    ($len,0x20);
2903         &jmp    (&label("cbc_dec_tail_collected"));
2904
2905 &set_label("cbc_dec_three",16);
2906         &call   ("_aesni_decrypt3");
2907         &xorps  ($inout0,$ivec);
2908         &xorps  ($inout1,$in0);
2909         &xorps  ($inout2,$in1);
2910         &movups (&QWP(0,$out),$inout0);
2911         &movaps ($inout0,$inout2);
2912         &pxor   ($inout2,$inout2);
2913         &movups (&QWP(0x10,$out),$inout1);
2914         &pxor   ($inout1,$inout1);
2915         &lea    ($out,&DWP(0x20,$out));
2916         &movups ($ivec,&QWP(0x20,$inp));
2917         &sub    ($len,0x30);
2918         &jmp    (&label("cbc_dec_tail_collected"));
2919
2920 &set_label("cbc_dec_four",16);
2921         &call   ("_aesni_decrypt4");
2922         &movups ($rndkey1,&QWP(0x10,$inp));
2923         &movups ($rndkey0,&QWP(0x20,$inp));
2924         &xorps  ($inout0,$ivec);
2925         &movups ($ivec,&QWP(0x30,$inp));
2926         &xorps  ($inout1,$in0);
2927         &movups (&QWP(0,$out),$inout0);
2928         &xorps  ($inout2,$rndkey1);
2929         &movups (&QWP(0x10,$out),$inout1);
2930         &pxor   ($inout1,$inout1);
2931         &xorps  ($inout3,$rndkey0);
2932         &movups (&QWP(0x20,$out),$inout2);
2933         &pxor   ($inout2,$inout2);
2934         &lea    ($out,&DWP(0x30,$out));
2935         &movaps ($inout0,$inout3);
2936         &pxor   ($inout3,$inout3);
2937         &sub    ($len,0x40);
2938         &jmp    (&label("cbc_dec_tail_collected"));
2939
2940 &set_label("cbc_dec_clear_tail_collected",16);
2941         &pxor   ($inout1,$inout1);
2942         &pxor   ($inout2,$inout2);
2943         &pxor   ($inout3,$inout3);
2944         &pxor   ($inout4,$inout4);
2945 &set_label("cbc_dec_tail_collected");
2946         &and    ($len,15);
2947         &jnz    (&label("cbc_dec_tail_partial"));
2948         &movups (&QWP(0,$out),$inout0);
2949         &pxor   ($rndkey0,$rndkey0);
2950         &jmp    (&label("cbc_ret"));
2951
2952 &set_label("cbc_dec_tail_partial",16);
2953         &movaps (&QWP(0,"esp"),$inout0);
2954         &pxor   ($rndkey0,$rndkey0);
2955         &mov    ("ecx",16);
2956         &mov    ($inp,"esp");
2957         &sub    ("ecx",$len);
2958         &data_word(0xA4F3F689);         # rep movsb
2959         &movdqa (&QWP(0,"esp"),$inout0);
2960
2961 &set_label("cbc_ret");
2962         &mov    ("esp",&DWP(16,"esp")); # pull original %esp
2963         &mov    ($key_,&wparam(4));
2964         &pxor   ($inout0,$inout0);
2965         &pxor   ($rndkey1,$rndkey1);
2966         &movups (&QWP(0,$key_),$ivec);  # output IV
2967         &pxor   ($ivec,$ivec);
2968 &set_label("cbc_abort");
2969 &function_end("${PREFIX}_cbc_encrypt");
2970 \f
2971 ######################################################################
2972 # Mechanical port from aesni-x86_64.pl.
2973 #
2974 # _aesni_set_encrypt_key is private interface,
2975 # input:
2976 #       "eax"   const unsigned char *userKey
2977 #       $rounds int bits
2978 #       $key    AES_KEY *key
2979 # output:
2980 #       "eax"   return code
2981 #       $round  rounds
2982
2983 &function_begin_B("_aesni_set_encrypt_key");
2984         &push   ("ebp");
2985         &push   ("ebx");
2986         &test   ("eax","eax");
2987         &jz     (&label("bad_pointer"));
2988         &test   ($key,$key);
2989         &jz     (&label("bad_pointer"));
2990
2991         &call   (&label("pic"));
2992 &set_label("pic");
2993         &blindpop("ebx");
2994         &lea    ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
2995
2996         &picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
2997         &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
2998         &xorps  ("xmm4","xmm4");        # low dword of xmm4 is assumed 0
2999         &mov    ("ebp",&DWP(4,"ebp"));
3000         &lea    ($key,&DWP(16,$key));
3001         &and    ("ebp",1<<28|1<<11);    # AVX and XOP bits
3002         &cmp    ($rounds,256);
3003         &je     (&label("14rounds"));
3004         &cmp    ($rounds,192);
3005         &je     (&label("12rounds"));
3006         &cmp    ($rounds,128);
3007         &jne    (&label("bad_keybits"));
3008
3009 &set_label("10rounds",16);
3010         &cmp            ("ebp",1<<28);
3011         &je             (&label("10rounds_alt"));
3012
3013         &mov            ($rounds,9);
3014         &$movekey       (&QWP(-16,$key),"xmm0");        # round 0
3015         &aeskeygenassist("xmm1","xmm0",0x01);           # round 1
3016         &call           (&label("key_128_cold"));
3017         &aeskeygenassist("xmm1","xmm0",0x2);            # round 2
3018         &call           (&label("key_128"));
3019         &aeskeygenassist("xmm1","xmm0",0x04);           # round 3
3020         &call           (&label("key_128"));
3021         &aeskeygenassist("xmm1","xmm0",0x08);           # round 4
3022         &call           (&label("key_128"));
3023         &aeskeygenassist("xmm1","xmm0",0x10);           # round 5
3024         &call           (&label("key_128"));
3025         &aeskeygenassist("xmm1","xmm0",0x20);           # round 6
3026         &call           (&label("key_128"));
3027         &aeskeygenassist("xmm1","xmm0",0x40);           # round 7
3028         &call           (&label("key_128"));
3029         &aeskeygenassist("xmm1","xmm0",0x80);           # round 8
3030         &call           (&label("key_128"));
3031         &aeskeygenassist("xmm1","xmm0",0x1b);           # round 9
3032         &call           (&label("key_128"));
3033         &aeskeygenassist("xmm1","xmm0",0x36);           # round 10
3034         &call           (&label("key_128"));
3035         &$movekey       (&QWP(0,$key),"xmm0");
3036         &mov            (&DWP(80,$key),$rounds);
3037
3038         &jmp    (&label("good_key"));
3039
3040 &set_label("key_128",16);
3041         &$movekey       (&QWP(0,$key),"xmm0");
3042         &lea            ($key,&DWP(16,$key));
3043 &set_label("key_128_cold");
3044         &shufps         ("xmm4","xmm0",0b00010000);
3045         &xorps          ("xmm0","xmm4");
3046         &shufps         ("xmm4","xmm0",0b10001100);
3047         &xorps          ("xmm0","xmm4");
3048         &shufps         ("xmm1","xmm1",0b11111111);     # critical path
3049         &xorps          ("xmm0","xmm1");
3050         &ret();
3051
3052 &set_label("10rounds_alt",16);
3053         &movdqa         ("xmm5",&QWP(0x00,"ebx"));
3054         &mov            ($rounds,8);
3055         &movdqa         ("xmm4",&QWP(0x20,"ebx"));
3056         &movdqa         ("xmm2","xmm0");
3057         &movdqu         (&QWP(-16,$key),"xmm0");
3058
3059 &set_label("loop_key128");
3060         &pshufb         ("xmm0","xmm5");
3061         &aesenclast     ("xmm0","xmm4");
3062         &pslld          ("xmm4",1);
3063         &lea            ($key,&DWP(16,$key));
3064
3065         &movdqa         ("xmm3","xmm2");
3066         &pslldq         ("xmm2",4);
3067         &pxor           ("xmm3","xmm2");
3068         &pslldq         ("xmm2",4);
3069         &pxor           ("xmm3","xmm2");
3070         &pslldq         ("xmm2",4);
3071         &pxor           ("xmm2","xmm3");
3072
3073         &pxor           ("xmm0","xmm2");
3074         &movdqu         (&QWP(-16,$key),"xmm0");
3075         &movdqa         ("xmm2","xmm0");
3076
3077         &dec            ($rounds);
3078         &jnz            (&label("loop_key128"));
3079
3080         &movdqa         ("xmm4",&QWP(0x30,"ebx"));
3081
3082         &pshufb         ("xmm0","xmm5");
3083         &aesenclast     ("xmm0","xmm4");
3084         &pslld          ("xmm4",1);
3085
3086         &movdqa         ("xmm3","xmm2");
3087         &pslldq         ("xmm2",4);
3088         &pxor           ("xmm3","xmm2");
3089         &pslldq         ("xmm2",4);
3090         &pxor           ("xmm3","xmm2");
3091         &pslldq         ("xmm2",4);
3092         &pxor           ("xmm2","xmm3");
3093
3094         &pxor           ("xmm0","xmm2");
3095         &movdqu         (&QWP(0,$key),"xmm0");
3096
3097         &movdqa         ("xmm2","xmm0");
3098         &pshufb         ("xmm0","xmm5");
3099         &aesenclast     ("xmm0","xmm4");
3100
3101         &movdqa         ("xmm3","xmm2");
3102         &pslldq         ("xmm2",4);
3103         &pxor           ("xmm3","xmm2");
3104         &pslldq         ("xmm2",4);
3105         &pxor           ("xmm3","xmm2");
3106         &pslldq         ("xmm2",4);
3107         &pxor           ("xmm2","xmm3");
3108
3109         &pxor           ("xmm0","xmm2");
3110         &movdqu         (&QWP(16,$key),"xmm0");
3111
3112         &mov            ($rounds,9);
3113         &mov            (&DWP(96,$key),$rounds);
3114
3115         &jmp    (&label("good_key"));
3116
3117 &set_label("12rounds",16);
3118         &movq           ("xmm2",&QWP(16,"eax"));        # remaining 1/3 of *userKey
3119         &cmp            ("ebp",1<<28);
3120         &je             (&label("12rounds_alt"));
3121
3122         &mov            ($rounds,11);
3123         &$movekey       (&QWP(-16,$key),"xmm0");        # round 0
3124         &aeskeygenassist("xmm1","xmm2",0x01);           # round 1,2
3125         &call           (&label("key_192a_cold"));
3126         &aeskeygenassist("xmm1","xmm2",0x02);           # round 2,3
3127         &call           (&label("key_192b"));
3128         &aeskeygenassist("xmm1","xmm2",0x04);           # round 4,5
3129         &call           (&label("key_192a"));
3130         &aeskeygenassist("xmm1","xmm2",0x08);           # round 5,6
3131         &call           (&label("key_192b"));
3132         &aeskeygenassist("xmm1","xmm2",0x10);           # round 7,8
3133         &call           (&label("key_192a"));
3134         &aeskeygenassist("xmm1","xmm2",0x20);           # round 8,9
3135         &call           (&label("key_192b"));
3136         &aeskeygenassist("xmm1","xmm2",0x40);           # round 10,11
3137         &call           (&label("key_192a"));
3138         &aeskeygenassist("xmm1","xmm2",0x80);           # round 11,12
3139         &call           (&label("key_192b"));
3140         &$movekey       (&QWP(0,$key),"xmm0");
3141         &mov            (&DWP(48,$key),$rounds);
3142
3143         &jmp    (&label("good_key"));
3144
3145 &set_label("key_192a",16);
3146         &$movekey       (&QWP(0,$key),"xmm0");
3147         &lea            ($key,&DWP(16,$key));
3148 &set_label("key_192a_cold",16);
3149         &movaps         ("xmm5","xmm2");
3150 &set_label("key_192b_warm");
3151         &shufps         ("xmm4","xmm0",0b00010000);
3152         &movdqa         ("xmm3","xmm2");
3153         &xorps          ("xmm0","xmm4");
3154         &shufps         ("xmm4","xmm0",0b10001100);
3155         &pslldq         ("xmm3",4);
3156         &xorps          ("xmm0","xmm4");
3157         &pshufd         ("xmm1","xmm1",0b01010101);     # critical path
3158         &pxor           ("xmm2","xmm3");
3159         &pxor           ("xmm0","xmm1");
3160         &pshufd         ("xmm3","xmm0",0b11111111);
3161         &pxor           ("xmm2","xmm3");
3162         &ret();
3163
3164 &set_label("key_192b",16);
3165         &movaps         ("xmm3","xmm0");
3166         &shufps         ("xmm5","xmm0",0b01000100);
3167         &$movekey       (&QWP(0,$key),"xmm5");
3168         &shufps         ("xmm3","xmm2",0b01001110);
3169         &$movekey       (&QWP(16,$key),"xmm3");
3170         &lea            ($key,&DWP(32,$key));
3171         &jmp            (&label("key_192b_warm"));
3172
3173 &set_label("12rounds_alt",16);
3174         &movdqa         ("xmm5",&QWP(0x10,"ebx"));
3175         &movdqa         ("xmm4",&QWP(0x20,"ebx"));
3176         &mov            ($rounds,8);
3177         &movdqu         (&QWP(-16,$key),"xmm0");
3178
3179 &set_label("loop_key192");
3180         &movq           (&QWP(0,$key),"xmm2");
3181         &movdqa         ("xmm1","xmm2");
3182         &pshufb         ("xmm2","xmm5");
3183         &aesenclast     ("xmm2","xmm4");
3184         &pslld          ("xmm4",1);
3185         &lea            ($key,&DWP(24,$key));
3186
3187         &movdqa         ("xmm3","xmm0");
3188         &pslldq         ("xmm0",4);
3189         &pxor           ("xmm3","xmm0");
3190         &pslldq         ("xmm0",4);
3191         &pxor           ("xmm3","xmm0");
3192         &pslldq         ("xmm0",4);
3193         &pxor           ("xmm0","xmm3");
3194
3195         &pshufd         ("xmm3","xmm0",0xff);
3196         &pxor           ("xmm3","xmm1");
3197         &pslldq         ("xmm1",4);
3198         &pxor           ("xmm3","xmm1");
3199
3200         &pxor           ("xmm0","xmm2");
3201         &pxor           ("xmm2","xmm3");
3202         &movdqu         (&QWP(-16,$key),"xmm0");
3203
3204         &dec            ($rounds);
3205         &jnz            (&label("loop_key192"));
3206
3207         &mov    ($rounds,11);
3208         &mov    (&DWP(32,$key),$rounds);
3209
3210         &jmp    (&label("good_key"));
3211
3212 &set_label("14rounds",16);
3213         &movups         ("xmm2",&QWP(16,"eax"));        # remaining half of *userKey
3214         &lea            ($key,&DWP(16,$key));
3215         &cmp            ("ebp",1<<28);
3216         &je             (&label("14rounds_alt"));
3217
3218         &mov            ($rounds,13);
3219         &$movekey       (&QWP(-32,$key),"xmm0");        # round 0
3220         &$movekey       (&QWP(-16,$key),"xmm2");        # round 1
3221         &aeskeygenassist("xmm1","xmm2",0x01);           # round 2
3222         &call           (&label("key_256a_cold"));
3223         &aeskeygenassist("xmm1","xmm0",0x01);           # round 3
3224         &call           (&label("key_256b"));
3225         &aeskeygenassist("xmm1","xmm2",0x02);           # round 4
3226         &call           (&label("key_256a"));
3227         &aeskeygenassist("xmm1","xmm0",0x02);           # round 5
3228         &call           (&label("key_256b"));
3229         &aeskeygenassist("xmm1","xmm2",0x04);           # round 6
3230         &call           (&label("key_256a"));
3231         &aeskeygenassist("xmm1","xmm0",0x04);           # round 7
3232         &call           (&label("key_256b"));
3233         &aeskeygenassist("xmm1","xmm2",0x08);           # round 8
3234         &call           (&label("key_256a"));
3235         &aeskeygenassist("xmm1","xmm0",0x08);           # round 9
3236         &call           (&label("key_256b"));
3237         &aeskeygenassist("xmm1","xmm2",0x10);           # round 10
3238         &call           (&label("key_256a"));
3239         &aeskeygenassist("xmm1","xmm0",0x10);           # round 11
3240         &call           (&label("key_256b"));
3241         &aeskeygenassist("xmm1","xmm2",0x20);           # round 12
3242         &call           (&label("key_256a"));
3243         &aeskeygenassist("xmm1","xmm0",0x20);           # round 13
3244         &call           (&label("key_256b"));
3245         &aeskeygenassist("xmm1","xmm2",0x40);           # round 14
3246         &call           (&label("key_256a"));
3247         &$movekey       (&QWP(0,$key),"xmm0");
3248         &mov            (&DWP(16,$key),$rounds);
3249         &xor            ("eax","eax");
3250
3251         &jmp    (&label("good_key"));
3252
3253 &set_label("key_256a",16);
3254         &$movekey       (&QWP(0,$key),"xmm2");
3255         &lea            ($key,&DWP(16,$key));
3256 &set_label("key_256a_cold");
3257         &shufps         ("xmm4","xmm0",0b00010000);
3258         &xorps          ("xmm0","xmm4");
3259         &shufps         ("xmm4","xmm0",0b10001100);
3260         &xorps          ("xmm0","xmm4");
3261         &shufps         ("xmm1","xmm1",0b11111111);     # critical path
3262         &xorps          ("xmm0","xmm1");
3263         &ret();
3264
3265 &set_label("key_256b",16);
3266         &$movekey       (&QWP(0,$key),"xmm0");
3267         &lea            ($key,&DWP(16,$key));
3268
3269         &shufps         ("xmm4","xmm2",0b00010000);
3270         &xorps          ("xmm2","xmm4");
3271         &shufps         ("xmm4","xmm2",0b10001100);
3272         &xorps          ("xmm2","xmm4");
3273         &shufps         ("xmm1","xmm1",0b10101010);     # critical path
3274         &xorps          ("xmm2","xmm1");
3275         &ret();
3276
3277 &set_label("14rounds_alt",16);
3278         &movdqa         ("xmm5",&QWP(0x00,"ebx"));
3279         &movdqa         ("xmm4",&QWP(0x20,"ebx"));
3280         &mov            ($rounds,7);
3281         &movdqu         (&QWP(-32,$key),"xmm0");
3282         &movdqa         ("xmm1","xmm2");
3283         &movdqu         (&QWP(-16,$key),"xmm2");
3284
3285 &set_label("loop_key256");
3286         &pshufb         ("xmm2","xmm5");
3287         &aesenclast     ("xmm2","xmm4");
3288
3289         &movdqa         ("xmm3","xmm0");
3290         &pslldq         ("xmm0",4);
3291         &pxor           ("xmm3","xmm0");
3292         &pslldq         ("xmm0",4);
3293         &pxor           ("xmm3","xmm0");
3294         &pslldq         ("xmm0",4);
3295         &pxor           ("xmm0","xmm3");
3296         &pslld          ("xmm4",1);
3297
3298         &pxor           ("xmm0","xmm2");
3299         &movdqu         (&QWP(0,$key),"xmm0");
3300
3301         &dec            ($rounds);
3302         &jz             (&label("done_key256"));
3303
3304         &pshufd         ("xmm2","xmm0",0xff);
3305         &pxor           ("xmm3","xmm3");
3306         &aesenclast     ("xmm2","xmm3");
3307
3308         &movdqa         ("xmm3","xmm1");
3309         &pslldq         ("xmm1",4);
3310         &pxor           ("xmm3","xmm1");
3311         &pslldq         ("xmm1",4);
3312         &pxor           ("xmm3","xmm1");
3313         &pslldq         ("xmm1",4);
3314         &pxor           ("xmm1","xmm3");
3315
3316         &pxor           ("xmm2","xmm1");
3317         &movdqu         (&QWP(16,$key),"xmm2");
3318         &lea            ($key,&DWP(32,$key));
3319         &movdqa         ("xmm1","xmm2");
3320         &jmp            (&label("loop_key256"));
3321
3322 &set_label("done_key256");
3323         &mov            ($rounds,13);
3324         &mov            (&DWP(16,$key),$rounds);
3325
3326 &set_label("good_key");
3327         &pxor   ("xmm0","xmm0");
3328         &pxor   ("xmm1","xmm1");
3329         &pxor   ("xmm2","xmm2");
3330         &pxor   ("xmm3","xmm3");
3331         &pxor   ("xmm4","xmm4");
3332         &pxor   ("xmm5","xmm5");
3333         &xor    ("eax","eax");
3334         &pop    ("ebx");
3335         &pop    ("ebp");
3336         &ret    ();
3337
3338 &set_label("bad_pointer",4);
3339         &mov    ("eax",-1);
3340         &pop    ("ebx");
3341         &pop    ("ebp");
3342         &ret    ();
3343 &set_label("bad_keybits",4);
3344         &pxor   ("xmm0","xmm0");
3345         &mov    ("eax",-2);
3346         &pop    ("ebx");
3347         &pop    ("ebp");
3348         &ret    ();
3349 &function_end_B("_aesni_set_encrypt_key");
3350
3351 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
3352 #                              AES_KEY *key)
3353 &function_begin_B("${PREFIX}_set_encrypt_key");
3354         &mov    ("eax",&wparam(0));
3355         &mov    ($rounds,&wparam(1));
3356         &mov    ($key,&wparam(2));
3357         &call   ("_aesni_set_encrypt_key");
3358         &ret    ();
3359 &function_end_B("${PREFIX}_set_encrypt_key");
3360
3361 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
3362 #                              AES_KEY *key)
3363 &function_begin_B("${PREFIX}_set_decrypt_key");
3364         &mov    ("eax",&wparam(0));
3365         &mov    ($rounds,&wparam(1));
3366         &mov    ($key,&wparam(2));
3367         &call   ("_aesni_set_encrypt_key");
3368         &mov    ($key,&wparam(2));
3369         &shl    ($rounds,4);    # rounds-1 after _aesni_set_encrypt_key
3370         &test   ("eax","eax");
3371         &jnz    (&label("dec_key_ret"));
3372         &lea    ("eax",&DWP(16,$key,$rounds));  # end of key schedule
3373
3374         &$movekey       ("xmm0",&QWP(0,$key));  # just swap
3375         &$movekey       ("xmm1",&QWP(0,"eax"));
3376         &$movekey       (&QWP(0,"eax"),"xmm0");
3377         &$movekey       (&QWP(0,$key),"xmm1");
3378         &lea            ($key,&DWP(16,$key));
3379         &lea            ("eax",&DWP(-16,"eax"));
3380
3381 &set_label("dec_key_inverse");
3382         &$movekey       ("xmm0",&QWP(0,$key));  # swap and inverse
3383         &$movekey       ("xmm1",&QWP(0,"eax"));
3384         &aesimc         ("xmm0","xmm0");
3385         &aesimc         ("xmm1","xmm1");
3386         &lea            ($key,&DWP(16,$key));
3387         &lea            ("eax",&DWP(-16,"eax"));
3388         &$movekey       (&QWP(16,"eax"),"xmm0");
3389         &$movekey       (&QWP(-16,$key),"xmm1");
3390         &cmp            ("eax",$key);
3391         &ja             (&label("dec_key_inverse"));
3392
3393         &$movekey       ("xmm0",&QWP(0,$key));  # inverse middle
3394         &aesimc         ("xmm0","xmm0");
3395         &$movekey       (&QWP(0,$key),"xmm0");
3396
3397         &pxor           ("xmm0","xmm0");
3398         &pxor           ("xmm1","xmm1");
3399         &xor            ("eax","eax");          # return success
3400 &set_label("dec_key_ret");
3401         &ret    ();
3402 &function_end_B("${PREFIX}_set_decrypt_key");
3403
3404 &set_label("key_const",64);
3405 &data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
3406 &data_word(0x04070605,0x04070605,0x04070605,0x04070605);
3407 &data_word(1,1,1,1);
3408 &data_word(0x1b,0x1b,0x1b,0x1b);
3409 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
3410
3411 &asm_finish();
3412
3413 close STDOUT or die "error closing STDOUT";