aesv8-armx.pl: inclrease interleave factor.
[openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # January 2013
11 #
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
22 # subroutine:
23 #
24 #                AES-128/-192/-256+SHA256       this(**)gain
25 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
26 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
27 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
28 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
29 #
30 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
31 #       Westmere is omitted from loop, this is because gain was not
32 #       estimated high enough to justify the effort;
33 # (**)  these are EVP-free results, results obtained with 'speed
34 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
35
36 $flavour = shift;
37 $output  = shift;
38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
39
40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
41
42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
45 die "can't locate x86_64-xlate.pl";
46
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49         $avx = ($1>=2.19) + ($1>=2.22);
50 }
51
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54         $avx = ($1>=2.09) + ($1>=2.10);
55 }
56
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59         $avx = ($1>=10) + ($1>=11);
60 }
61
62 $shaext=$avx;   ### set to zero if compiling for 1.0.1
63 $avx=1          if (!$shaext && $avx);
64
65 open OUT,"| \"$^X\" $xlate $flavour $output";
66 *STDOUT=*OUT;
67
68 $func="aesni_cbc_sha256_enc";
69 $TABLE="K256";
70 $SZ=4;
71 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
72                                 "%r8d","%r9d","%r10d","%r11d");
73 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
74 @Sigma0=( 2,13,22);
75 @Sigma1=( 6,11,25);
76 @sigma0=( 7,18, 3);
77 @sigma1=(17,19,10);
78 $rounds=64;
79
80 ########################################################################
81 # void aesni_cbc_sha256_enc(const void *inp,
82 #                       void *out,
83 #                       size_t length,
84 #                       const AES_KEY *key,
85 #                       unsigned char *iv,
86 #                       SHA256_CTX *ctx,
87 #                       const void *in0);
88 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
89 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
90
91 $Tbl="%rbp";
92
93 $_inp="16*$SZ+0*8(%rsp)";
94 $_out="16*$SZ+1*8(%rsp)";
95 $_end="16*$SZ+2*8(%rsp)";
96 $_key="16*$SZ+3*8(%rsp)";
97 $_ivp="16*$SZ+4*8(%rsp)";
98 $_ctx="16*$SZ+5*8(%rsp)";
99 $_in0="16*$SZ+6*8(%rsp)";
100 $_rsp="16*$SZ+7*8(%rsp)";
101 $framesz=16*$SZ+8*8;
102
103 $code=<<___;
104 .text
105
106 .extern OPENSSL_ia32cap_P
107 .globl  $func
108 .type   $func,\@abi-omnipotent
109 .align  16
110 $func:
111 ___
112                                                 if ($avx) {
113 $code.=<<___;
114         lea     OPENSSL_ia32cap_P(%rip),%r11
115         mov     \$1,%eax
116         cmp     \$0,`$win64?"%rcx":"%rdi"`
117         je      .Lprobe
118         mov     0(%r11),%eax
119         mov     4(%r11),%r10
120 ___
121 $code.=<<___ if ($shaext);
122         bt      \$61,%r10                       # check for SHA
123         jc      ${func}_shaext
124 ___
125 $code.=<<___;
126         mov     %r10,%r11
127         shr     \$32,%r11
128
129         test    \$`1<<11`,%r10d                 # check for XOP
130         jnz     ${func}_xop
131 ___
132 $code.=<<___ if ($avx>1);
133         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
134         cmp     \$`1<<8|1<<5|1<<3`,%r11d
135         je      ${func}_avx2
136 ___
137 $code.=<<___;
138         and     \$`1<<30`,%eax                  # mask "Intel CPU" bit
139         and     \$`1<<28|1<<9`,%r10d            # mask AVX+SSSE3 bits
140         or      %eax,%r10d
141         cmp     \$`1<<28|1<<9|1<<30`,%r10d
142         je      ${func}_avx
143         ud2
144 ___
145                                                 }
146 $code.=<<___;
147         xor     %eax,%eax
148         cmp     \$0,`$win64?"%rcx":"%rdi"`
149         je      .Lprobe
150         ud2
151 .Lprobe:
152         ret
153 .size   $func,.-$func
154
155 .align  64
156 .type   $TABLE,\@object
157 $TABLE:
158         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
159         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
160         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
161         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
162         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
163         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
164         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
165         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
166         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
167         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
168         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
169         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
170         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
171         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
172         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
173         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
174         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
175         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
176         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
177         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
178         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
179         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
180         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
181         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
182         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
183         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
184         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
185         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
186         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
187         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
188         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
189         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
190
191         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
192         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
193         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
194         .long   0,0,0,0,   0,0,0,0
195         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
196 .align  64
197 ___
198
199 ######################################################################
200 # SIMD code paths
201 #
202 {{{
203 ($iv,$inout,$roundkey,$temp,
204  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
205
206 $aesni_cbc_idx=0;
207 @aesni_cbc_block = (
208 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
209 ##      &vmovdqu        ($inout,($inp));
210 ##      &mov            ($_inp,$inp);
211
212         '&vpxor         ($inout,$inout,$roundkey);'.
213         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
214
215         '&vpxor         ($inout,$inout,$iv);',
216
217         '&vaesenc       ($inout,$inout,$roundkey);'.
218         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
219
220         '&vaesenc       ($inout,$inout,$roundkey);'.
221         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
222
223         '&vaesenc       ($inout,$inout,$roundkey);'.
224         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
225
226         '&vaesenc       ($inout,$inout,$roundkey);'.
227         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
228
229         '&vaesenc       ($inout,$inout,$roundkey);'.
230         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
231
232         '&vaesenc       ($inout,$inout,$roundkey);'.
233         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
234
235         '&vaesenc       ($inout,$inout,$roundkey);'.
236         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
237
238         '&vaesenc       ($inout,$inout,$roundkey);'.
239         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
240
241         '&vaesenc       ($inout,$inout,$roundkey);'.
242         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
243
244         '&vaesenclast   ($temp,$inout,$roundkey);'.
245         ' &vaesenc      ($inout,$inout,$roundkey);'.
246         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
247
248         '&vpand         ($iv,$temp,$mask10);'.
249         ' &vaesenc      ($inout,$inout,$roundkey);'.
250         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
251
252         '&vaesenclast   ($temp,$inout,$roundkey);'.
253         ' &vaesenc      ($inout,$inout,$roundkey);'.
254         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
255
256         '&vpand         ($temp,$temp,$mask12);'.
257         ' &vaesenc      ($inout,$inout,$roundkey);'.
258          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
259
260         '&vpor          ($iv,$iv,$temp);'.
261         ' &vaesenclast  ($temp,$inout,$roundkey);'.
262         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
263
264 ##      &mov            ($inp,$_inp);
265 ##      &mov            ($out,$_out);
266 ##      &vpand          ($temp,$temp,$mask14);
267 ##      &vpor           ($iv,$iv,$temp);
268 ##      &vmovdqu        ($iv,($out,$inp);
269 ##      &lea            (inp,16($inp));
270 );
271
272 my $a4=$T1;
273 my ($a,$b,$c,$d,$e,$f,$g,$h);
274
275 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
276 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
277   my $arg = pop;
278     $arg = "\$$arg" if ($arg*1 eq $arg);
279     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
280 }
281
282 sub body_00_15 () {
283         (
284         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
285
286         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
287         '&mov   ($a,$a1)',
288         '&mov   ($a4,$f)',
289
290         '&xor   ($a0,$e)',
291         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
292         '&xor   ($a4,$g)',                      # f^g
293
294         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
295         '&xor   ($a1,$a)',
296         '&and   ($a4,$e)',                      # (f^g)&e
297
298         @aesni_cbc_block[$aesni_cbc_idx++].
299         '&xor   ($a0,$e)',
300         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
301         '&mov   ($a2,$a)',
302
303         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
304         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
305         '&xor   ($a2,$b)',                      # a^b, b^c in next round
306
307         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
308         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
309         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
310
311         '&xor   ($a1,$a)',
312         '&add   ($h,$a0)',                      # h+=Sigma1(e)
313         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
314
315         '&add   ($d,$h)',                       # d+=h
316         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
317         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
318
319         '&mov   ($a0,$d)',
320         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
321         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
322         );
323 }
324
325 if ($avx) {{
326 ######################################################################
327 # XOP code path
328 #
329 $code.=<<___;
330 .type   ${func}_xop,\@function,6
331 .align  64
332 ${func}_xop:
333 .Lxop_shortcut:
334         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
335         push    %rbx
336         push    %rbp
337         push    %r12
338         push    %r13
339         push    %r14
340         push    %r15
341         mov     %rsp,%r11               # copy %rsp
342         sub     \$`$framesz+$win64*16*10`,%rsp
343         and     \$-64,%rsp              # align stack frame
344
345         shl     \$6,$len
346         sub     $inp,$out               # re-bias
347         sub     $inp,$in0
348         add     $inp,$len               # end of input
349
350         #mov    $inp,$_inp              # saved later
351         mov     $out,$_out
352         mov     $len,$_end
353         #mov    $key,$_key              # remains resident in $inp register
354         mov     $ivp,$_ivp
355         mov     $ctx,$_ctx
356         mov     $in0,$_in0
357         mov     %r11,$_rsp
358 ___
359 $code.=<<___ if ($win64);
360         movaps  %xmm6,`$framesz+16*0`(%rsp)
361         movaps  %xmm7,`$framesz+16*1`(%rsp)
362         movaps  %xmm8,`$framesz+16*2`(%rsp)
363         movaps  %xmm9,`$framesz+16*3`(%rsp)
364         movaps  %xmm10,`$framesz+16*4`(%rsp)
365         movaps  %xmm11,`$framesz+16*5`(%rsp)
366         movaps  %xmm12,`$framesz+16*6`(%rsp)
367         movaps  %xmm13,`$framesz+16*7`(%rsp)
368         movaps  %xmm14,`$framesz+16*8`(%rsp)
369         movaps  %xmm15,`$framesz+16*9`(%rsp)
370 ___
371 $code.=<<___;
372 .Lprologue_xop:
373         vzeroall
374
375         mov     $inp,%r12               # borrow $a4
376         lea     0x80($key),$inp         # size optimization, reassign
377         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
378         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
379         mov     $ctx,%r15               # borrow $a2
380         mov     $in0,%rsi               # borrow $a3
381         vmovdqu ($ivp),$iv              # load IV
382         sub     \$9,%r14
383
384         mov     $SZ*0(%r15),$A
385         mov     $SZ*1(%r15),$B
386         mov     $SZ*2(%r15),$C
387         mov     $SZ*3(%r15),$D
388         mov     $SZ*4(%r15),$E
389         mov     $SZ*5(%r15),$F
390         mov     $SZ*6(%r15),$G
391         mov     $SZ*7(%r15),$H
392
393         vmovdqa 0x00(%r13,%r14,8),$mask14
394         vmovdqa 0x10(%r13,%r14,8),$mask12
395         vmovdqa 0x20(%r13,%r14,8),$mask10
396         vmovdqu 0x00-0x80($inp),$roundkey
397         jmp     .Lloop_xop
398 ___
399                                         if ($SZ==4) {   # SHA256
400     my @X = map("%xmm$_",(0..3));
401     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
402
403 $code.=<<___;
404 .align  16
405 .Lloop_xop:
406         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
407         vmovdqu 0x00(%rsi,%r12),@X[0]
408         vmovdqu 0x10(%rsi,%r12),@X[1]
409         vmovdqu 0x20(%rsi,%r12),@X[2]
410         vmovdqu 0x30(%rsi,%r12),@X[3]
411         vpshufb $t3,@X[0],@X[0]
412         lea     $TABLE(%rip),$Tbl
413         vpshufb $t3,@X[1],@X[1]
414         vpshufb $t3,@X[2],@X[2]
415         vpaddd  0x00($Tbl),@X[0],$t0
416         vpshufb $t3,@X[3],@X[3]
417         vpaddd  0x20($Tbl),@X[1],$t1
418         vpaddd  0x40($Tbl),@X[2],$t2
419         vpaddd  0x60($Tbl),@X[3],$t3
420         vmovdqa $t0,0x00(%rsp)
421         mov     $A,$a1
422         vmovdqa $t1,0x10(%rsp)
423         mov     $B,$a3
424         vmovdqa $t2,0x20(%rsp)
425         xor     $C,$a3                  # magic
426         vmovdqa $t3,0x30(%rsp)
427         mov     $E,$a0
428         jmp     .Lxop_00_47
429
430 .align  16
431 .Lxop_00_47:
432         sub     \$-16*2*$SZ,$Tbl        # size optimization
433         vmovdqu (%r12),$inout           # $a4
434         mov     %r12,$_inp              # $a4
435 ___
436 sub XOP_256_00_47 () {
437 my $j = shift;
438 my $body = shift;
439 my @X = @_;
440 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
441
442         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
443           eval(shift(@insns));
444           eval(shift(@insns));
445          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
446           eval(shift(@insns));
447           eval(shift(@insns));
448         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
449           eval(shift(@insns));
450           eval(shift(@insns));
451         &vpsrld         ($t0,$t0,$sigma0[2]);
452           eval(shift(@insns));
453           eval(shift(@insns));
454          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
455           eval(shift(@insns));
456           eval(shift(@insns));
457           eval(shift(@insns));
458           eval(shift(@insns));
459         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
460           eval(shift(@insns));
461           eval(shift(@insns));
462         &vpxor          ($t0,$t0,$t1);
463           eval(shift(@insns));
464           eval(shift(@insns));
465           eval(shift(@insns));
466           eval(shift(@insns));
467          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
468           eval(shift(@insns));
469           eval(shift(@insns));
470         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
471           eval(shift(@insns));
472           eval(shift(@insns));
473          &vpsrld        ($t2,@X[3],$sigma1[2]);
474           eval(shift(@insns));
475           eval(shift(@insns));
476         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
477           eval(shift(@insns));
478           eval(shift(@insns));
479          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
480           eval(shift(@insns));
481           eval(shift(@insns));
482          &vpxor         ($t3,$t3,$t2);
483           eval(shift(@insns));
484           eval(shift(@insns));
485           eval(shift(@insns));
486           eval(shift(@insns));
487          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
488           eval(shift(@insns));
489           eval(shift(@insns));
490           eval(shift(@insns));
491           eval(shift(@insns));
492         &vpsrldq        ($t3,$t3,8);
493           eval(shift(@insns));
494           eval(shift(@insns));
495           eval(shift(@insns));
496           eval(shift(@insns));
497         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
498           eval(shift(@insns));
499           eval(shift(@insns));
500           eval(shift(@insns));
501           eval(shift(@insns));
502          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
503           eval(shift(@insns));
504           eval(shift(@insns));
505          &vpsrld        ($t2,@X[0],$sigma1[2]);
506           eval(shift(@insns));
507           eval(shift(@insns));
508          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
509           eval(shift(@insns));
510           eval(shift(@insns));
511          &vpxor         ($t3,$t3,$t2);
512           eval(shift(@insns));
513           eval(shift(@insns));
514           eval(shift(@insns));
515           eval(shift(@insns));
516          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
517           eval(shift(@insns));
518           eval(shift(@insns));
519           eval(shift(@insns));
520           eval(shift(@insns));
521         &vpslldq        ($t3,$t3,8);            # 22 instructions
522           eval(shift(@insns));
523           eval(shift(@insns));
524           eval(shift(@insns));
525           eval(shift(@insns));
526         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
527           eval(shift(@insns));
528           eval(shift(@insns));
529           eval(shift(@insns));
530           eval(shift(@insns));
531         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
532           foreach (@insns) { eval; }            # remaining instructions
533         &vmovdqa        (16*$j."(%rsp)",$t2);
534 }
535
536     $aesni_cbc_idx=0;
537     for ($i=0,$j=0; $j<4; $j++) {
538         &XOP_256_00_47($j,\&body_00_15,@X);
539         push(@X,shift(@X));                     # rotate(@X)
540     }
541         &mov            ("%r12",$_inp);         # borrow $a4
542         &vpand          ($temp,$temp,$mask14);
543         &mov            ("%r15",$_out);         # borrow $a2
544         &vpor           ($iv,$iv,$temp);
545         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
546         &lea            ("%r12","16(%r12)");    # inp++
547
548         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
549         &jne    (".Lxop_00_47");
550
551         &vmovdqu        ($inout,"(%r12)");
552         &mov            ($_inp,"%r12");
553
554     $aesni_cbc_idx=0;
555     for ($i=0; $i<16; ) {
556         foreach(body_00_15()) { eval; }
557     }
558                                         }
559 $code.=<<___;
560         mov     $_inp,%r12              # borrow $a4
561         mov     $_out,%r13              # borrow $a0
562         mov     $_ctx,%r15              # borrow $a2
563         mov     $_in0,%rsi              # borrow $a3
564
565         vpand   $mask14,$temp,$temp
566         mov     $a1,$A
567         vpor    $temp,$iv,$iv
568         vmovdqu $iv,(%r13,%r12)         # write output
569         lea     16(%r12),%r12           # inp++
570
571         add     $SZ*0(%r15),$A
572         add     $SZ*1(%r15),$B
573         add     $SZ*2(%r15),$C
574         add     $SZ*3(%r15),$D
575         add     $SZ*4(%r15),$E
576         add     $SZ*5(%r15),$F
577         add     $SZ*6(%r15),$G
578         add     $SZ*7(%r15),$H
579
580         cmp     $_end,%r12
581
582         mov     $A,$SZ*0(%r15)
583         mov     $B,$SZ*1(%r15)
584         mov     $C,$SZ*2(%r15)
585         mov     $D,$SZ*3(%r15)
586         mov     $E,$SZ*4(%r15)
587         mov     $F,$SZ*5(%r15)
588         mov     $G,$SZ*6(%r15)
589         mov     $H,$SZ*7(%r15)
590
591         jb      .Lloop_xop
592
593         mov     $_ivp,$ivp
594         mov     $_rsp,%rsi
595         vmovdqu $iv,($ivp)              # output IV
596         vzeroall
597 ___
598 $code.=<<___ if ($win64);
599         movaps  `$framesz+16*0`(%rsp),%xmm6
600         movaps  `$framesz+16*1`(%rsp),%xmm7
601         movaps  `$framesz+16*2`(%rsp),%xmm8
602         movaps  `$framesz+16*3`(%rsp),%xmm9
603         movaps  `$framesz+16*4`(%rsp),%xmm10
604         movaps  `$framesz+16*5`(%rsp),%xmm11
605         movaps  `$framesz+16*6`(%rsp),%xmm12
606         movaps  `$framesz+16*7`(%rsp),%xmm13
607         movaps  `$framesz+16*8`(%rsp),%xmm14
608         movaps  `$framesz+16*9`(%rsp),%xmm15
609 ___
610 $code.=<<___;
611         mov     (%rsi),%r15
612         mov     8(%rsi),%r14
613         mov     16(%rsi),%r13
614         mov     24(%rsi),%r12
615         mov     32(%rsi),%rbp
616         mov     40(%rsi),%rbx
617         lea     48(%rsi),%rsp
618 .Lepilogue_xop:
619         ret
620 .size   ${func}_xop,.-${func}_xop
621 ___
622 ######################################################################
623 # AVX+shrd code path
624 #
625 local *ror = sub { &shrd(@_[0],@_) };
626
627 $code.=<<___;
628 .type   ${func}_avx,\@function,6
629 .align  64
630 ${func}_avx:
631 .Lavx_shortcut:
632         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
633         push    %rbx
634         push    %rbp
635         push    %r12
636         push    %r13
637         push    %r14
638         push    %r15
639         mov     %rsp,%r11               # copy %rsp
640         sub     \$`$framesz+$win64*16*10`,%rsp
641         and     \$-64,%rsp              # align stack frame
642
643         shl     \$6,$len
644         sub     $inp,$out               # re-bias
645         sub     $inp,$in0
646         add     $inp,$len               # end of input
647
648         #mov    $inp,$_inp              # saved later
649         mov     $out,$_out
650         mov     $len,$_end
651         #mov    $key,$_key              # remains resident in $inp register
652         mov     $ivp,$_ivp
653         mov     $ctx,$_ctx
654         mov     $in0,$_in0
655         mov     %r11,$_rsp
656 ___
657 $code.=<<___ if ($win64);
658         movaps  %xmm6,`$framesz+16*0`(%rsp)
659         movaps  %xmm7,`$framesz+16*1`(%rsp)
660         movaps  %xmm8,`$framesz+16*2`(%rsp)
661         movaps  %xmm9,`$framesz+16*3`(%rsp)
662         movaps  %xmm10,`$framesz+16*4`(%rsp)
663         movaps  %xmm11,`$framesz+16*5`(%rsp)
664         movaps  %xmm12,`$framesz+16*6`(%rsp)
665         movaps  %xmm13,`$framesz+16*7`(%rsp)
666         movaps  %xmm14,`$framesz+16*8`(%rsp)
667         movaps  %xmm15,`$framesz+16*9`(%rsp)
668 ___
669 $code.=<<___;
670 .Lprologue_avx:
671         vzeroall
672
673         mov     $inp,%r12               # borrow $a4
674         lea     0x80($key),$inp         # size optimization, reassign
675         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
676         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
677         mov     $ctx,%r15               # borrow $a2
678         mov     $in0,%rsi               # borrow $a3
679         vmovdqu ($ivp),$iv              # load IV
680         sub     \$9,%r14
681
682         mov     $SZ*0(%r15),$A
683         mov     $SZ*1(%r15),$B
684         mov     $SZ*2(%r15),$C
685         mov     $SZ*3(%r15),$D
686         mov     $SZ*4(%r15),$E
687         mov     $SZ*5(%r15),$F
688         mov     $SZ*6(%r15),$G
689         mov     $SZ*7(%r15),$H
690
691         vmovdqa 0x00(%r13,%r14,8),$mask14
692         vmovdqa 0x10(%r13,%r14,8),$mask12
693         vmovdqa 0x20(%r13,%r14,8),$mask10
694         vmovdqu 0x00-0x80($inp),$roundkey
695 ___
696                                         if ($SZ==4) {   # SHA256
697     my @X = map("%xmm$_",(0..3));
698     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
699
700 $code.=<<___;
701         jmp     .Lloop_avx
702 .align  16
703 .Lloop_avx:
704         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
705         vmovdqu 0x00(%rsi,%r12),@X[0]
706         vmovdqu 0x10(%rsi,%r12),@X[1]
707         vmovdqu 0x20(%rsi,%r12),@X[2]
708         vmovdqu 0x30(%rsi,%r12),@X[3]
709         vpshufb $t3,@X[0],@X[0]
710         lea     $TABLE(%rip),$Tbl
711         vpshufb $t3,@X[1],@X[1]
712         vpshufb $t3,@X[2],@X[2]
713         vpaddd  0x00($Tbl),@X[0],$t0
714         vpshufb $t3,@X[3],@X[3]
715         vpaddd  0x20($Tbl),@X[1],$t1
716         vpaddd  0x40($Tbl),@X[2],$t2
717         vpaddd  0x60($Tbl),@X[3],$t3
718         vmovdqa $t0,0x00(%rsp)
719         mov     $A,$a1
720         vmovdqa $t1,0x10(%rsp)
721         mov     $B,$a3
722         vmovdqa $t2,0x20(%rsp)
723         xor     $C,$a3                  # magic
724         vmovdqa $t3,0x30(%rsp)
725         mov     $E,$a0
726         jmp     .Lavx_00_47
727
728 .align  16
729 .Lavx_00_47:
730         sub     \$-16*2*$SZ,$Tbl        # size optimization
731         vmovdqu (%r12),$inout           # $a4
732         mov     %r12,$_inp              # $a4
733 ___
734 sub Xupdate_256_AVX () {
735         (
736         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
737          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
738         '&vpsrld        ($t2,$t0,$sigma0[0]);',
739          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
740         '&vpsrld        ($t3,$t0,$sigma0[2])',
741         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
742         '&vpxor         ($t0,$t3,$t2)',
743          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
744         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
745         '&vpxor         ($t0,$t0,$t1)',
746         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
747         '&vpxor         ($t0,$t0,$t2)',
748          '&vpsrld       ($t2,$t3,$sigma1[2]);',
749         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
750          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
751         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
752          '&vpxor        ($t2,$t2,$t3);',
753          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
754          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
755          '&vpshufd      ($t2,$t2,0b10000100)',
756          '&vpsrldq      ($t2,$t2,8)',
757         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
758          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
759          '&vpsrld       ($t2,$t3,$sigma1[2])',
760          '&vpsrlq       ($t3,$t3,$sigma1[0])',
761          '&vpxor        ($t2,$t2,$t3);',
762          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
763          '&vpxor        ($t2,$t2,$t3)',
764          '&vpshufd      ($t2,$t2,0b11101000)',
765          '&vpslldq      ($t2,$t2,8)',
766         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
767         );
768 }
769
770 sub AVX_256_00_47 () {
771 my $j = shift;
772 my $body = shift;
773 my @X = @_;
774 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
775
776         foreach (Xupdate_256_AVX()) {           # 29 instructions
777             eval;
778             eval(shift(@insns));
779             eval(shift(@insns));
780             eval(shift(@insns));
781         }
782         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
783           foreach (@insns) { eval; }            # remaining instructions
784         &vmovdqa        (16*$j."(%rsp)",$t2);
785 }
786
787     $aesni_cbc_idx=0;
788     for ($i=0,$j=0; $j<4; $j++) {
789         &AVX_256_00_47($j,\&body_00_15,@X);
790         push(@X,shift(@X));                     # rotate(@X)
791     }
792         &mov            ("%r12",$_inp);         # borrow $a4
793         &vpand          ($temp,$temp,$mask14);
794         &mov            ("%r15",$_out);         # borrow $a2
795         &vpor           ($iv,$iv,$temp);
796         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
797         &lea            ("%r12","16(%r12)");    # inp++
798
799         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
800         &jne    (".Lavx_00_47");
801
802         &vmovdqu        ($inout,"(%r12)");
803         &mov            ($_inp,"%r12");
804
805     $aesni_cbc_idx=0;
806     for ($i=0; $i<16; ) {
807         foreach(body_00_15()) { eval; }
808     }
809
810                                         }
811 $code.=<<___;
812         mov     $_inp,%r12              # borrow $a4
813         mov     $_out,%r13              # borrow $a0
814         mov     $_ctx,%r15              # borrow $a2
815         mov     $_in0,%rsi              # borrow $a3
816
817         vpand   $mask14,$temp,$temp
818         mov     $a1,$A
819         vpor    $temp,$iv,$iv
820         vmovdqu $iv,(%r13,%r12)         # write output
821         lea     16(%r12),%r12           # inp++
822
823         add     $SZ*0(%r15),$A
824         add     $SZ*1(%r15),$B
825         add     $SZ*2(%r15),$C
826         add     $SZ*3(%r15),$D
827         add     $SZ*4(%r15),$E
828         add     $SZ*5(%r15),$F
829         add     $SZ*6(%r15),$G
830         add     $SZ*7(%r15),$H
831
832         cmp     $_end,%r12
833
834         mov     $A,$SZ*0(%r15)
835         mov     $B,$SZ*1(%r15)
836         mov     $C,$SZ*2(%r15)
837         mov     $D,$SZ*3(%r15)
838         mov     $E,$SZ*4(%r15)
839         mov     $F,$SZ*5(%r15)
840         mov     $G,$SZ*6(%r15)
841         mov     $H,$SZ*7(%r15)
842         jb      .Lloop_avx
843
844         mov     $_ivp,$ivp
845         mov     $_rsp,%rsi
846         vmovdqu $iv,($ivp)              # output IV
847         vzeroall
848 ___
849 $code.=<<___ if ($win64);
850         movaps  `$framesz+16*0`(%rsp),%xmm6
851         movaps  `$framesz+16*1`(%rsp),%xmm7
852         movaps  `$framesz+16*2`(%rsp),%xmm8
853         movaps  `$framesz+16*3`(%rsp),%xmm9
854         movaps  `$framesz+16*4`(%rsp),%xmm10
855         movaps  `$framesz+16*5`(%rsp),%xmm11
856         movaps  `$framesz+16*6`(%rsp),%xmm12
857         movaps  `$framesz+16*7`(%rsp),%xmm13
858         movaps  `$framesz+16*8`(%rsp),%xmm14
859         movaps  `$framesz+16*9`(%rsp),%xmm15
860 ___
861 $code.=<<___;
862         mov     (%rsi),%r15
863         mov     8(%rsi),%r14
864         mov     16(%rsi),%r13
865         mov     24(%rsi),%r12
866         mov     32(%rsi),%rbp
867         mov     40(%rsi),%rbx
868         lea     48(%rsi),%rsp
869 .Lepilogue_avx:
870         ret
871 .size   ${func}_avx,.-${func}_avx
872 ___
873
874 if ($avx>1) {{
875 ######################################################################
876 # AVX2+BMI code path
877 #
878 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp 
879 my $PUSH8=8*2*$SZ;
880 use integer;
881
882 sub bodyx_00_15 () {
883         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
884         (
885         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
886
887         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
888         '&and   ($a4,$e)',              # f&e
889         '&rorx  ($a0,$e,$Sigma1[2])',
890         '&rorx  ($a2,$e,$Sigma1[1])',
891
892         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
893         '&lea   ($h,"($h,$a4)")',
894         '&andn  ($a4,$e,$g)',           # ~e&g
895         '&xor   ($a0,$a2)',
896
897         '&rorx  ($a1,$e,$Sigma1[0])',
898         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
899         '&xor   ($a0,$a1)',             # Sigma1(e)
900         '&mov   ($a2,$a)',
901
902         '&rorx  ($a4,$a,$Sigma0[2])',
903         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
904         '&xor   ($a2,$b)',              # a^b, b^c in next round
905         '&rorx  ($a1,$a,$Sigma0[1])',
906
907         '&rorx  ($a0,$a,$Sigma0[0])',
908         '&lea   ($d,"($d,$h)")',        # d+=h
909         '&and   ($a3,$a2)',             # (b^c)&(a^b)
910         @aesni_cbc_block[$aesni_cbc_idx++].
911         '&xor   ($a1,$a4)',
912
913         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
914         '&xor   ($a1,$a0)',             # Sigma0(a)
915         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
916         '&mov   ($a4,$e)',              # copy of f in future
917
918         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
919         );
920         # and at the finish one has to $a+=$a1
921 }
922
923 $code.=<<___;
924 .type   ${func}_avx2,\@function,6
925 .align  64
926 ${func}_avx2:
927 .Lavx2_shortcut:
928         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
929         push    %rbx
930         push    %rbp
931         push    %r12
932         push    %r13
933         push    %r14
934         push    %r15
935         mov     %rsp,%r11               # copy %rsp
936         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
937         and     \$-256*$SZ,%rsp         # align stack frame
938         add     \$`2*$SZ*($rounds-8)`,%rsp
939
940         shl     \$6,$len
941         sub     $inp,$out               # re-bias
942         sub     $inp,$in0
943         add     $inp,$len               # end of input
944
945         #mov    $inp,$_inp              # saved later
946         #mov    $out,$_out              # kept in $offload
947         mov     $len,$_end
948         #mov    $key,$_key              # remains resident in $inp register
949         mov     $ivp,$_ivp
950         mov     $ctx,$_ctx
951         mov     $in0,$_in0
952         mov     %r11,$_rsp
953 ___
954 $code.=<<___ if ($win64);
955         movaps  %xmm6,`$framesz+16*0`(%rsp)
956         movaps  %xmm7,`$framesz+16*1`(%rsp)
957         movaps  %xmm8,`$framesz+16*2`(%rsp)
958         movaps  %xmm9,`$framesz+16*3`(%rsp)
959         movaps  %xmm10,`$framesz+16*4`(%rsp)
960         movaps  %xmm11,`$framesz+16*5`(%rsp)
961         movaps  %xmm12,`$framesz+16*6`(%rsp)
962         movaps  %xmm13,`$framesz+16*7`(%rsp)
963         movaps  %xmm14,`$framesz+16*8`(%rsp)
964         movaps  %xmm15,`$framesz+16*9`(%rsp)
965 ___
966 $code.=<<___;
967 .Lprologue_avx2:
968         vzeroall
969
970         mov     $inp,%r13               # borrow $a0
971         vpinsrq \$1,$out,$offload,$offload
972         lea     0x80($key),$inp         # size optimization, reassign
973         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
974         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
975         mov     $ctx,%r15               # borrow $a2
976         mov     $in0,%rsi               # borrow $a3
977         vmovdqu ($ivp),$iv              # load IV
978         lea     -9(%r14),%r14
979
980         vmovdqa 0x00(%r12,%r14,8),$mask14
981         vmovdqa 0x10(%r12,%r14,8),$mask12
982         vmovdqa 0x20(%r12,%r14,8),$mask10
983
984         sub     \$-16*$SZ,%r13          # inp++, size optimization
985         mov     $SZ*0(%r15),$A
986         lea     (%rsi,%r13),%r12        # borrow $a0
987         mov     $SZ*1(%r15),$B
988         cmp     $len,%r13               # $_end
989         mov     $SZ*2(%r15),$C
990         cmove   %rsp,%r12               # next block or random data
991         mov     $SZ*3(%r15),$D
992         mov     $SZ*4(%r15),$E
993         mov     $SZ*5(%r15),$F
994         mov     $SZ*6(%r15),$G
995         mov     $SZ*7(%r15),$H
996         vmovdqu 0x00-0x80($inp),$roundkey
997 ___
998                                         if ($SZ==4) {   # SHA256
999     my @X = map("%ymm$_",(0..3));
1000     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
1001
1002 $code.=<<___;
1003         jmp     .Loop_avx2
1004 .align  16
1005 .Loop_avx2:
1006         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1007         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1008         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1009         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1010         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1011
1012         vinserti128     \$1,(%r12),@X[0],@X[0]
1013         vinserti128     \$1,16(%r12),@X[1],@X[1]
1014          vpshufb        $t3,@X[0],@X[0]
1015         vinserti128     \$1,32(%r12),@X[2],@X[2]
1016          vpshufb        $t3,@X[1],@X[1]
1017         vinserti128     \$1,48(%r12),@X[3],@X[3]
1018
1019         lea     $TABLE(%rip),$Tbl
1020         vpshufb $t3,@X[2],@X[2]
1021         lea     -16*$SZ(%r13),%r13
1022         vpaddd  0x00($Tbl),@X[0],$t0
1023         vpshufb $t3,@X[3],@X[3]
1024         vpaddd  0x20($Tbl),@X[1],$t1
1025         vpaddd  0x40($Tbl),@X[2],$t2
1026         vpaddd  0x60($Tbl),@X[3],$t3
1027         vmovdqa $t0,0x00(%rsp)
1028         xor     $a1,$a1
1029         vmovdqa $t1,0x20(%rsp)
1030         lea     -$PUSH8(%rsp),%rsp
1031         mov     $B,$a3
1032         vmovdqa $t2,0x00(%rsp)
1033         xor     $C,$a3                  # magic
1034         vmovdqa $t3,0x20(%rsp)
1035         mov     $F,$a4
1036         sub     \$-16*2*$SZ,$Tbl        # size optimization
1037         jmp     .Lavx2_00_47
1038
1039 .align  16
1040 .Lavx2_00_47:
1041         vmovdqu (%r13),$inout
1042         vpinsrq \$0,%r13,$offload,$offload
1043 ___
1044
1045 sub AVX2_256_00_47 () {
1046 my $j = shift;
1047 my $body = shift;
1048 my @X = @_;
1049 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1050 my $base = "+2*$PUSH8(%rsp)";
1051
1052         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1053         foreach (Xupdate_256_AVX()) {           # 29 instructions
1054             eval;
1055             eval(shift(@insns));
1056             eval(shift(@insns));
1057             eval(shift(@insns));
1058         }
1059         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1060           foreach (@insns) { eval; }            # remaining instructions
1061         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1062 }
1063     $aesni_cbc_idx=0;
1064     for ($i=0,$j=0; $j<4; $j++) {
1065         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1066         push(@X,shift(@X));                     # rotate(@X)
1067     }
1068         &vmovq          ("%r13",$offload);      # borrow $a0
1069         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1070         &vpand          ($temp,$temp,$mask14);
1071         &vpor           ($iv,$iv,$temp);
1072         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1073         &lea            ("%r13","16(%r13)");    # inp++
1074
1075         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1076         &cmpb   (($SZ-1)."($Tbl)",0);
1077         &jne    (".Lavx2_00_47");
1078
1079         &vmovdqu        ($inout,"(%r13)");
1080         &vpinsrq        ($offload,$offload,"%r13",0);
1081
1082     $aesni_cbc_idx=0;
1083     for ($i=0; $i<16; ) {
1084         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1085         foreach(bodyx_00_15()) { eval; }
1086     }
1087                                         }
1088 $code.=<<___;
1089         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1090         vmovq   $offload,%r13                   # $_inp, borrow $a0
1091         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1092         add     $a1,$A
1093         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1094
1095         vpand   $mask14,$temp,$temp
1096         vpor    $temp,$iv,$iv
1097         vmovdqu $iv,(%r12,%r13)                 # write output
1098         lea     16(%r13),%r13
1099
1100         add     $SZ*0(%r15),$A
1101         add     $SZ*1(%r15),$B
1102         add     $SZ*2(%r15),$C
1103         add     $SZ*3(%r15),$D
1104         add     $SZ*4(%r15),$E
1105         add     $SZ*5(%r15),$F
1106         add     $SZ*6(%r15),$G
1107         add     $SZ*7(%r15),$H
1108
1109         mov     $A,$SZ*0(%r15)
1110         mov     $B,$SZ*1(%r15)
1111         mov     $C,$SZ*2(%r15)
1112         mov     $D,$SZ*3(%r15)
1113         mov     $E,$SZ*4(%r15)
1114         mov     $F,$SZ*5(%r15)
1115         mov     $G,$SZ*6(%r15)
1116         mov     $H,$SZ*7(%r15)
1117
1118         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1119         je      .Ldone_avx2
1120
1121         xor     $a1,$a1
1122         mov     $B,$a3
1123         mov     $F,$a4
1124         xor     $C,$a3                  # magic
1125         jmp     .Lower_avx2
1126 .align  16
1127 .Lower_avx2:
1128         vmovdqu (%r13),$inout
1129         vpinsrq \$0,%r13,$offload,$offload
1130 ___
1131     $aesni_cbc_idx=0;
1132     for ($i=0; $i<16; ) {
1133         my $base="+16($Tbl)";
1134         foreach(bodyx_00_15()) { eval; }
1135         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1136     }
1137 $code.=<<___;
1138         vmovq   $offload,%r13                   # borrow $a0
1139         vpextrq \$1,$offload,%r15               # borrow $a2
1140         vpand   $mask14,$temp,$temp
1141         vpor    $temp,$iv,$iv
1142         lea     -$PUSH8($Tbl),$Tbl
1143         vmovdqu $iv,(%r15,%r13)                 # write output
1144         lea     16(%r13),%r13                   # inp++
1145         cmp     %rsp,$Tbl
1146         jae     .Lower_avx2
1147
1148         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1149         lea     16*$SZ(%r13),%r13
1150         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1151         add     $a1,$A
1152         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1153
1154         add     $SZ*0(%r15),$A
1155         add     $SZ*1(%r15),$B
1156         add     $SZ*2(%r15),$C
1157         add     $SZ*3(%r15),$D
1158         add     $SZ*4(%r15),$E
1159         add     $SZ*5(%r15),$F
1160         add     $SZ*6(%r15),$G
1161         lea     (%rsi,%r13),%r12
1162         add     $SZ*7(%r15),$H
1163
1164         cmp     $_end,%r13
1165
1166         mov     $A,$SZ*0(%r15)
1167         cmove   %rsp,%r12               # next block or stale data
1168         mov     $B,$SZ*1(%r15)
1169         mov     $C,$SZ*2(%r15)
1170         mov     $D,$SZ*3(%r15)
1171         mov     $E,$SZ*4(%r15)
1172         mov     $F,$SZ*5(%r15)
1173         mov     $G,$SZ*6(%r15)
1174         mov     $H,$SZ*7(%r15)
1175
1176         jbe     .Loop_avx2
1177         lea     (%rsp),$Tbl
1178
1179 .Ldone_avx2:
1180         lea     ($Tbl),%rsp
1181         mov     $_ivp,$ivp
1182         mov     $_rsp,%rsi
1183         vmovdqu $iv,($ivp)              # output IV
1184         vzeroall
1185 ___
1186 $code.=<<___ if ($win64);
1187         movaps  `$framesz+16*0`(%rsp),%xmm6
1188         movaps  `$framesz+16*1`(%rsp),%xmm7
1189         movaps  `$framesz+16*2`(%rsp),%xmm8
1190         movaps  `$framesz+16*3`(%rsp),%xmm9
1191         movaps  `$framesz+16*4`(%rsp),%xmm10
1192         movaps  `$framesz+16*5`(%rsp),%xmm11
1193         movaps  `$framesz+16*6`(%rsp),%xmm12
1194         movaps  `$framesz+16*7`(%rsp),%xmm13
1195         movaps  `$framesz+16*8`(%rsp),%xmm14
1196         movaps  `$framesz+16*9`(%rsp),%xmm15
1197 ___
1198 $code.=<<___;
1199         mov     (%rsi),%r15
1200         mov     8(%rsi),%r14
1201         mov     16(%rsi),%r13
1202         mov     24(%rsi),%r12
1203         mov     32(%rsi),%rbp
1204         mov     40(%rsi),%rbx
1205         lea     48(%rsi),%rsp
1206 .Lepilogue_avx2:
1207         ret
1208 .size   ${func}_avx2,.-${func}_avx2
1209 ___
1210 }}
1211 }}
1212 {{
1213 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1214
1215 my ($rounds,$Tbl)=("%r11d","%rbx");
1216
1217 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1218 my @rndkey=("%xmm4","%xmm5");
1219 my $r=0;
1220 my $sn=0;
1221
1222 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1223 my @MSG=map("%xmm$_",(10..13));
1224
1225 my $aesenc=sub {
1226   use integer;
1227   my ($n,$k)=($r/10,$r%10);
1228     if ($k==0) {
1229       $code.=<<___;
1230         movups          `16*$n`($in0),$in               # load input
1231         xorps           $rndkey0,$in
1232 ___
1233       $code.=<<___ if ($n);
1234         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1235 ___
1236       $code.=<<___;
1237         xorps           $in,$iv
1238         movups          `32+16*$k-112`($key),$rndkey[1]
1239         aesenc          $rndkey[0],$iv
1240 ___
1241     } elsif ($k==9) {
1242       $sn++;
1243       $code.=<<___;
1244         cmp             \$11,$rounds
1245         jb              .Laesenclast$sn
1246         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1247         aesenc          $rndkey[0],$iv
1248         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1249         aesenc          $rndkey[1],$iv
1250         je              .Laesenclast$sn
1251         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1252         aesenc          $rndkey[0],$iv
1253         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1254         aesenc          $rndkey[1],$iv
1255 .Laesenclast$sn:
1256         aesenclast      $rndkey[0],$iv
1257         movups          16-112($key),$rndkey[1]         # forward reference
1258         nop
1259 ___
1260     } else {
1261       $code.=<<___;
1262         movups          `32+16*$k-112`($key),$rndkey[1]
1263         aesenc          $rndkey[0],$iv
1264 ___
1265     }
1266     $r++;       unshift(@rndkey,pop(@rndkey));
1267 };
1268
1269 if ($shaext) {
1270 my $Tbl="%rax";
1271
1272 $code.=<<___;
1273 .type   ${func}_shaext,\@function,6
1274 .align  32
1275 ${func}_shaext:
1276         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1277 ___
1278 $code.=<<___ if ($win64);
1279         lea     `-8-10*16`(%rsp),%rsp
1280         movaps  %xmm6,-8-10*16(%rax)
1281         movaps  %xmm7,-8-9*16(%rax)
1282         movaps  %xmm8,-8-8*16(%rax)
1283         movaps  %xmm9,-8-7*16(%rax)
1284         movaps  %xmm10,-8-6*16(%rax)
1285         movaps  %xmm11,-8-5*16(%rax)
1286         movaps  %xmm12,-8-4*16(%rax)
1287         movaps  %xmm13,-8-3*16(%rax)
1288         movaps  %xmm14,-8-2*16(%rax)
1289         movaps  %xmm15,-8-1*16(%rax)
1290 .Lprologue_shaext:
1291 ___
1292 $code.=<<___;
1293         lea             K256+0x80(%rip),$Tbl
1294         movdqu          ($ctx),$ABEF            # DCBA
1295         movdqu          16($ctx),$CDGH          # HGFE
1296         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1297
1298         mov             240($key),$rounds
1299         sub             $in0,$out
1300         movups          ($key),$rndkey0         # $key[0]
1301         movups          16($key),$rndkey[0]     # forward reference
1302         lea             112($key),$key          # size optimization
1303
1304         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1305         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1306         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1307         movdqa          $TMP,$BSWAP             # offload
1308         palignr         \$8,$CDGH,$ABEF         # ABEF
1309         punpcklqdq      $Wi,$CDGH               # CDGH
1310
1311         jmp     .Loop_shaext
1312
1313 .align  16
1314 .Loop_shaext:
1315         movdqu          ($inp),@MSG[0]
1316         movdqu          0x10($inp),@MSG[1]
1317         movdqu          0x20($inp),@MSG[2]
1318         pshufb          $TMP,@MSG[0]
1319         movdqu          0x30($inp),@MSG[3]
1320
1321         movdqa          0*32-0x80($Tbl),$Wi
1322         paddd           @MSG[0],$Wi
1323         pshufb          $TMP,@MSG[1]
1324         movdqa          $CDGH,$CDGH_SAVE        # offload
1325         movdqa          $ABEF,$ABEF_SAVE        # offload
1326 ___
1327         &$aesenc();
1328 $code.=<<___;
1329         sha256rnds2     $ABEF,$CDGH             # 0-3
1330         pshufd          \$0x0e,$Wi,$Wi
1331 ___
1332         &$aesenc();
1333 $code.=<<___;
1334         sha256rnds2     $CDGH,$ABEF
1335
1336         movdqa          1*32-0x80($Tbl),$Wi
1337         paddd           @MSG[1],$Wi
1338         pshufb          $TMP,@MSG[2]
1339         lea             0x40($inp),$inp
1340 ___
1341         &$aesenc();
1342 $code.=<<___;
1343         sha256rnds2     $ABEF,$CDGH             # 4-7
1344         pshufd          \$0x0e,$Wi,$Wi
1345 ___
1346         &$aesenc();
1347 $code.=<<___;
1348         sha256rnds2     $CDGH,$ABEF
1349
1350         movdqa          2*32-0x80($Tbl),$Wi
1351         paddd           @MSG[2],$Wi
1352         pshufb          $TMP,@MSG[3]
1353         sha256msg1      @MSG[1],@MSG[0]
1354 ___
1355         &$aesenc();
1356 $code.=<<___;
1357         sha256rnds2     $ABEF,$CDGH             # 8-11
1358         pshufd          \$0x0e,$Wi,$Wi
1359         movdqa          @MSG[3],$TMP
1360         palignr         \$4,@MSG[2],$TMP
1361         paddd           $TMP,@MSG[0]
1362 ___
1363         &$aesenc();
1364 $code.=<<___;
1365         sha256rnds2     $CDGH,$ABEF
1366
1367         movdqa          3*32-0x80($Tbl),$Wi
1368         paddd           @MSG[3],$Wi
1369         sha256msg2      @MSG[3],@MSG[0]
1370         sha256msg1      @MSG[2],@MSG[1]
1371 ___
1372         &$aesenc();
1373 $code.=<<___;
1374         sha256rnds2     $ABEF,$CDGH             # 12-15
1375         pshufd          \$0x0e,$Wi,$Wi
1376 ___
1377         &$aesenc();
1378 $code.=<<___;
1379         movdqa          @MSG[0],$TMP
1380         palignr         \$4,@MSG[3],$TMP
1381         paddd           $TMP,@MSG[1]
1382         sha256rnds2     $CDGH,$ABEF
1383 ___
1384 for($i=4;$i<16-3;$i++) {
1385         &$aesenc()      if (($r%10)==0);
1386 $code.=<<___;
1387         movdqa          $i*32-0x80($Tbl),$Wi
1388         paddd           @MSG[0],$Wi
1389         sha256msg2      @MSG[0],@MSG[1]
1390         sha256msg1      @MSG[3],@MSG[2]
1391 ___
1392         &$aesenc();
1393 $code.=<<___;
1394         sha256rnds2     $ABEF,$CDGH             # 16-19...
1395         pshufd          \$0x0e,$Wi,$Wi
1396         movdqa          @MSG[1],$TMP
1397         palignr         \$4,@MSG[0],$TMP
1398         paddd           $TMP,@MSG[2]
1399 ___
1400         &$aesenc();
1401         &$aesenc()      if ($r==19);
1402 $code.=<<___;
1403         sha256rnds2     $CDGH,$ABEF
1404 ___
1405         push(@MSG,shift(@MSG));
1406 }
1407 $code.=<<___;
1408         movdqa          13*32-0x80($Tbl),$Wi
1409         paddd           @MSG[0],$Wi
1410         sha256msg2      @MSG[0],@MSG[1]
1411         sha256msg1      @MSG[3],@MSG[2]
1412 ___
1413         &$aesenc();
1414 $code.=<<___;
1415         sha256rnds2     $ABEF,$CDGH             # 52-55
1416         pshufd          \$0x0e,$Wi,$Wi
1417         movdqa          @MSG[1],$TMP
1418         palignr         \$4,@MSG[0],$TMP
1419         paddd           $TMP,@MSG[2]
1420 ___
1421         &$aesenc();
1422         &$aesenc();
1423 $code.=<<___;
1424         sha256rnds2     $CDGH,$ABEF
1425
1426         movdqa          14*32-0x80($Tbl),$Wi
1427         paddd           @MSG[1],$Wi
1428         sha256msg2      @MSG[1],@MSG[2]
1429         movdqa          $BSWAP,$TMP
1430 ___
1431         &$aesenc();
1432 $code.=<<___;
1433         sha256rnds2     $ABEF,$CDGH             # 56-59
1434         pshufd          \$0x0e,$Wi,$Wi
1435 ___
1436         &$aesenc();
1437 $code.=<<___;
1438         sha256rnds2     $CDGH,$ABEF
1439
1440         movdqa          15*32-0x80($Tbl),$Wi
1441         paddd           @MSG[2],$Wi
1442 ___
1443         &$aesenc();
1444         &$aesenc();
1445 $code.=<<___;
1446         sha256rnds2     $ABEF,$CDGH             # 60-63
1447         pshufd          \$0x0e,$Wi,$Wi
1448 ___
1449         &$aesenc();
1450 $code.=<<___;
1451         sha256rnds2     $CDGH,$ABEF
1452         #pxor           $CDGH,$rndkey0          # black magic
1453 ___
1454         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1455 $code.=<<___;
1456         #xorps          $CDGH,$rndkey0          # black magic
1457         paddd           $CDGH_SAVE,$CDGH
1458         paddd           $ABEF_SAVE,$ABEF
1459
1460         dec             $len
1461         movups          $iv,48($out,$in0)       # write output
1462         lea             64($in0),$in0
1463         jnz             .Loop_shaext
1464
1465         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1466         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1467         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1468         punpckhqdq      $CDGH,$ABEF             # DCBA
1469         palignr         \$8,$TMP,$CDGH          # HGFE
1470
1471         movups          $iv,($ivp)              # write IV
1472         movdqu          $ABEF,($ctx)
1473         movdqu          $CDGH,16($ctx)
1474 ___
1475 $code.=<<___ if ($win64);
1476         movaps  0*16(%rsp),%xmm6
1477         movaps  1*16(%rsp),%xmm7
1478         movaps  2*16(%rsp),%xmm8
1479         movaps  3*16(%rsp),%xmm9
1480         movaps  4*16(%rsp),%xmm10
1481         movaps  5*16(%rsp),%xmm11
1482         movaps  6*16(%rsp),%xmm12
1483         movaps  7*16(%rsp),%xmm13
1484         movaps  8*16(%rsp),%xmm14
1485         movaps  9*16(%rsp),%xmm15
1486         lea     8+10*16(%rsp),%rsp
1487 .Lepilogue_shaext:
1488 ___
1489 $code.=<<___;
1490         ret
1491 .size   ${func}_shaext,.-${func}_shaext
1492 ___
1493 }
1494 }}}}}
1495
1496 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1497 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1498 if ($win64) {
1499 $rec="%rcx";
1500 $frame="%rdx";
1501 $context="%r8";
1502 $disp="%r9";
1503
1504 $code.=<<___ if ($avx);
1505 .extern __imp_RtlVirtualUnwind
1506 .type   se_handler,\@abi-omnipotent
1507 .align  16
1508 se_handler:
1509         push    %rsi
1510         push    %rdi
1511         push    %rbx
1512         push    %rbp
1513         push    %r12
1514         push    %r13
1515         push    %r14
1516         push    %r15
1517         pushfq
1518         sub     \$64,%rsp
1519
1520         mov     120($context),%rax      # pull context->Rax
1521         mov     248($context),%rbx      # pull context->Rip
1522
1523         mov     8($disp),%rsi           # disp->ImageBase
1524         mov     56($disp),%r11          # disp->HanderlData
1525
1526         mov     0(%r11),%r10d           # HandlerData[0]
1527         lea     (%rsi,%r10),%r10        # prologue label
1528         cmp     %r10,%rbx               # context->Rip<prologue label
1529         jb      .Lin_prologue
1530
1531         mov     152($context),%rax      # pull context->Rsp
1532
1533         mov     4(%r11),%r10d           # HandlerData[1]
1534         lea     (%rsi,%r10),%r10        # epilogue label
1535         cmp     %r10,%rbx               # context->Rip>=epilogue label
1536         jae     .Lin_prologue
1537 ___
1538 $code.=<<___ if ($shaext);
1539         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1540         cmp     %r10,%rbx
1541         jb      .Lnot_in_shaext
1542
1543         lea     (%rax),%rsi
1544         lea     512($context),%rdi      # &context.Xmm6
1545         mov     \$20,%ecx
1546         .long   0xa548f3fc              # cld; rep movsq
1547         lea     168(%rax),%rax          # adjust stack pointer
1548         jmp     .Lin_prologue
1549 .Lnot_in_shaext:
1550 ___
1551 $code.=<<___ if ($avx>1);
1552         lea     .Lavx2_shortcut(%rip),%r10
1553         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1554         jb      .Lnot_in_avx2
1555
1556         and     \$-256*$SZ,%rax
1557         add     \$`2*$SZ*($rounds-8)`,%rax
1558 .Lnot_in_avx2:
1559 ___
1560 $code.=<<___;
1561         mov     %rax,%rsi               # put aside Rsp
1562         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1563         lea     48(%rax),%rax
1564
1565         mov     -8(%rax),%rbx
1566         mov     -16(%rax),%rbp
1567         mov     -24(%rax),%r12
1568         mov     -32(%rax),%r13
1569         mov     -40(%rax),%r14
1570         mov     -48(%rax),%r15
1571         mov     %rbx,144($context)      # restore context->Rbx
1572         mov     %rbp,160($context)      # restore context->Rbp
1573         mov     %r12,216($context)      # restore context->R12
1574         mov     %r13,224($context)      # restore context->R13
1575         mov     %r14,232($context)      # restore context->R14
1576         mov     %r15,240($context)      # restore context->R15
1577
1578         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1579         lea     512($context),%rdi      # &context.Xmm6
1580         mov     \$20,%ecx
1581         .long   0xa548f3fc              # cld; rep movsq
1582
1583 .Lin_prologue:
1584         mov     8(%rax),%rdi
1585         mov     16(%rax),%rsi
1586         mov     %rax,152($context)      # restore context->Rsp
1587         mov     %rsi,168($context)      # restore context->Rsi
1588         mov     %rdi,176($context)      # restore context->Rdi
1589
1590         mov     40($disp),%rdi          # disp->ContextRecord
1591         mov     $context,%rsi           # context
1592         mov     \$154,%ecx              # sizeof(CONTEXT)
1593         .long   0xa548f3fc              # cld; rep movsq
1594
1595         mov     $disp,%rsi
1596         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1597         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1598         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1599         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1600         mov     40(%rsi),%r10           # disp->ContextRecord
1601         lea     56(%rsi),%r11           # &disp->HandlerData
1602         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1603         mov     %r10,32(%rsp)           # arg5
1604         mov     %r11,40(%rsp)           # arg6
1605         mov     %r12,48(%rsp)           # arg7
1606         mov     %rcx,56(%rsp)           # arg8, (NULL)
1607         call    *__imp_RtlVirtualUnwind(%rip)
1608
1609         mov     \$1,%eax                # ExceptionContinueSearch
1610         add     \$64,%rsp
1611         popfq
1612         pop     %r15
1613         pop     %r14
1614         pop     %r13
1615         pop     %r12
1616         pop     %rbp
1617         pop     %rbx
1618         pop     %rdi
1619         pop     %rsi
1620         ret
1621 .size   se_handler,.-se_handler
1622
1623 .section        .pdata
1624         .rva    .LSEH_begin_${func}_xop
1625         .rva    .LSEH_end_${func}_xop
1626         .rva    .LSEH_info_${func}_xop
1627
1628         .rva    .LSEH_begin_${func}_avx
1629         .rva    .LSEH_end_${func}_avx
1630         .rva    .LSEH_info_${func}_avx
1631 ___
1632 $code.=<<___ if ($avx>1);
1633         .rva    .LSEH_begin_${func}_avx2
1634         .rva    .LSEH_end_${func}_avx2
1635         .rva    .LSEH_info_${func}_avx2
1636 ___
1637 $code.=<<___ if ($shaext);
1638         .rva    .LSEH_begin_${func}_shaext
1639         .rva    .LSEH_end_${func}_shaext
1640         .rva    .LSEH_info_${func}_shaext
1641 ___
1642 $code.=<<___ if ($avx);
1643 .section        .xdata
1644 .align  8
1645 .LSEH_info_${func}_xop:
1646         .byte   9,0,0,0
1647         .rva    se_handler
1648         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1649
1650 .LSEH_info_${func}_avx:
1651         .byte   9,0,0,0
1652         .rva    se_handler
1653         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1654 ___
1655 $code.=<<___ if ($avx>1);
1656 .LSEH_info_${func}_avx2:
1657         .byte   9,0,0,0
1658         .rva    se_handler
1659         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1660 ___
1661 $code.=<<___ if ($shaext);
1662 .LSEH_info_${func}_shaext:
1663         .byte   9,0,0,0
1664         .rva    se_handler
1665         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1666 ___
1667 }
1668
1669 ####################################################################
1670 sub rex {
1671   local *opcode=shift;
1672   my ($dst,$src)=@_;
1673   my $rex=0;
1674
1675     $rex|=0x04                  if($dst>=8);
1676     $rex|=0x01                  if($src>=8);
1677     unshift @opcode,$rex|0x40   if($rex);
1678 }
1679
1680 {
1681   my %opcodelet = (
1682                 "sha256rnds2" => 0xcb,
1683                 "sha256msg1"  => 0xcc,
1684                 "sha256msg2"  => 0xcd   );
1685
1686   sub sha256op38 {
1687     my $instr = shift;
1688
1689     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1690       my @opcode=(0x0f,0x38);
1691         rex(\@opcode,$2,$1);
1692         push @opcode,$opcodelet{$instr};
1693         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1694         return ".byte\t".join(',',@opcode);
1695     } else {
1696         return $instr."\t".@_[0];
1697     }
1698   }
1699 }
1700
1701 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1702 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1703 print $code;
1704 close STDOUT;