Add support for Intel SHA extension.
[openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # January 2013
11 #
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
22 # subroutine:
23 #
24 #                AES-128/-192/-256+SHA256       this(**)gain
25 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
26 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
27 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
28 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
29 #
30 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
31 #       Westmere is omitted from loop, this is because gain was not
32 #       estimated high enough to justify the effort;
33 # (**)  these are EVP-free results, results obtained with 'speed
34 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
35
36 $flavour = shift;
37 $output  = shift;
38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
39
40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
41
42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
45 die "can't locate x86_64-xlate.pl";
46
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49         $avx = ($1>=2.19) + ($1>=2.22);
50 }
51
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54         $avx = ($1>=2.09) + ($1>=2.10);
55 }
56
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59         $avx = ($1>=10) + ($1>=11);
60 }
61
62 open OUT,"| \"$^X\" $xlate $flavour $output";
63 *STDOUT=*OUT;
64
65 $func="aesni_cbc_sha256_enc";
66 $TABLE="K256";
67 $SZ=4;
68 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
69                                 "%r8d","%r9d","%r10d","%r11d");
70 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
71 @Sigma0=( 2,13,22);
72 @Sigma1=( 6,11,25);
73 @sigma0=( 7,18, 3);
74 @sigma1=(17,19,10);
75 $rounds=64;
76
77 ########################################################################
78 # void aesni_cbc_sha256_enc(const void *inp,
79 #                       void *out,
80 #                       size_t length,
81 #                       const AES_KEY *key,
82 #                       unsigned char *iv,
83 #                       SHA256_CTX *ctx,
84 #                       const void *in0);
85 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
86 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
87
88 $Tbl="%rbp";
89
90 $_inp="16*$SZ+0*8(%rsp)";
91 $_out="16*$SZ+1*8(%rsp)";
92 $_end="16*$SZ+2*8(%rsp)";
93 $_key="16*$SZ+3*8(%rsp)";
94 $_ivp="16*$SZ+4*8(%rsp)";
95 $_ctx="16*$SZ+5*8(%rsp)";
96 $_in0="16*$SZ+6*8(%rsp)";
97 $_rsp="16*$SZ+7*8(%rsp)";
98 $framesz=16*$SZ+8*8;
99
100 $code=<<___;
101 .text
102
103 .extern OPENSSL_ia32cap_P
104 .globl  $func
105 .type   $func,\@abi-omnipotent
106 .align  16
107 $func:
108 ___
109 $code.=<<___ if ($avx);
110         lea     OPENSSL_ia32cap_P(%rip),%r11
111         mov     \$1,%eax
112         cmp     \$0,`$win64?"%rcx":"%rdi"`
113         je      .Lprobe
114         mov     0(%r11),%eax
115         mov     4(%r11),%r10
116
117         bt      \$61,%r10                       # check for SHA
118         jc      ${func}_shaext
119
120         mov     %r10,%r11
121         shr     \$32,%r11
122
123         test    \$`1<<11`,%r10d                 # check for XOP
124         jnz     ${func}_xop
125 ___
126 $code.=<<___ if ($avx>1);
127         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
128         cmp     \$`1<<8|1<<5|1<<3`,%r11d
129         je      ${func}_avx2
130 ___
131 $code.=<<___ if ($avx);
132         and     \$`1<<30`,%eax                  # mask "Intel CPU" bit
133         and     \$`1<<28|1<<9`,%r10d            # mask AVX+SSSE3 bits
134         or      %eax,%r10d
135         cmp     \$`1<<28|1<<9|1<<30`,%r10d
136         je      ${func}_avx
137         ud2
138 ___
139 $code.=<<___;
140         xor     %eax,%eax
141         cmp     \$0,`$win64?"%rcx":"%rdi"`
142         je      .Lprobe
143         ud2
144 .Lprobe:
145         ret
146 .size   $func,.-$func
147
148 .align  64
149 .type   $TABLE,\@object
150 $TABLE:
151         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
152         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
153         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
154         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
155         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
156         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
157         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
158         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
159         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
160         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
161         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
162         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
163         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
164         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
165         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
166         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
167         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
168         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
169         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
170         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
171         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
172         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
173         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
174         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
175         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
176         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
177         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
178         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
179         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
180         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
181         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
182         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
183
184         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
185         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
186         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
187         .long   0,0,0,0,   0,0,0,0
188         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
189 .align  64
190 ___
191
192 ######################################################################
193 # SIMD code paths
194 #
195 {{{
196 ($iv,$inout,$roundkey,$temp,
197  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
198
199 $aesni_cbc_idx=0;
200 @aesni_cbc_block = (
201 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
202 ##      &vmovdqu        ($inout,($inp));
203 ##      &mov            ($_inp,$inp);
204
205         '&vpxor         ($inout,$inout,$roundkey);'.
206         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
207
208         '&vpxor         ($inout,$inout,$iv);',
209
210         '&vaesenc       ($inout,$inout,$roundkey);'.
211         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
212
213         '&vaesenc       ($inout,$inout,$roundkey);'.
214         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
215
216         '&vaesenc       ($inout,$inout,$roundkey);'.
217         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
218
219         '&vaesenc       ($inout,$inout,$roundkey);'.
220         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
221
222         '&vaesenc       ($inout,$inout,$roundkey);'.
223         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
224
225         '&vaesenc       ($inout,$inout,$roundkey);'.
226         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
227
228         '&vaesenc       ($inout,$inout,$roundkey);'.
229         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
230
231         '&vaesenc       ($inout,$inout,$roundkey);'.
232         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
233
234         '&vaesenc       ($inout,$inout,$roundkey);'.
235         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
236
237         '&vaesenclast   ($temp,$inout,$roundkey);'.
238         ' &vaesenc      ($inout,$inout,$roundkey);'.
239         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
240
241         '&vpand         ($iv,$temp,$mask10);'.
242         ' &vaesenc      ($inout,$inout,$roundkey);'.
243         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
244
245         '&vaesenclast   ($temp,$inout,$roundkey);'.
246         ' &vaesenc      ($inout,$inout,$roundkey);'.
247         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
248
249         '&vpand         ($temp,$temp,$mask12);'.
250         ' &vaesenc      ($inout,$inout,$roundkey);'.
251          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
252
253         '&vpor          ($iv,$iv,$temp);'.
254         ' &vaesenclast  ($temp,$inout,$roundkey);'.
255         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
256
257 ##      &mov            ($inp,$_inp);
258 ##      &mov            ($out,$_out);
259 ##      &vpand          ($temp,$temp,$mask14);
260 ##      &vpor           ($iv,$iv,$temp);
261 ##      &vmovdqu        ($iv,($out,$inp);
262 ##      &lea            (inp,16($inp));
263 );
264
265 my $a4=$T1;
266 my ($a,$b,$c,$d,$e,$f,$g,$h);
267
268 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
269 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
270   my $arg = pop;
271     $arg = "\$$arg" if ($arg*1 eq $arg);
272     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
273 }
274
275 sub body_00_15 () {
276         (
277         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
278
279         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
280         '&mov   ($a,$a1)',
281         '&mov   ($a4,$f)',
282
283         '&xor   ($a0,$e)',
284         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
285         '&xor   ($a4,$g)',                      # f^g
286
287         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
288         '&xor   ($a1,$a)',
289         '&and   ($a4,$e)',                      # (f^g)&e
290
291         @aesni_cbc_block[$aesni_cbc_idx++].
292         '&xor   ($a0,$e)',
293         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
294         '&mov   ($a2,$a)',
295
296         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
297         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
298         '&xor   ($a2,$b)',                      # a^b, b^c in next round
299
300         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
301         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
302         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
303
304         '&xor   ($a1,$a)',
305         '&add   ($h,$a0)',                      # h+=Sigma1(e)
306         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
307
308         '&add   ($d,$h)',                       # d+=h
309         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
310         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
311
312         '&mov   ($a0,$d)',
313         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
314         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
315         );
316 }
317
318 if ($avx) {{
319 ######################################################################
320 # XOP code path
321 #
322 $code.=<<___;
323 .type   ${func}_xop,\@function,6
324 .align  64
325 ${func}_xop:
326 .Lxop_shortcut:
327         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
328         push    %rbx
329         push    %rbp
330         push    %r12
331         push    %r13
332         push    %r14
333         push    %r15
334         mov     %rsp,%r11               # copy %rsp
335         sub     \$`$framesz+$win64*16*10`,%rsp
336         and     \$-64,%rsp              # align stack frame
337
338         shl     \$6,$len
339         sub     $inp,$out               # re-bias
340         sub     $inp,$in0
341         add     $inp,$len               # end of input
342
343         #mov    $inp,$_inp              # saved later
344         mov     $out,$_out
345         mov     $len,$_end
346         #mov    $key,$_key              # remains resident in $inp register
347         mov     $ivp,$_ivp
348         mov     $ctx,$_ctx
349         mov     $in0,$_in0
350         mov     %r11,$_rsp
351 ___
352 $code.=<<___ if ($win64);
353         movaps  %xmm6,`$framesz+16*0`(%rsp)
354         movaps  %xmm7,`$framesz+16*1`(%rsp)
355         movaps  %xmm8,`$framesz+16*2`(%rsp)
356         movaps  %xmm9,`$framesz+16*3`(%rsp)
357         movaps  %xmm10,`$framesz+16*4`(%rsp)
358         movaps  %xmm11,`$framesz+16*5`(%rsp)
359         movaps  %xmm12,`$framesz+16*6`(%rsp)
360         movaps  %xmm13,`$framesz+16*7`(%rsp)
361         movaps  %xmm14,`$framesz+16*8`(%rsp)
362         movaps  %xmm15,`$framesz+16*9`(%rsp)
363 ___
364 $code.=<<___;
365 .Lprologue_xop:
366         vzeroall
367
368         mov     $inp,%r12               # borrow $a4
369         lea     0x80($key),$inp         # size optimization, reassign
370         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
371         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
372         mov     $ctx,%r15               # borrow $a2
373         mov     $in0,%rsi               # borrow $a3
374         vmovdqu ($ivp),$iv              # load IV
375         sub     \$9,%r14
376
377         mov     $SZ*0(%r15),$A
378         mov     $SZ*1(%r15),$B
379         mov     $SZ*2(%r15),$C
380         mov     $SZ*3(%r15),$D
381         mov     $SZ*4(%r15),$E
382         mov     $SZ*5(%r15),$F
383         mov     $SZ*6(%r15),$G
384         mov     $SZ*7(%r15),$H
385
386         vmovdqa 0x00(%r13,%r14,8),$mask14
387         vmovdqa 0x10(%r13,%r14,8),$mask12
388         vmovdqa 0x20(%r13,%r14,8),$mask10
389         vmovdqu 0x00-0x80($inp),$roundkey
390         jmp     .Lloop_xop
391 ___
392                                         if ($SZ==4) {   # SHA256
393     my @X = map("%xmm$_",(0..3));
394     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
395
396 $code.=<<___;
397 .align  16
398 .Lloop_xop:
399         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
400         vmovdqu 0x00(%rsi,%r12),@X[0]
401         vmovdqu 0x10(%rsi,%r12),@X[1]
402         vmovdqu 0x20(%rsi,%r12),@X[2]
403         vmovdqu 0x30(%rsi,%r12),@X[3]
404         vpshufb $t3,@X[0],@X[0]
405         lea     $TABLE(%rip),$Tbl
406         vpshufb $t3,@X[1],@X[1]
407         vpshufb $t3,@X[2],@X[2]
408         vpaddd  0x00($Tbl),@X[0],$t0
409         vpshufb $t3,@X[3],@X[3]
410         vpaddd  0x20($Tbl),@X[1],$t1
411         vpaddd  0x40($Tbl),@X[2],$t2
412         vpaddd  0x60($Tbl),@X[3],$t3
413         vmovdqa $t0,0x00(%rsp)
414         mov     $A,$a1
415         vmovdqa $t1,0x10(%rsp)
416         mov     $B,$a3
417         vmovdqa $t2,0x20(%rsp)
418         xor     $C,$a3                  # magic
419         vmovdqa $t3,0x30(%rsp)
420         mov     $E,$a0
421         jmp     .Lxop_00_47
422
423 .align  16
424 .Lxop_00_47:
425         sub     \$-16*2*$SZ,$Tbl        # size optimization
426         vmovdqu (%r12),$inout           # $a4
427         mov     %r12,$_inp              # $a4
428 ___
429 sub XOP_256_00_47 () {
430 my $j = shift;
431 my $body = shift;
432 my @X = @_;
433 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
434
435         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
436           eval(shift(@insns));
437           eval(shift(@insns));
438          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
439           eval(shift(@insns));
440           eval(shift(@insns));
441         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
442           eval(shift(@insns));
443           eval(shift(@insns));
444         &vpsrld         ($t0,$t0,$sigma0[2]);
445           eval(shift(@insns));
446           eval(shift(@insns));
447          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
448           eval(shift(@insns));
449           eval(shift(@insns));
450           eval(shift(@insns));
451           eval(shift(@insns));
452         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
453           eval(shift(@insns));
454           eval(shift(@insns));
455         &vpxor          ($t0,$t0,$t1);
456           eval(shift(@insns));
457           eval(shift(@insns));
458           eval(shift(@insns));
459           eval(shift(@insns));
460          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
461           eval(shift(@insns));
462           eval(shift(@insns));
463         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
464           eval(shift(@insns));
465           eval(shift(@insns));
466          &vpsrld        ($t2,@X[3],$sigma1[2]);
467           eval(shift(@insns));
468           eval(shift(@insns));
469         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
470           eval(shift(@insns));
471           eval(shift(@insns));
472          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
473           eval(shift(@insns));
474           eval(shift(@insns));
475          &vpxor         ($t3,$t3,$t2);
476           eval(shift(@insns));
477           eval(shift(@insns));
478           eval(shift(@insns));
479           eval(shift(@insns));
480          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
481           eval(shift(@insns));
482           eval(shift(@insns));
483           eval(shift(@insns));
484           eval(shift(@insns));
485         &vpsrldq        ($t3,$t3,8);
486           eval(shift(@insns));
487           eval(shift(@insns));
488           eval(shift(@insns));
489           eval(shift(@insns));
490         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
491           eval(shift(@insns));
492           eval(shift(@insns));
493           eval(shift(@insns));
494           eval(shift(@insns));
495          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
496           eval(shift(@insns));
497           eval(shift(@insns));
498          &vpsrld        ($t2,@X[0],$sigma1[2]);
499           eval(shift(@insns));
500           eval(shift(@insns));
501          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
502           eval(shift(@insns));
503           eval(shift(@insns));
504          &vpxor         ($t3,$t3,$t2);
505           eval(shift(@insns));
506           eval(shift(@insns));
507           eval(shift(@insns));
508           eval(shift(@insns));
509          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
510           eval(shift(@insns));
511           eval(shift(@insns));
512           eval(shift(@insns));
513           eval(shift(@insns));
514         &vpslldq        ($t3,$t3,8);            # 22 instructions
515           eval(shift(@insns));
516           eval(shift(@insns));
517           eval(shift(@insns));
518           eval(shift(@insns));
519         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
520           eval(shift(@insns));
521           eval(shift(@insns));
522           eval(shift(@insns));
523           eval(shift(@insns));
524         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
525           foreach (@insns) { eval; }            # remaining instructions
526         &vmovdqa        (16*$j."(%rsp)",$t2);
527 }
528
529     $aesni_cbc_idx=0;
530     for ($i=0,$j=0; $j<4; $j++) {
531         &XOP_256_00_47($j,\&body_00_15,@X);
532         push(@X,shift(@X));                     # rotate(@X)
533     }
534         &mov            ("%r12",$_inp);         # borrow $a4
535         &vpand          ($temp,$temp,$mask14);
536         &mov            ("%r15",$_out);         # borrow $a2
537         &vpor           ($iv,$iv,$temp);
538         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
539         &lea            ("%r12","16(%r12)");    # inp++
540
541         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
542         &jne    (".Lxop_00_47");
543
544         &vmovdqu        ($inout,"(%r12)");
545         &mov            ($_inp,"%r12");
546
547     $aesni_cbc_idx=0;
548     for ($i=0; $i<16; ) {
549         foreach(body_00_15()) { eval; }
550     }
551                                         }
552 $code.=<<___;
553         mov     $_inp,%r12              # borrow $a4
554         mov     $_out,%r13              # borrow $a0
555         mov     $_ctx,%r15              # borrow $a2
556         mov     $_in0,%rsi              # borrow $a3
557
558         vpand   $mask14,$temp,$temp
559         mov     $a1,$A
560         vpor    $temp,$iv,$iv
561         vmovdqu $iv,(%r13,%r12)         # write output
562         lea     16(%r12),%r12           # inp++
563
564         add     $SZ*0(%r15),$A
565         add     $SZ*1(%r15),$B
566         add     $SZ*2(%r15),$C
567         add     $SZ*3(%r15),$D
568         add     $SZ*4(%r15),$E
569         add     $SZ*5(%r15),$F
570         add     $SZ*6(%r15),$G
571         add     $SZ*7(%r15),$H
572
573         cmp     $_end,%r12
574
575         mov     $A,$SZ*0(%r15)
576         mov     $B,$SZ*1(%r15)
577         mov     $C,$SZ*2(%r15)
578         mov     $D,$SZ*3(%r15)
579         mov     $E,$SZ*4(%r15)
580         mov     $F,$SZ*5(%r15)
581         mov     $G,$SZ*6(%r15)
582         mov     $H,$SZ*7(%r15)
583
584         jb      .Lloop_xop
585
586         mov     $_ivp,$ivp
587         mov     $_rsp,%rsi
588         vmovdqu $iv,($ivp)              # output IV
589         vzeroall
590 ___
591 $code.=<<___ if ($win64);
592         movaps  `$framesz+16*0`(%rsp),%xmm6
593         movaps  `$framesz+16*1`(%rsp),%xmm7
594         movaps  `$framesz+16*2`(%rsp),%xmm8
595         movaps  `$framesz+16*3`(%rsp),%xmm9
596         movaps  `$framesz+16*4`(%rsp),%xmm10
597         movaps  `$framesz+16*5`(%rsp),%xmm11
598         movaps  `$framesz+16*6`(%rsp),%xmm12
599         movaps  `$framesz+16*7`(%rsp),%xmm13
600         movaps  `$framesz+16*8`(%rsp),%xmm14
601         movaps  `$framesz+16*9`(%rsp),%xmm15
602 ___
603 $code.=<<___;
604         mov     (%rsi),%r15
605         mov     8(%rsi),%r14
606         mov     16(%rsi),%r13
607         mov     24(%rsi),%r12
608         mov     32(%rsi),%rbp
609         mov     40(%rsi),%rbx
610         lea     48(%rsi),%rsp
611 .Lepilogue_xop:
612         ret
613 .size   ${func}_xop,.-${func}_xop
614 ___
615 ######################################################################
616 # AVX+shrd code path
617 #
618 local *ror = sub { &shrd(@_[0],@_) };
619
620 $code.=<<___;
621 .type   ${func}_avx,\@function,6
622 .align  64
623 ${func}_avx:
624 .Lavx_shortcut:
625         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
626         push    %rbx
627         push    %rbp
628         push    %r12
629         push    %r13
630         push    %r14
631         push    %r15
632         mov     %rsp,%r11               # copy %rsp
633         sub     \$`$framesz+$win64*16*10`,%rsp
634         and     \$-64,%rsp              # align stack frame
635
636         shl     \$6,$len
637         sub     $inp,$out               # re-bias
638         sub     $inp,$in0
639         add     $inp,$len               # end of input
640
641         #mov    $inp,$_inp              # saved later
642         mov     $out,$_out
643         mov     $len,$_end
644         #mov    $key,$_key              # remains resident in $inp register
645         mov     $ivp,$_ivp
646         mov     $ctx,$_ctx
647         mov     $in0,$_in0
648         mov     %r11,$_rsp
649 ___
650 $code.=<<___ if ($win64);
651         movaps  %xmm6,`$framesz+16*0`(%rsp)
652         movaps  %xmm7,`$framesz+16*1`(%rsp)
653         movaps  %xmm8,`$framesz+16*2`(%rsp)
654         movaps  %xmm9,`$framesz+16*3`(%rsp)
655         movaps  %xmm10,`$framesz+16*4`(%rsp)
656         movaps  %xmm11,`$framesz+16*5`(%rsp)
657         movaps  %xmm12,`$framesz+16*6`(%rsp)
658         movaps  %xmm13,`$framesz+16*7`(%rsp)
659         movaps  %xmm14,`$framesz+16*8`(%rsp)
660         movaps  %xmm15,`$framesz+16*9`(%rsp)
661 ___
662 $code.=<<___;
663 .Lprologue_avx:
664         vzeroall
665
666         mov     $inp,%r12               # borrow $a4
667         lea     0x80($key),$inp         # size optimization, reassign
668         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
669         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
670         mov     $ctx,%r15               # borrow $a2
671         mov     $in0,%rsi               # borrow $a3
672         vmovdqu ($ivp),$iv              # load IV
673         sub     \$9,%r14
674
675         mov     $SZ*0(%r15),$A
676         mov     $SZ*1(%r15),$B
677         mov     $SZ*2(%r15),$C
678         mov     $SZ*3(%r15),$D
679         mov     $SZ*4(%r15),$E
680         mov     $SZ*5(%r15),$F
681         mov     $SZ*6(%r15),$G
682         mov     $SZ*7(%r15),$H
683
684         vmovdqa 0x00(%r13,%r14,8),$mask14
685         vmovdqa 0x10(%r13,%r14,8),$mask12
686         vmovdqa 0x20(%r13,%r14,8),$mask10
687         vmovdqu 0x00-0x80($inp),$roundkey
688 ___
689                                         if ($SZ==4) {   # SHA256
690     my @X = map("%xmm$_",(0..3));
691     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
692
693 $code.=<<___;
694         jmp     .Lloop_avx
695 .align  16
696 .Lloop_avx:
697         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
698         vmovdqu 0x00(%rsi,%r12),@X[0]
699         vmovdqu 0x10(%rsi,%r12),@X[1]
700         vmovdqu 0x20(%rsi,%r12),@X[2]
701         vmovdqu 0x30(%rsi,%r12),@X[3]
702         vpshufb $t3,@X[0],@X[0]
703         lea     $TABLE(%rip),$Tbl
704         vpshufb $t3,@X[1],@X[1]
705         vpshufb $t3,@X[2],@X[2]
706         vpaddd  0x00($Tbl),@X[0],$t0
707         vpshufb $t3,@X[3],@X[3]
708         vpaddd  0x20($Tbl),@X[1],$t1
709         vpaddd  0x40($Tbl),@X[2],$t2
710         vpaddd  0x60($Tbl),@X[3],$t3
711         vmovdqa $t0,0x00(%rsp)
712         mov     $A,$a1
713         vmovdqa $t1,0x10(%rsp)
714         mov     $B,$a3
715         vmovdqa $t2,0x20(%rsp)
716         xor     $C,$a3                  # magic
717         vmovdqa $t3,0x30(%rsp)
718         mov     $E,$a0
719         jmp     .Lavx_00_47
720
721 .align  16
722 .Lavx_00_47:
723         sub     \$-16*2*$SZ,$Tbl        # size optimization
724         vmovdqu (%r12),$inout           # $a4
725         mov     %r12,$_inp              # $a4
726 ___
727 sub Xupdate_256_AVX () {
728         (
729         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
730          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
731         '&vpsrld        ($t2,$t0,$sigma0[0]);',
732          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
733         '&vpsrld        ($t3,$t0,$sigma0[2])',
734         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
735         '&vpxor         ($t0,$t3,$t2)',
736          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
737         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
738         '&vpxor         ($t0,$t0,$t1)',
739         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
740         '&vpxor         ($t0,$t0,$t2)',
741          '&vpsrld       ($t2,$t3,$sigma1[2]);',
742         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
743          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
744         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
745          '&vpxor        ($t2,$t2,$t3);',
746          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
747          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
748          '&vpshufd      ($t2,$t2,0b10000100)',
749          '&vpsrldq      ($t2,$t2,8)',
750         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
751          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
752          '&vpsrld       ($t2,$t3,$sigma1[2])',
753          '&vpsrlq       ($t3,$t3,$sigma1[0])',
754          '&vpxor        ($t2,$t2,$t3);',
755          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
756          '&vpxor        ($t2,$t2,$t3)',
757          '&vpshufd      ($t2,$t2,0b11101000)',
758          '&vpslldq      ($t2,$t2,8)',
759         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
760         );
761 }
762
763 sub AVX_256_00_47 () {
764 my $j = shift;
765 my $body = shift;
766 my @X = @_;
767 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
768
769         foreach (Xupdate_256_AVX()) {           # 29 instructions
770             eval;
771             eval(shift(@insns));
772             eval(shift(@insns));
773             eval(shift(@insns));
774         }
775         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
776           foreach (@insns) { eval; }            # remaining instructions
777         &vmovdqa        (16*$j."(%rsp)",$t2);
778 }
779
780     $aesni_cbc_idx=0;
781     for ($i=0,$j=0; $j<4; $j++) {
782         &AVX_256_00_47($j,\&body_00_15,@X);
783         push(@X,shift(@X));                     # rotate(@X)
784     }
785         &mov            ("%r12",$_inp);         # borrow $a4
786         &vpand          ($temp,$temp,$mask14);
787         &mov            ("%r15",$_out);         # borrow $a2
788         &vpor           ($iv,$iv,$temp);
789         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
790         &lea            ("%r12","16(%r12)");    # inp++
791
792         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
793         &jne    (".Lavx_00_47");
794
795         &vmovdqu        ($inout,"(%r12)");
796         &mov            ($_inp,"%r12");
797
798     $aesni_cbc_idx=0;
799     for ($i=0; $i<16; ) {
800         foreach(body_00_15()) { eval; }
801     }
802
803                                         }
804 $code.=<<___;
805         mov     $_inp,%r12              # borrow $a4
806         mov     $_out,%r13              # borrow $a0
807         mov     $_ctx,%r15              # borrow $a2
808         mov     $_in0,%rsi              # borrow $a3
809
810         vpand   $mask14,$temp,$temp
811         mov     $a1,$A
812         vpor    $temp,$iv,$iv
813         vmovdqu $iv,(%r13,%r12)         # write output
814         lea     16(%r12),%r12           # inp++
815
816         add     $SZ*0(%r15),$A
817         add     $SZ*1(%r15),$B
818         add     $SZ*2(%r15),$C
819         add     $SZ*3(%r15),$D
820         add     $SZ*4(%r15),$E
821         add     $SZ*5(%r15),$F
822         add     $SZ*6(%r15),$G
823         add     $SZ*7(%r15),$H
824
825         cmp     $_end,%r12
826
827         mov     $A,$SZ*0(%r15)
828         mov     $B,$SZ*1(%r15)
829         mov     $C,$SZ*2(%r15)
830         mov     $D,$SZ*3(%r15)
831         mov     $E,$SZ*4(%r15)
832         mov     $F,$SZ*5(%r15)
833         mov     $G,$SZ*6(%r15)
834         mov     $H,$SZ*7(%r15)
835         jb      .Lloop_avx
836
837         mov     $_ivp,$ivp
838         mov     $_rsp,%rsi
839         vmovdqu $iv,($ivp)              # output IV
840         vzeroall
841 ___
842 $code.=<<___ if ($win64);
843         movaps  `$framesz+16*0`(%rsp),%xmm6
844         movaps  `$framesz+16*1`(%rsp),%xmm7
845         movaps  `$framesz+16*2`(%rsp),%xmm8
846         movaps  `$framesz+16*3`(%rsp),%xmm9
847         movaps  `$framesz+16*4`(%rsp),%xmm10
848         movaps  `$framesz+16*5`(%rsp),%xmm11
849         movaps  `$framesz+16*6`(%rsp),%xmm12
850         movaps  `$framesz+16*7`(%rsp),%xmm13
851         movaps  `$framesz+16*8`(%rsp),%xmm14
852         movaps  `$framesz+16*9`(%rsp),%xmm15
853 ___
854 $code.=<<___;
855         mov     (%rsi),%r15
856         mov     8(%rsi),%r14
857         mov     16(%rsi),%r13
858         mov     24(%rsi),%r12
859         mov     32(%rsi),%rbp
860         mov     40(%rsi),%rbx
861         lea     48(%rsi),%rsp
862 .Lepilogue_avx:
863         ret
864 .size   ${func}_avx,.-${func}_avx
865 ___
866
867 if ($avx>1) {{
868 ######################################################################
869 # AVX2+BMI code path
870 #
871 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp 
872 my $PUSH8=8*2*$SZ;
873 use integer;
874
875 sub bodyx_00_15 () {
876         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
877         (
878         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
879
880         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
881         '&and   ($a4,$e)',              # f&e
882         '&rorx  ($a0,$e,$Sigma1[2])',
883         '&rorx  ($a2,$e,$Sigma1[1])',
884
885         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
886         '&lea   ($h,"($h,$a4)")',
887         '&andn  ($a4,$e,$g)',           # ~e&g
888         '&xor   ($a0,$a2)',
889
890         '&rorx  ($a1,$e,$Sigma1[0])',
891         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
892         '&xor   ($a0,$a1)',             # Sigma1(e)
893         '&mov   ($a2,$a)',
894
895         '&rorx  ($a4,$a,$Sigma0[2])',
896         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
897         '&xor   ($a2,$b)',              # a^b, b^c in next round
898         '&rorx  ($a1,$a,$Sigma0[1])',
899
900         '&rorx  ($a0,$a,$Sigma0[0])',
901         '&lea   ($d,"($d,$h)")',        # d+=h
902         '&and   ($a3,$a2)',             # (b^c)&(a^b)
903         @aesni_cbc_block[$aesni_cbc_idx++].
904         '&xor   ($a1,$a4)',
905
906         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
907         '&xor   ($a1,$a0)',             # Sigma0(a)
908         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
909         '&mov   ($a4,$e)',              # copy of f in future
910
911         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
912         );
913         # and at the finish one has to $a+=$a1
914 }
915
916 $code.=<<___;
917 .type   ${func}_avx2,\@function,6
918 .align  64
919 ${func}_avx2:
920 .Lavx2_shortcut:
921         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
922         push    %rbx
923         push    %rbp
924         push    %r12
925         push    %r13
926         push    %r14
927         push    %r15
928         mov     %rsp,%r11               # copy %rsp
929         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
930         and     \$-256*$SZ,%rsp         # align stack frame
931         add     \$`2*$SZ*($rounds-8)`,%rsp
932
933         shl     \$6,$len
934         sub     $inp,$out               # re-bias
935         sub     $inp,$in0
936         add     $inp,$len               # end of input
937
938         #mov    $inp,$_inp              # saved later
939         #mov    $out,$_out              # kept in $offload
940         mov     $len,$_end
941         #mov    $key,$_key              # remains resident in $inp register
942         mov     $ivp,$_ivp
943         mov     $ctx,$_ctx
944         mov     $in0,$_in0
945         mov     %r11,$_rsp
946 ___
947 $code.=<<___ if ($win64);
948         movaps  %xmm6,`$framesz+16*0`(%rsp)
949         movaps  %xmm7,`$framesz+16*1`(%rsp)
950         movaps  %xmm8,`$framesz+16*2`(%rsp)
951         movaps  %xmm9,`$framesz+16*3`(%rsp)
952         movaps  %xmm10,`$framesz+16*4`(%rsp)
953         movaps  %xmm11,`$framesz+16*5`(%rsp)
954         movaps  %xmm12,`$framesz+16*6`(%rsp)
955         movaps  %xmm13,`$framesz+16*7`(%rsp)
956         movaps  %xmm14,`$framesz+16*8`(%rsp)
957         movaps  %xmm15,`$framesz+16*9`(%rsp)
958 ___
959 $code.=<<___;
960 .Lprologue_avx2:
961         vzeroall
962
963         mov     $inp,%r13               # borrow $a0
964         vpinsrq \$1,$out,$offload,$offload
965         lea     0x80($key),$inp         # size optimization, reassign
966         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
967         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
968         mov     $ctx,%r15               # borrow $a2
969         mov     $in0,%rsi               # borrow $a3
970         vmovdqu ($ivp),$iv              # load IV
971         lea     -9(%r14),%r14
972
973         vmovdqa 0x00(%r12,%r14,8),$mask14
974         vmovdqa 0x10(%r12,%r14,8),$mask12
975         vmovdqa 0x20(%r12,%r14,8),$mask10
976
977         sub     \$-16*$SZ,%r13          # inp++, size optimization
978         mov     $SZ*0(%r15),$A
979         lea     (%rsi,%r13),%r12        # borrow $a0
980         mov     $SZ*1(%r15),$B
981         cmp     $len,%r13               # $_end
982         mov     $SZ*2(%r15),$C
983         cmove   %rsp,%r12               # next block or random data
984         mov     $SZ*3(%r15),$D
985         mov     $SZ*4(%r15),$E
986         mov     $SZ*5(%r15),$F
987         mov     $SZ*6(%r15),$G
988         mov     $SZ*7(%r15),$H
989         vmovdqu 0x00-0x80($inp),$roundkey
990 ___
991                                         if ($SZ==4) {   # SHA256
992     my @X = map("%ymm$_",(0..3));
993     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
994
995 $code.=<<___;
996         jmp     .Loop_avx2
997 .align  16
998 .Loop_avx2:
999         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1000         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1001         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1002         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1003         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1004
1005         vinserti128     \$1,(%r12),@X[0],@X[0]
1006         vinserti128     \$1,16(%r12),@X[1],@X[1]
1007          vpshufb        $t3,@X[0],@X[0]
1008         vinserti128     \$1,32(%r12),@X[2],@X[2]
1009          vpshufb        $t3,@X[1],@X[1]
1010         vinserti128     \$1,48(%r12),@X[3],@X[3]
1011
1012         lea     $TABLE(%rip),$Tbl
1013         vpshufb $t3,@X[2],@X[2]
1014         lea     -16*$SZ(%r13),%r13
1015         vpaddd  0x00($Tbl),@X[0],$t0
1016         vpshufb $t3,@X[3],@X[3]
1017         vpaddd  0x20($Tbl),@X[1],$t1
1018         vpaddd  0x40($Tbl),@X[2],$t2
1019         vpaddd  0x60($Tbl),@X[3],$t3
1020         vmovdqa $t0,0x00(%rsp)
1021         xor     $a1,$a1
1022         vmovdqa $t1,0x20(%rsp)
1023         lea     -$PUSH8(%rsp),%rsp
1024         mov     $B,$a3
1025         vmovdqa $t2,0x00(%rsp)
1026         xor     $C,$a3                  # magic
1027         vmovdqa $t3,0x20(%rsp)
1028         mov     $F,$a4
1029         sub     \$-16*2*$SZ,$Tbl        # size optimization
1030         jmp     .Lavx2_00_47
1031
1032 .align  16
1033 .Lavx2_00_47:
1034         vmovdqu (%r13),$inout
1035         vpinsrq \$0,%r13,$offload,$offload
1036 ___
1037
1038 sub AVX2_256_00_47 () {
1039 my $j = shift;
1040 my $body = shift;
1041 my @X = @_;
1042 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1043 my $base = "+2*$PUSH8(%rsp)";
1044
1045         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1046         foreach (Xupdate_256_AVX()) {           # 29 instructions
1047             eval;
1048             eval(shift(@insns));
1049             eval(shift(@insns));
1050             eval(shift(@insns));
1051         }
1052         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1053           foreach (@insns) { eval; }            # remaining instructions
1054         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1055 }
1056     $aesni_cbc_idx=0;
1057     for ($i=0,$j=0; $j<4; $j++) {
1058         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1059         push(@X,shift(@X));                     # rotate(@X)
1060     }
1061         &vmovq          ("%r13",$offload);      # borrow $a0
1062         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1063         &vpand          ($temp,$temp,$mask14);
1064         &vpor           ($iv,$iv,$temp);
1065         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1066         &lea            ("%r13","16(%r13)");    # inp++
1067
1068         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1069         &cmpb   (($SZ-1)."($Tbl)",0);
1070         &jne    (".Lavx2_00_47");
1071
1072         &vmovdqu        ($inout,"(%r13)");
1073         &vpinsrq        ($offload,$offload,"%r13",0);
1074
1075     $aesni_cbc_idx=0;
1076     for ($i=0; $i<16; ) {
1077         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1078         foreach(bodyx_00_15()) { eval; }
1079     }
1080                                         }
1081 $code.=<<___;
1082         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1083         vmovq   $offload,%r13                   # $_inp, borrow $a0
1084         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1085         add     $a1,$A
1086         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1087
1088         vpand   $mask14,$temp,$temp
1089         vpor    $temp,$iv,$iv
1090         vmovdqu $iv,(%r12,%r13)                 # write output
1091         lea     16(%r13),%r13
1092
1093         add     $SZ*0(%r15),$A
1094         add     $SZ*1(%r15),$B
1095         add     $SZ*2(%r15),$C
1096         add     $SZ*3(%r15),$D
1097         add     $SZ*4(%r15),$E
1098         add     $SZ*5(%r15),$F
1099         add     $SZ*6(%r15),$G
1100         add     $SZ*7(%r15),$H
1101
1102         mov     $A,$SZ*0(%r15)
1103         mov     $B,$SZ*1(%r15)
1104         mov     $C,$SZ*2(%r15)
1105         mov     $D,$SZ*3(%r15)
1106         mov     $E,$SZ*4(%r15)
1107         mov     $F,$SZ*5(%r15)
1108         mov     $G,$SZ*6(%r15)
1109         mov     $H,$SZ*7(%r15)
1110
1111         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1112         je      .Ldone_avx2
1113
1114         xor     $a1,$a1
1115         mov     $B,$a3
1116         mov     $F,$a4
1117         xor     $C,$a3                  # magic
1118         jmp     .Lower_avx2
1119 .align  16
1120 .Lower_avx2:
1121         vmovdqu (%r13),$inout
1122         vpinsrq \$0,%r13,$offload,$offload
1123 ___
1124     $aesni_cbc_idx=0;
1125     for ($i=0; $i<16; ) {
1126         my $base="+16($Tbl)";
1127         foreach(bodyx_00_15()) { eval; }
1128         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1129     }
1130 $code.=<<___;
1131         vmovq   $offload,%r13                   # borrow $a0
1132         vpextrq \$1,$offload,%r15               # borrow $a2
1133         vpand   $mask14,$temp,$temp
1134         vpor    $temp,$iv,$iv
1135         lea     -$PUSH8($Tbl),$Tbl
1136         vmovdqu $iv,(%r15,%r13)                 # write output
1137         lea     16(%r13),%r13                   # inp++
1138         cmp     %rsp,$Tbl
1139         jae     .Lower_avx2
1140
1141         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1142         lea     16*$SZ(%r13),%r13
1143         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1144         add     $a1,$A
1145         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1146
1147         add     $SZ*0(%r15),$A
1148         add     $SZ*1(%r15),$B
1149         add     $SZ*2(%r15),$C
1150         add     $SZ*3(%r15),$D
1151         add     $SZ*4(%r15),$E
1152         add     $SZ*5(%r15),$F
1153         add     $SZ*6(%r15),$G
1154         lea     (%rsi,%r13),%r12
1155         add     $SZ*7(%r15),$H
1156
1157         cmp     $_end,%r13
1158
1159         mov     $A,$SZ*0(%r15)
1160         cmove   %rsp,%r12               # next block or stale data
1161         mov     $B,$SZ*1(%r15)
1162         mov     $C,$SZ*2(%r15)
1163         mov     $D,$SZ*3(%r15)
1164         mov     $E,$SZ*4(%r15)
1165         mov     $F,$SZ*5(%r15)
1166         mov     $G,$SZ*6(%r15)
1167         mov     $H,$SZ*7(%r15)
1168
1169         jbe     .Loop_avx2
1170         lea     (%rsp),$Tbl
1171
1172 .Ldone_avx2:
1173         lea     ($Tbl),%rsp
1174         mov     $_ivp,$ivp
1175         mov     $_rsp,%rsi
1176         vmovdqu $iv,($ivp)              # output IV
1177         vzeroall
1178 ___
1179 $code.=<<___ if ($win64);
1180         movaps  `$framesz+16*0`(%rsp),%xmm6
1181         movaps  `$framesz+16*1`(%rsp),%xmm7
1182         movaps  `$framesz+16*2`(%rsp),%xmm8
1183         movaps  `$framesz+16*3`(%rsp),%xmm9
1184         movaps  `$framesz+16*4`(%rsp),%xmm10
1185         movaps  `$framesz+16*5`(%rsp),%xmm11
1186         movaps  `$framesz+16*6`(%rsp),%xmm12
1187         movaps  `$framesz+16*7`(%rsp),%xmm13
1188         movaps  `$framesz+16*8`(%rsp),%xmm14
1189         movaps  `$framesz+16*9`(%rsp),%xmm15
1190 ___
1191 $code.=<<___;
1192         mov     (%rsi),%r15
1193         mov     8(%rsi),%r14
1194         mov     16(%rsi),%r13
1195         mov     24(%rsi),%r12
1196         mov     32(%rsi),%rbp
1197         mov     40(%rsi),%rbx
1198         lea     48(%rsi),%rsp
1199 .Lepilogue_avx2:
1200         ret
1201 .size   ${func}_avx2,.-${func}_avx2
1202 ___
1203 }}
1204 }}
1205 {{
1206 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1207
1208 my ($rounds,$Tbl)=("%r11d","%rbx");
1209
1210 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1211 my @rndkey=("%xmm4","%xmm5");
1212 my $r=0;
1213 my $sn=0;
1214
1215 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1216 my @MSG=map("%xmm$_",(10..13));
1217
1218 my $aesenc=sub {
1219   use integer;
1220   my ($n,$k)=($r/10,$r%10);
1221     if ($k==0) {
1222       $code.=<<___;
1223         movups          `16*$n`($in0),$in               # load input
1224         xorps           $rndkey0,$in
1225 ___
1226       $code.=<<___ if ($n);
1227         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1228 ___
1229       $code.=<<___;
1230         xorps           $in,$iv
1231         movups          `32+16*$k-112`($key),$rndkey[1]
1232         aesenc          $rndkey[0],$iv
1233 ___
1234     } elsif ($k==9) {
1235       $sn++;
1236       $code.=<<___;
1237         cmp             \$11,$rounds
1238         jb              .Laesenclast$sn
1239         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1240         aesenc          $rndkey[0],$iv
1241         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1242         aesenc          $rndkey[1],$iv
1243         je              .Laesenclast$sn
1244         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1245         aesenc          $rndkey[0],$iv
1246         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1247         aesenc          $rndkey[1],$iv
1248 .Laesenclast$sn:
1249         aesenclast      $rndkey[0],$iv
1250         movups          16-112($key),$rndkey[1]         # forward reference
1251         nop
1252 ___
1253     } else {
1254       $code.=<<___;
1255         movups          `32+16*$k-112`($key),$rndkey[1]
1256         aesenc          $rndkey[0],$iv
1257 ___
1258     }
1259     $r++;       unshift(@rndkey,pop(@rndkey));
1260 };
1261
1262 $code.=<<___;
1263 .type   ${func}_shaext,\@function,6
1264 .align  32
1265 ${func}_shaext:
1266         mov     %rsp,%rax
1267         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1268         push    %rbx
1269 ___
1270 $code.=<<___ if ($win64);
1271         lea     `-4*16`(%rsp),%rsp
1272         movaps  %xmm6,-8-10*16(%rax)
1273         movaps  %xmm7,-8-9*16(%rax)
1274         movaps  %xmm8,-8-8*16(%rax)
1275         movaps  %xmm9,-8-7*16(%rax)
1276         movaps  %xmm10,-8-6*16(%rax)
1277         movaps  %xmm11,-8-5*16(%rax)
1278         movaps  %xmm12,-8-4*16(%rax)
1279         movaps  %xmm13,-8-3*16(%rax)
1280         movaps  %xmm14,-8-2*16(%rax)
1281         movaps  %xmm15,-8-1*16(%rax)
1282 .Lprologue_shaext:
1283 ___
1284 $code.=<<___;
1285         lea             K256+0x80(%rip),$Tbl
1286         movdqu          ($ctx),$ABEF            # DCBA
1287         movdqu          16($ctx),$CDGH          # HGFE
1288         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1289
1290         mov             240($key),$rounds
1291         sub             $in0,$out
1292         movups          ($key),$rndkey0         # $key[0]
1293         movups          16($key),$rndkey[0]     # forward reference
1294         lea             112($key),$key          # size optimization
1295
1296         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1297         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1298         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1299         movdqa          $TMP,$BSWAP             # offload
1300         palignr         \$8,$CDGH,$ABEF         # ABEF
1301         punpcklqdq      $Wi,$CDGH               # CDGH
1302
1303         jmp     .Loop_shaext
1304
1305 .align  16
1306 .Loop_shaext:
1307         movdqu          ($inp),@MSG[0]
1308         movdqu          0x10($inp),@MSG[1]
1309         movdqu          0x20($inp),@MSG[2]
1310         pshufb          $TMP,@MSG[0]
1311         movdqu          0x30($inp),@MSG[3]
1312
1313         movdqa          0*32-0x80($Tbl),$Wi
1314         paddd           @MSG[0],$Wi
1315         pshufb          $TMP,@MSG[1]
1316         movdqa          $CDGH,$CDGH_SAVE        # offload
1317         movdqa          $ABEF,$ABEF_SAVE        # offload
1318 ___
1319         &$aesenc();
1320 $code.=<<___;
1321         sha256rnds2     $ABEF,$CDGH             # 0-3
1322         pshufd          \$0x0e,$Wi,$Wi
1323 ___
1324         &$aesenc();
1325 $code.=<<___;
1326         sha256rnds2     $CDGH,$ABEF
1327
1328         movdqa          1*32-0x80($Tbl),$Wi
1329         paddd           @MSG[1],$Wi
1330         pshufb          $TMP,@MSG[2]
1331         lea             0x40($inp),$inp
1332 ___
1333         &$aesenc();
1334 $code.=<<___;
1335         sha256rnds2     $ABEF,$CDGH             # 4-7
1336         pshufd          \$0x0e,$Wi,$Wi
1337 ___
1338         &$aesenc();
1339 $code.=<<___;
1340         sha256rnds2     $CDGH,$ABEF
1341
1342         movdqa          2*32-0x80($Tbl),$Wi
1343         paddd           @MSG[2],$Wi
1344         pshufb          $TMP,@MSG[3]
1345         sha256msg1      @MSG[1],@MSG[0]
1346 ___
1347         &$aesenc();
1348 $code.=<<___;
1349         sha256rnds2     $ABEF,$CDGH             # 8-11
1350         pshufd          \$0x0e,$Wi,$Wi
1351         movdqa          @MSG[3],$TMP
1352         palignr         \$4,@MSG[2],$TMP
1353         paddd           $TMP,@MSG[0]
1354 ___
1355         &$aesenc();
1356 $code.=<<___;
1357         sha256rnds2     $CDGH,$ABEF
1358
1359         movdqa          3*32-0x80($Tbl),$Wi
1360         paddd           @MSG[3],$Wi
1361         sha256msg2      @MSG[3],@MSG[0]
1362         sha256msg1      @MSG[2],@MSG[1]
1363 ___
1364         &$aesenc();
1365 $code.=<<___;
1366         sha256rnds2     $ABEF,$CDGH             # 12-15
1367         pshufd          \$0x0e,$Wi,$Wi
1368 ___
1369         &$aesenc();
1370 $code.=<<___;
1371         movdqa          @MSG[0],$TMP
1372         palignr         \$4,@MSG[3],$TMP
1373         paddd           $TMP,@MSG[1]
1374         sha256rnds2     $CDGH,$ABEF
1375 ___
1376 for($i=4;$i<16-3;$i++) {
1377         &$aesenc()      if (($r%10)==0);
1378 $code.=<<___;
1379         movdqa          $i*32-0x80($Tbl),$Wi
1380         paddd           @MSG[0],$Wi
1381         sha256msg2      @MSG[0],@MSG[1]
1382         sha256msg1      @MSG[3],@MSG[2]
1383 ___
1384         &$aesenc();
1385 $code.=<<___;
1386         sha256rnds2     $ABEF,$CDGH             # 16-19...
1387         pshufd          \$0x0e,$Wi,$Wi
1388         movdqa          @MSG[1],$TMP
1389         palignr         \$4,@MSG[0],$TMP
1390         paddd           $TMP,@MSG[2]
1391 ___
1392         &$aesenc();
1393         &$aesenc()      if ($r==19);
1394 $code.=<<___;
1395         sha256rnds2     $CDGH,$ABEF
1396 ___
1397         push(@MSG,shift(@MSG));
1398 }
1399 $code.=<<___;
1400         movdqa          13*32-0x80($Tbl),$Wi
1401         paddd           @MSG[0],$Wi
1402         sha256msg2      @MSG[0],@MSG[1]
1403         sha256msg1      @MSG[3],@MSG[2]
1404 ___
1405         &$aesenc();
1406 $code.=<<___;
1407         sha256rnds2     $ABEF,$CDGH             # 52-55
1408         pshufd          \$0x0e,$Wi,$Wi
1409         movdqa          @MSG[1],$TMP
1410         palignr         \$4,@MSG[0],$TMP
1411         paddd           $TMP,@MSG[2]
1412 ___
1413         &$aesenc();
1414         &$aesenc();
1415 $code.=<<___;
1416         sha256rnds2     $CDGH,$ABEF
1417
1418         movdqa          14*32-0x80($Tbl),$Wi
1419         paddd           @MSG[1],$Wi
1420         sha256msg2      @MSG[1],@MSG[2]
1421         movdqa          $BSWAP,$TMP
1422 ___
1423         &$aesenc();
1424 $code.=<<___;
1425         sha256rnds2     $ABEF,$CDGH             # 56-59
1426         pshufd          \$0x0e,$Wi,$Wi
1427 ___
1428         &$aesenc();
1429 $code.=<<___;
1430         sha256rnds2     $CDGH,$ABEF
1431
1432         movdqa          15*32-0x80($Tbl),$Wi
1433         paddd           @MSG[2],$Wi
1434 ___
1435         &$aesenc();
1436         &$aesenc();
1437 $code.=<<___;
1438         sha256rnds2     $ABEF,$CDGH             # 60-63
1439         pshufd          \$0x0e,$Wi,$Wi
1440 ___
1441         &$aesenc();
1442 $code.=<<___;
1443         sha256rnds2     $CDGH,$ABEF
1444         #pxor           $CDGH,$rndkey0          # black magic
1445 ___
1446         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1447 $code.=<<___;
1448         #xorps          $CDGH,$rndkey0          # black magic
1449         paddd           $CDGH_SAVE,$CDGH
1450         paddd           $ABEF_SAVE,$ABEF
1451
1452         dec             $len
1453         movups          $iv,48($out,$in0)       # write output
1454         lea             64($in0),$in0
1455         jnz             .Loop_shaext
1456
1457         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1458         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1459         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1460         punpckhqdq      $CDGH,$ABEF             # DCBA
1461         palignr         \$8,$TMP,$CDGH          # HGFE
1462
1463         movups          $iv,($ivp)              # write IV
1464         movdqu          $ABEF,($ctx)
1465         movdqu          $CDGH,16($ctx)
1466 ___
1467 $code.=<<___ if ($win64);
1468         movaps  -8-10*16(%rax),%xmm6
1469         movaps  -8-9*16(%rax),%xmm7
1470         movaps  -8-8*16(%rax),%xmm8
1471         movaps  -8-7*16(%rax),%xmm9
1472         movaps  -8-6*16(%rax),%xmm10
1473         movaps  -8-5*16(%rax),%xmm11
1474         movaps  -8-4*16(%rax),%xmm12
1475         movaps  -8-3*16(%rax),%xmm13
1476         movaps  -8-2*16(%rax),%xmm14
1477         movaps  -8-1*16(%rax),%xmm15
1478 .Lepilogue_shaext:
1479 ___
1480 $code.=<<___;
1481         mov     -8(%rax),%rbx
1482         mov     %rax,%rsp
1483         ret
1484 .size   ${func}_shaext,.-${func}_shaext
1485 ___
1486 }}}}}
1487
1488 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1489 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1490 if ($win64) {
1491 $rec="%rcx";
1492 $frame="%rdx";
1493 $context="%r8";
1494 $disp="%r9";
1495
1496 $code.=<<___ if ($avx);
1497 .extern __imp_RtlVirtualUnwind
1498 .type   se_handler,\@abi-omnipotent
1499 .align  16
1500 se_handler:
1501         push    %rsi
1502         push    %rdi
1503         push    %rbx
1504         push    %rbp
1505         push    %r12
1506         push    %r13
1507         push    %r14
1508         push    %r15
1509         pushfq
1510         sub     \$64,%rsp
1511
1512         mov     120($context),%rax      # pull context->Rax
1513         mov     248($context),%rbx      # pull context->Rip
1514
1515         mov     8($disp),%rsi           # disp->ImageBase
1516         mov     56($disp),%r11          # disp->HanderlData
1517
1518         mov     0(%r11),%r10d           # HandlerData[0]
1519         lea     (%rsi,%r10),%r10        # prologue label
1520         cmp     %r10,%rbx               # context->Rip<prologue label
1521         jb      .Lin_prologue
1522
1523         mov     152($context),%rax      # pull context->Rsp
1524
1525         mov     4(%r11),%r10d           # HandlerData[1]
1526         lea     (%rsi,%r10),%r10        # epilogue label
1527         cmp     %r10,%rbx               # context->Rip>=epilogue label
1528         jae     .Lin_prologue
1529 ___
1530 $code.=<<___ if ($avx>1);
1531         lea     .Lavx2_shortcut(%rip),%r10
1532         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1533         jb      .Lnot_in_avx2
1534
1535         and     \$-256*$SZ,%rax
1536         add     \$`2*$SZ*($rounds-8)`,%rax
1537 .Lnot_in_avx2:
1538 ___
1539 $code.=<<___;
1540         mov     %rax,%rsi               # put aside Rsp
1541         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1542         lea     48(%rax),%rax
1543
1544         mov     -8(%rax),%rbx
1545         mov     -16(%rax),%rbp
1546         mov     -24(%rax),%r12
1547         mov     -32(%rax),%r13
1548         mov     -40(%rax),%r14
1549         mov     -48(%rax),%r15
1550         mov     %rbx,144($context)      # restore context->Rbx
1551         mov     %rbp,160($context)      # restore context->Rbp
1552         mov     %r12,216($context)      # restore context->R12
1553         mov     %r13,224($context)      # restore context->R13
1554         mov     %r14,232($context)      # restore context->R14
1555         mov     %r15,240($context)      # restore context->R15
1556
1557         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1558         lea     512($context),%rdi      # &context.Xmm6
1559         mov     \$20,%ecx
1560         .long   0xa548f3fc              # cld; rep movsq
1561
1562 .Lin_prologue:
1563         mov     8(%rax),%rdi
1564         mov     16(%rax),%rsi
1565         mov     %rax,152($context)      # restore context->Rsp
1566         mov     %rsi,168($context)      # restore context->Rsi
1567         mov     %rdi,176($context)      # restore context->Rdi
1568
1569         mov     40($disp),%rdi          # disp->ContextRecord
1570         mov     $context,%rsi           # context
1571         mov     \$154,%ecx              # sizeof(CONTEXT)
1572         .long   0xa548f3fc              # cld; rep movsq
1573
1574         mov     $disp,%rsi
1575         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1576         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1577         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1578         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1579         mov     40(%rsi),%r10           # disp->ContextRecord
1580         lea     56(%rsi),%r11           # &disp->HandlerData
1581         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1582         mov     %r10,32(%rsp)           # arg5
1583         mov     %r11,40(%rsp)           # arg6
1584         mov     %r12,48(%rsp)           # arg7
1585         mov     %rcx,56(%rsp)           # arg8, (NULL)
1586         call    *__imp_RtlVirtualUnwind(%rip)
1587
1588         mov     \$1,%eax                # ExceptionContinueSearch
1589         add     \$64,%rsp
1590         popfq
1591         pop     %r15
1592         pop     %r14
1593         pop     %r13
1594         pop     %r12
1595         pop     %rbp
1596         pop     %rbx
1597         pop     %rdi
1598         pop     %rsi
1599         ret
1600 .size   se_handler,.-se_handler
1601
1602 .section        .pdata
1603         .rva    .LSEH_begin_${func}_xop
1604         .rva    .LSEH_end_${func}_xop
1605         .rva    .LSEH_info_${func}_xop
1606
1607         .rva    .LSEH_begin_${func}_avx
1608         .rva    .LSEH_end_${func}_avx
1609         .rva    .LSEH_info_${func}_avx
1610 ___
1611 $code.=<<___ if ($avx>1);
1612         .rva    .LSEH_begin_${func}_avx2
1613         .rva    .LSEH_end_${func}_avx2
1614         .rva    .LSEH_info_${func}_avx2
1615 ___
1616 $code.=<<___ if ($avx);
1617 .section        .xdata
1618 .align  8
1619 .LSEH_info_${func}_xop:
1620         .byte   9,0,0,0
1621         .rva    se_handler
1622         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1623
1624 .LSEH_info_${func}_avx:
1625         .byte   9,0,0,0
1626         .rva    se_handler
1627         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1628 ___
1629 $code.=<<___ if ($avx>1);
1630 .LSEH_info_${func}_avx2:
1631         .byte   9,0,0,0
1632         .rva    se_handler
1633         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1634 ___
1635 }
1636
1637 ####################################################################
1638 sub rex {
1639   local *opcode=shift;
1640   my ($dst,$src)=@_;
1641   my $rex=0;
1642
1643     $rex|=0x04                  if($dst>=8);
1644     $rex|=0x01                  if($src>=8);
1645     unshift @opcode,$rex|0x40   if($rex);
1646 }
1647
1648 {
1649   my %opcodelet = (
1650                 "sha256rnds2" => 0xcb,
1651                 "sha256msg1"  => 0xcc,
1652                 "sha256msg2"  => 0xcd   );
1653
1654   sub sha256op38 {
1655     my $instr = shift;
1656
1657     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
1658       my @opcode=(0x0f,0x38);
1659         rex(\@opcode,$2,$1);
1660         push @opcode,$opcodelet{$instr};
1661         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1662         return ".byte\t".join(',',@opcode);
1663     } else {
1664         return $instr."\t".@_[0];
1665     }
1666   }
1667 }
1668
1669 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1670 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1671 print $code;
1672 close STDOUT;