63e27473505840403258bbc9dc9c1471ed6115fa
[openssl.git] / crypto / aes / asm / aesni-sha256-x86_64.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # January 2013
11 #
12 # This is AESNI-CBC+SHA256 stitch implementation. The idea, as spelled
13 # in http://download.intel.com/design/intarch/papers/323686.pdf, is
14 # that since AESNI-CBC encrypt exhibit *very* low instruction-level
15 # parallelism, interleaving it with another algorithm would allow to
16 # utilize processor resources better and achieve better performance.
17 # SHA256 instruction sequences(*) are taken from sha512-x86_64.pl and
18 # AESNI code is weaved into it. As SHA256 dominates execution time,
19 # stitch performance does not depend on AES key length. Below are
20 # performance numbers in cycles per processed byte, less is better,
21 # for standalone AESNI-CBC encrypt, standalone SHA256, and stitched
22 # subroutine:
23 #
24 #                AES-128/-192/-256+SHA256       this(**)gain
25 # Sandy Bridge      5.05/6.05/7.05+11.6         13.0    +28%/36%/43%
26 # Ivy Bridge        5.05/6.05/7.05+10.3         11.6    +32%/41%/50%
27 # Haswell           4.43/5.29/6.19+7.80         8.79    +39%/49%/59%
28 # Bulldozer         5.77/6.89/8.00+13.7         13.7    +42%/50%/58%
29 #
30 # (*)   there are XOP, AVX1 and AVX2 code pathes, meaning that
31 #       Westmere is omitted from loop, this is because gain was not
32 #       estimated high enough to justify the effort;
33 # (**)  these are EVP-free results, results obtained with 'speed
34 #       -evp aes-256-cbc-hmac-sha256' will vary by percent or two;
35
36 $flavour = shift;
37 $output  = shift;
38 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
39
40 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
41
42 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
43 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
44 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
45 die "can't locate x86_64-xlate.pl";
46
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49         $avx = ($1>=2.19) + ($1>=2.22);
50 }
51
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54         $avx = ($1>=2.09) + ($1>=2.10);
55 }
56
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59         $avx = ($1>=10) + ($1>=11);
60 }
61
62 $shaext=1;      ### set to zero if compiling for 1.0.1
63 $avx=1          if (!$shaext && $avx);
64
65 open OUT,"| \"$^X\" $xlate $flavour $output";
66 *STDOUT=*OUT;
67
68 $func="aesni_cbc_sha256_enc";
69 $TABLE="K256";
70 $SZ=4;
71 @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
72                                 "%r8d","%r9d","%r10d","%r11d");
73 ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%esi");
74 @Sigma0=( 2,13,22);
75 @Sigma1=( 6,11,25);
76 @sigma0=( 7,18, 3);
77 @sigma1=(17,19,10);
78 $rounds=64;
79
80 ########################################################################
81 # void aesni_cbc_sha256_enc(const void *inp,
82 #                       void *out,
83 #                       size_t length,
84 #                       const AES_KEY *key,
85 #                       unsigned char *iv,
86 #                       SHA256_CTX *ctx,
87 #                       const void *in0);
88 ($inp,  $out,  $len,  $key,  $ivp, $ctx, $in0) =
89 ("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
90
91 $Tbl="%rbp";
92
93 $_inp="16*$SZ+0*8(%rsp)";
94 $_out="16*$SZ+1*8(%rsp)";
95 $_end="16*$SZ+2*8(%rsp)";
96 $_key="16*$SZ+3*8(%rsp)";
97 $_ivp="16*$SZ+4*8(%rsp)";
98 $_ctx="16*$SZ+5*8(%rsp)";
99 $_in0="16*$SZ+6*8(%rsp)";
100 $_rsp="16*$SZ+7*8(%rsp)";
101 $framesz=16*$SZ+8*8;
102
103 $code=<<___;
104 .text
105
106 .extern OPENSSL_ia32cap_P
107 .globl  $func
108 .type   $func,\@abi-omnipotent
109 .align  16
110 $func:
111 ___
112 $code.=<<___ if ($avx);
113         lea     OPENSSL_ia32cap_P(%rip),%r11
114         mov     \$1,%eax
115         cmp     \$0,`$win64?"%rcx":"%rdi"`
116         je      .Lprobe
117         mov     0(%r11),%eax
118         mov     4(%r11),%r10
119 ___
120 $code.=<<___ if ($shaext);
121         bt      \$61,%r10                       # check for SHA
122         jc      ${func}_shaext
123 ___
124 $code.=<<___;
125         mov     %r10,%r11
126         shr     \$32,%r11
127
128         test    \$`1<<11`,%r10d                 # check for XOP
129         jnz     ${func}_xop
130 ___
131 $code.=<<___ if ($avx>1);
132         and     \$`1<<8|1<<5|1<<3`,%r11d        # check for BMI2+AVX2+BMI1
133         cmp     \$`1<<8|1<<5|1<<3`,%r11d
134         je      ${func}_avx2
135 ___
136 $code.=<<___ if ($avx);
137         and     \$`1<<30`,%eax                  # mask "Intel CPU" bit
138         and     \$`1<<28|1<<9`,%r10d            # mask AVX+SSSE3 bits
139         or      %eax,%r10d
140         cmp     \$`1<<28|1<<9|1<<30`,%r10d
141         je      ${func}_avx
142         ud2
143 ___
144 $code.=<<___;
145         xor     %eax,%eax
146         cmp     \$0,`$win64?"%rcx":"%rdi"`
147         je      .Lprobe
148         ud2
149 .Lprobe:
150         ret
151 .size   $func,.-$func
152
153 .align  64
154 .type   $TABLE,\@object
155 $TABLE:
156         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
157         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
158         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
159         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
160         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
161         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
162         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
163         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
164         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
165         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
166         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
167         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
168         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
169         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
170         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
171         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
172         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
173         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
174         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
175         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
176         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
177         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
178         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
179         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
180         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
181         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
182         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
183         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
184         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
185         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
186         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
187         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
188
189         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
190         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
191         .long   0,0,0,0,   0,0,0,0,   -1,-1,-1,-1
192         .long   0,0,0,0,   0,0,0,0
193         .asciz  "AESNI-CBC+SHA256 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
194 .align  64
195 ___
196
197 ######################################################################
198 # SIMD code paths
199 #
200 {{{
201 ($iv,$inout,$roundkey,$temp,
202  $mask10,$mask12,$mask14,$offload)=map("%xmm$_",(8..15));
203
204 $aesni_cbc_idx=0;
205 @aesni_cbc_block = (
206 ##      &vmovdqu        ($roundkey,"0x00-0x80($inp)");'
207 ##      &vmovdqu        ($inout,($inp));
208 ##      &mov            ($_inp,$inp);
209
210         '&vpxor         ($inout,$inout,$roundkey);'.
211         ' &vmovdqu      ($roundkey,"0x10-0x80($inp)");',
212
213         '&vpxor         ($inout,$inout,$iv);',
214
215         '&vaesenc       ($inout,$inout,$roundkey);'.
216         ' &vmovdqu      ($roundkey,"0x20-0x80($inp)");',
217
218         '&vaesenc       ($inout,$inout,$roundkey);'.
219         ' &vmovdqu      ($roundkey,"0x30-0x80($inp)");',
220
221         '&vaesenc       ($inout,$inout,$roundkey);'.
222         ' &vmovdqu      ($roundkey,"0x40-0x80($inp)");',
223
224         '&vaesenc       ($inout,$inout,$roundkey);'.
225         ' &vmovdqu      ($roundkey,"0x50-0x80($inp)");',
226
227         '&vaesenc       ($inout,$inout,$roundkey);'.
228         ' &vmovdqu      ($roundkey,"0x60-0x80($inp)");',
229
230         '&vaesenc       ($inout,$inout,$roundkey);'.
231         ' &vmovdqu      ($roundkey,"0x70-0x80($inp)");',
232
233         '&vaesenc       ($inout,$inout,$roundkey);'.
234         ' &vmovdqu      ($roundkey,"0x80-0x80($inp)");',
235
236         '&vaesenc       ($inout,$inout,$roundkey);'.
237         ' &vmovdqu      ($roundkey,"0x90-0x80($inp)");',
238
239         '&vaesenc       ($inout,$inout,$roundkey);'.
240         ' &vmovdqu      ($roundkey,"0xa0-0x80($inp)");',
241
242         '&vaesenclast   ($temp,$inout,$roundkey);'.
243         ' &vaesenc      ($inout,$inout,$roundkey);'.
244         ' &vmovdqu      ($roundkey,"0xb0-0x80($inp)");',
245
246         '&vpand         ($iv,$temp,$mask10);'.
247         ' &vaesenc      ($inout,$inout,$roundkey);'.
248         ' &vmovdqu      ($roundkey,"0xc0-0x80($inp)");',
249
250         '&vaesenclast   ($temp,$inout,$roundkey);'.
251         ' &vaesenc      ($inout,$inout,$roundkey);'.
252         ' &vmovdqu      ($roundkey,"0xd0-0x80($inp)");',
253
254         '&vpand         ($temp,$temp,$mask12);'.
255         ' &vaesenc      ($inout,$inout,$roundkey);'.
256          '&vmovdqu      ($roundkey,"0xe0-0x80($inp)");',
257
258         '&vpor          ($iv,$iv,$temp);'.
259         ' &vaesenclast  ($temp,$inout,$roundkey);'.
260         ' &vmovdqu      ($roundkey,"0x00-0x80($inp)");'
261
262 ##      &mov            ($inp,$_inp);
263 ##      &mov            ($out,$_out);
264 ##      &vpand          ($temp,$temp,$mask14);
265 ##      &vpor           ($iv,$iv,$temp);
266 ##      &vmovdqu        ($iv,($out,$inp);
267 ##      &lea            (inp,16($inp));
268 );
269
270 my $a4=$T1;
271 my ($a,$b,$c,$d,$e,$f,$g,$h);
272
273 sub AUTOLOAD()          # thunk [simplified] 32-bit style perlasm
274 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
275   my $arg = pop;
276     $arg = "\$$arg" if ($arg*1 eq $arg);
277     $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
278 }
279
280 sub body_00_15 () {
281         (
282         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
283
284         '&ror   ($a0,$Sigma1[2]-$Sigma1[1])',
285         '&mov   ($a,$a1)',
286         '&mov   ($a4,$f)',
287
288         '&xor   ($a0,$e)',
289         '&ror   ($a1,$Sigma0[2]-$Sigma0[1])',
290         '&xor   ($a4,$g)',                      # f^g
291
292         '&ror   ($a0,$Sigma1[1]-$Sigma1[0])',
293         '&xor   ($a1,$a)',
294         '&and   ($a4,$e)',                      # (f^g)&e
295
296         @aesni_cbc_block[$aesni_cbc_idx++].
297         '&xor   ($a0,$e)',
298         '&add   ($h,$SZ*($i&15)."(%rsp)")',     # h+=X[i]+K[i]
299         '&mov   ($a2,$a)',
300
301         '&ror   ($a1,$Sigma0[1]-$Sigma0[0])',
302         '&xor   ($a4,$g)',                      # Ch(e,f,g)=((f^g)&e)^g
303         '&xor   ($a2,$b)',                      # a^b, b^c in next round
304
305         '&ror   ($a0,$Sigma1[0])',              # Sigma1(e)
306         '&add   ($h,$a4)',                      # h+=Ch(e,f,g)
307         '&and   ($a3,$a2)',                     # (b^c)&(a^b)
308
309         '&xor   ($a1,$a)',
310         '&add   ($h,$a0)',                      # h+=Sigma1(e)
311         '&xor   ($a3,$b)',                      # Maj(a,b,c)=Ch(a^b,c,b)
312
313         '&add   ($d,$h)',                       # d+=h
314         '&ror   ($a1,$Sigma0[0])',              # Sigma0(a)
315         '&add   ($h,$a3)',                      # h+=Maj(a,b,c)
316
317         '&mov   ($a0,$d)',
318         '&add   ($a1,$h);'.                     # h+=Sigma0(a)
319         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
320         );
321 }
322
323 if ($avx) {{
324 ######################################################################
325 # XOP code path
326 #
327 $code.=<<___;
328 .type   ${func}_xop,\@function,6
329 .align  64
330 ${func}_xop:
331 .Lxop_shortcut:
332         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
333         push    %rbx
334         push    %rbp
335         push    %r12
336         push    %r13
337         push    %r14
338         push    %r15
339         mov     %rsp,%r11               # copy %rsp
340         sub     \$`$framesz+$win64*16*10`,%rsp
341         and     \$-64,%rsp              # align stack frame
342
343         shl     \$6,$len
344         sub     $inp,$out               # re-bias
345         sub     $inp,$in0
346         add     $inp,$len               # end of input
347
348         #mov    $inp,$_inp              # saved later
349         mov     $out,$_out
350         mov     $len,$_end
351         #mov    $key,$_key              # remains resident in $inp register
352         mov     $ivp,$_ivp
353         mov     $ctx,$_ctx
354         mov     $in0,$_in0
355         mov     %r11,$_rsp
356 ___
357 $code.=<<___ if ($win64);
358         movaps  %xmm6,`$framesz+16*0`(%rsp)
359         movaps  %xmm7,`$framesz+16*1`(%rsp)
360         movaps  %xmm8,`$framesz+16*2`(%rsp)
361         movaps  %xmm9,`$framesz+16*3`(%rsp)
362         movaps  %xmm10,`$framesz+16*4`(%rsp)
363         movaps  %xmm11,`$framesz+16*5`(%rsp)
364         movaps  %xmm12,`$framesz+16*6`(%rsp)
365         movaps  %xmm13,`$framesz+16*7`(%rsp)
366         movaps  %xmm14,`$framesz+16*8`(%rsp)
367         movaps  %xmm15,`$framesz+16*9`(%rsp)
368 ___
369 $code.=<<___;
370 .Lprologue_xop:
371         vzeroall
372
373         mov     $inp,%r12               # borrow $a4
374         lea     0x80($key),$inp         # size optimization, reassign
375         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
376         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
377         mov     $ctx,%r15               # borrow $a2
378         mov     $in0,%rsi               # borrow $a3
379         vmovdqu ($ivp),$iv              # load IV
380         sub     \$9,%r14
381
382         mov     $SZ*0(%r15),$A
383         mov     $SZ*1(%r15),$B
384         mov     $SZ*2(%r15),$C
385         mov     $SZ*3(%r15),$D
386         mov     $SZ*4(%r15),$E
387         mov     $SZ*5(%r15),$F
388         mov     $SZ*6(%r15),$G
389         mov     $SZ*7(%r15),$H
390
391         vmovdqa 0x00(%r13,%r14,8),$mask14
392         vmovdqa 0x10(%r13,%r14,8),$mask12
393         vmovdqa 0x20(%r13,%r14,8),$mask10
394         vmovdqu 0x00-0x80($inp),$roundkey
395         jmp     .Lloop_xop
396 ___
397                                         if ($SZ==4) {   # SHA256
398     my @X = map("%xmm$_",(0..3));
399     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
400
401 $code.=<<___;
402 .align  16
403 .Lloop_xop:
404         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
405         vmovdqu 0x00(%rsi,%r12),@X[0]
406         vmovdqu 0x10(%rsi,%r12),@X[1]
407         vmovdqu 0x20(%rsi,%r12),@X[2]
408         vmovdqu 0x30(%rsi,%r12),@X[3]
409         vpshufb $t3,@X[0],@X[0]
410         lea     $TABLE(%rip),$Tbl
411         vpshufb $t3,@X[1],@X[1]
412         vpshufb $t3,@X[2],@X[2]
413         vpaddd  0x00($Tbl),@X[0],$t0
414         vpshufb $t3,@X[3],@X[3]
415         vpaddd  0x20($Tbl),@X[1],$t1
416         vpaddd  0x40($Tbl),@X[2],$t2
417         vpaddd  0x60($Tbl),@X[3],$t3
418         vmovdqa $t0,0x00(%rsp)
419         mov     $A,$a1
420         vmovdqa $t1,0x10(%rsp)
421         mov     $B,$a3
422         vmovdqa $t2,0x20(%rsp)
423         xor     $C,$a3                  # magic
424         vmovdqa $t3,0x30(%rsp)
425         mov     $E,$a0
426         jmp     .Lxop_00_47
427
428 .align  16
429 .Lxop_00_47:
430         sub     \$-16*2*$SZ,$Tbl        # size optimization
431         vmovdqu (%r12),$inout           # $a4
432         mov     %r12,$_inp              # $a4
433 ___
434 sub XOP_256_00_47 () {
435 my $j = shift;
436 my $body = shift;
437 my @X = @_;
438 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
439
440         &vpalignr       ($t0,@X[1],@X[0],$SZ);  # X[1..4]
441           eval(shift(@insns));
442           eval(shift(@insns));
443          &vpalignr      ($t3,@X[3],@X[2],$SZ);  # X[9..12]
444           eval(shift(@insns));
445           eval(shift(@insns));
446         &vprotd         ($t1,$t0,8*$SZ-$sigma0[1]);
447           eval(shift(@insns));
448           eval(shift(@insns));
449         &vpsrld         ($t0,$t0,$sigma0[2]);
450           eval(shift(@insns));
451           eval(shift(@insns));
452          &vpaddd        (@X[0],@X[0],$t3);      # X[0..3] += X[9..12]
453           eval(shift(@insns));
454           eval(shift(@insns));
455           eval(shift(@insns));
456           eval(shift(@insns));
457         &vprotd         ($t2,$t1,$sigma0[1]-$sigma0[0]);
458           eval(shift(@insns));
459           eval(shift(@insns));
460         &vpxor          ($t0,$t0,$t1);
461           eval(shift(@insns));
462           eval(shift(@insns));
463           eval(shift(@insns));
464           eval(shift(@insns));
465          &vprotd        ($t3,@X[3],8*$SZ-$sigma1[1]);
466           eval(shift(@insns));
467           eval(shift(@insns));
468         &vpxor          ($t0,$t0,$t2);          # sigma0(X[1..4])
469           eval(shift(@insns));
470           eval(shift(@insns));
471          &vpsrld        ($t2,@X[3],$sigma1[2]);
472           eval(shift(@insns));
473           eval(shift(@insns));
474         &vpaddd         (@X[0],@X[0],$t0);      # X[0..3] += sigma0(X[1..4])
475           eval(shift(@insns));
476           eval(shift(@insns));
477          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
478           eval(shift(@insns));
479           eval(shift(@insns));
480          &vpxor         ($t3,$t3,$t2);
481           eval(shift(@insns));
482           eval(shift(@insns));
483           eval(shift(@insns));
484           eval(shift(@insns));
485          &vpxor         ($t3,$t3,$t1);          # sigma1(X[14..15])
486           eval(shift(@insns));
487           eval(shift(@insns));
488           eval(shift(@insns));
489           eval(shift(@insns));
490         &vpsrldq        ($t3,$t3,8);
491           eval(shift(@insns));
492           eval(shift(@insns));
493           eval(shift(@insns));
494           eval(shift(@insns));
495         &vpaddd         (@X[0],@X[0],$t3);      # X[0..1] += sigma1(X[14..15])
496           eval(shift(@insns));
497           eval(shift(@insns));
498           eval(shift(@insns));
499           eval(shift(@insns));
500          &vprotd        ($t3,@X[0],8*$SZ-$sigma1[1]);
501           eval(shift(@insns));
502           eval(shift(@insns));
503          &vpsrld        ($t2,@X[0],$sigma1[2]);
504           eval(shift(@insns));
505           eval(shift(@insns));
506          &vprotd        ($t1,$t3,$sigma1[1]-$sigma1[0]);
507           eval(shift(@insns));
508           eval(shift(@insns));
509          &vpxor         ($t3,$t3,$t2);
510           eval(shift(@insns));
511           eval(shift(@insns));
512           eval(shift(@insns));
513           eval(shift(@insns));
514          &vpxor         ($t3,$t3,$t1);          # sigma1(X[16..17])
515           eval(shift(@insns));
516           eval(shift(@insns));
517           eval(shift(@insns));
518           eval(shift(@insns));
519         &vpslldq        ($t3,$t3,8);            # 22 instructions
520           eval(shift(@insns));
521           eval(shift(@insns));
522           eval(shift(@insns));
523           eval(shift(@insns));
524         &vpaddd         (@X[0],@X[0],$t3);      # X[2..3] += sigma1(X[16..17])
525           eval(shift(@insns));
526           eval(shift(@insns));
527           eval(shift(@insns));
528           eval(shift(@insns));
529         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
530           foreach (@insns) { eval; }            # remaining instructions
531         &vmovdqa        (16*$j."(%rsp)",$t2);
532 }
533
534     $aesni_cbc_idx=0;
535     for ($i=0,$j=0; $j<4; $j++) {
536         &XOP_256_00_47($j,\&body_00_15,@X);
537         push(@X,shift(@X));                     # rotate(@X)
538     }
539         &mov            ("%r12",$_inp);         # borrow $a4
540         &vpand          ($temp,$temp,$mask14);
541         &mov            ("%r15",$_out);         # borrow $a2
542         &vpor           ($iv,$iv,$temp);
543         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
544         &lea            ("%r12","16(%r12)");    # inp++
545
546         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
547         &jne    (".Lxop_00_47");
548
549         &vmovdqu        ($inout,"(%r12)");
550         &mov            ($_inp,"%r12");
551
552     $aesni_cbc_idx=0;
553     for ($i=0; $i<16; ) {
554         foreach(body_00_15()) { eval; }
555     }
556                                         }
557 $code.=<<___;
558         mov     $_inp,%r12              # borrow $a4
559         mov     $_out,%r13              # borrow $a0
560         mov     $_ctx,%r15              # borrow $a2
561         mov     $_in0,%rsi              # borrow $a3
562
563         vpand   $mask14,$temp,$temp
564         mov     $a1,$A
565         vpor    $temp,$iv,$iv
566         vmovdqu $iv,(%r13,%r12)         # write output
567         lea     16(%r12),%r12           # inp++
568
569         add     $SZ*0(%r15),$A
570         add     $SZ*1(%r15),$B
571         add     $SZ*2(%r15),$C
572         add     $SZ*3(%r15),$D
573         add     $SZ*4(%r15),$E
574         add     $SZ*5(%r15),$F
575         add     $SZ*6(%r15),$G
576         add     $SZ*7(%r15),$H
577
578         cmp     $_end,%r12
579
580         mov     $A,$SZ*0(%r15)
581         mov     $B,$SZ*1(%r15)
582         mov     $C,$SZ*2(%r15)
583         mov     $D,$SZ*3(%r15)
584         mov     $E,$SZ*4(%r15)
585         mov     $F,$SZ*5(%r15)
586         mov     $G,$SZ*6(%r15)
587         mov     $H,$SZ*7(%r15)
588
589         jb      .Lloop_xop
590
591         mov     $_ivp,$ivp
592         mov     $_rsp,%rsi
593         vmovdqu $iv,($ivp)              # output IV
594         vzeroall
595 ___
596 $code.=<<___ if ($win64);
597         movaps  `$framesz+16*0`(%rsp),%xmm6
598         movaps  `$framesz+16*1`(%rsp),%xmm7
599         movaps  `$framesz+16*2`(%rsp),%xmm8
600         movaps  `$framesz+16*3`(%rsp),%xmm9
601         movaps  `$framesz+16*4`(%rsp),%xmm10
602         movaps  `$framesz+16*5`(%rsp),%xmm11
603         movaps  `$framesz+16*6`(%rsp),%xmm12
604         movaps  `$framesz+16*7`(%rsp),%xmm13
605         movaps  `$framesz+16*8`(%rsp),%xmm14
606         movaps  `$framesz+16*9`(%rsp),%xmm15
607 ___
608 $code.=<<___;
609         mov     (%rsi),%r15
610         mov     8(%rsi),%r14
611         mov     16(%rsi),%r13
612         mov     24(%rsi),%r12
613         mov     32(%rsi),%rbp
614         mov     40(%rsi),%rbx
615         lea     48(%rsi),%rsp
616 .Lepilogue_xop:
617         ret
618 .size   ${func}_xop,.-${func}_xop
619 ___
620 ######################################################################
621 # AVX+shrd code path
622 #
623 local *ror = sub { &shrd(@_[0],@_) };
624
625 $code.=<<___;
626 .type   ${func}_avx,\@function,6
627 .align  64
628 ${func}_avx:
629 .Lavx_shortcut:
630         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
631         push    %rbx
632         push    %rbp
633         push    %r12
634         push    %r13
635         push    %r14
636         push    %r15
637         mov     %rsp,%r11               # copy %rsp
638         sub     \$`$framesz+$win64*16*10`,%rsp
639         and     \$-64,%rsp              # align stack frame
640
641         shl     \$6,$len
642         sub     $inp,$out               # re-bias
643         sub     $inp,$in0
644         add     $inp,$len               # end of input
645
646         #mov    $inp,$_inp              # saved later
647         mov     $out,$_out
648         mov     $len,$_end
649         #mov    $key,$_key              # remains resident in $inp register
650         mov     $ivp,$_ivp
651         mov     $ctx,$_ctx
652         mov     $in0,$_in0
653         mov     %r11,$_rsp
654 ___
655 $code.=<<___ if ($win64);
656         movaps  %xmm6,`$framesz+16*0`(%rsp)
657         movaps  %xmm7,`$framesz+16*1`(%rsp)
658         movaps  %xmm8,`$framesz+16*2`(%rsp)
659         movaps  %xmm9,`$framesz+16*3`(%rsp)
660         movaps  %xmm10,`$framesz+16*4`(%rsp)
661         movaps  %xmm11,`$framesz+16*5`(%rsp)
662         movaps  %xmm12,`$framesz+16*6`(%rsp)
663         movaps  %xmm13,`$framesz+16*7`(%rsp)
664         movaps  %xmm14,`$framesz+16*8`(%rsp)
665         movaps  %xmm15,`$framesz+16*9`(%rsp)
666 ___
667 $code.=<<___;
668 .Lprologue_avx:
669         vzeroall
670
671         mov     $inp,%r12               # borrow $a4
672         lea     0x80($key),$inp         # size optimization, reassign
673         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r13    # borrow $a0
674         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
675         mov     $ctx,%r15               # borrow $a2
676         mov     $in0,%rsi               # borrow $a3
677         vmovdqu ($ivp),$iv              # load IV
678         sub     \$9,%r14
679
680         mov     $SZ*0(%r15),$A
681         mov     $SZ*1(%r15),$B
682         mov     $SZ*2(%r15),$C
683         mov     $SZ*3(%r15),$D
684         mov     $SZ*4(%r15),$E
685         mov     $SZ*5(%r15),$F
686         mov     $SZ*6(%r15),$G
687         mov     $SZ*7(%r15),$H
688
689         vmovdqa 0x00(%r13,%r14,8),$mask14
690         vmovdqa 0x10(%r13,%r14,8),$mask12
691         vmovdqa 0x20(%r13,%r14,8),$mask10
692         vmovdqu 0x00-0x80($inp),$roundkey
693 ___
694                                         if ($SZ==4) {   # SHA256
695     my @X = map("%xmm$_",(0..3));
696     my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
697
698 $code.=<<___;
699         jmp     .Lloop_avx
700 .align  16
701 .Lloop_avx:
702         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
703         vmovdqu 0x00(%rsi,%r12),@X[0]
704         vmovdqu 0x10(%rsi,%r12),@X[1]
705         vmovdqu 0x20(%rsi,%r12),@X[2]
706         vmovdqu 0x30(%rsi,%r12),@X[3]
707         vpshufb $t3,@X[0],@X[0]
708         lea     $TABLE(%rip),$Tbl
709         vpshufb $t3,@X[1],@X[1]
710         vpshufb $t3,@X[2],@X[2]
711         vpaddd  0x00($Tbl),@X[0],$t0
712         vpshufb $t3,@X[3],@X[3]
713         vpaddd  0x20($Tbl),@X[1],$t1
714         vpaddd  0x40($Tbl),@X[2],$t2
715         vpaddd  0x60($Tbl),@X[3],$t3
716         vmovdqa $t0,0x00(%rsp)
717         mov     $A,$a1
718         vmovdqa $t1,0x10(%rsp)
719         mov     $B,$a3
720         vmovdqa $t2,0x20(%rsp)
721         xor     $C,$a3                  # magic
722         vmovdqa $t3,0x30(%rsp)
723         mov     $E,$a0
724         jmp     .Lavx_00_47
725
726 .align  16
727 .Lavx_00_47:
728         sub     \$-16*2*$SZ,$Tbl        # size optimization
729         vmovdqu (%r12),$inout           # $a4
730         mov     %r12,$_inp              # $a4
731 ___
732 sub Xupdate_256_AVX () {
733         (
734         '&vpalignr      ($t0,@X[1],@X[0],$SZ)', # X[1..4]
735          '&vpalignr     ($t3,@X[3],@X[2],$SZ)', # X[9..12]
736         '&vpsrld        ($t2,$t0,$sigma0[0]);',
737          '&vpaddd       (@X[0],@X[0],$t3)',     # X[0..3] += X[9..12]
738         '&vpsrld        ($t3,$t0,$sigma0[2])',
739         '&vpslld        ($t1,$t0,8*$SZ-$sigma0[1]);',
740         '&vpxor         ($t0,$t3,$t2)',
741          '&vpshufd      ($t3,@X[3],0b11111010)',# X[14..15]
742         '&vpsrld        ($t2,$t2,$sigma0[1]-$sigma0[0]);',
743         '&vpxor         ($t0,$t0,$t1)',
744         '&vpslld        ($t1,$t1,$sigma0[1]-$sigma0[0]);',
745         '&vpxor         ($t0,$t0,$t2)',
746          '&vpsrld       ($t2,$t3,$sigma1[2]);',
747         '&vpxor         ($t0,$t0,$t1)',         # sigma0(X[1..4])
748          '&vpsrlq       ($t3,$t3,$sigma1[0]);',
749         '&vpaddd        (@X[0],@X[0],$t0)',     # X[0..3] += sigma0(X[1..4])
750          '&vpxor        ($t2,$t2,$t3);',
751          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
752          '&vpxor        ($t2,$t2,$t3)',         # sigma1(X[14..15])
753          '&vpshufd      ($t2,$t2,0b10000100)',
754          '&vpsrldq      ($t2,$t2,8)',
755         '&vpaddd        (@X[0],@X[0],$t2)',     # X[0..1] += sigma1(X[14..15])
756          '&vpshufd      ($t3,@X[0],0b01010000)',# X[16..17]
757          '&vpsrld       ($t2,$t3,$sigma1[2])',
758          '&vpsrlq       ($t3,$t3,$sigma1[0])',
759          '&vpxor        ($t2,$t2,$t3);',
760          '&vpsrlq       ($t3,$t3,$sigma1[1]-$sigma1[0])',
761          '&vpxor        ($t2,$t2,$t3)',
762          '&vpshufd      ($t2,$t2,0b11101000)',
763          '&vpslldq      ($t2,$t2,8)',
764         '&vpaddd        (@X[0],@X[0],$t2)'      # X[2..3] += sigma1(X[16..17])
765         );
766 }
767
768 sub AVX_256_00_47 () {
769 my $j = shift;
770 my $body = shift;
771 my @X = @_;
772 my @insns = (&$body,&$body,&$body,&$body);      # 104 instructions
773
774         foreach (Xupdate_256_AVX()) {           # 29 instructions
775             eval;
776             eval(shift(@insns));
777             eval(shift(@insns));
778             eval(shift(@insns));
779         }
780         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
781           foreach (@insns) { eval; }            # remaining instructions
782         &vmovdqa        (16*$j."(%rsp)",$t2);
783 }
784
785     $aesni_cbc_idx=0;
786     for ($i=0,$j=0; $j<4; $j++) {
787         &AVX_256_00_47($j,\&body_00_15,@X);
788         push(@X,shift(@X));                     # rotate(@X)
789     }
790         &mov            ("%r12",$_inp);         # borrow $a4
791         &vpand          ($temp,$temp,$mask14);
792         &mov            ("%r15",$_out);         # borrow $a2
793         &vpor           ($iv,$iv,$temp);
794         &vmovdqu        ("(%r15,%r12)",$iv);    # write output
795         &lea            ("%r12","16(%r12)");    # inp++
796
797         &cmpb   ($SZ-1+16*2*$SZ."($Tbl)",0);
798         &jne    (".Lavx_00_47");
799
800         &vmovdqu        ($inout,"(%r12)");
801         &mov            ($_inp,"%r12");
802
803     $aesni_cbc_idx=0;
804     for ($i=0; $i<16; ) {
805         foreach(body_00_15()) { eval; }
806     }
807
808                                         }
809 $code.=<<___;
810         mov     $_inp,%r12              # borrow $a4
811         mov     $_out,%r13              # borrow $a0
812         mov     $_ctx,%r15              # borrow $a2
813         mov     $_in0,%rsi              # borrow $a3
814
815         vpand   $mask14,$temp,$temp
816         mov     $a1,$A
817         vpor    $temp,$iv,$iv
818         vmovdqu $iv,(%r13,%r12)         # write output
819         lea     16(%r12),%r12           # inp++
820
821         add     $SZ*0(%r15),$A
822         add     $SZ*1(%r15),$B
823         add     $SZ*2(%r15),$C
824         add     $SZ*3(%r15),$D
825         add     $SZ*4(%r15),$E
826         add     $SZ*5(%r15),$F
827         add     $SZ*6(%r15),$G
828         add     $SZ*7(%r15),$H
829
830         cmp     $_end,%r12
831
832         mov     $A,$SZ*0(%r15)
833         mov     $B,$SZ*1(%r15)
834         mov     $C,$SZ*2(%r15)
835         mov     $D,$SZ*3(%r15)
836         mov     $E,$SZ*4(%r15)
837         mov     $F,$SZ*5(%r15)
838         mov     $G,$SZ*6(%r15)
839         mov     $H,$SZ*7(%r15)
840         jb      .Lloop_avx
841
842         mov     $_ivp,$ivp
843         mov     $_rsp,%rsi
844         vmovdqu $iv,($ivp)              # output IV
845         vzeroall
846 ___
847 $code.=<<___ if ($win64);
848         movaps  `$framesz+16*0`(%rsp),%xmm6
849         movaps  `$framesz+16*1`(%rsp),%xmm7
850         movaps  `$framesz+16*2`(%rsp),%xmm8
851         movaps  `$framesz+16*3`(%rsp),%xmm9
852         movaps  `$framesz+16*4`(%rsp),%xmm10
853         movaps  `$framesz+16*5`(%rsp),%xmm11
854         movaps  `$framesz+16*6`(%rsp),%xmm12
855         movaps  `$framesz+16*7`(%rsp),%xmm13
856         movaps  `$framesz+16*8`(%rsp),%xmm14
857         movaps  `$framesz+16*9`(%rsp),%xmm15
858 ___
859 $code.=<<___;
860         mov     (%rsi),%r15
861         mov     8(%rsi),%r14
862         mov     16(%rsi),%r13
863         mov     24(%rsi),%r12
864         mov     32(%rsi),%rbp
865         mov     40(%rsi),%rbx
866         lea     48(%rsi),%rsp
867 .Lepilogue_avx:
868         ret
869 .size   ${func}_avx,.-${func}_avx
870 ___
871
872 if ($avx>1) {{
873 ######################################################################
874 # AVX2+BMI code path
875 #
876 my $a5=$SZ==4?"%esi":"%rsi";    # zap $inp 
877 my $PUSH8=8*2*$SZ;
878 use integer;
879
880 sub bodyx_00_15 () {
881         # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
882         (
883         '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
884
885         '&add   ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)',    # h+=X[i]+K[i]
886         '&and   ($a4,$e)',              # f&e
887         '&rorx  ($a0,$e,$Sigma1[2])',
888         '&rorx  ($a2,$e,$Sigma1[1])',
889
890         '&lea   ($a,"($a,$a1)")',       # h+=Sigma0(a) from the past
891         '&lea   ($h,"($h,$a4)")',
892         '&andn  ($a4,$e,$g)',           # ~e&g
893         '&xor   ($a0,$a2)',
894
895         '&rorx  ($a1,$e,$Sigma1[0])',
896         '&lea   ($h,"($h,$a4)")',       # h+=Ch(e,f,g)=(e&f)+(~e&g)
897         '&xor   ($a0,$a1)',             # Sigma1(e)
898         '&mov   ($a2,$a)',
899
900         '&rorx  ($a4,$a,$Sigma0[2])',
901         '&lea   ($h,"($h,$a0)")',       # h+=Sigma1(e)
902         '&xor   ($a2,$b)',              # a^b, b^c in next round
903         '&rorx  ($a1,$a,$Sigma0[1])',
904
905         '&rorx  ($a0,$a,$Sigma0[0])',
906         '&lea   ($d,"($d,$h)")',        # d+=h
907         '&and   ($a3,$a2)',             # (b^c)&(a^b)
908         @aesni_cbc_block[$aesni_cbc_idx++].
909         '&xor   ($a1,$a4)',
910
911         '&xor   ($a3,$b)',              # Maj(a,b,c)=Ch(a^b,c,b)
912         '&xor   ($a1,$a0)',             # Sigma0(a)
913         '&lea   ($h,"($h,$a3)");'.      # h+=Maj(a,b,c)
914         '&mov   ($a4,$e)',              # copy of f in future
915
916         '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
917         );
918         # and at the finish one has to $a+=$a1
919 }
920
921 $code.=<<___;
922 .type   ${func}_avx2,\@function,6
923 .align  64
924 ${func}_avx2:
925 .Lavx2_shortcut:
926         mov     `($win64?56:8)`(%rsp),$in0      # load 7th parameter
927         push    %rbx
928         push    %rbp
929         push    %r12
930         push    %r13
931         push    %r14
932         push    %r15
933         mov     %rsp,%r11               # copy %rsp
934         sub     \$`2*$SZ*$rounds+8*8+$win64*16*10`,%rsp
935         and     \$-256*$SZ,%rsp         # align stack frame
936         add     \$`2*$SZ*($rounds-8)`,%rsp
937
938         shl     \$6,$len
939         sub     $inp,$out               # re-bias
940         sub     $inp,$in0
941         add     $inp,$len               # end of input
942
943         #mov    $inp,$_inp              # saved later
944         #mov    $out,$_out              # kept in $offload
945         mov     $len,$_end
946         #mov    $key,$_key              # remains resident in $inp register
947         mov     $ivp,$_ivp
948         mov     $ctx,$_ctx
949         mov     $in0,$_in0
950         mov     %r11,$_rsp
951 ___
952 $code.=<<___ if ($win64);
953         movaps  %xmm6,`$framesz+16*0`(%rsp)
954         movaps  %xmm7,`$framesz+16*1`(%rsp)
955         movaps  %xmm8,`$framesz+16*2`(%rsp)
956         movaps  %xmm9,`$framesz+16*3`(%rsp)
957         movaps  %xmm10,`$framesz+16*4`(%rsp)
958         movaps  %xmm11,`$framesz+16*5`(%rsp)
959         movaps  %xmm12,`$framesz+16*6`(%rsp)
960         movaps  %xmm13,`$framesz+16*7`(%rsp)
961         movaps  %xmm14,`$framesz+16*8`(%rsp)
962         movaps  %xmm15,`$framesz+16*9`(%rsp)
963 ___
964 $code.=<<___;
965 .Lprologue_avx2:
966         vzeroall
967
968         mov     $inp,%r13               # borrow $a0
969         vpinsrq \$1,$out,$offload,$offload
970         lea     0x80($key),$inp         # size optimization, reassign
971         lea     $TABLE+`$SZ*2*$rounds+32`(%rip),%r12    # borrow $a4
972         mov     0xf0-0x80($inp),%r14d   # rounds, borrow $a1
973         mov     $ctx,%r15               # borrow $a2
974         mov     $in0,%rsi               # borrow $a3
975         vmovdqu ($ivp),$iv              # load IV
976         lea     -9(%r14),%r14
977
978         vmovdqa 0x00(%r12,%r14,8),$mask14
979         vmovdqa 0x10(%r12,%r14,8),$mask12
980         vmovdqa 0x20(%r12,%r14,8),$mask10
981
982         sub     \$-16*$SZ,%r13          # inp++, size optimization
983         mov     $SZ*0(%r15),$A
984         lea     (%rsi,%r13),%r12        # borrow $a0
985         mov     $SZ*1(%r15),$B
986         cmp     $len,%r13               # $_end
987         mov     $SZ*2(%r15),$C
988         cmove   %rsp,%r12               # next block or random data
989         mov     $SZ*3(%r15),$D
990         mov     $SZ*4(%r15),$E
991         mov     $SZ*5(%r15),$F
992         mov     $SZ*6(%r15),$G
993         mov     $SZ*7(%r15),$H
994         vmovdqu 0x00-0x80($inp),$roundkey
995 ___
996                                         if ($SZ==4) {   # SHA256
997     my @X = map("%ymm$_",(0..3));
998     my ($t0,$t1,$t2,$t3) = map("%ymm$_",(4..7));
999
1000 $code.=<<___;
1001         jmp     .Loop_avx2
1002 .align  16
1003 .Loop_avx2:
1004         vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
1005         vmovdqu -16*$SZ+0(%rsi,%r13),%xmm0
1006         vmovdqu -16*$SZ+16(%rsi,%r13),%xmm1
1007         vmovdqu -16*$SZ+32(%rsi,%r13),%xmm2
1008         vmovdqu -16*$SZ+48(%rsi,%r13),%xmm3
1009
1010         vinserti128     \$1,(%r12),@X[0],@X[0]
1011         vinserti128     \$1,16(%r12),@X[1],@X[1]
1012          vpshufb        $t3,@X[0],@X[0]
1013         vinserti128     \$1,32(%r12),@X[2],@X[2]
1014          vpshufb        $t3,@X[1],@X[1]
1015         vinserti128     \$1,48(%r12),@X[3],@X[3]
1016
1017         lea     $TABLE(%rip),$Tbl
1018         vpshufb $t3,@X[2],@X[2]
1019         lea     -16*$SZ(%r13),%r13
1020         vpaddd  0x00($Tbl),@X[0],$t0
1021         vpshufb $t3,@X[3],@X[3]
1022         vpaddd  0x20($Tbl),@X[1],$t1
1023         vpaddd  0x40($Tbl),@X[2],$t2
1024         vpaddd  0x60($Tbl),@X[3],$t3
1025         vmovdqa $t0,0x00(%rsp)
1026         xor     $a1,$a1
1027         vmovdqa $t1,0x20(%rsp)
1028         lea     -$PUSH8(%rsp),%rsp
1029         mov     $B,$a3
1030         vmovdqa $t2,0x00(%rsp)
1031         xor     $C,$a3                  # magic
1032         vmovdqa $t3,0x20(%rsp)
1033         mov     $F,$a4
1034         sub     \$-16*2*$SZ,$Tbl        # size optimization
1035         jmp     .Lavx2_00_47
1036
1037 .align  16
1038 .Lavx2_00_47:
1039         vmovdqu (%r13),$inout
1040         vpinsrq \$0,%r13,$offload,$offload
1041 ___
1042
1043 sub AVX2_256_00_47 () {
1044 my $j = shift;
1045 my $body = shift;
1046 my @X = @_;
1047 my @insns = (&$body,&$body,&$body,&$body);      # 96 instructions
1048 my $base = "+2*$PUSH8(%rsp)";
1049
1050         &lea    ("%rsp","-$PUSH8(%rsp)")        if (($j%2)==0);
1051         foreach (Xupdate_256_AVX()) {           # 29 instructions
1052             eval;
1053             eval(shift(@insns));
1054             eval(shift(@insns));
1055             eval(shift(@insns));
1056         }
1057         &vpaddd         ($t2,@X[0],16*2*$j."($Tbl)");
1058           foreach (@insns) { eval; }            # remaining instructions
1059         &vmovdqa        ((32*$j)%$PUSH8."(%rsp)",$t2);
1060 }
1061     $aesni_cbc_idx=0;
1062     for ($i=0,$j=0; $j<4; $j++) {
1063         &AVX2_256_00_47($j,\&bodyx_00_15,@X);
1064         push(@X,shift(@X));                     # rotate(@X)
1065     }
1066         &vmovq          ("%r13",$offload);      # borrow $a0
1067         &vpextrq        ("%r15",$offload,1);    # borrow $a2
1068         &vpand          ($temp,$temp,$mask14);
1069         &vpor           ($iv,$iv,$temp);
1070         &vmovdqu        ("(%r15,%r13)",$iv);    # write output
1071         &lea            ("%r13","16(%r13)");    # inp++
1072
1073         &lea    ($Tbl,16*2*$SZ."($Tbl)");
1074         &cmpb   (($SZ-1)."($Tbl)",0);
1075         &jne    (".Lavx2_00_47");
1076
1077         &vmovdqu        ($inout,"(%r13)");
1078         &vpinsrq        ($offload,$offload,"%r13",0);
1079
1080     $aesni_cbc_idx=0;
1081     for ($i=0; $i<16; ) {
1082         my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
1083         foreach(bodyx_00_15()) { eval; }
1084     }
1085                                         }
1086 $code.=<<___;
1087         vpextrq \$1,$offload,%r12               # $_out, borrow $a4
1088         vmovq   $offload,%r13                   # $_inp, borrow $a0
1089         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1090         add     $a1,$A
1091         lea     `2*$SZ*($rounds-8)`(%rsp),$Tbl
1092
1093         vpand   $mask14,$temp,$temp
1094         vpor    $temp,$iv,$iv
1095         vmovdqu $iv,(%r12,%r13)                 # write output
1096         lea     16(%r13),%r13
1097
1098         add     $SZ*0(%r15),$A
1099         add     $SZ*1(%r15),$B
1100         add     $SZ*2(%r15),$C
1101         add     $SZ*3(%r15),$D
1102         add     $SZ*4(%r15),$E
1103         add     $SZ*5(%r15),$F
1104         add     $SZ*6(%r15),$G
1105         add     $SZ*7(%r15),$H
1106
1107         mov     $A,$SZ*0(%r15)
1108         mov     $B,$SZ*1(%r15)
1109         mov     $C,$SZ*2(%r15)
1110         mov     $D,$SZ*3(%r15)
1111         mov     $E,$SZ*4(%r15)
1112         mov     $F,$SZ*5(%r15)
1113         mov     $G,$SZ*6(%r15)
1114         mov     $H,$SZ*7(%r15)
1115
1116         cmp     `$PUSH8+2*8`($Tbl),%r13         # $_end
1117         je      .Ldone_avx2
1118
1119         xor     $a1,$a1
1120         mov     $B,$a3
1121         mov     $F,$a4
1122         xor     $C,$a3                  # magic
1123         jmp     .Lower_avx2
1124 .align  16
1125 .Lower_avx2:
1126         vmovdqu (%r13),$inout
1127         vpinsrq \$0,%r13,$offload,$offload
1128 ___
1129     $aesni_cbc_idx=0;
1130     for ($i=0; $i<16; ) {
1131         my $base="+16($Tbl)";
1132         foreach(bodyx_00_15()) { eval; }
1133         &lea    ($Tbl,"-$PUSH8($Tbl)")  if ($i==8);
1134     }
1135 $code.=<<___;
1136         vmovq   $offload,%r13                   # borrow $a0
1137         vpextrq \$1,$offload,%r15               # borrow $a2
1138         vpand   $mask14,$temp,$temp
1139         vpor    $temp,$iv,$iv
1140         lea     -$PUSH8($Tbl),$Tbl
1141         vmovdqu $iv,(%r15,%r13)                 # write output
1142         lea     16(%r13),%r13                   # inp++
1143         cmp     %rsp,$Tbl
1144         jae     .Lower_avx2
1145
1146         mov     `2*$SZ*$rounds+5*8`(%rsp),%r15  # $_ctx, borrow $a2
1147         lea     16*$SZ(%r13),%r13
1148         mov     `2*$SZ*$rounds+6*8`(%rsp),%rsi  # $_in0, borrow $a3
1149         add     $a1,$A
1150         lea     `2*$SZ*($rounds-8)`(%rsp),%rsp
1151
1152         add     $SZ*0(%r15),$A
1153         add     $SZ*1(%r15),$B
1154         add     $SZ*2(%r15),$C
1155         add     $SZ*3(%r15),$D
1156         add     $SZ*4(%r15),$E
1157         add     $SZ*5(%r15),$F
1158         add     $SZ*6(%r15),$G
1159         lea     (%rsi,%r13),%r12
1160         add     $SZ*7(%r15),$H
1161
1162         cmp     $_end,%r13
1163
1164         mov     $A,$SZ*0(%r15)
1165         cmove   %rsp,%r12               # next block or stale data
1166         mov     $B,$SZ*1(%r15)
1167         mov     $C,$SZ*2(%r15)
1168         mov     $D,$SZ*3(%r15)
1169         mov     $E,$SZ*4(%r15)
1170         mov     $F,$SZ*5(%r15)
1171         mov     $G,$SZ*6(%r15)
1172         mov     $H,$SZ*7(%r15)
1173
1174         jbe     .Loop_avx2
1175         lea     (%rsp),$Tbl
1176
1177 .Ldone_avx2:
1178         lea     ($Tbl),%rsp
1179         mov     $_ivp,$ivp
1180         mov     $_rsp,%rsi
1181         vmovdqu $iv,($ivp)              # output IV
1182         vzeroall
1183 ___
1184 $code.=<<___ if ($win64);
1185         movaps  `$framesz+16*0`(%rsp),%xmm6
1186         movaps  `$framesz+16*1`(%rsp),%xmm7
1187         movaps  `$framesz+16*2`(%rsp),%xmm8
1188         movaps  `$framesz+16*3`(%rsp),%xmm9
1189         movaps  `$framesz+16*4`(%rsp),%xmm10
1190         movaps  `$framesz+16*5`(%rsp),%xmm11
1191         movaps  `$framesz+16*6`(%rsp),%xmm12
1192         movaps  `$framesz+16*7`(%rsp),%xmm13
1193         movaps  `$framesz+16*8`(%rsp),%xmm14
1194         movaps  `$framesz+16*9`(%rsp),%xmm15
1195 ___
1196 $code.=<<___;
1197         mov     (%rsi),%r15
1198         mov     8(%rsi),%r14
1199         mov     16(%rsi),%r13
1200         mov     24(%rsi),%r12
1201         mov     32(%rsi),%rbp
1202         mov     40(%rsi),%rbx
1203         lea     48(%rsi),%rsp
1204 .Lepilogue_avx2:
1205         ret
1206 .size   ${func}_avx2,.-${func}_avx2
1207 ___
1208 }}
1209 }}
1210 {{
1211 my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
1212
1213 my ($rounds,$Tbl)=("%r11d","%rbx");
1214
1215 my ($iv,$in,$rndkey0)=map("%xmm$_",(6,14,15));
1216 my @rndkey=("%xmm4","%xmm5");
1217 my $r=0;
1218 my $sn=0;
1219
1220 my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..3,7..9));
1221 my @MSG=map("%xmm$_",(10..13));
1222
1223 my $aesenc=sub {
1224   use integer;
1225   my ($n,$k)=($r/10,$r%10);
1226     if ($k==0) {
1227       $code.=<<___;
1228         movups          `16*$n`($in0),$in               # load input
1229         xorps           $rndkey0,$in
1230 ___
1231       $code.=<<___ if ($n);
1232         movups          $iv,`16*($n-1)`($out,$in0)      # write output
1233 ___
1234       $code.=<<___;
1235         xorps           $in,$iv
1236         movups          `32+16*$k-112`($key),$rndkey[1]
1237         aesenc          $rndkey[0],$iv
1238 ___
1239     } elsif ($k==9) {
1240       $sn++;
1241       $code.=<<___;
1242         cmp             \$11,$rounds
1243         jb              .Laesenclast$sn
1244         movups          `32+16*($k+0)-112`($key),$rndkey[1]
1245         aesenc          $rndkey[0],$iv
1246         movups          `32+16*($k+1)-112`($key),$rndkey[0]
1247         aesenc          $rndkey[1],$iv
1248         je              .Laesenclast$sn
1249         movups          `32+16*($k+2)-112`($key),$rndkey[1]
1250         aesenc          $rndkey[0],$iv
1251         movups          `32+16*($k+3)-112`($key),$rndkey[0]
1252         aesenc          $rndkey[1],$iv
1253 .Laesenclast$sn:
1254         aesenclast      $rndkey[0],$iv
1255         movups          16-112($key),$rndkey[1]         # forward reference
1256         nop
1257 ___
1258     } else {
1259       $code.=<<___;
1260         movups          `32+16*$k-112`($key),$rndkey[1]
1261         aesenc          $rndkey[0],$iv
1262 ___
1263     }
1264     $r++;       unshift(@rndkey,pop(@rndkey));
1265 };
1266
1267 if ($shaext) {
1268 my $Tbl="%rax";
1269
1270 $code.=<<___;
1271 .type   ${func}_shaext,\@function,6
1272 .align  32
1273 ${func}_shaext:
1274         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
1275 ___
1276 $code.=<<___ if ($win64);
1277         lea     `-8-10*16`(%rsp),%rsp
1278         movaps  %xmm6,-8-10*16(%rax)
1279         movaps  %xmm7,-8-9*16(%rax)
1280         movaps  %xmm8,-8-8*16(%rax)
1281         movaps  %xmm9,-8-7*16(%rax)
1282         movaps  %xmm10,-8-6*16(%rax)
1283         movaps  %xmm11,-8-5*16(%rax)
1284         movaps  %xmm12,-8-4*16(%rax)
1285         movaps  %xmm13,-8-3*16(%rax)
1286         movaps  %xmm14,-8-2*16(%rax)
1287         movaps  %xmm15,-8-1*16(%rax)
1288 .Lprologue_shaext:
1289 ___
1290 $code.=<<___;
1291         lea             K256+0x80(%rip),$Tbl
1292         movdqu          ($ctx),$ABEF            # DCBA
1293         movdqu          16($ctx),$CDGH          # HGFE
1294         movdqa          0x200-0x80($Tbl),$TMP   # byte swap mask
1295
1296         mov             240($key),$rounds
1297         sub             $in0,$out
1298         movups          ($key),$rndkey0         # $key[0]
1299         movups          16($key),$rndkey[0]     # forward reference
1300         lea             112($key),$key          # size optimization
1301
1302         pshufd          \$0x1b,$ABEF,$Wi        # ABCD
1303         pshufd          \$0xb1,$ABEF,$ABEF      # CDAB
1304         pshufd          \$0x1b,$CDGH,$CDGH      # EFGH
1305         movdqa          $TMP,$BSWAP             # offload
1306         palignr         \$8,$CDGH,$ABEF         # ABEF
1307         punpcklqdq      $Wi,$CDGH               # CDGH
1308
1309         jmp     .Loop_shaext
1310
1311 .align  16
1312 .Loop_shaext:
1313         movdqu          ($inp),@MSG[0]
1314         movdqu          0x10($inp),@MSG[1]
1315         movdqu          0x20($inp),@MSG[2]
1316         pshufb          $TMP,@MSG[0]
1317         movdqu          0x30($inp),@MSG[3]
1318
1319         movdqa          0*32-0x80($Tbl),$Wi
1320         paddd           @MSG[0],$Wi
1321         pshufb          $TMP,@MSG[1]
1322         movdqa          $CDGH,$CDGH_SAVE        # offload
1323         movdqa          $ABEF,$ABEF_SAVE        # offload
1324 ___
1325         &$aesenc();
1326 $code.=<<___;
1327         sha256rnds2     $ABEF,$CDGH             # 0-3
1328         pshufd          \$0x0e,$Wi,$Wi
1329 ___
1330         &$aesenc();
1331 $code.=<<___;
1332         sha256rnds2     $CDGH,$ABEF
1333
1334         movdqa          1*32-0x80($Tbl),$Wi
1335         paddd           @MSG[1],$Wi
1336         pshufb          $TMP,@MSG[2]
1337         lea             0x40($inp),$inp
1338 ___
1339         &$aesenc();
1340 $code.=<<___;
1341         sha256rnds2     $ABEF,$CDGH             # 4-7
1342         pshufd          \$0x0e,$Wi,$Wi
1343 ___
1344         &$aesenc();
1345 $code.=<<___;
1346         sha256rnds2     $CDGH,$ABEF
1347
1348         movdqa          2*32-0x80($Tbl),$Wi
1349         paddd           @MSG[2],$Wi
1350         pshufb          $TMP,@MSG[3]
1351         sha256msg1      @MSG[1],@MSG[0]
1352 ___
1353         &$aesenc();
1354 $code.=<<___;
1355         sha256rnds2     $ABEF,$CDGH             # 8-11
1356         pshufd          \$0x0e,$Wi,$Wi
1357         movdqa          @MSG[3],$TMP
1358         palignr         \$4,@MSG[2],$TMP
1359         paddd           $TMP,@MSG[0]
1360 ___
1361         &$aesenc();
1362 $code.=<<___;
1363         sha256rnds2     $CDGH,$ABEF
1364
1365         movdqa          3*32-0x80($Tbl),$Wi
1366         paddd           @MSG[3],$Wi
1367         sha256msg2      @MSG[3],@MSG[0]
1368         sha256msg1      @MSG[2],@MSG[1]
1369 ___
1370         &$aesenc();
1371 $code.=<<___;
1372         sha256rnds2     $ABEF,$CDGH             # 12-15
1373         pshufd          \$0x0e,$Wi,$Wi
1374 ___
1375         &$aesenc();
1376 $code.=<<___;
1377         movdqa          @MSG[0],$TMP
1378         palignr         \$4,@MSG[3],$TMP
1379         paddd           $TMP,@MSG[1]
1380         sha256rnds2     $CDGH,$ABEF
1381 ___
1382 for($i=4;$i<16-3;$i++) {
1383         &$aesenc()      if (($r%10)==0);
1384 $code.=<<___;
1385         movdqa          $i*32-0x80($Tbl),$Wi
1386         paddd           @MSG[0],$Wi
1387         sha256msg2      @MSG[0],@MSG[1]
1388         sha256msg1      @MSG[3],@MSG[2]
1389 ___
1390         &$aesenc();
1391 $code.=<<___;
1392         sha256rnds2     $ABEF,$CDGH             # 16-19...
1393         pshufd          \$0x0e,$Wi,$Wi
1394         movdqa          @MSG[1],$TMP
1395         palignr         \$4,@MSG[0],$TMP
1396         paddd           $TMP,@MSG[2]
1397 ___
1398         &$aesenc();
1399         &$aesenc()      if ($r==19);
1400 $code.=<<___;
1401         sha256rnds2     $CDGH,$ABEF
1402 ___
1403         push(@MSG,shift(@MSG));
1404 }
1405 $code.=<<___;
1406         movdqa          13*32-0x80($Tbl),$Wi
1407         paddd           @MSG[0],$Wi
1408         sha256msg2      @MSG[0],@MSG[1]
1409         sha256msg1      @MSG[3],@MSG[2]
1410 ___
1411         &$aesenc();
1412 $code.=<<___;
1413         sha256rnds2     $ABEF,$CDGH             # 52-55
1414         pshufd          \$0x0e,$Wi,$Wi
1415         movdqa          @MSG[1],$TMP
1416         palignr         \$4,@MSG[0],$TMP
1417         paddd           $TMP,@MSG[2]
1418 ___
1419         &$aesenc();
1420         &$aesenc();
1421 $code.=<<___;
1422         sha256rnds2     $CDGH,$ABEF
1423
1424         movdqa          14*32-0x80($Tbl),$Wi
1425         paddd           @MSG[1],$Wi
1426         sha256msg2      @MSG[1],@MSG[2]
1427         movdqa          $BSWAP,$TMP
1428 ___
1429         &$aesenc();
1430 $code.=<<___;
1431         sha256rnds2     $ABEF,$CDGH             # 56-59
1432         pshufd          \$0x0e,$Wi,$Wi
1433 ___
1434         &$aesenc();
1435 $code.=<<___;
1436         sha256rnds2     $CDGH,$ABEF
1437
1438         movdqa          15*32-0x80($Tbl),$Wi
1439         paddd           @MSG[2],$Wi
1440 ___
1441         &$aesenc();
1442         &$aesenc();
1443 $code.=<<___;
1444         sha256rnds2     $ABEF,$CDGH             # 60-63
1445         pshufd          \$0x0e,$Wi,$Wi
1446 ___
1447         &$aesenc();
1448 $code.=<<___;
1449         sha256rnds2     $CDGH,$ABEF
1450         #pxor           $CDGH,$rndkey0          # black magic
1451 ___
1452         while ($r<40)   { &$aesenc(); }         # remaining aesenc's
1453 $code.=<<___;
1454         #xorps          $CDGH,$rndkey0          # black magic
1455         paddd           $CDGH_SAVE,$CDGH
1456         paddd           $ABEF_SAVE,$ABEF
1457
1458         dec             $len
1459         movups          $iv,48($out,$in0)       # write output
1460         lea             64($in0),$in0
1461         jnz             .Loop_shaext
1462
1463         pshufd          \$0xb1,$CDGH,$CDGH      # DCHG
1464         pshufd          \$0x1b,$ABEF,$TMP       # FEBA
1465         pshufd          \$0xb1,$ABEF,$ABEF      # BAFE
1466         punpckhqdq      $CDGH,$ABEF             # DCBA
1467         palignr         \$8,$TMP,$CDGH          # HGFE
1468
1469         movups          $iv,($ivp)              # write IV
1470         movdqu          $ABEF,($ctx)
1471         movdqu          $CDGH,16($ctx)
1472 ___
1473 $code.=<<___ if ($win64);
1474         movaps  0*16(%rsp),%xmm6
1475         movaps  1*16(%rsp),%xmm7
1476         movaps  2*16(%rsp),%xmm8
1477         movaps  3*16(%rsp),%xmm9
1478         movaps  4*16(%rsp),%xmm10
1479         movaps  5*16(%rsp),%xmm11
1480         movaps  6*16(%rsp),%xmm12
1481         movaps  7*16(%rsp),%xmm13
1482         movaps  8*16(%rsp),%xmm14
1483         movaps  9*16(%rsp),%xmm15
1484         lea     8+10*16(%rsp),%rsp
1485 .Lepilogue_shaext:
1486 ___
1487 $code.=<<___;
1488         ret
1489 .size   ${func}_shaext,.-${func}_shaext
1490 ___
1491 }
1492 }}}}}
1493
1494 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1495 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1496 if ($win64) {
1497 $rec="%rcx";
1498 $frame="%rdx";
1499 $context="%r8";
1500 $disp="%r9";
1501
1502 $code.=<<___ if ($avx);
1503 .extern __imp_RtlVirtualUnwind
1504 .type   se_handler,\@abi-omnipotent
1505 .align  16
1506 se_handler:
1507         push    %rsi
1508         push    %rdi
1509         push    %rbx
1510         push    %rbp
1511         push    %r12
1512         push    %r13
1513         push    %r14
1514         push    %r15
1515         pushfq
1516         sub     \$64,%rsp
1517
1518         mov     120($context),%rax      # pull context->Rax
1519         mov     248($context),%rbx      # pull context->Rip
1520
1521         mov     8($disp),%rsi           # disp->ImageBase
1522         mov     56($disp),%r11          # disp->HanderlData
1523
1524         mov     0(%r11),%r10d           # HandlerData[0]
1525         lea     (%rsi,%r10),%r10        # prologue label
1526         cmp     %r10,%rbx               # context->Rip<prologue label
1527         jb      .Lin_prologue
1528
1529         mov     152($context),%rax      # pull context->Rsp
1530
1531         mov     4(%r11),%r10d           # HandlerData[1]
1532         lea     (%rsi,%r10),%r10        # epilogue label
1533         cmp     %r10,%rbx               # context->Rip>=epilogue label
1534         jae     .Lin_prologue
1535 ___
1536 $code.=<<___ if ($shaext);
1537         lea     aesni_cbc_sha256_enc_shaext(%rip),%r10
1538         cmp     %r10,%rbx
1539         jb      .Lnot_in_shaext
1540
1541         lea     (%rax),%rsi
1542         lea     512($context),%rdi      # &context.Xmm6
1543         mov     \$20,%ecx
1544         .long   0xa548f3fc              # cld; rep movsq
1545         lea     168(%rax),%rax          # adjust stack pointer
1546         jmp     .Lin_prologue
1547 .Lnot_in_shaext:
1548 ___
1549 $code.=<<___ if ($avx>1);
1550         lea     .Lavx2_shortcut(%rip),%r10
1551         cmp     %r10,%rbx               # context->Rip<avx2_shortcut
1552         jb      .Lnot_in_avx2
1553
1554         and     \$-256*$SZ,%rax
1555         add     \$`2*$SZ*($rounds-8)`,%rax
1556 .Lnot_in_avx2:
1557 ___
1558 $code.=<<___;
1559         mov     %rax,%rsi               # put aside Rsp
1560         mov     16*$SZ+7*8(%rax),%rax   # pull $_rsp
1561         lea     48(%rax),%rax
1562
1563         mov     -8(%rax),%rbx
1564         mov     -16(%rax),%rbp
1565         mov     -24(%rax),%r12
1566         mov     -32(%rax),%r13
1567         mov     -40(%rax),%r14
1568         mov     -48(%rax),%r15
1569         mov     %rbx,144($context)      # restore context->Rbx
1570         mov     %rbp,160($context)      # restore context->Rbp
1571         mov     %r12,216($context)      # restore context->R12
1572         mov     %r13,224($context)      # restore context->R13
1573         mov     %r14,232($context)      # restore context->R14
1574         mov     %r15,240($context)      # restore context->R15
1575
1576         lea     16*$SZ+8*8(%rsi),%rsi   # Xmm6- save area
1577         lea     512($context),%rdi      # &context.Xmm6
1578         mov     \$20,%ecx
1579         .long   0xa548f3fc              # cld; rep movsq
1580
1581 .Lin_prologue:
1582         mov     8(%rax),%rdi
1583         mov     16(%rax),%rsi
1584         mov     %rax,152($context)      # restore context->Rsp
1585         mov     %rsi,168($context)      # restore context->Rsi
1586         mov     %rdi,176($context)      # restore context->Rdi
1587
1588         mov     40($disp),%rdi          # disp->ContextRecord
1589         mov     $context,%rsi           # context
1590         mov     \$154,%ecx              # sizeof(CONTEXT)
1591         .long   0xa548f3fc              # cld; rep movsq
1592
1593         mov     $disp,%rsi
1594         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1595         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1596         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1597         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1598         mov     40(%rsi),%r10           # disp->ContextRecord
1599         lea     56(%rsi),%r11           # &disp->HandlerData
1600         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1601         mov     %r10,32(%rsp)           # arg5
1602         mov     %r11,40(%rsp)           # arg6
1603         mov     %r12,48(%rsp)           # arg7
1604         mov     %rcx,56(%rsp)           # arg8, (NULL)
1605         call    *__imp_RtlVirtualUnwind(%rip)
1606
1607         mov     \$1,%eax                # ExceptionContinueSearch
1608         add     \$64,%rsp
1609         popfq
1610         pop     %r15
1611         pop     %r14
1612         pop     %r13
1613         pop     %r12
1614         pop     %rbp
1615         pop     %rbx
1616         pop     %rdi
1617         pop     %rsi
1618         ret
1619 .size   se_handler,.-se_handler
1620
1621 .section        .pdata
1622         .rva    .LSEH_begin_${func}_xop
1623         .rva    .LSEH_end_${func}_xop
1624         .rva    .LSEH_info_${func}_xop
1625
1626         .rva    .LSEH_begin_${func}_avx
1627         .rva    .LSEH_end_${func}_avx
1628         .rva    .LSEH_info_${func}_avx
1629 ___
1630 $code.=<<___ if ($avx>1);
1631         .rva    .LSEH_begin_${func}_avx2
1632         .rva    .LSEH_end_${func}_avx2
1633         .rva    .LSEH_info_${func}_avx2
1634 ___
1635 $code.=<<___ if ($shaext);
1636         .rva    .LSEH_begin_${func}_shaext
1637         .rva    .LSEH_end_${func}_shaext
1638         .rva    .LSEH_info_${func}_shaext
1639 ___
1640 $code.=<<___ if ($avx);
1641 .section        .xdata
1642 .align  8
1643 .LSEH_info_${func}_xop:
1644         .byte   9,0,0,0
1645         .rva    se_handler
1646         .rva    .Lprologue_xop,.Lepilogue_xop           # HandlerData[]
1647
1648 .LSEH_info_${func}_avx:
1649         .byte   9,0,0,0
1650         .rva    se_handler
1651         .rva    .Lprologue_avx,.Lepilogue_avx           # HandlerData[]
1652 ___
1653 $code.=<<___ if ($avx>1);
1654 .LSEH_info_${func}_avx2:
1655         .byte   9,0,0,0
1656         .rva    se_handler
1657         .rva    .Lprologue_avx2,.Lepilogue_avx2         # HandlerData[]
1658 ___
1659 $code.=<<___ if ($shaext);
1660 .LSEH_info_${func}_shaext:
1661         .byte   9,0,0,0
1662         .rva    se_handler
1663         .rva    .Lprologue_shaext,.Lepilogue_shaext     # HandlerData[]
1664 ___
1665 }
1666
1667 ####################################################################
1668 sub rex {
1669   local *opcode=shift;
1670   my ($dst,$src)=@_;
1671   my $rex=0;
1672
1673     $rex|=0x04                  if($dst>=8);
1674     $rex|=0x01                  if($src>=8);
1675     unshift @opcode,$rex|0x40   if($rex);
1676 }
1677
1678 {
1679   my %opcodelet = (
1680                 "sha256rnds2" => 0xcb,
1681                 "sha256msg1"  => 0xcc,
1682                 "sha256msg2"  => 0xcd   );
1683
1684   sub sha256op38 {
1685     my $instr = shift;
1686
1687     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
1688       my @opcode=(0x0f,0x38);
1689         rex(\@opcode,$2,$1);
1690         push @opcode,$opcodelet{$instr};
1691         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1692         return ".byte\t".join(',',@opcode);
1693     } else {
1694         return $instr."\t".@_[0];
1695     }
1696   }
1697 }
1698
1699 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1700 $code =~ s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/gem;
1701 print $code;
1702 close STDOUT;