sha/asm/keccak1600-armv4.pl: improve non-NEON performance by ~10%.
[openssl.git] / crypto / sha / asm / sha256-mb-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # Multi-buffer SHA256 procedure processes n buffers in parallel by
18 # placing buffer data to designated lane of SIMD register. n is
19 # naturally limited to 4 on pre-AVX2 processors and to 8 on
20 # AVX2-capable processors such as Haswell.
21 #
22 #               this    +aesni(i)       sha256  aesni-sha256    gain(iv)
23 # -------------------------------------------------------------------
24 # Westmere(ii)  23.3/n  +1.28=7.11(n=4) 12.3    +3.75=16.1      +126%
25 # Atom(ii)      38.7/n  +3.93=13.6(n=4) 20.8    +5.69=26.5      +95%
26 # Sandy Bridge  (20.5   +5.15=25.7)/n   11.6    13.0            +103%
27 # Ivy Bridge    (20.4   +5.14=25.5)/n   10.3    11.6            +82%
28 # Haswell(iii)  (21.0   +5.00=26.0)/n   7.80    8.79            +170%
29 # Skylake       (18.9   +5.00=23.9)/n   7.70    8.17            +170%
30 # Bulldozer     (21.6   +5.76=27.4)/n   13.6    13.7            +100%
31 #
32 # (i)   multi-block CBC encrypt with 128-bit key;
33 # (ii)  (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34 #       because of lower AES-NI instruction throughput, nor is there
35 #       AES-NI-SHA256 stitch for these processors;
36 # (iii) "this" is for n=8, when we gather twice as much data, result
37 #       for n=4 is 20.3+4.44=24.7;
38 # (iv)  presented improvement coefficients are asymptotic limits and
39 #       in real-life application are somewhat lower, e.g. for 2KB
40 #       fragments they range from 75% to 130% (on Haswell);
41
42 $flavour = shift;
43 $output  = shift;
44 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
45
46 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47
48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51 die "can't locate x86_64-xlate.pl";
52
53 $avx=0;
54
55 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57         $avx = ($1>=2.19) + ($1>=2.22);
58 }
59
60 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62         $avx = ($1>=2.09) + ($1>=2.10);
63 }
64
65 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
67         $avx = ($1>=10) + ($1>=11);
68 }
69
70 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
71         $avx = ($2>=3.0) + ($2>3.0);
72 }
73
74 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
75 *STDOUT=*OUT;
76
77 # void sha256_multi_block (
78 #     struct {  unsigned int A[8];
79 #               unsigned int B[8];
80 #               unsigned int C[8];
81 #               unsigned int D[8];
82 #               unsigned int E[8];
83 #               unsigned int F[8];
84 #               unsigned int G[8];
85 #               unsigned int H[8];      } *ctx,
86 #     struct {  void *ptr; int blocks;  } inp[8],
87 #     int num);         /* 1 or 2 */
88 #
89 $ctx="%rdi";    # 1st arg
90 $inp="%rsi";    # 2nd arg
91 $num="%edx";    # 3rd arg
92 @ptr=map("%r$_",(8..11));
93 $Tbl="%rbp";
94
95 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
96 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
97
98 $REG_SZ=16;
99
100 sub Xi_off {
101 my $off = shift;
102
103     $off %= 16; $off *= $REG_SZ;
104     $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
105 }
106
107 sub ROUND_00_15 {
108 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
109
110 $code.=<<___ if ($i<15);
111         movd            `4*$i`(@ptr[0]),$Xi
112         movd            `4*$i`(@ptr[1]),$t1
113         movd            `4*$i`(@ptr[2]),$t2
114         movd            `4*$i`(@ptr[3]),$t3
115         punpckldq       $t2,$Xi
116         punpckldq       $t3,$t1
117         punpckldq       $t1,$Xi
118 ___
119 $code.=<<___ if ($i==15);
120         movd            `4*$i`(@ptr[0]),$Xi
121          lea            `16*4`(@ptr[0]),@ptr[0]
122         movd            `4*$i`(@ptr[1]),$t1
123          lea            `16*4`(@ptr[1]),@ptr[1]
124         movd            `4*$i`(@ptr[2]),$t2
125          lea            `16*4`(@ptr[2]),@ptr[2]
126         movd            `4*$i`(@ptr[3]),$t3
127          lea            `16*4`(@ptr[3]),@ptr[3]
128         punpckldq       $t2,$Xi
129         punpckldq       $t3,$t1
130         punpckldq       $t1,$Xi
131 ___
132 $code.=<<___;
133         movdqa  $e,$sigma
134         `"pshufb        $Xn,$Xi"                if ($i<=15 && ($i&1)==0)`
135         movdqa  $e,$t3
136         `"pshufb        $Xn,$Xi"                if ($i<=15 && ($i&1)==1)`
137         psrld   \$6,$sigma
138         movdqa  $e,$t2
139         pslld   \$7,$t3
140         movdqa  $Xi,`&Xi_off($i)`
141          paddd  $h,$Xi                          # Xi+=h
142
143         psrld   \$11,$t2
144         pxor    $t3,$sigma
145         pslld   \$21-7,$t3
146          paddd  `32*($i%8)-128`($Tbl),$Xi       # Xi+=K[round]
147         pxor    $t2,$sigma
148
149         psrld   \$25-11,$t2
150          movdqa $e,$t1
151          `"prefetcht0   63(@ptr[0])"            if ($i==15)`
152         pxor    $t3,$sigma
153          movdqa $e,$axb                         # borrow $axb
154         pslld   \$26-21,$t3
155          pandn  $g,$t1
156          pand   $f,$axb
157         pxor    $t2,$sigma
158
159          `"prefetcht0   63(@ptr[1])"            if ($i==15)`
160         movdqa  $a,$t2
161         pxor    $t3,$sigma                      # Sigma1(e)
162         movdqa  $a,$t3
163         psrld   \$2,$t2
164         paddd   $sigma,$Xi                      # Xi+=Sigma1(e)
165          pxor   $axb,$t1                        # Ch(e,f,g)
166          movdqa $b,$axb
167         movdqa  $a,$sigma
168         pslld   \$10,$t3
169          pxor   $a,$axb                         # a^b, b^c in next round
170
171          `"prefetcht0   63(@ptr[2])"            if ($i==15)`
172         psrld   \$13,$sigma
173         pxor    $t3,$t2
174          paddd  $t1,$Xi                         # Xi+=Ch(e,f,g)
175         pslld   \$19-10,$t3
176          pand   $axb,$bxc
177         pxor    $sigma,$t2
178
179          `"prefetcht0   63(@ptr[3])"            if ($i==15)`
180         psrld   \$22-13,$sigma
181         pxor    $t3,$t2
182          movdqa $b,$h
183         pslld   \$30-19,$t3
184         pxor    $t2,$sigma
185          pxor   $bxc,$h                         # h=Maj(a,b,c)=Ch(a^b,c,b)
186          paddd  $Xi,$d                          # d+=Xi
187         pxor    $t3,$sigma                      # Sigma0(a)
188
189         paddd   $Xi,$h                          # h+=Xi
190         paddd   $sigma,$h                       # h+=Sigma0(a)
191 ___
192 $code.=<<___ if (($i%8)==7);
193         lea     `32*8`($Tbl),$Tbl
194 ___
195         ($axb,$bxc)=($bxc,$axb);
196 }
197
198 sub ROUND_16_XX {
199 my $i=shift;
200
201 $code.=<<___;
202         movdqa  `&Xi_off($i+1)`,$Xn
203         paddd   `&Xi_off($i+9)`,$Xi             # Xi+=X[i+9]
204
205         movdqa  $Xn,$sigma
206         movdqa  $Xn,$t2
207         psrld   \$3,$sigma
208         movdqa  $Xn,$t3
209
210         psrld   \$7,$t2
211         movdqa  `&Xi_off($i+14)`,$t1
212         pslld   \$14,$t3
213         pxor    $t2,$sigma
214         psrld   \$18-7,$t2
215         movdqa  $t1,$axb                        # borrow $axb
216         pxor    $t3,$sigma
217         pslld   \$25-14,$t3
218         pxor    $t2,$sigma
219         psrld   \$10,$t1
220         movdqa  $axb,$t2
221
222         psrld   \$17,$axb
223         pxor    $t3,$sigma                      # sigma0(X[i+1])
224         pslld   \$13,$t2
225          paddd  $sigma,$Xi                      # Xi+=sigma0(e)
226         pxor    $axb,$t1
227         psrld   \$19-17,$axb
228         pxor    $t2,$t1
229         pslld   \$15-13,$t2
230         pxor    $axb,$t1
231         pxor    $t2,$t1                         # sigma0(X[i+14])
232         paddd   $t1,$Xi                         # Xi+=sigma1(X[i+14])
233 ___
234         &ROUND_00_15($i,@_);
235         ($Xi,$Xn)=($Xn,$Xi);
236 }
237
238 $code.=<<___;
239 .text
240
241 .extern OPENSSL_ia32cap_P
242
243 .globl  sha256_multi_block
244 .type   sha256_multi_block,\@function,3
245 .align  32
246 sha256_multi_block:
247 .cfi_startproc
248         mov     OPENSSL_ia32cap_P+4(%rip),%rcx
249         bt      \$61,%rcx                       # check SHA bit
250         jc      _shaext_shortcut
251 ___
252 $code.=<<___ if ($avx);
253         test    \$`1<<28`,%ecx
254         jnz     _avx_shortcut
255 ___
256 $code.=<<___;
257         mov     %rsp,%rax
258 .cfi_def_cfa_register   %rax
259         push    %rbx
260 .cfi_push       %rbx
261         push    %rbp
262 .cfi_push       %rbp
263 ___
264 $code.=<<___ if ($win64);
265         lea     -0xa8(%rsp),%rsp
266         movaps  %xmm6,(%rsp)
267         movaps  %xmm7,0x10(%rsp)
268         movaps  %xmm8,0x20(%rsp)
269         movaps  %xmm9,0x30(%rsp)
270         movaps  %xmm10,-0x78(%rax)
271         movaps  %xmm11,-0x68(%rax)
272         movaps  %xmm12,-0x58(%rax)
273         movaps  %xmm13,-0x48(%rax)
274         movaps  %xmm14,-0x38(%rax)
275         movaps  %xmm15,-0x28(%rax)
276 ___
277 $code.=<<___;
278         sub     \$`$REG_SZ*18`, %rsp
279         and     \$-256,%rsp
280         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
281 .cfi_cfa_expression     %rsp+`$REG_SZ*17`,deref,+8
282 .Lbody:
283         lea     K256+128(%rip),$Tbl
284         lea     `$REG_SZ*16`(%rsp),%rbx
285         lea     0x80($ctx),$ctx                 # size optimization
286
287 .Loop_grande:
288         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
289         xor     $num,$num
290 ___
291 for($i=0;$i<4;$i++) {
292     $code.=<<___;
293         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
294         mov     `16*$i+8`($inp),%ecx            # number of blocks
295         cmp     $num,%ecx
296         cmovg   %ecx,$num                       # find maximum
297         test    %ecx,%ecx
298         mov     %ecx,`4*$i`(%rbx)               # initialize counters
299         cmovle  $Tbl,@ptr[$i]                   # cancel input
300 ___
301 }
302 $code.=<<___;
303         test    $num,$num
304         jz      .Ldone
305
306         movdqu  0x00-0x80($ctx),$A              # load context
307          lea    128(%rsp),%rax
308         movdqu  0x20-0x80($ctx),$B
309         movdqu  0x40-0x80($ctx),$C
310         movdqu  0x60-0x80($ctx),$D
311         movdqu  0x80-0x80($ctx),$E
312         movdqu  0xa0-0x80($ctx),$F
313         movdqu  0xc0-0x80($ctx),$G
314         movdqu  0xe0-0x80($ctx),$H
315         movdqu  .Lpbswap(%rip),$Xn
316         jmp     .Loop
317
318 .align  32
319 .Loop:
320         movdqa  $C,$bxc
321         pxor    $B,$bxc                         # magic seed
322 ___
323 for($i=0;$i<16;$i++)    { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
324 $code.=<<___;
325         movdqu  `&Xi_off($i)`,$Xi
326         mov     \$3,%ecx
327         jmp     .Loop_16_xx
328 .align  32
329 .Loop_16_xx:
330 ___
331 for(;$i<32;$i++)        { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
332 $code.=<<___;
333         dec     %ecx
334         jnz     .Loop_16_xx
335
336         mov     \$1,%ecx
337         lea     K256+128(%rip),$Tbl
338
339         movdqa  (%rbx),$sigma                   # pull counters
340         cmp     4*0(%rbx),%ecx                  # examine counters
341         pxor    $t1,$t1
342         cmovge  $Tbl,@ptr[0]                    # cancel input
343         cmp     4*1(%rbx),%ecx
344         movdqa  $sigma,$Xn
345         cmovge  $Tbl,@ptr[1]
346         cmp     4*2(%rbx),%ecx
347         pcmpgtd $t1,$Xn                         # mask value
348         cmovge  $Tbl,@ptr[2]
349         cmp     4*3(%rbx),%ecx
350         paddd   $Xn,$sigma                      # counters--
351         cmovge  $Tbl,@ptr[3]
352
353         movdqu  0x00-0x80($ctx),$t1
354         pand    $Xn,$A
355         movdqu  0x20-0x80($ctx),$t2
356         pand    $Xn,$B
357         movdqu  0x40-0x80($ctx),$t3
358         pand    $Xn,$C
359         movdqu  0x60-0x80($ctx),$Xi
360         pand    $Xn,$D
361         paddd   $t1,$A
362         movdqu  0x80-0x80($ctx),$t1
363         pand    $Xn,$E
364         paddd   $t2,$B
365         movdqu  0xa0-0x80($ctx),$t2
366         pand    $Xn,$F
367         paddd   $t3,$C
368         movdqu  0xc0-0x80($ctx),$t3
369         pand    $Xn,$G
370         paddd   $Xi,$D
371         movdqu  0xe0-0x80($ctx),$Xi
372         pand    $Xn,$H
373         paddd   $t1,$E
374         paddd   $t2,$F
375         movdqu  $A,0x00-0x80($ctx)
376         paddd   $t3,$G
377         movdqu  $B,0x20-0x80($ctx)
378         paddd   $Xi,$H
379         movdqu  $C,0x40-0x80($ctx)
380         movdqu  $D,0x60-0x80($ctx)
381         movdqu  $E,0x80-0x80($ctx)
382         movdqu  $F,0xa0-0x80($ctx)
383         movdqu  $G,0xc0-0x80($ctx)
384         movdqu  $H,0xe0-0x80($ctx)
385
386         movdqa  $sigma,(%rbx)                   # save counters
387         movdqa  .Lpbswap(%rip),$Xn
388         dec     $num
389         jnz     .Loop
390
391         mov     `$REG_SZ*17+8`(%rsp),$num
392         lea     $REG_SZ($ctx),$ctx
393         lea     `16*$REG_SZ/4`($inp),$inp
394         dec     $num
395         jnz     .Loop_grande
396
397 .Ldone:
398         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
399 .cfi_def_cfa    %rax,8
400 ___
401 $code.=<<___ if ($win64);
402         movaps  -0xb8(%rax),%xmm6
403         movaps  -0xa8(%rax),%xmm7
404         movaps  -0x98(%rax),%xmm8
405         movaps  -0x88(%rax),%xmm9
406         movaps  -0x78(%rax),%xmm10
407         movaps  -0x68(%rax),%xmm11
408         movaps  -0x58(%rax),%xmm12
409         movaps  -0x48(%rax),%xmm13
410         movaps  -0x38(%rax),%xmm14
411         movaps  -0x28(%rax),%xmm15
412 ___
413 $code.=<<___;
414         mov     -16(%rax),%rbp
415 .cfi_restore    %rbp
416         mov     -8(%rax),%rbx
417 .cfi_restore    %rbx
418         lea     (%rax),%rsp
419 .cfi_def_cfa_register   %rsp
420 .Lepilogue:
421         ret
422 .cfi_endproc
423 .size   sha256_multi_block,.-sha256_multi_block
424 ___
425                                                 {{{
426 my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
427 my @MSG0=map("%xmm$_",(4..7));
428 my @MSG1=map("%xmm$_",(8..11));
429
430 $code.=<<___;
431 .type   sha256_multi_block_shaext,\@function,3
432 .align  32
433 sha256_multi_block_shaext:
434 .cfi_startproc
435 _shaext_shortcut:
436         mov     %rsp,%rax
437 .cfi_def_cfa_register   %rax
438         push    %rbx
439 .cfi_push       %rbx
440         push    %rbp
441 .cfi_push       %rbp
442 ___
443 $code.=<<___ if ($win64);
444         lea     -0xa8(%rsp),%rsp
445         movaps  %xmm6,(%rsp)
446         movaps  %xmm7,0x10(%rsp)
447         movaps  %xmm8,0x20(%rsp)
448         movaps  %xmm9,0x30(%rsp)
449         movaps  %xmm10,-0x78(%rax)
450         movaps  %xmm11,-0x68(%rax)
451         movaps  %xmm12,-0x58(%rax)
452         movaps  %xmm13,-0x48(%rax)
453         movaps  %xmm14,-0x38(%rax)
454         movaps  %xmm15,-0x28(%rax)
455 ___
456 $code.=<<___;
457         sub     \$`$REG_SZ*18`,%rsp
458         shl     \$1,$num                        # we process pair at a time
459         and     \$-256,%rsp
460         lea     0x80($ctx),$ctx                 # size optimization
461         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
462 .Lbody_shaext:
463         lea     `$REG_SZ*16`(%rsp),%rbx
464         lea     K256_shaext+0x80(%rip),$Tbl
465
466 .Loop_grande_shaext:
467         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
468         xor     $num,$num
469 ___
470 for($i=0;$i<2;$i++) {
471     $code.=<<___;
472         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
473         mov     `16*$i+8`($inp),%ecx            # number of blocks
474         cmp     $num,%ecx
475         cmovg   %ecx,$num                       # find maximum
476         test    %ecx,%ecx
477         mov     %ecx,`4*$i`(%rbx)               # initialize counters
478         cmovle  %rsp,@ptr[$i]                   # cancel input
479 ___
480 }
481 $code.=<<___;
482         test    $num,$num
483         jz      .Ldone_shaext
484
485         movq            0x00-0x80($ctx),$ABEF0          # A1.A0
486         movq            0x20-0x80($ctx),@MSG0[0]        # B1.B0
487         movq            0x40-0x80($ctx),$CDGH0          # C1.C0
488         movq            0x60-0x80($ctx),@MSG0[1]        # D1.D0
489         movq            0x80-0x80($ctx),@MSG1[0]        # E1.E0
490         movq            0xa0-0x80($ctx),@MSG1[1]        # F1.F0
491         movq            0xc0-0x80($ctx),@MSG1[2]        # G1.G0
492         movq            0xe0-0x80($ctx),@MSG1[3]        # H1.H0
493
494         punpckldq       @MSG0[0],$ABEF0                 # B1.A1.B0.A0
495         punpckldq       @MSG0[1],$CDGH0                 # D1.C1.D0.C0
496         punpckldq       @MSG1[1],@MSG1[0]               # F1.E1.F0.E0
497         punpckldq       @MSG1[3],@MSG1[2]               # H1.G1.H0.G0
498         movdqa          K256_shaext-0x10(%rip),$TMPx    # byte swap
499
500         movdqa          $ABEF0,$ABEF1
501         movdqa          $CDGH0,$CDGH1
502         punpcklqdq      @MSG1[0],$ABEF0                 # F0.E0.B0.A0
503         punpcklqdq      @MSG1[2],$CDGH0                 # H0.G0.D0.C0
504         punpckhqdq      @MSG1[0],$ABEF1                 # F1.E1.B1.A1
505         punpckhqdq      @MSG1[2],$CDGH1                 # H1.G1.D1.C1
506
507         pshufd          \$0b00011011,$ABEF0,$ABEF0
508         pshufd          \$0b00011011,$CDGH0,$CDGH0
509         pshufd          \$0b00011011,$ABEF1,$ABEF1
510         pshufd          \$0b00011011,$CDGH1,$CDGH1
511         jmp             .Loop_shaext
512
513 .align  32
514 .Loop_shaext:
515         movdqu          0x00(@ptr[0]),@MSG0[0]
516          movdqu         0x00(@ptr[1]),@MSG1[0]
517         movdqu          0x10(@ptr[0]),@MSG0[1]
518          movdqu         0x10(@ptr[1]),@MSG1[1]
519         movdqu          0x20(@ptr[0]),@MSG0[2]
520         pshufb          $TMPx,@MSG0[0]
521          movdqu         0x20(@ptr[1]),@MSG1[2]
522          pshufb         $TMPx,@MSG1[0]
523         movdqu          0x30(@ptr[0]),@MSG0[3]
524         lea             0x40(@ptr[0]),@ptr[0]
525          movdqu         0x30(@ptr[1]),@MSG1[3]
526          lea            0x40(@ptr[1]),@ptr[1]
527
528         movdqa          0*16-0x80($Tbl),$Wi
529         pshufb          $TMPx,@MSG0[1]
530         paddd           @MSG0[0],$Wi
531         pxor            $ABEF0,@MSG0[0]         # black magic
532         movdqa          $Wi,$TMP0
533          movdqa         0*16-0x80($Tbl),$TMP1
534          pshufb         $TMPx,@MSG1[1]
535          paddd          @MSG1[0],$TMP1
536         movdqa          $CDGH0,0x50(%rsp)       # offload
537         sha256rnds2     $ABEF0,$CDGH0           # 0-3
538          pxor           $ABEF1,@MSG1[0]         # black magic
539          movdqa         $TMP1,$Wi
540          movdqa         $CDGH1,0x70(%rsp)
541          sha256rnds2    $ABEF1,$CDGH1           # 0-3
542         pshufd          \$0x0e,$TMP0,$Wi
543         pxor            $ABEF0,@MSG0[0]         # black magic
544         movdqa          $ABEF0,0x40(%rsp)       # offload
545         sha256rnds2     $CDGH0,$ABEF0
546          pshufd         \$0x0e,$TMP1,$Wi
547          pxor           $ABEF1,@MSG1[0]         # black magic
548          movdqa         $ABEF1,0x60(%rsp)
549         movdqa          1*16-0x80($Tbl),$TMP0
550         paddd           @MSG0[1],$TMP0
551         pshufb          $TMPx,@MSG0[2]
552          sha256rnds2    $CDGH1,$ABEF1
553
554         movdqa          $TMP0,$Wi
555          movdqa         1*16-0x80($Tbl),$TMP1
556          paddd          @MSG1[1],$TMP1
557         sha256rnds2     $ABEF0,$CDGH0           # 4-7
558          movdqa         $TMP1,$Wi
559         prefetcht0      127(@ptr[0])
560         pshufb          $TMPx,@MSG0[3]
561          pshufb         $TMPx,@MSG1[2]
562          prefetcht0     127(@ptr[1])
563          sha256rnds2    $ABEF1,$CDGH1           # 4-7
564         pshufd          \$0x0e,$TMP0,$Wi
565          pshufb         $TMPx,@MSG1[3]
566         sha256msg1      @MSG0[1],@MSG0[0]
567         sha256rnds2     $CDGH0,$ABEF0
568          pshufd         \$0x0e,$TMP1,$Wi
569         movdqa          2*16-0x80($Tbl),$TMP0
570         paddd           @MSG0[2],$TMP0
571          sha256rnds2    $CDGH1,$ABEF1
572
573         movdqa          $TMP0,$Wi
574          movdqa         2*16-0x80($Tbl),$TMP1
575          paddd          @MSG1[2],$TMP1
576         sha256rnds2     $ABEF0,$CDGH0           # 8-11
577          sha256msg1     @MSG1[1],@MSG1[0]
578          movdqa         $TMP1,$Wi
579         movdqa          @MSG0[3],$TMPx
580          sha256rnds2    $ABEF1,$CDGH1           # 8-11
581         pshufd          \$0x0e,$TMP0,$Wi
582         palignr         \$4,@MSG0[2],$TMPx
583         paddd           $TMPx,@MSG0[0]
584          movdqa         @MSG1[3],$TMPx
585          palignr        \$4,@MSG1[2],$TMPx
586         sha256msg1      @MSG0[2],@MSG0[1]
587         sha256rnds2     $CDGH0,$ABEF0
588          pshufd         \$0x0e,$TMP1,$Wi
589         movdqa          3*16-0x80($Tbl),$TMP0
590         paddd           @MSG0[3],$TMP0
591          sha256rnds2    $CDGH1,$ABEF1
592          sha256msg1     @MSG1[2],@MSG1[1]
593
594         movdqa          $TMP0,$Wi
595          movdqa         3*16-0x80($Tbl),$TMP1
596          paddd          $TMPx,@MSG1[0]
597          paddd          @MSG1[3],$TMP1
598         sha256msg2      @MSG0[3],@MSG0[0]
599         sha256rnds2     $ABEF0,$CDGH0           # 12-15
600          movdqa         $TMP1,$Wi
601         movdqa          @MSG0[0],$TMPx
602         palignr         \$4,@MSG0[3],$TMPx
603          sha256rnds2    $ABEF1,$CDGH1           # 12-15
604          sha256msg2     @MSG1[3],@MSG1[0]
605         pshufd          \$0x0e,$TMP0,$Wi
606         paddd           $TMPx,@MSG0[1]
607          movdqa         @MSG1[0],$TMPx
608          palignr        \$4,@MSG1[3],$TMPx
609         sha256msg1      @MSG0[3],@MSG0[2]
610         sha256rnds2     $CDGH0,$ABEF0
611          pshufd         \$0x0e,$TMP1,$Wi
612         movdqa          4*16-0x80($Tbl),$TMP0
613         paddd           @MSG0[0],$TMP0
614          sha256rnds2    $CDGH1,$ABEF1
615          sha256msg1     @MSG1[3],@MSG1[2]
616 ___
617 for($i=4;$i<16-3;$i++) {
618 $code.=<<___;
619         movdqa          $TMP0,$Wi
620          movdqa         $i*16-0x80($Tbl),$TMP1
621          paddd          $TMPx,@MSG1[1]
622          paddd          @MSG1[0],$TMP1
623         sha256msg2      @MSG0[0],@MSG0[1]
624         sha256rnds2     $ABEF0,$CDGH0           # 16-19...
625          movdqa         $TMP1,$Wi
626         movdqa          @MSG0[1],$TMPx
627         palignr         \$4,@MSG0[0],$TMPx
628          sha256rnds2    $ABEF1,$CDGH1           # 16-19...
629          sha256msg2     @MSG1[0],@MSG1[1]
630         pshufd          \$0x0e,$TMP0,$Wi
631         paddd           $TMPx,@MSG0[2]
632          movdqa         @MSG1[1],$TMPx
633          palignr        \$4,@MSG1[0],$TMPx
634         sha256msg1      @MSG0[0],@MSG0[3]
635         sha256rnds2     $CDGH0,$ABEF0
636          pshufd         \$0x0e,$TMP1,$Wi
637         movdqa          `($i+1)*16`-0x80($Tbl),$TMP0
638         paddd           @MSG0[1],$TMP0
639          sha256rnds2    $CDGH1,$ABEF1
640          sha256msg1     @MSG1[0],@MSG1[3]
641 ___
642         push(@MSG0,shift(@MSG0));       push(@MSG1,shift(@MSG1));
643 }
644 $code.=<<___;
645         movdqa          $TMP0,$Wi
646          movdqa         13*16-0x80($Tbl),$TMP1
647          paddd          $TMPx,@MSG1[1]
648          paddd          @MSG1[0],$TMP1
649         sha256msg2      @MSG0[0],@MSG0[1]
650         sha256rnds2     $ABEF0,$CDGH0           # 52-55
651          movdqa         $TMP1,$Wi
652         movdqa          @MSG0[1],$TMPx
653         palignr         \$4,@MSG0[0],$TMPx
654          sha256rnds2    $ABEF1,$CDGH1           # 52-55
655          sha256msg2     @MSG1[0],@MSG1[1]
656         pshufd          \$0x0e,$TMP0,$Wi
657         paddd           $TMPx,@MSG0[2]
658          movdqa         @MSG1[1],$TMPx
659          palignr        \$4,@MSG1[0],$TMPx
660         nop
661         sha256rnds2     $CDGH0,$ABEF0
662          pshufd         \$0x0e,$TMP1,$Wi
663         movdqa          14*16-0x80($Tbl),$TMP0
664         paddd           @MSG0[1],$TMP0
665          sha256rnds2    $CDGH1,$ABEF1
666
667         movdqa          $TMP0,$Wi
668          movdqa         14*16-0x80($Tbl),$TMP1
669          paddd          $TMPx,@MSG1[2]
670          paddd          @MSG1[1],$TMP1
671         sha256msg2      @MSG0[1],@MSG0[2]
672         nop
673         sha256rnds2     $ABEF0,$CDGH0           # 56-59
674          movdqa         $TMP1,$Wi
675           mov           \$1,%ecx
676           pxor          @MSG0[1],@MSG0[1]       # zero
677          sha256rnds2    $ABEF1,$CDGH1           # 56-59
678          sha256msg2     @MSG1[1],@MSG1[2]
679         pshufd          \$0x0e,$TMP0,$Wi
680         movdqa          15*16-0x80($Tbl),$TMP0
681         paddd           @MSG0[2],$TMP0
682           movq          (%rbx),@MSG0[2]         # pull counters
683           nop
684         sha256rnds2     $CDGH0,$ABEF0
685          pshufd         \$0x0e,$TMP1,$Wi
686          movdqa         15*16-0x80($Tbl),$TMP1
687          paddd          @MSG1[2],$TMP1
688          sha256rnds2    $CDGH1,$ABEF1
689
690         movdqa          $TMP0,$Wi
691           cmp           4*0(%rbx),%ecx          # examine counters
692           cmovge        %rsp,@ptr[0]            # cancel input
693           cmp           4*1(%rbx),%ecx
694           cmovge        %rsp,@ptr[1]
695           pshufd        \$0x00,@MSG0[2],@MSG1[0]
696         sha256rnds2     $ABEF0,$CDGH0           # 60-63
697          movdqa         $TMP1,$Wi
698           pshufd        \$0x55,@MSG0[2],@MSG1[1]
699           movdqa        @MSG0[2],@MSG1[2]
700          sha256rnds2    $ABEF1,$CDGH1           # 60-63
701         pshufd          \$0x0e,$TMP0,$Wi
702           pcmpgtd       @MSG0[1],@MSG1[0]
703           pcmpgtd       @MSG0[1],@MSG1[1]
704         sha256rnds2     $CDGH0,$ABEF0
705          pshufd         \$0x0e,$TMP1,$Wi
706           pcmpgtd       @MSG0[1],@MSG1[2]       # counter mask
707           movdqa        K256_shaext-0x10(%rip),$TMPx
708          sha256rnds2    $CDGH1,$ABEF1
709
710         pand            @MSG1[0],$CDGH0
711          pand           @MSG1[1],$CDGH1
712         pand            @MSG1[0],$ABEF0
713          pand           @MSG1[1],$ABEF1
714         paddd           @MSG0[2],@MSG1[2]       # counters--
715
716         paddd           0x50(%rsp),$CDGH0
717          paddd          0x70(%rsp),$CDGH1
718         paddd           0x40(%rsp),$ABEF0
719          paddd          0x60(%rsp),$ABEF1
720
721         movq            @MSG1[2],(%rbx)         # save counters
722         dec             $num
723         jnz             .Loop_shaext
724
725         mov             `$REG_SZ*17+8`(%rsp),$num
726
727         pshufd          \$0b00011011,$ABEF0,$ABEF0
728         pshufd          \$0b00011011,$CDGH0,$CDGH0
729         pshufd          \$0b00011011,$ABEF1,$ABEF1
730         pshufd          \$0b00011011,$CDGH1,$CDGH1
731
732         movdqa          $ABEF0,@MSG0[0]
733         movdqa          $CDGH0,@MSG0[1]
734         punpckldq       $ABEF1,$ABEF0                   # B1.B0.A1.A0
735         punpckhdq       $ABEF1,@MSG0[0]                 # F1.F0.E1.E0
736         punpckldq       $CDGH1,$CDGH0                   # D1.D0.C1.C0
737         punpckhdq       $CDGH1,@MSG0[1]                 # H1.H0.G1.G0
738
739         movq            $ABEF0,0x00-0x80($ctx)          # A1.A0
740         psrldq          \$8,$ABEF0
741         movq            @MSG0[0],0x80-0x80($ctx)        # E1.E0
742         psrldq          \$8,@MSG0[0]
743         movq            $ABEF0,0x20-0x80($ctx)          # B1.B0
744         movq            @MSG0[0],0xa0-0x80($ctx)        # F1.F0
745
746         movq            $CDGH0,0x40-0x80($ctx)          # C1.C0
747         psrldq          \$8,$CDGH0
748         movq            @MSG0[1],0xc0-0x80($ctx)        # G1.G0
749         psrldq          \$8,@MSG0[1]
750         movq            $CDGH0,0x60-0x80($ctx)          # D1.D0
751         movq            @MSG0[1],0xe0-0x80($ctx)        # H1.H0
752
753         lea     `$REG_SZ/2`($ctx),$ctx
754         lea     `16*2`($inp),$inp
755         dec     $num
756         jnz     .Loop_grande_shaext
757
758 .Ldone_shaext:
759         #mov    `$REG_SZ*17`(%rsp),%rax         # original %rsp
760 ___
761 $code.=<<___ if ($win64);
762         movaps  -0xb8(%rax),%xmm6
763         movaps  -0xa8(%rax),%xmm7
764         movaps  -0x98(%rax),%xmm8
765         movaps  -0x88(%rax),%xmm9
766         movaps  -0x78(%rax),%xmm10
767         movaps  -0x68(%rax),%xmm11
768         movaps  -0x58(%rax),%xmm12
769         movaps  -0x48(%rax),%xmm13
770         movaps  -0x38(%rax),%xmm14
771         movaps  -0x28(%rax),%xmm15
772 ___
773 $code.=<<___;
774         mov     -16(%rax),%rbp
775 .cfi_restore    %rbp
776         mov     -8(%rax),%rbx
777 .cfi_restore    %rbx
778         lea     (%rax),%rsp
779 .cfi_def_cfa_register   %rsp
780 .Lepilogue_shaext:
781         ret
782 .cfi_endproc
783 .size   sha256_multi_block_shaext,.-sha256_multi_block_shaext
784 ___
785                                                 }}}
786                                                 if ($avx) {{{
787 sub ROUND_00_15_avx {
788 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
789
790 $code.=<<___ if ($i<15 && $REG_SZ==16);
791         vmovd           `4*$i`(@ptr[0]),$Xi
792         vmovd           `4*$i`(@ptr[1]),$t1
793         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
794         vpinsrd         \$1,`4*$i`(@ptr[3]),$t1,$t1
795         vpunpckldq      $t1,$Xi,$Xi
796         vpshufb         $Xn,$Xi,$Xi
797 ___
798 $code.=<<___ if ($i==15 && $REG_SZ==16);
799         vmovd           `4*$i`(@ptr[0]),$Xi
800          lea            `16*4`(@ptr[0]),@ptr[0]
801         vmovd           `4*$i`(@ptr[1]),$t1
802          lea            `16*4`(@ptr[1]),@ptr[1]
803         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
804          lea            `16*4`(@ptr[2]),@ptr[2]
805         vpinsrd         \$1,`4*$i`(@ptr[3]),$t1,$t1
806          lea            `16*4`(@ptr[3]),@ptr[3]
807         vpunpckldq      $t1,$Xi,$Xi
808         vpshufb         $Xn,$Xi,$Xi
809 ___
810 $code.=<<___ if ($i<15 && $REG_SZ==32);
811         vmovd           `4*$i`(@ptr[0]),$Xi
812         vmovd           `4*$i`(@ptr[4]),$t1
813         vmovd           `4*$i`(@ptr[1]),$t2
814         vmovd           `4*$i`(@ptr[5]),$t3
815         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
816         vpinsrd         \$1,`4*$i`(@ptr[6]),$t1,$t1
817         vpinsrd         \$1,`4*$i`(@ptr[3]),$t2,$t2
818         vpunpckldq      $t2,$Xi,$Xi
819         vpinsrd         \$1,`4*$i`(@ptr[7]),$t3,$t3
820         vpunpckldq      $t3,$t1,$t1
821         vinserti128     $t1,$Xi,$Xi
822         vpshufb         $Xn,$Xi,$Xi
823 ___
824 $code.=<<___ if ($i==15 && $REG_SZ==32);
825         vmovd           `4*$i`(@ptr[0]),$Xi
826          lea            `16*4`(@ptr[0]),@ptr[0]
827         vmovd           `4*$i`(@ptr[4]),$t1
828          lea            `16*4`(@ptr[4]),@ptr[4]
829         vmovd           `4*$i`(@ptr[1]),$t2
830          lea            `16*4`(@ptr[1]),@ptr[1]
831         vmovd           `4*$i`(@ptr[5]),$t3
832          lea            `16*4`(@ptr[5]),@ptr[5]
833         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
834          lea            `16*4`(@ptr[2]),@ptr[2]
835         vpinsrd         \$1,`4*$i`(@ptr[6]),$t1,$t1
836          lea            `16*4`(@ptr[6]),@ptr[6]
837         vpinsrd         \$1,`4*$i`(@ptr[3]),$t2,$t2
838          lea            `16*4`(@ptr[3]),@ptr[3]
839         vpunpckldq      $t2,$Xi,$Xi
840         vpinsrd         \$1,`4*$i`(@ptr[7]),$t3,$t3
841          lea            `16*4`(@ptr[7]),@ptr[7]
842         vpunpckldq      $t3,$t1,$t1
843         vinserti128     $t1,$Xi,$Xi
844         vpshufb         $Xn,$Xi,$Xi
845 ___
846 $code.=<<___;
847         vpsrld  \$6,$e,$sigma
848         vpslld  \$26,$e,$t3
849         vmovdqu $Xi,`&Xi_off($i)`
850          vpaddd $h,$Xi,$Xi                      # Xi+=h
851
852         vpsrld  \$11,$e,$t2
853         vpxor   $t3,$sigma,$sigma
854         vpslld  \$21,$e,$t3
855          vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi   # Xi+=K[round]
856         vpxor   $t2,$sigma,$sigma
857
858         vpsrld  \$25,$e,$t2
859         vpxor   $t3,$sigma,$sigma
860          `"prefetcht0   63(@ptr[0])"            if ($i==15)`
861         vpslld  \$7,$e,$t3
862          vpandn $g,$e,$t1
863          vpand  $f,$e,$axb                      # borrow $axb
864          `"prefetcht0   63(@ptr[1])"            if ($i==15)`
865         vpxor   $t2,$sigma,$sigma
866
867         vpsrld  \$2,$a,$h                       # borrow $h
868         vpxor   $t3,$sigma,$sigma               # Sigma1(e)
869          `"prefetcht0   63(@ptr[2])"            if ($i==15)`
870         vpslld  \$30,$a,$t2
871          vpxor  $axb,$t1,$t1                    # Ch(e,f,g)
872          vpxor  $a,$b,$axb                      # a^b, b^c in next round
873          `"prefetcht0   63(@ptr[3])"            if ($i==15)`
874         vpxor   $t2,$h,$h
875         vpaddd  $sigma,$Xi,$Xi                  # Xi+=Sigma1(e)
876
877         vpsrld  \$13,$a,$t2
878          `"prefetcht0   63(@ptr[4])"            if ($i==15 && $REG_SZ==32)`
879         vpslld  \$19,$a,$t3
880          vpaddd $t1,$Xi,$Xi                     # Xi+=Ch(e,f,g)
881          vpand  $axb,$bxc,$bxc
882          `"prefetcht0   63(@ptr[5])"            if ($i==15 && $REG_SZ==32)`
883         vpxor   $t2,$h,$sigma
884
885         vpsrld  \$22,$a,$t2
886         vpxor   $t3,$sigma,$sigma
887          `"prefetcht0   63(@ptr[6])"            if ($i==15 && $REG_SZ==32)`
888         vpslld  \$10,$a,$t3
889          vpxor  $bxc,$b,$h                      # h=Maj(a,b,c)=Ch(a^b,c,b)
890          vpaddd $Xi,$d,$d                       # d+=Xi
891          `"prefetcht0   63(@ptr[7])"            if ($i==15 && $REG_SZ==32)`
892         vpxor   $t2,$sigma,$sigma
893         vpxor   $t3,$sigma,$sigma               # Sigma0(a)
894
895         vpaddd  $Xi,$h,$h                       # h+=Xi
896         vpaddd  $sigma,$h,$h                    # h+=Sigma0(a)
897 ___
898 $code.=<<___ if (($i%8)==7);
899         add     \$`32*8`,$Tbl
900 ___
901         ($axb,$bxc)=($bxc,$axb);
902 }
903
904 sub ROUND_16_XX_avx {
905 my $i=shift;
906
907 $code.=<<___;
908         vmovdqu `&Xi_off($i+1)`,$Xn
909         vpaddd  `&Xi_off($i+9)`,$Xi,$Xi         # Xi+=X[i+9]
910
911         vpsrld  \$3,$Xn,$sigma
912         vpsrld  \$7,$Xn,$t2
913         vpslld  \$25,$Xn,$t3
914         vpxor   $t2,$sigma,$sigma
915         vpsrld  \$18,$Xn,$t2
916         vpxor   $t3,$sigma,$sigma
917         vpslld  \$14,$Xn,$t3
918         vmovdqu `&Xi_off($i+14)`,$t1
919         vpsrld  \$10,$t1,$axb                   # borrow $axb
920
921         vpxor   $t2,$sigma,$sigma
922         vpsrld  \$17,$t1,$t2
923         vpxor   $t3,$sigma,$sigma               # sigma0(X[i+1])
924         vpslld  \$15,$t1,$t3
925          vpaddd $sigma,$Xi,$Xi                  # Xi+=sigma0(e)
926         vpxor   $t2,$axb,$sigma
927         vpsrld  \$19,$t1,$t2
928         vpxor   $t3,$sigma,$sigma
929         vpslld  \$13,$t1,$t3
930         vpxor   $t2,$sigma,$sigma
931         vpxor   $t3,$sigma,$sigma               # sigma0(X[i+14])
932         vpaddd  $sigma,$Xi,$Xi                  # Xi+=sigma1(X[i+14])
933 ___
934         &ROUND_00_15_avx($i,@_);
935         ($Xi,$Xn)=($Xn,$Xi);
936 }
937
938 $code.=<<___;
939 .type   sha256_multi_block_avx,\@function,3
940 .align  32
941 sha256_multi_block_avx:
942 .cfi_startproc
943 _avx_shortcut:
944 ___
945 $code.=<<___ if ($avx>1);
946         shr     \$32,%rcx
947         cmp     \$2,$num
948         jb      .Lavx
949         test    \$`1<<5`,%ecx
950         jnz     _avx2_shortcut
951         jmp     .Lavx
952 .align  32
953 .Lavx:
954 ___
955 $code.=<<___;
956         mov     %rsp,%rax
957 .cfi_def_cfa_register   %rax
958         push    %rbx
959 .cfi_push       %rbx
960         push    %rbp
961 .cfi_push       %rbp
962 ___
963 $code.=<<___ if ($win64);
964         lea     -0xa8(%rsp),%rsp
965         movaps  %xmm6,(%rsp)
966         movaps  %xmm7,0x10(%rsp)
967         movaps  %xmm8,0x20(%rsp)
968         movaps  %xmm9,0x30(%rsp)
969         movaps  %xmm10,-0x78(%rax)
970         movaps  %xmm11,-0x68(%rax)
971         movaps  %xmm12,-0x58(%rax)
972         movaps  %xmm13,-0x48(%rax)
973         movaps  %xmm14,-0x38(%rax)
974         movaps  %xmm15,-0x28(%rax)
975 ___
976 $code.=<<___;
977         sub     \$`$REG_SZ*18`, %rsp
978         and     \$-256,%rsp
979         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
980 .cfi_cfa_expression     %rsp+`$REG_SZ*17`,deref,+8
981 .Lbody_avx:
982         lea     K256+128(%rip),$Tbl
983         lea     `$REG_SZ*16`(%rsp),%rbx
984         lea     0x80($ctx),$ctx                 # size optimization
985
986 .Loop_grande_avx:
987         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
988         xor     $num,$num
989 ___
990 for($i=0;$i<4;$i++) {
991     $code.=<<___;
992         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
993         mov     `16*$i+8`($inp),%ecx            # number of blocks
994         cmp     $num,%ecx
995         cmovg   %ecx,$num                       # find maximum
996         test    %ecx,%ecx
997         mov     %ecx,`4*$i`(%rbx)               # initialize counters
998         cmovle  $Tbl,@ptr[$i]                   # cancel input
999 ___
1000 }
1001 $code.=<<___;
1002         test    $num,$num
1003         jz      .Ldone_avx
1004
1005         vmovdqu 0x00-0x80($ctx),$A              # load context
1006          lea    128(%rsp),%rax
1007         vmovdqu 0x20-0x80($ctx),$B
1008         vmovdqu 0x40-0x80($ctx),$C
1009         vmovdqu 0x60-0x80($ctx),$D
1010         vmovdqu 0x80-0x80($ctx),$E
1011         vmovdqu 0xa0-0x80($ctx),$F
1012         vmovdqu 0xc0-0x80($ctx),$G
1013         vmovdqu 0xe0-0x80($ctx),$H
1014         vmovdqu .Lpbswap(%rip),$Xn
1015         jmp     .Loop_avx
1016
1017 .align  32
1018 .Loop_avx:
1019         vpxor   $B,$C,$bxc                      # magic seed
1020 ___
1021 for($i=0;$i<16;$i++)    { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1022 $code.=<<___;
1023         vmovdqu `&Xi_off($i)`,$Xi
1024         mov     \$3,%ecx
1025         jmp     .Loop_16_xx_avx
1026 .align  32
1027 .Loop_16_xx_avx:
1028 ___
1029 for(;$i<32;$i++)        { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1030 $code.=<<___;
1031         dec     %ecx
1032         jnz     .Loop_16_xx_avx
1033
1034         mov     \$1,%ecx
1035         lea     K256+128(%rip),$Tbl
1036 ___
1037 for($i=0;$i<4;$i++) {
1038     $code.=<<___;
1039         cmp     `4*$i`(%rbx),%ecx               # examine counters
1040         cmovge  $Tbl,@ptr[$i]                   # cancel input
1041 ___
1042 }
1043 $code.=<<___;
1044         vmovdqa (%rbx),$sigma                   # pull counters
1045         vpxor   $t1,$t1,$t1
1046         vmovdqa $sigma,$Xn
1047         vpcmpgtd $t1,$Xn,$Xn                    # mask value
1048         vpaddd  $Xn,$sigma,$sigma               # counters--
1049
1050         vmovdqu 0x00-0x80($ctx),$t1
1051         vpand   $Xn,$A,$A
1052         vmovdqu 0x20-0x80($ctx),$t2
1053         vpand   $Xn,$B,$B
1054         vmovdqu 0x40-0x80($ctx),$t3
1055         vpand   $Xn,$C,$C
1056         vmovdqu 0x60-0x80($ctx),$Xi
1057         vpand   $Xn,$D,$D
1058         vpaddd  $t1,$A,$A
1059         vmovdqu 0x80-0x80($ctx),$t1
1060         vpand   $Xn,$E,$E
1061         vpaddd  $t2,$B,$B
1062         vmovdqu 0xa0-0x80($ctx),$t2
1063         vpand   $Xn,$F,$F
1064         vpaddd  $t3,$C,$C
1065         vmovdqu 0xc0-0x80($ctx),$t3
1066         vpand   $Xn,$G,$G
1067         vpaddd  $Xi,$D,$D
1068         vmovdqu 0xe0-0x80($ctx),$Xi
1069         vpand   $Xn,$H,$H
1070         vpaddd  $t1,$E,$E
1071         vpaddd  $t2,$F,$F
1072         vmovdqu $A,0x00-0x80($ctx)
1073         vpaddd  $t3,$G,$G
1074         vmovdqu $B,0x20-0x80($ctx)
1075         vpaddd  $Xi,$H,$H
1076         vmovdqu $C,0x40-0x80($ctx)
1077         vmovdqu $D,0x60-0x80($ctx)
1078         vmovdqu $E,0x80-0x80($ctx)
1079         vmovdqu $F,0xa0-0x80($ctx)
1080         vmovdqu $G,0xc0-0x80($ctx)
1081         vmovdqu $H,0xe0-0x80($ctx)
1082
1083         vmovdqu $sigma,(%rbx)                   # save counters
1084         vmovdqu .Lpbswap(%rip),$Xn
1085         dec     $num
1086         jnz     .Loop_avx
1087
1088         mov     `$REG_SZ*17+8`(%rsp),$num
1089         lea     $REG_SZ($ctx),$ctx
1090         lea     `16*$REG_SZ/4`($inp),$inp
1091         dec     $num
1092         jnz     .Loop_grande_avx
1093
1094 .Ldone_avx:
1095         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
1096 .cfi_def_cfa    %rax,8
1097         vzeroupper
1098 ___
1099 $code.=<<___ if ($win64);
1100         movaps  -0xb8(%rax),%xmm6
1101         movaps  -0xa8(%rax),%xmm7
1102         movaps  -0x98(%rax),%xmm8
1103         movaps  -0x88(%rax),%xmm9
1104         movaps  -0x78(%rax),%xmm10
1105         movaps  -0x68(%rax),%xmm11
1106         movaps  -0x58(%rax),%xmm12
1107         movaps  -0x48(%rax),%xmm13
1108         movaps  -0x38(%rax),%xmm14
1109         movaps  -0x28(%rax),%xmm15
1110 ___
1111 $code.=<<___;
1112         mov     -16(%rax),%rbp
1113 .cfi_restore    %rbp
1114         mov     -8(%rax),%rbx
1115 .cfi_restore    %rbx
1116         lea     (%rax),%rsp
1117 .cfi_def_cfa_register   %rsp
1118 .Lepilogue_avx:
1119         ret
1120 .cfi_endproc
1121 .size   sha256_multi_block_avx,.-sha256_multi_block_avx
1122 ___
1123                                                 if ($avx>1) {
1124 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1125
1126 $REG_SZ=32;
1127 @ptr=map("%r$_",(12..15,8..11));
1128
1129 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1130 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1131
1132 $code.=<<___;
1133 .type   sha256_multi_block_avx2,\@function,3
1134 .align  32
1135 sha256_multi_block_avx2:
1136 .cfi_startproc
1137 _avx2_shortcut:
1138         mov     %rsp,%rax
1139 .cfi_def_cfa_register   %rax
1140         push    %rbx
1141 .cfi_push       %rbx
1142         push    %rbp
1143 .cfi_push       %rbp
1144         push    %r12
1145 .cfi_push       %r12
1146         push    %r13
1147 .cfi_push       %r13
1148         push    %r14
1149 .cfi_push       %r14
1150         push    %r15
1151 .cfi_push       %r15
1152 ___
1153 $code.=<<___ if ($win64);
1154         lea     -0xa8(%rsp),%rsp
1155         movaps  %xmm6,(%rsp)
1156         movaps  %xmm7,0x10(%rsp)
1157         movaps  %xmm8,0x20(%rsp)
1158         movaps  %xmm9,0x30(%rsp)
1159         movaps  %xmm10,0x40(%rsp)
1160         movaps  %xmm11,0x50(%rsp)
1161         movaps  %xmm12,-0x78(%rax)
1162         movaps  %xmm13,-0x68(%rax)
1163         movaps  %xmm14,-0x58(%rax)
1164         movaps  %xmm15,-0x48(%rax)
1165 ___
1166 $code.=<<___;
1167         sub     \$`$REG_SZ*18`, %rsp
1168         and     \$-256,%rsp
1169         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
1170 .cfi_cfa_expression     %rsp+`$REG_SZ*17`,deref,+8
1171 .Lbody_avx2:
1172         lea     K256+128(%rip),$Tbl
1173         lea     0x80($ctx),$ctx                 # size optimization
1174
1175 .Loop_grande_avx2:
1176         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
1177         xor     $num,$num
1178         lea     `$REG_SZ*16`(%rsp),%rbx
1179 ___
1180 for($i=0;$i<8;$i++) {
1181     $code.=<<___;
1182         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
1183         mov     `16*$i+8`($inp),%ecx            # number of blocks
1184         cmp     $num,%ecx
1185         cmovg   %ecx,$num                       # find maximum
1186         test    %ecx,%ecx
1187         mov     %ecx,`4*$i`(%rbx)               # initialize counters
1188         cmovle  $Tbl,@ptr[$i]                   # cancel input
1189 ___
1190 }
1191 $code.=<<___;
1192         vmovdqu 0x00-0x80($ctx),$A              # load context
1193          lea    128(%rsp),%rax
1194         vmovdqu 0x20-0x80($ctx),$B
1195          lea    256+128(%rsp),%rbx
1196         vmovdqu 0x40-0x80($ctx),$C
1197         vmovdqu 0x60-0x80($ctx),$D
1198         vmovdqu 0x80-0x80($ctx),$E
1199         vmovdqu 0xa0-0x80($ctx),$F
1200         vmovdqu 0xc0-0x80($ctx),$G
1201         vmovdqu 0xe0-0x80($ctx),$H
1202         vmovdqu .Lpbswap(%rip),$Xn
1203         jmp     .Loop_avx2
1204
1205 .align  32
1206 .Loop_avx2:
1207         vpxor   $B,$C,$bxc                      # magic seed
1208 ___
1209 for($i=0;$i<16;$i++)    { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1210 $code.=<<___;
1211         vmovdqu `&Xi_off($i)`,$Xi
1212         mov     \$3,%ecx
1213         jmp     .Loop_16_xx_avx2
1214 .align  32
1215 .Loop_16_xx_avx2:
1216 ___
1217 for(;$i<32;$i++)        { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1218 $code.=<<___;
1219         dec     %ecx
1220         jnz     .Loop_16_xx_avx2
1221
1222         mov     \$1,%ecx
1223         lea     `$REG_SZ*16`(%rsp),%rbx
1224         lea     K256+128(%rip),$Tbl
1225 ___
1226 for($i=0;$i<8;$i++) {
1227     $code.=<<___;
1228         cmp     `4*$i`(%rbx),%ecx               # examine counters
1229         cmovge  $Tbl,@ptr[$i]                   # cancel input
1230 ___
1231 }
1232 $code.=<<___;
1233         vmovdqa (%rbx),$sigma                   # pull counters
1234         vpxor   $t1,$t1,$t1
1235         vmovdqa $sigma,$Xn
1236         vpcmpgtd $t1,$Xn,$Xn                    # mask value
1237         vpaddd  $Xn,$sigma,$sigma               # counters--
1238
1239         vmovdqu 0x00-0x80($ctx),$t1
1240         vpand   $Xn,$A,$A
1241         vmovdqu 0x20-0x80($ctx),$t2
1242         vpand   $Xn,$B,$B
1243         vmovdqu 0x40-0x80($ctx),$t3
1244         vpand   $Xn,$C,$C
1245         vmovdqu 0x60-0x80($ctx),$Xi
1246         vpand   $Xn,$D,$D
1247         vpaddd  $t1,$A,$A
1248         vmovdqu 0x80-0x80($ctx),$t1
1249         vpand   $Xn,$E,$E
1250         vpaddd  $t2,$B,$B
1251         vmovdqu 0xa0-0x80($ctx),$t2
1252         vpand   $Xn,$F,$F
1253         vpaddd  $t3,$C,$C
1254         vmovdqu 0xc0-0x80($ctx),$t3
1255         vpand   $Xn,$G,$G
1256         vpaddd  $Xi,$D,$D
1257         vmovdqu 0xe0-0x80($ctx),$Xi
1258         vpand   $Xn,$H,$H
1259         vpaddd  $t1,$E,$E
1260         vpaddd  $t2,$F,$F
1261         vmovdqu $A,0x00-0x80($ctx)
1262         vpaddd  $t3,$G,$G
1263         vmovdqu $B,0x20-0x80($ctx)
1264         vpaddd  $Xi,$H,$H
1265         vmovdqu $C,0x40-0x80($ctx)
1266         vmovdqu $D,0x60-0x80($ctx)
1267         vmovdqu $E,0x80-0x80($ctx)
1268         vmovdqu $F,0xa0-0x80($ctx)
1269         vmovdqu $G,0xc0-0x80($ctx)
1270         vmovdqu $H,0xe0-0x80($ctx)
1271
1272         vmovdqu $sigma,(%rbx)                   # save counters
1273         lea     256+128(%rsp),%rbx
1274         vmovdqu .Lpbswap(%rip),$Xn
1275         dec     $num
1276         jnz     .Loop_avx2
1277
1278         #mov    `$REG_SZ*17+8`(%rsp),$num
1279         #lea    $REG_SZ($ctx),$ctx
1280         #lea    `16*$REG_SZ/4`($inp),$inp
1281         #dec    $num
1282         #jnz    .Loop_grande_avx2
1283
1284 .Ldone_avx2:
1285         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
1286 .cfi_def_cfa    %rax,8
1287         vzeroupper
1288 ___
1289 $code.=<<___ if ($win64);
1290         movaps  -0xd8(%rax),%xmm6
1291         movaps  -0xc8(%rax),%xmm7
1292         movaps  -0xb8(%rax),%xmm8
1293         movaps  -0xa8(%rax),%xmm9
1294         movaps  -0x98(%rax),%xmm10
1295         movaps  -0x88(%rax),%xmm11
1296         movaps  -0x78(%rax),%xmm12
1297         movaps  -0x68(%rax),%xmm13
1298         movaps  -0x58(%rax),%xmm14
1299         movaps  -0x48(%rax),%xmm15
1300 ___
1301 $code.=<<___;
1302         mov     -48(%rax),%r15
1303 .cfi_restore    %r15
1304         mov     -40(%rax),%r14
1305 .cfi_restore    %r14
1306         mov     -32(%rax),%r13
1307 .cfi_restore    %r13
1308         mov     -24(%rax),%r12
1309 .cfi_restore    %r12
1310         mov     -16(%rax),%rbp
1311 .cfi_restore    %rbp
1312         mov     -8(%rax),%rbx
1313 .cfi_restore    %rbx
1314         lea     (%rax),%rsp
1315 .cfi_def_cfa_register   %rsp
1316 .Lepilogue_avx2:
1317         ret
1318 .cfi_endproc
1319 .size   sha256_multi_block_avx2,.-sha256_multi_block_avx2
1320 ___
1321                                         }       }}}
1322 $code.=<<___;
1323 .align  256
1324 K256:
1325 ___
1326 sub TABLE {
1327     foreach (@_) {
1328         $code.=<<___;
1329         .long   $_,$_,$_,$_
1330         .long   $_,$_,$_,$_
1331 ___
1332     }
1333 }
1334 &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1335         0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1336         0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1337         0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1338         0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1339         0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1340         0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1341         0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1342         0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1343         0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1344         0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1345         0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1346         0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1347         0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1348         0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1349         0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1350 $code.=<<___;
1351 .Lpbswap:
1352         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1353         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1354 K256_shaext:
1355         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1356         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1357         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1358         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1359         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1360         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1361         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1362         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1363         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1364         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1365         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1366         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1367         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1368         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1369         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1370         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1371         .asciz  "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1372 ___
1373
1374 if ($win64) {
1375 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1376 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1377 $rec="%rcx";
1378 $frame="%rdx";
1379 $context="%r8";
1380 $disp="%r9";
1381
1382 $code.=<<___;
1383 .extern __imp_RtlVirtualUnwind
1384 .type   se_handler,\@abi-omnipotent
1385 .align  16
1386 se_handler:
1387         push    %rsi
1388         push    %rdi
1389         push    %rbx
1390         push    %rbp
1391         push    %r12
1392         push    %r13
1393         push    %r14
1394         push    %r15
1395         pushfq
1396         sub     \$64,%rsp
1397
1398         mov     120($context),%rax      # pull context->Rax
1399         mov     248($context),%rbx      # pull context->Rip
1400
1401         mov     8($disp),%rsi           # disp->ImageBase
1402         mov     56($disp),%r11          # disp->HandlerData
1403
1404         mov     0(%r11),%r10d           # HandlerData[0]
1405         lea     (%rsi,%r10),%r10        # end of prologue label
1406         cmp     %r10,%rbx               # context->Rip<.Lbody
1407         jb      .Lin_prologue
1408
1409         mov     152($context),%rax      # pull context->Rsp
1410
1411         mov     4(%r11),%r10d           # HandlerData[1]
1412         lea     (%rsi,%r10),%r10        # epilogue label
1413         cmp     %r10,%rbx               # context->Rip>=.Lepilogue
1414         jae     .Lin_prologue
1415
1416         mov     `16*17`(%rax),%rax      # pull saved stack pointer
1417
1418         mov     -8(%rax),%rbx
1419         mov     -16(%rax),%rbp
1420         mov     %rbx,144($context)      # restore context->Rbx
1421         mov     %rbp,160($context)      # restore context->Rbp
1422
1423         lea     -24-10*16(%rax),%rsi
1424         lea     512($context),%rdi      # &context.Xmm6
1425         mov     \$20,%ecx
1426         .long   0xa548f3fc              # cld; rep movsq
1427
1428 .Lin_prologue:
1429         mov     8(%rax),%rdi
1430         mov     16(%rax),%rsi
1431         mov     %rax,152($context)      # restore context->Rsp
1432         mov     %rsi,168($context)      # restore context->Rsi
1433         mov     %rdi,176($context)      # restore context->Rdi
1434
1435         mov     40($disp),%rdi          # disp->ContextRecord
1436         mov     $context,%rsi           # context
1437         mov     \$154,%ecx              # sizeof(CONTEXT)
1438         .long   0xa548f3fc              # cld; rep movsq
1439
1440         mov     $disp,%rsi
1441         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1442         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1443         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1444         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1445         mov     40(%rsi),%r10           # disp->ContextRecord
1446         lea     56(%rsi),%r11           # &disp->HandlerData
1447         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1448         mov     %r10,32(%rsp)           # arg5
1449         mov     %r11,40(%rsp)           # arg6
1450         mov     %r12,48(%rsp)           # arg7
1451         mov     %rcx,56(%rsp)           # arg8, (NULL)
1452         call    *__imp_RtlVirtualUnwind(%rip)
1453
1454         mov     \$1,%eax                # ExceptionContinueSearch
1455         add     \$64,%rsp
1456         popfq
1457         pop     %r15
1458         pop     %r14
1459         pop     %r13
1460         pop     %r12
1461         pop     %rbp
1462         pop     %rbx
1463         pop     %rdi
1464         pop     %rsi
1465         ret
1466 .size   se_handler,.-se_handler
1467 ___
1468 $code.=<<___ if ($avx>1);
1469 .type   avx2_handler,\@abi-omnipotent
1470 .align  16
1471 avx2_handler:
1472         push    %rsi
1473         push    %rdi
1474         push    %rbx
1475         push    %rbp
1476         push    %r12
1477         push    %r13
1478         push    %r14
1479         push    %r15
1480         pushfq
1481         sub     \$64,%rsp
1482
1483         mov     120($context),%rax      # pull context->Rax
1484         mov     248($context),%rbx      # pull context->Rip
1485
1486         mov     8($disp),%rsi           # disp->ImageBase
1487         mov     56($disp),%r11          # disp->HandlerData
1488
1489         mov     0(%r11),%r10d           # HandlerData[0]
1490         lea     (%rsi,%r10),%r10        # end of prologue label
1491         cmp     %r10,%rbx               # context->Rip<body label
1492         jb      .Lin_prologue
1493
1494         mov     152($context),%rax      # pull context->Rsp
1495
1496         mov     4(%r11),%r10d           # HandlerData[1]
1497         lea     (%rsi,%r10),%r10        # epilogue label
1498         cmp     %r10,%rbx               # context->Rip>=epilogue label
1499         jae     .Lin_prologue
1500
1501         mov     `32*17`($context),%rax  # pull saved stack pointer
1502
1503         mov     -8(%rax),%rbx
1504         mov     -16(%rax),%rbp
1505         mov     -24(%rax),%r12
1506         mov     -32(%rax),%r13
1507         mov     -40(%rax),%r14
1508         mov     -48(%rax),%r15
1509         mov     %rbx,144($context)      # restore context->Rbx
1510         mov     %rbp,160($context)      # restore context->Rbp
1511         mov     %r12,216($context)      # restore cotnext->R12
1512         mov     %r13,224($context)      # restore cotnext->R13
1513         mov     %r14,232($context)      # restore cotnext->R14
1514         mov     %r15,240($context)      # restore cotnext->R15
1515
1516         lea     -56-10*16(%rax),%rsi
1517         lea     512($context),%rdi      # &context.Xmm6
1518         mov     \$20,%ecx
1519         .long   0xa548f3fc              # cld; rep movsq
1520
1521         jmp     .Lin_prologue
1522 .size   avx2_handler,.-avx2_handler
1523 ___
1524 $code.=<<___;
1525 .section        .pdata
1526 .align  4
1527         .rva    .LSEH_begin_sha256_multi_block
1528         .rva    .LSEH_end_sha256_multi_block
1529         .rva    .LSEH_info_sha256_multi_block
1530         .rva    .LSEH_begin_sha256_multi_block_shaext
1531         .rva    .LSEH_end_sha256_multi_block_shaext
1532         .rva    .LSEH_info_sha256_multi_block_shaext
1533 ___
1534 $code.=<<___ if ($avx);
1535         .rva    .LSEH_begin_sha256_multi_block_avx
1536         .rva    .LSEH_end_sha256_multi_block_avx
1537         .rva    .LSEH_info_sha256_multi_block_avx
1538 ___
1539 $code.=<<___ if ($avx>1);
1540         .rva    .LSEH_begin_sha256_multi_block_avx2
1541         .rva    .LSEH_end_sha256_multi_block_avx2
1542         .rva    .LSEH_info_sha256_multi_block_avx2
1543 ___
1544 $code.=<<___;
1545 .section        .xdata
1546 .align  8
1547 .LSEH_info_sha256_multi_block:
1548         .byte   9,0,0,0
1549         .rva    se_handler
1550         .rva    .Lbody,.Lepilogue                       # HandlerData[]
1551 .LSEH_info_sha256_multi_block_shaext:
1552         .byte   9,0,0,0
1553         .rva    se_handler
1554         .rva    .Lbody_shaext,.Lepilogue_shaext         # HandlerData[]
1555 ___
1556 $code.=<<___ if ($avx);
1557 .LSEH_info_sha256_multi_block_avx:
1558         .byte   9,0,0,0
1559         .rva    se_handler
1560         .rva    .Lbody_avx,.Lepilogue_avx               # HandlerData[]
1561 ___
1562 $code.=<<___ if ($avx>1);
1563 .LSEH_info_sha256_multi_block_avx2:
1564         .byte   9,0,0,0
1565         .rva    avx2_handler
1566         .rva    .Lbody_avx2,.Lepilogue_avx2             # HandlerData[]
1567 ___
1568 }
1569 ####################################################################
1570
1571 sub rex {
1572   local *opcode=shift;
1573   my ($dst,$src)=@_;
1574   my $rex=0;
1575
1576     $rex|=0x04                  if ($dst>=8);
1577     $rex|=0x01                  if ($src>=8);
1578     unshift @opcode,$rex|0x40   if ($rex);
1579 }
1580
1581 sub sha256op38 {
1582     my $instr = shift;
1583     my %opcodelet = (
1584                 "sha256rnds2" => 0xcb,
1585                 "sha256msg1"  => 0xcc,
1586                 "sha256msg2"  => 0xcd   );
1587
1588     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1589       my @opcode=(0x0f,0x38);
1590         rex(\@opcode,$2,$1);
1591         push @opcode,$opcodelet{$instr};
1592         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1593         return ".byte\t".join(',',@opcode);
1594     } else {
1595         return $instr."\t".@_[0];
1596     }
1597 }
1598
1599 foreach (split("\n",$code)) {
1600         s/\`([^\`]*)\`/eval($1)/ge;
1601
1602         s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo         or
1603
1604         s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go          or
1605         s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go         or
1606         s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go    or
1607         s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go        or
1608         s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go            or
1609         s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1610
1611         print $_,"\n";
1612 }
1613
1614 close STDOUT;