Fix grammar in certificates.txt
[openssl.git] / crypto / sha / asm / sha256-mb-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # Multi-buffer SHA256 procedure processes n buffers in parallel by
18 # placing buffer data to designated lane of SIMD register. n is
19 # naturally limited to 4 on pre-AVX2 processors and to 8 on
20 # AVX2-capable processors such as Haswell.
21 #
22 #               this    +aesni(i)       sha256  aesni-sha256    gain(iv)
23 # -------------------------------------------------------------------
24 # Westmere(ii)  23.3/n  +1.28=7.11(n=4) 12.3    +3.75=16.1      +126%
25 # Atom(ii)      38.7/n  +3.93=13.6(n=4) 20.8    +5.69=26.5      +95%
26 # Sandy Bridge  (20.5   +5.15=25.7)/n   11.6    13.0            +103%
27 # Ivy Bridge    (20.4   +5.14=25.5)/n   10.3    11.6            +82%
28 # Haswell(iii)  (21.0   +5.00=26.0)/n   7.80    8.79            +170%
29 # Skylake       (18.9   +5.00=23.9)/n   7.70    8.17            +170%
30 # Bulldozer     (21.6   +5.76=27.4)/n   13.6    13.7            +100%
31 #
32 # (i)   multi-block CBC encrypt with 128-bit key;
33 # (ii)  (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34 #       because of lower AES-NI instruction throughput, nor is there
35 #       AES-NI-SHA256 stitch for these processors;
36 # (iii) "this" is for n=8, when we gather twice as much data, result
37 #       for n=4 is 20.3+4.44=24.7;
38 # (iv)  presented improvement coefficients are asymptotic limits and
39 #       in real-life application are somewhat lower, e.g. for 2KB
40 #       fragments they range from 75% to 130% (on Haswell);
41
42 # $output is the last argument if it looks like a file (it has an extension)
43 # $flavour is the first argument if it doesn't look like a file
44 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
45 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
46
47 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52 die "can't locate x86_64-xlate.pl";
53
54 $avx=0;
55
56 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
57                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
58         $avx = ($1>=2.19) + ($1>=2.22);
59 }
60
61 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
62            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
63         $avx = ($1>=2.09) + ($1>=2.10);
64 }
65
66 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
67            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
68         $avx = ($1>=10) + ($1>=11);
69 }
70
71 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
72         $avx = ($2>=3.0) + ($2>3.0);
73 }
74
75 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
76     or die "can't call $xlate: $!";
77 *STDOUT=*OUT;
78
79 # void sha256_multi_block (
80 #     struct {  unsigned int A[8];
81 #               unsigned int B[8];
82 #               unsigned int C[8];
83 #               unsigned int D[8];
84 #               unsigned int E[8];
85 #               unsigned int F[8];
86 #               unsigned int G[8];
87 #               unsigned int H[8];      } *ctx,
88 #     struct {  void *ptr; int blocks;  } inp[8],
89 #     int num);         /* 1 or 2 */
90 #
91 $ctx="%rdi";    # 1st arg
92 $inp="%rsi";    # 2nd arg
93 $num="%edx";    # 3rd arg
94 @ptr=map("%r$_",(8..11));
95 $Tbl="%rbp";
96
97 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
98 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
99
100 $REG_SZ=16;
101
102 sub Xi_off {
103 my $off = shift;
104
105     $off %= 16; $off *= $REG_SZ;
106     $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
107 }
108
109 sub ROUND_00_15 {
110 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
111
112 $code.=<<___ if ($i<15);
113         movd            `4*$i`(@ptr[0]),$Xi
114         movd            `4*$i`(@ptr[1]),$t1
115         movd            `4*$i`(@ptr[2]),$t2
116         movd            `4*$i`(@ptr[3]),$t3
117         punpckldq       $t2,$Xi
118         punpckldq       $t3,$t1
119         punpckldq       $t1,$Xi
120 ___
121 $code.=<<___ if ($i==15);
122         movd            `4*$i`(@ptr[0]),$Xi
123          lea            `16*4`(@ptr[0]),@ptr[0]
124         movd            `4*$i`(@ptr[1]),$t1
125          lea            `16*4`(@ptr[1]),@ptr[1]
126         movd            `4*$i`(@ptr[2]),$t2
127          lea            `16*4`(@ptr[2]),@ptr[2]
128         movd            `4*$i`(@ptr[3]),$t3
129          lea            `16*4`(@ptr[3]),@ptr[3]
130         punpckldq       $t2,$Xi
131         punpckldq       $t3,$t1
132         punpckldq       $t1,$Xi
133 ___
134 $code.=<<___;
135         movdqa  $e,$sigma
136         `"pshufb        $Xn,$Xi"                if ($i<=15 && ($i&1)==0)`
137         movdqa  $e,$t3
138         `"pshufb        $Xn,$Xi"                if ($i<=15 && ($i&1)==1)`
139         psrld   \$6,$sigma
140         movdqa  $e,$t2
141         pslld   \$7,$t3
142         movdqa  $Xi,`&Xi_off($i)`
143          paddd  $h,$Xi                          # Xi+=h
144
145         psrld   \$11,$t2
146         pxor    $t3,$sigma
147         pslld   \$21-7,$t3
148          paddd  `32*($i%8)-128`($Tbl),$Xi       # Xi+=K[round]
149         pxor    $t2,$sigma
150
151         psrld   \$25-11,$t2
152          movdqa $e,$t1
153          `"prefetcht0   63(@ptr[0])"            if ($i==15)`
154         pxor    $t3,$sigma
155          movdqa $e,$axb                         # borrow $axb
156         pslld   \$26-21,$t3
157          pandn  $g,$t1
158          pand   $f,$axb
159         pxor    $t2,$sigma
160
161          `"prefetcht0   63(@ptr[1])"            if ($i==15)`
162         movdqa  $a,$t2
163         pxor    $t3,$sigma                      # Sigma1(e)
164         movdqa  $a,$t3
165         psrld   \$2,$t2
166         paddd   $sigma,$Xi                      # Xi+=Sigma1(e)
167          pxor   $axb,$t1                        # Ch(e,f,g)
168          movdqa $b,$axb
169         movdqa  $a,$sigma
170         pslld   \$10,$t3
171          pxor   $a,$axb                         # a^b, b^c in next round
172
173          `"prefetcht0   63(@ptr[2])"            if ($i==15)`
174         psrld   \$13,$sigma
175         pxor    $t3,$t2
176          paddd  $t1,$Xi                         # Xi+=Ch(e,f,g)
177         pslld   \$19-10,$t3
178          pand   $axb,$bxc
179         pxor    $sigma,$t2
180
181          `"prefetcht0   63(@ptr[3])"            if ($i==15)`
182         psrld   \$22-13,$sigma
183         pxor    $t3,$t2
184          movdqa $b,$h
185         pslld   \$30-19,$t3
186         pxor    $t2,$sigma
187          pxor   $bxc,$h                         # h=Maj(a,b,c)=Ch(a^b,c,b)
188          paddd  $Xi,$d                          # d+=Xi
189         pxor    $t3,$sigma                      # Sigma0(a)
190
191         paddd   $Xi,$h                          # h+=Xi
192         paddd   $sigma,$h                       # h+=Sigma0(a)
193 ___
194 $code.=<<___ if (($i%8)==7);
195         lea     `32*8`($Tbl),$Tbl
196 ___
197         ($axb,$bxc)=($bxc,$axb);
198 }
199
200 sub ROUND_16_XX {
201 my $i=shift;
202
203 $code.=<<___;
204         movdqa  `&Xi_off($i+1)`,$Xn
205         paddd   `&Xi_off($i+9)`,$Xi             # Xi+=X[i+9]
206
207         movdqa  $Xn,$sigma
208         movdqa  $Xn,$t2
209         psrld   \$3,$sigma
210         movdqa  $Xn,$t3
211
212         psrld   \$7,$t2
213         movdqa  `&Xi_off($i+14)`,$t1
214         pslld   \$14,$t3
215         pxor    $t2,$sigma
216         psrld   \$18-7,$t2
217         movdqa  $t1,$axb                        # borrow $axb
218         pxor    $t3,$sigma
219         pslld   \$25-14,$t3
220         pxor    $t2,$sigma
221         psrld   \$10,$t1
222         movdqa  $axb,$t2
223
224         psrld   \$17,$axb
225         pxor    $t3,$sigma                      # sigma0(X[i+1])
226         pslld   \$13,$t2
227          paddd  $sigma,$Xi                      # Xi+=sigma0(e)
228         pxor    $axb,$t1
229         psrld   \$19-17,$axb
230         pxor    $t2,$t1
231         pslld   \$15-13,$t2
232         pxor    $axb,$t1
233         pxor    $t2,$t1                         # sigma0(X[i+14])
234         paddd   $t1,$Xi                         # Xi+=sigma1(X[i+14])
235 ___
236         &ROUND_00_15($i,@_);
237         ($Xi,$Xn)=($Xn,$Xi);
238 }
239
240 $code.=<<___;
241 .text
242
243 .extern OPENSSL_ia32cap_P
244
245 .globl  sha256_multi_block
246 .type   sha256_multi_block,\@function,3
247 .align  32
248 sha256_multi_block:
249 .cfi_startproc
250         mov     OPENSSL_ia32cap_P+4(%rip),%rcx
251         bt      \$61,%rcx                       # check SHA bit
252         jc      _shaext_shortcut
253 ___
254 $code.=<<___ if ($avx);
255         test    \$`1<<28`,%ecx
256         jnz     _avx_shortcut
257 ___
258 $code.=<<___;
259         mov     %rsp,%rax
260 .cfi_def_cfa_register   %rax
261         push    %rbx
262 .cfi_push       %rbx
263         push    %rbp
264 .cfi_push       %rbp
265 ___
266 $code.=<<___ if ($win64);
267         lea     -0xa8(%rsp),%rsp
268         movaps  %xmm6,(%rsp)
269         movaps  %xmm7,0x10(%rsp)
270         movaps  %xmm8,0x20(%rsp)
271         movaps  %xmm9,0x30(%rsp)
272         movaps  %xmm10,-0x78(%rax)
273         movaps  %xmm11,-0x68(%rax)
274         movaps  %xmm12,-0x58(%rax)
275         movaps  %xmm13,-0x48(%rax)
276         movaps  %xmm14,-0x38(%rax)
277         movaps  %xmm15,-0x28(%rax)
278 ___
279 $code.=<<___;
280         sub     \$`$REG_SZ*18`, %rsp
281         and     \$-256,%rsp
282         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
283 .cfi_cfa_expression     %rsp+`$REG_SZ*17`,deref,+8
284 .Lbody:
285         lea     K256+128(%rip),$Tbl
286         lea     `$REG_SZ*16`(%rsp),%rbx
287         lea     0x80($ctx),$ctx                 # size optimization
288
289 .Loop_grande:
290         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
291         xor     $num,$num
292 ___
293 for($i=0;$i<4;$i++) {
294     $code.=<<___;
295         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
296         mov     `16*$i+8`($inp),%ecx            # number of blocks
297         cmp     $num,%ecx
298         cmovg   %ecx,$num                       # find maximum
299         test    %ecx,%ecx
300         mov     %ecx,`4*$i`(%rbx)               # initialize counters
301         cmovle  $Tbl,@ptr[$i]                   # cancel input
302 ___
303 }
304 $code.=<<___;
305         test    $num,$num
306         jz      .Ldone
307
308         movdqu  0x00-0x80($ctx),$A              # load context
309          lea    128(%rsp),%rax
310         movdqu  0x20-0x80($ctx),$B
311         movdqu  0x40-0x80($ctx),$C
312         movdqu  0x60-0x80($ctx),$D
313         movdqu  0x80-0x80($ctx),$E
314         movdqu  0xa0-0x80($ctx),$F
315         movdqu  0xc0-0x80($ctx),$G
316         movdqu  0xe0-0x80($ctx),$H
317         movdqu  .Lpbswap(%rip),$Xn
318         jmp     .Loop
319
320 .align  32
321 .Loop:
322         movdqa  $C,$bxc
323         pxor    $B,$bxc                         # magic seed
324 ___
325 for($i=0;$i<16;$i++)    { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
326 $code.=<<___;
327         movdqu  `&Xi_off($i)`,$Xi
328         mov     \$3,%ecx
329         jmp     .Loop_16_xx
330 .align  32
331 .Loop_16_xx:
332 ___
333 for(;$i<32;$i++)        { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
334 $code.=<<___;
335         dec     %ecx
336         jnz     .Loop_16_xx
337
338         mov     \$1,%ecx
339         lea     K256+128(%rip),$Tbl
340
341         movdqa  (%rbx),$sigma                   # pull counters
342         cmp     4*0(%rbx),%ecx                  # examine counters
343         pxor    $t1,$t1
344         cmovge  $Tbl,@ptr[0]                    # cancel input
345         cmp     4*1(%rbx),%ecx
346         movdqa  $sigma,$Xn
347         cmovge  $Tbl,@ptr[1]
348         cmp     4*2(%rbx),%ecx
349         pcmpgtd $t1,$Xn                         # mask value
350         cmovge  $Tbl,@ptr[2]
351         cmp     4*3(%rbx),%ecx
352         paddd   $Xn,$sigma                      # counters--
353         cmovge  $Tbl,@ptr[3]
354
355         movdqu  0x00-0x80($ctx),$t1
356         pand    $Xn,$A
357         movdqu  0x20-0x80($ctx),$t2
358         pand    $Xn,$B
359         movdqu  0x40-0x80($ctx),$t3
360         pand    $Xn,$C
361         movdqu  0x60-0x80($ctx),$Xi
362         pand    $Xn,$D
363         paddd   $t1,$A
364         movdqu  0x80-0x80($ctx),$t1
365         pand    $Xn,$E
366         paddd   $t2,$B
367         movdqu  0xa0-0x80($ctx),$t2
368         pand    $Xn,$F
369         paddd   $t3,$C
370         movdqu  0xc0-0x80($ctx),$t3
371         pand    $Xn,$G
372         paddd   $Xi,$D
373         movdqu  0xe0-0x80($ctx),$Xi
374         pand    $Xn,$H
375         paddd   $t1,$E
376         paddd   $t2,$F
377         movdqu  $A,0x00-0x80($ctx)
378         paddd   $t3,$G
379         movdqu  $B,0x20-0x80($ctx)
380         paddd   $Xi,$H
381         movdqu  $C,0x40-0x80($ctx)
382         movdqu  $D,0x60-0x80($ctx)
383         movdqu  $E,0x80-0x80($ctx)
384         movdqu  $F,0xa0-0x80($ctx)
385         movdqu  $G,0xc0-0x80($ctx)
386         movdqu  $H,0xe0-0x80($ctx)
387
388         movdqa  $sigma,(%rbx)                   # save counters
389         movdqa  .Lpbswap(%rip),$Xn
390         dec     $num
391         jnz     .Loop
392
393         mov     `$REG_SZ*17+8`(%rsp),$num
394         lea     $REG_SZ($ctx),$ctx
395         lea     `16*$REG_SZ/4`($inp),$inp
396         dec     $num
397         jnz     .Loop_grande
398
399 .Ldone:
400         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
401 .cfi_def_cfa    %rax,8
402 ___
403 $code.=<<___ if ($win64);
404         movaps  -0xb8(%rax),%xmm6
405         movaps  -0xa8(%rax),%xmm7
406         movaps  -0x98(%rax),%xmm8
407         movaps  -0x88(%rax),%xmm9
408         movaps  -0x78(%rax),%xmm10
409         movaps  -0x68(%rax),%xmm11
410         movaps  -0x58(%rax),%xmm12
411         movaps  -0x48(%rax),%xmm13
412         movaps  -0x38(%rax),%xmm14
413         movaps  -0x28(%rax),%xmm15
414 ___
415 $code.=<<___;
416         mov     -16(%rax),%rbp
417 .cfi_restore    %rbp
418         mov     -8(%rax),%rbx
419 .cfi_restore    %rbx
420         lea     (%rax),%rsp
421 .cfi_def_cfa_register   %rsp
422 .Lepilogue:
423         ret
424 .cfi_endproc
425 .size   sha256_multi_block,.-sha256_multi_block
426 ___
427                                                 {{{
428 my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
429 my @MSG0=map("%xmm$_",(4..7));
430 my @MSG1=map("%xmm$_",(8..11));
431
432 $code.=<<___;
433 .type   sha256_multi_block_shaext,\@function,3
434 .align  32
435 sha256_multi_block_shaext:
436 .cfi_startproc
437 _shaext_shortcut:
438         mov     %rsp,%rax
439 .cfi_def_cfa_register   %rax
440         push    %rbx
441 .cfi_push       %rbx
442         push    %rbp
443 .cfi_push       %rbp
444 ___
445 $code.=<<___ if ($win64);
446         lea     -0xa8(%rsp),%rsp
447         movaps  %xmm6,(%rsp)
448         movaps  %xmm7,0x10(%rsp)
449         movaps  %xmm8,0x20(%rsp)
450         movaps  %xmm9,0x30(%rsp)
451         movaps  %xmm10,-0x78(%rax)
452         movaps  %xmm11,-0x68(%rax)
453         movaps  %xmm12,-0x58(%rax)
454         movaps  %xmm13,-0x48(%rax)
455         movaps  %xmm14,-0x38(%rax)
456         movaps  %xmm15,-0x28(%rax)
457 ___
458 $code.=<<___;
459         sub     \$`$REG_SZ*18`,%rsp
460         shl     \$1,$num                        # we process pair at a time
461         and     \$-256,%rsp
462         lea     0x80($ctx),$ctx                 # size optimization
463         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
464 .Lbody_shaext:
465         lea     `$REG_SZ*16`(%rsp),%rbx
466         lea     K256_shaext+0x80(%rip),$Tbl
467
468 .Loop_grande_shaext:
469         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
470         xor     $num,$num
471 ___
472 for($i=0;$i<2;$i++) {
473     $code.=<<___;
474         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
475         mov     `16*$i+8`($inp),%ecx            # number of blocks
476         cmp     $num,%ecx
477         cmovg   %ecx,$num                       # find maximum
478         test    %ecx,%ecx
479         mov     %ecx,`4*$i`(%rbx)               # initialize counters
480         cmovle  %rsp,@ptr[$i]                   # cancel input
481 ___
482 }
483 $code.=<<___;
484         test    $num,$num
485         jz      .Ldone_shaext
486
487         movq            0x00-0x80($ctx),$ABEF0          # A1.A0
488         movq            0x20-0x80($ctx),@MSG0[0]        # B1.B0
489         movq            0x40-0x80($ctx),$CDGH0          # C1.C0
490         movq            0x60-0x80($ctx),@MSG0[1]        # D1.D0
491         movq            0x80-0x80($ctx),@MSG1[0]        # E1.E0
492         movq            0xa0-0x80($ctx),@MSG1[1]        # F1.F0
493         movq            0xc0-0x80($ctx),@MSG1[2]        # G1.G0
494         movq            0xe0-0x80($ctx),@MSG1[3]        # H1.H0
495
496         punpckldq       @MSG0[0],$ABEF0                 # B1.A1.B0.A0
497         punpckldq       @MSG0[1],$CDGH0                 # D1.C1.D0.C0
498         punpckldq       @MSG1[1],@MSG1[0]               # F1.E1.F0.E0
499         punpckldq       @MSG1[3],@MSG1[2]               # H1.G1.H0.G0
500         movdqa          K256_shaext-0x10(%rip),$TMPx    # byte swap
501
502         movdqa          $ABEF0,$ABEF1
503         movdqa          $CDGH0,$CDGH1
504         punpcklqdq      @MSG1[0],$ABEF0                 # F0.E0.B0.A0
505         punpcklqdq      @MSG1[2],$CDGH0                 # H0.G0.D0.C0
506         punpckhqdq      @MSG1[0],$ABEF1                 # F1.E1.B1.A1
507         punpckhqdq      @MSG1[2],$CDGH1                 # H1.G1.D1.C1
508
509         pshufd          \$0b00011011,$ABEF0,$ABEF0
510         pshufd          \$0b00011011,$CDGH0,$CDGH0
511         pshufd          \$0b00011011,$ABEF1,$ABEF1
512         pshufd          \$0b00011011,$CDGH1,$CDGH1
513         jmp             .Loop_shaext
514
515 .align  32
516 .Loop_shaext:
517         movdqu          0x00(@ptr[0]),@MSG0[0]
518          movdqu         0x00(@ptr[1]),@MSG1[0]
519         movdqu          0x10(@ptr[0]),@MSG0[1]
520          movdqu         0x10(@ptr[1]),@MSG1[1]
521         movdqu          0x20(@ptr[0]),@MSG0[2]
522         pshufb          $TMPx,@MSG0[0]
523          movdqu         0x20(@ptr[1]),@MSG1[2]
524          pshufb         $TMPx,@MSG1[0]
525         movdqu          0x30(@ptr[0]),@MSG0[3]
526         lea             0x40(@ptr[0]),@ptr[0]
527          movdqu         0x30(@ptr[1]),@MSG1[3]
528          lea            0x40(@ptr[1]),@ptr[1]
529
530         movdqa          0*16-0x80($Tbl),$Wi
531         pshufb          $TMPx,@MSG0[1]
532         paddd           @MSG0[0],$Wi
533         pxor            $ABEF0,@MSG0[0]         # black magic
534         movdqa          $Wi,$TMP0
535          movdqa         0*16-0x80($Tbl),$TMP1
536          pshufb         $TMPx,@MSG1[1]
537          paddd          @MSG1[0],$TMP1
538         movdqa          $CDGH0,0x50(%rsp)       # offload
539         sha256rnds2     $ABEF0,$CDGH0           # 0-3
540          pxor           $ABEF1,@MSG1[0]         # black magic
541          movdqa         $TMP1,$Wi
542          movdqa         $CDGH1,0x70(%rsp)
543          sha256rnds2    $ABEF1,$CDGH1           # 0-3
544         pshufd          \$0x0e,$TMP0,$Wi
545         pxor            $ABEF0,@MSG0[0]         # black magic
546         movdqa          $ABEF0,0x40(%rsp)       # offload
547         sha256rnds2     $CDGH0,$ABEF0
548          pshufd         \$0x0e,$TMP1,$Wi
549          pxor           $ABEF1,@MSG1[0]         # black magic
550          movdqa         $ABEF1,0x60(%rsp)
551         movdqa          1*16-0x80($Tbl),$TMP0
552         paddd           @MSG0[1],$TMP0
553         pshufb          $TMPx,@MSG0[2]
554          sha256rnds2    $CDGH1,$ABEF1
555
556         movdqa          $TMP0,$Wi
557          movdqa         1*16-0x80($Tbl),$TMP1
558          paddd          @MSG1[1],$TMP1
559         sha256rnds2     $ABEF0,$CDGH0           # 4-7
560          movdqa         $TMP1,$Wi
561         prefetcht0      127(@ptr[0])
562         pshufb          $TMPx,@MSG0[3]
563          pshufb         $TMPx,@MSG1[2]
564          prefetcht0     127(@ptr[1])
565          sha256rnds2    $ABEF1,$CDGH1           # 4-7
566         pshufd          \$0x0e,$TMP0,$Wi
567          pshufb         $TMPx,@MSG1[3]
568         sha256msg1      @MSG0[1],@MSG0[0]
569         sha256rnds2     $CDGH0,$ABEF0
570          pshufd         \$0x0e,$TMP1,$Wi
571         movdqa          2*16-0x80($Tbl),$TMP0
572         paddd           @MSG0[2],$TMP0
573          sha256rnds2    $CDGH1,$ABEF1
574
575         movdqa          $TMP0,$Wi
576          movdqa         2*16-0x80($Tbl),$TMP1
577          paddd          @MSG1[2],$TMP1
578         sha256rnds2     $ABEF0,$CDGH0           # 8-11
579          sha256msg1     @MSG1[1],@MSG1[0]
580          movdqa         $TMP1,$Wi
581         movdqa          @MSG0[3],$TMPx
582          sha256rnds2    $ABEF1,$CDGH1           # 8-11
583         pshufd          \$0x0e,$TMP0,$Wi
584         palignr         \$4,@MSG0[2],$TMPx
585         paddd           $TMPx,@MSG0[0]
586          movdqa         @MSG1[3],$TMPx
587          palignr        \$4,@MSG1[2],$TMPx
588         sha256msg1      @MSG0[2],@MSG0[1]
589         sha256rnds2     $CDGH0,$ABEF0
590          pshufd         \$0x0e,$TMP1,$Wi
591         movdqa          3*16-0x80($Tbl),$TMP0
592         paddd           @MSG0[3],$TMP0
593          sha256rnds2    $CDGH1,$ABEF1
594          sha256msg1     @MSG1[2],@MSG1[1]
595
596         movdqa          $TMP0,$Wi
597          movdqa         3*16-0x80($Tbl),$TMP1
598          paddd          $TMPx,@MSG1[0]
599          paddd          @MSG1[3],$TMP1
600         sha256msg2      @MSG0[3],@MSG0[0]
601         sha256rnds2     $ABEF0,$CDGH0           # 12-15
602          movdqa         $TMP1,$Wi
603         movdqa          @MSG0[0],$TMPx
604         palignr         \$4,@MSG0[3],$TMPx
605          sha256rnds2    $ABEF1,$CDGH1           # 12-15
606          sha256msg2     @MSG1[3],@MSG1[0]
607         pshufd          \$0x0e,$TMP0,$Wi
608         paddd           $TMPx,@MSG0[1]
609          movdqa         @MSG1[0],$TMPx
610          palignr        \$4,@MSG1[3],$TMPx
611         sha256msg1      @MSG0[3],@MSG0[2]
612         sha256rnds2     $CDGH0,$ABEF0
613          pshufd         \$0x0e,$TMP1,$Wi
614         movdqa          4*16-0x80($Tbl),$TMP0
615         paddd           @MSG0[0],$TMP0
616          sha256rnds2    $CDGH1,$ABEF1
617          sha256msg1     @MSG1[3],@MSG1[2]
618 ___
619 for($i=4;$i<16-3;$i++) {
620 $code.=<<___;
621         movdqa          $TMP0,$Wi
622          movdqa         $i*16-0x80($Tbl),$TMP1
623          paddd          $TMPx,@MSG1[1]
624          paddd          @MSG1[0],$TMP1
625         sha256msg2      @MSG0[0],@MSG0[1]
626         sha256rnds2     $ABEF0,$CDGH0           # 16-19...
627          movdqa         $TMP1,$Wi
628         movdqa          @MSG0[1],$TMPx
629         palignr         \$4,@MSG0[0],$TMPx
630          sha256rnds2    $ABEF1,$CDGH1           # 16-19...
631          sha256msg2     @MSG1[0],@MSG1[1]
632         pshufd          \$0x0e,$TMP0,$Wi
633         paddd           $TMPx,@MSG0[2]
634          movdqa         @MSG1[1],$TMPx
635          palignr        \$4,@MSG1[0],$TMPx
636         sha256msg1      @MSG0[0],@MSG0[3]
637         sha256rnds2     $CDGH0,$ABEF0
638          pshufd         \$0x0e,$TMP1,$Wi
639         movdqa          `($i+1)*16`-0x80($Tbl),$TMP0
640         paddd           @MSG0[1],$TMP0
641          sha256rnds2    $CDGH1,$ABEF1
642          sha256msg1     @MSG1[0],@MSG1[3]
643 ___
644         push(@MSG0,shift(@MSG0));       push(@MSG1,shift(@MSG1));
645 }
646 $code.=<<___;
647         movdqa          $TMP0,$Wi
648          movdqa         13*16-0x80($Tbl),$TMP1
649          paddd          $TMPx,@MSG1[1]
650          paddd          @MSG1[0],$TMP1
651         sha256msg2      @MSG0[0],@MSG0[1]
652         sha256rnds2     $ABEF0,$CDGH0           # 52-55
653          movdqa         $TMP1,$Wi
654         movdqa          @MSG0[1],$TMPx
655         palignr         \$4,@MSG0[0],$TMPx
656          sha256rnds2    $ABEF1,$CDGH1           # 52-55
657          sha256msg2     @MSG1[0],@MSG1[1]
658         pshufd          \$0x0e,$TMP0,$Wi
659         paddd           $TMPx,@MSG0[2]
660          movdqa         @MSG1[1],$TMPx
661          palignr        \$4,@MSG1[0],$TMPx
662         nop
663         sha256rnds2     $CDGH0,$ABEF0
664          pshufd         \$0x0e,$TMP1,$Wi
665         movdqa          14*16-0x80($Tbl),$TMP0
666         paddd           @MSG0[1],$TMP0
667          sha256rnds2    $CDGH1,$ABEF1
668
669         movdqa          $TMP0,$Wi
670          movdqa         14*16-0x80($Tbl),$TMP1
671          paddd          $TMPx,@MSG1[2]
672          paddd          @MSG1[1],$TMP1
673         sha256msg2      @MSG0[1],@MSG0[2]
674         nop
675         sha256rnds2     $ABEF0,$CDGH0           # 56-59
676          movdqa         $TMP1,$Wi
677           mov           \$1,%ecx
678           pxor          @MSG0[1],@MSG0[1]       # zero
679          sha256rnds2    $ABEF1,$CDGH1           # 56-59
680          sha256msg2     @MSG1[1],@MSG1[2]
681         pshufd          \$0x0e,$TMP0,$Wi
682         movdqa          15*16-0x80($Tbl),$TMP0
683         paddd           @MSG0[2],$TMP0
684           movq          (%rbx),@MSG0[2]         # pull counters
685           nop
686         sha256rnds2     $CDGH0,$ABEF0
687          pshufd         \$0x0e,$TMP1,$Wi
688          movdqa         15*16-0x80($Tbl),$TMP1
689          paddd          @MSG1[2],$TMP1
690          sha256rnds2    $CDGH1,$ABEF1
691
692         movdqa          $TMP0,$Wi
693           cmp           4*0(%rbx),%ecx          # examine counters
694           cmovge        %rsp,@ptr[0]            # cancel input
695           cmp           4*1(%rbx),%ecx
696           cmovge        %rsp,@ptr[1]
697           pshufd        \$0x00,@MSG0[2],@MSG1[0]
698         sha256rnds2     $ABEF0,$CDGH0           # 60-63
699          movdqa         $TMP1,$Wi
700           pshufd        \$0x55,@MSG0[2],@MSG1[1]
701           movdqa        @MSG0[2],@MSG1[2]
702          sha256rnds2    $ABEF1,$CDGH1           # 60-63
703         pshufd          \$0x0e,$TMP0,$Wi
704           pcmpgtd       @MSG0[1],@MSG1[0]
705           pcmpgtd       @MSG0[1],@MSG1[1]
706         sha256rnds2     $CDGH0,$ABEF0
707          pshufd         \$0x0e,$TMP1,$Wi
708           pcmpgtd       @MSG0[1],@MSG1[2]       # counter mask
709           movdqa        K256_shaext-0x10(%rip),$TMPx
710          sha256rnds2    $CDGH1,$ABEF1
711
712         pand            @MSG1[0],$CDGH0
713          pand           @MSG1[1],$CDGH1
714         pand            @MSG1[0],$ABEF0
715          pand           @MSG1[1],$ABEF1
716         paddd           @MSG0[2],@MSG1[2]       # counters--
717
718         paddd           0x50(%rsp),$CDGH0
719          paddd          0x70(%rsp),$CDGH1
720         paddd           0x40(%rsp),$ABEF0
721          paddd          0x60(%rsp),$ABEF1
722
723         movq            @MSG1[2],(%rbx)         # save counters
724         dec             $num
725         jnz             .Loop_shaext
726
727         mov             `$REG_SZ*17+8`(%rsp),$num
728
729         pshufd          \$0b00011011,$ABEF0,$ABEF0
730         pshufd          \$0b00011011,$CDGH0,$CDGH0
731         pshufd          \$0b00011011,$ABEF1,$ABEF1
732         pshufd          \$0b00011011,$CDGH1,$CDGH1
733
734         movdqa          $ABEF0,@MSG0[0]
735         movdqa          $CDGH0,@MSG0[1]
736         punpckldq       $ABEF1,$ABEF0                   # B1.B0.A1.A0
737         punpckhdq       $ABEF1,@MSG0[0]                 # F1.F0.E1.E0
738         punpckldq       $CDGH1,$CDGH0                   # D1.D0.C1.C0
739         punpckhdq       $CDGH1,@MSG0[1]                 # H1.H0.G1.G0
740
741         movq            $ABEF0,0x00-0x80($ctx)          # A1.A0
742         psrldq          \$8,$ABEF0
743         movq            @MSG0[0],0x80-0x80($ctx)        # E1.E0
744         psrldq          \$8,@MSG0[0]
745         movq            $ABEF0,0x20-0x80($ctx)          # B1.B0
746         movq            @MSG0[0],0xa0-0x80($ctx)        # F1.F0
747
748         movq            $CDGH0,0x40-0x80($ctx)          # C1.C0
749         psrldq          \$8,$CDGH0
750         movq            @MSG0[1],0xc0-0x80($ctx)        # G1.G0
751         psrldq          \$8,@MSG0[1]
752         movq            $CDGH0,0x60-0x80($ctx)          # D1.D0
753         movq            @MSG0[1],0xe0-0x80($ctx)        # H1.H0
754
755         lea     `$REG_SZ/2`($ctx),$ctx
756         lea     `16*2`($inp),$inp
757         dec     $num
758         jnz     .Loop_grande_shaext
759
760 .Ldone_shaext:
761         #mov    `$REG_SZ*17`(%rsp),%rax         # original %rsp
762 ___
763 $code.=<<___ if ($win64);
764         movaps  -0xb8(%rax),%xmm6
765         movaps  -0xa8(%rax),%xmm7
766         movaps  -0x98(%rax),%xmm8
767         movaps  -0x88(%rax),%xmm9
768         movaps  -0x78(%rax),%xmm10
769         movaps  -0x68(%rax),%xmm11
770         movaps  -0x58(%rax),%xmm12
771         movaps  -0x48(%rax),%xmm13
772         movaps  -0x38(%rax),%xmm14
773         movaps  -0x28(%rax),%xmm15
774 ___
775 $code.=<<___;
776         mov     -16(%rax),%rbp
777 .cfi_restore    %rbp
778         mov     -8(%rax),%rbx
779 .cfi_restore    %rbx
780         lea     (%rax),%rsp
781 .cfi_def_cfa_register   %rsp
782 .Lepilogue_shaext:
783         ret
784 .cfi_endproc
785 .size   sha256_multi_block_shaext,.-sha256_multi_block_shaext
786 ___
787                                                 }}}
788                                                 if ($avx) {{{
789 sub ROUND_00_15_avx {
790 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
791
792 $code.=<<___ if ($i<15 && $REG_SZ==16);
793         vmovd           `4*$i`(@ptr[0]),$Xi
794         vmovd           `4*$i`(@ptr[1]),$t1
795         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
796         vpinsrd         \$1,`4*$i`(@ptr[3]),$t1,$t1
797         vpunpckldq      $t1,$Xi,$Xi
798         vpshufb         $Xn,$Xi,$Xi
799 ___
800 $code.=<<___ if ($i==15 && $REG_SZ==16);
801         vmovd           `4*$i`(@ptr[0]),$Xi
802          lea            `16*4`(@ptr[0]),@ptr[0]
803         vmovd           `4*$i`(@ptr[1]),$t1
804          lea            `16*4`(@ptr[1]),@ptr[1]
805         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
806          lea            `16*4`(@ptr[2]),@ptr[2]
807         vpinsrd         \$1,`4*$i`(@ptr[3]),$t1,$t1
808          lea            `16*4`(@ptr[3]),@ptr[3]
809         vpunpckldq      $t1,$Xi,$Xi
810         vpshufb         $Xn,$Xi,$Xi
811 ___
812 $code.=<<___ if ($i<15 && $REG_SZ==32);
813         vmovd           `4*$i`(@ptr[0]),$Xi
814         vmovd           `4*$i`(@ptr[4]),$t1
815         vmovd           `4*$i`(@ptr[1]),$t2
816         vmovd           `4*$i`(@ptr[5]),$t3
817         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
818         vpinsrd         \$1,`4*$i`(@ptr[6]),$t1,$t1
819         vpinsrd         \$1,`4*$i`(@ptr[3]),$t2,$t2
820         vpunpckldq      $t2,$Xi,$Xi
821         vpinsrd         \$1,`4*$i`(@ptr[7]),$t3,$t3
822         vpunpckldq      $t3,$t1,$t1
823         vinserti128     $t1,$Xi,$Xi
824         vpshufb         $Xn,$Xi,$Xi
825 ___
826 $code.=<<___ if ($i==15 && $REG_SZ==32);
827         vmovd           `4*$i`(@ptr[0]),$Xi
828          lea            `16*4`(@ptr[0]),@ptr[0]
829         vmovd           `4*$i`(@ptr[4]),$t1
830          lea            `16*4`(@ptr[4]),@ptr[4]
831         vmovd           `4*$i`(@ptr[1]),$t2
832          lea            `16*4`(@ptr[1]),@ptr[1]
833         vmovd           `4*$i`(@ptr[5]),$t3
834          lea            `16*4`(@ptr[5]),@ptr[5]
835         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
836          lea            `16*4`(@ptr[2]),@ptr[2]
837         vpinsrd         \$1,`4*$i`(@ptr[6]),$t1,$t1
838          lea            `16*4`(@ptr[6]),@ptr[6]
839         vpinsrd         \$1,`4*$i`(@ptr[3]),$t2,$t2
840          lea            `16*4`(@ptr[3]),@ptr[3]
841         vpunpckldq      $t2,$Xi,$Xi
842         vpinsrd         \$1,`4*$i`(@ptr[7]),$t3,$t3
843          lea            `16*4`(@ptr[7]),@ptr[7]
844         vpunpckldq      $t3,$t1,$t1
845         vinserti128     $t1,$Xi,$Xi
846         vpshufb         $Xn,$Xi,$Xi
847 ___
848 $code.=<<___;
849         vpsrld  \$6,$e,$sigma
850         vpslld  \$26,$e,$t3
851         vmovdqu $Xi,`&Xi_off($i)`
852          vpaddd $h,$Xi,$Xi                      # Xi+=h
853
854         vpsrld  \$11,$e,$t2
855         vpxor   $t3,$sigma,$sigma
856         vpslld  \$21,$e,$t3
857          vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi   # Xi+=K[round]
858         vpxor   $t2,$sigma,$sigma
859
860         vpsrld  \$25,$e,$t2
861         vpxor   $t3,$sigma,$sigma
862          `"prefetcht0   63(@ptr[0])"            if ($i==15)`
863         vpslld  \$7,$e,$t3
864          vpandn $g,$e,$t1
865          vpand  $f,$e,$axb                      # borrow $axb
866          `"prefetcht0   63(@ptr[1])"            if ($i==15)`
867         vpxor   $t2,$sigma,$sigma
868
869         vpsrld  \$2,$a,$h                       # borrow $h
870         vpxor   $t3,$sigma,$sigma               # Sigma1(e)
871          `"prefetcht0   63(@ptr[2])"            if ($i==15)`
872         vpslld  \$30,$a,$t2
873          vpxor  $axb,$t1,$t1                    # Ch(e,f,g)
874          vpxor  $a,$b,$axb                      # a^b, b^c in next round
875          `"prefetcht0   63(@ptr[3])"            if ($i==15)`
876         vpxor   $t2,$h,$h
877         vpaddd  $sigma,$Xi,$Xi                  # Xi+=Sigma1(e)
878
879         vpsrld  \$13,$a,$t2
880          `"prefetcht0   63(@ptr[4])"            if ($i==15 && $REG_SZ==32)`
881         vpslld  \$19,$a,$t3
882          vpaddd $t1,$Xi,$Xi                     # Xi+=Ch(e,f,g)
883          vpand  $axb,$bxc,$bxc
884          `"prefetcht0   63(@ptr[5])"            if ($i==15 && $REG_SZ==32)`
885         vpxor   $t2,$h,$sigma
886
887         vpsrld  \$22,$a,$t2
888         vpxor   $t3,$sigma,$sigma
889          `"prefetcht0   63(@ptr[6])"            if ($i==15 && $REG_SZ==32)`
890         vpslld  \$10,$a,$t3
891          vpxor  $bxc,$b,$h                      # h=Maj(a,b,c)=Ch(a^b,c,b)
892          vpaddd $Xi,$d,$d                       # d+=Xi
893          `"prefetcht0   63(@ptr[7])"            if ($i==15 && $REG_SZ==32)`
894         vpxor   $t2,$sigma,$sigma
895         vpxor   $t3,$sigma,$sigma               # Sigma0(a)
896
897         vpaddd  $Xi,$h,$h                       # h+=Xi
898         vpaddd  $sigma,$h,$h                    # h+=Sigma0(a)
899 ___
900 $code.=<<___ if (($i%8)==7);
901         add     \$`32*8`,$Tbl
902 ___
903         ($axb,$bxc)=($bxc,$axb);
904 }
905
906 sub ROUND_16_XX_avx {
907 my $i=shift;
908
909 $code.=<<___;
910         vmovdqu `&Xi_off($i+1)`,$Xn
911         vpaddd  `&Xi_off($i+9)`,$Xi,$Xi         # Xi+=X[i+9]
912
913         vpsrld  \$3,$Xn,$sigma
914         vpsrld  \$7,$Xn,$t2
915         vpslld  \$25,$Xn,$t3
916         vpxor   $t2,$sigma,$sigma
917         vpsrld  \$18,$Xn,$t2
918         vpxor   $t3,$sigma,$sigma
919         vpslld  \$14,$Xn,$t3
920         vmovdqu `&Xi_off($i+14)`,$t1
921         vpsrld  \$10,$t1,$axb                   # borrow $axb
922
923         vpxor   $t2,$sigma,$sigma
924         vpsrld  \$17,$t1,$t2
925         vpxor   $t3,$sigma,$sigma               # sigma0(X[i+1])
926         vpslld  \$15,$t1,$t3
927          vpaddd $sigma,$Xi,$Xi                  # Xi+=sigma0(e)
928         vpxor   $t2,$axb,$sigma
929         vpsrld  \$19,$t1,$t2
930         vpxor   $t3,$sigma,$sigma
931         vpslld  \$13,$t1,$t3
932         vpxor   $t2,$sigma,$sigma
933         vpxor   $t3,$sigma,$sigma               # sigma0(X[i+14])
934         vpaddd  $sigma,$Xi,$Xi                  # Xi+=sigma1(X[i+14])
935 ___
936         &ROUND_00_15_avx($i,@_);
937         ($Xi,$Xn)=($Xn,$Xi);
938 }
939
940 $code.=<<___;
941 .type   sha256_multi_block_avx,\@function,3
942 .align  32
943 sha256_multi_block_avx:
944 .cfi_startproc
945 _avx_shortcut:
946 ___
947 $code.=<<___ if ($avx>1);
948         shr     \$32,%rcx
949         cmp     \$2,$num
950         jb      .Lavx
951         test    \$`1<<5`,%ecx
952         jnz     _avx2_shortcut
953         jmp     .Lavx
954 .align  32
955 .Lavx:
956 ___
957 $code.=<<___;
958         mov     %rsp,%rax
959 .cfi_def_cfa_register   %rax
960         push    %rbx
961 .cfi_push       %rbx
962         push    %rbp
963 .cfi_push       %rbp
964 ___
965 $code.=<<___ if ($win64);
966         lea     -0xa8(%rsp),%rsp
967         movaps  %xmm6,(%rsp)
968         movaps  %xmm7,0x10(%rsp)
969         movaps  %xmm8,0x20(%rsp)
970         movaps  %xmm9,0x30(%rsp)
971         movaps  %xmm10,-0x78(%rax)
972         movaps  %xmm11,-0x68(%rax)
973         movaps  %xmm12,-0x58(%rax)
974         movaps  %xmm13,-0x48(%rax)
975         movaps  %xmm14,-0x38(%rax)
976         movaps  %xmm15,-0x28(%rax)
977 ___
978 $code.=<<___;
979         sub     \$`$REG_SZ*18`, %rsp
980         and     \$-256,%rsp
981         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
982 .cfi_cfa_expression     %rsp+`$REG_SZ*17`,deref,+8
983 .Lbody_avx:
984         lea     K256+128(%rip),$Tbl
985         lea     `$REG_SZ*16`(%rsp),%rbx
986         lea     0x80($ctx),$ctx                 # size optimization
987
988 .Loop_grande_avx:
989         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
990         xor     $num,$num
991 ___
992 for($i=0;$i<4;$i++) {
993     $code.=<<___;
994         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
995         mov     `16*$i+8`($inp),%ecx            # number of blocks
996         cmp     $num,%ecx
997         cmovg   %ecx,$num                       # find maximum
998         test    %ecx,%ecx
999         mov     %ecx,`4*$i`(%rbx)               # initialize counters
1000         cmovle  $Tbl,@ptr[$i]                   # cancel input
1001 ___
1002 }
1003 $code.=<<___;
1004         test    $num,$num
1005         jz      .Ldone_avx
1006
1007         vmovdqu 0x00-0x80($ctx),$A              # load context
1008          lea    128(%rsp),%rax
1009         vmovdqu 0x20-0x80($ctx),$B
1010         vmovdqu 0x40-0x80($ctx),$C
1011         vmovdqu 0x60-0x80($ctx),$D
1012         vmovdqu 0x80-0x80($ctx),$E
1013         vmovdqu 0xa0-0x80($ctx),$F
1014         vmovdqu 0xc0-0x80($ctx),$G
1015         vmovdqu 0xe0-0x80($ctx),$H
1016         vmovdqu .Lpbswap(%rip),$Xn
1017         jmp     .Loop_avx
1018
1019 .align  32
1020 .Loop_avx:
1021         vpxor   $B,$C,$bxc                      # magic seed
1022 ___
1023 for($i=0;$i<16;$i++)    { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1024 $code.=<<___;
1025         vmovdqu `&Xi_off($i)`,$Xi
1026         mov     \$3,%ecx
1027         jmp     .Loop_16_xx_avx
1028 .align  32
1029 .Loop_16_xx_avx:
1030 ___
1031 for(;$i<32;$i++)        { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1032 $code.=<<___;
1033         dec     %ecx
1034         jnz     .Loop_16_xx_avx
1035
1036         mov     \$1,%ecx
1037         lea     K256+128(%rip),$Tbl
1038 ___
1039 for($i=0;$i<4;$i++) {
1040     $code.=<<___;
1041         cmp     `4*$i`(%rbx),%ecx               # examine counters
1042         cmovge  $Tbl,@ptr[$i]                   # cancel input
1043 ___
1044 }
1045 $code.=<<___;
1046         vmovdqa (%rbx),$sigma                   # pull counters
1047         vpxor   $t1,$t1,$t1
1048         vmovdqa $sigma,$Xn
1049         vpcmpgtd $t1,$Xn,$Xn                    # mask value
1050         vpaddd  $Xn,$sigma,$sigma               # counters--
1051
1052         vmovdqu 0x00-0x80($ctx),$t1
1053         vpand   $Xn,$A,$A
1054         vmovdqu 0x20-0x80($ctx),$t2
1055         vpand   $Xn,$B,$B
1056         vmovdqu 0x40-0x80($ctx),$t3
1057         vpand   $Xn,$C,$C
1058         vmovdqu 0x60-0x80($ctx),$Xi
1059         vpand   $Xn,$D,$D
1060         vpaddd  $t1,$A,$A
1061         vmovdqu 0x80-0x80($ctx),$t1
1062         vpand   $Xn,$E,$E
1063         vpaddd  $t2,$B,$B
1064         vmovdqu 0xa0-0x80($ctx),$t2
1065         vpand   $Xn,$F,$F
1066         vpaddd  $t3,$C,$C
1067         vmovdqu 0xc0-0x80($ctx),$t3
1068         vpand   $Xn,$G,$G
1069         vpaddd  $Xi,$D,$D
1070         vmovdqu 0xe0-0x80($ctx),$Xi
1071         vpand   $Xn,$H,$H
1072         vpaddd  $t1,$E,$E
1073         vpaddd  $t2,$F,$F
1074         vmovdqu $A,0x00-0x80($ctx)
1075         vpaddd  $t3,$G,$G
1076         vmovdqu $B,0x20-0x80($ctx)
1077         vpaddd  $Xi,$H,$H
1078         vmovdqu $C,0x40-0x80($ctx)
1079         vmovdqu $D,0x60-0x80($ctx)
1080         vmovdqu $E,0x80-0x80($ctx)
1081         vmovdqu $F,0xa0-0x80($ctx)
1082         vmovdqu $G,0xc0-0x80($ctx)
1083         vmovdqu $H,0xe0-0x80($ctx)
1084
1085         vmovdqu $sigma,(%rbx)                   # save counters
1086         vmovdqu .Lpbswap(%rip),$Xn
1087         dec     $num
1088         jnz     .Loop_avx
1089
1090         mov     `$REG_SZ*17+8`(%rsp),$num
1091         lea     $REG_SZ($ctx),$ctx
1092         lea     `16*$REG_SZ/4`($inp),$inp
1093         dec     $num
1094         jnz     .Loop_grande_avx
1095
1096 .Ldone_avx:
1097         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
1098 .cfi_def_cfa    %rax,8
1099         vzeroupper
1100 ___
1101 $code.=<<___ if ($win64);
1102         movaps  -0xb8(%rax),%xmm6
1103         movaps  -0xa8(%rax),%xmm7
1104         movaps  -0x98(%rax),%xmm8
1105         movaps  -0x88(%rax),%xmm9
1106         movaps  -0x78(%rax),%xmm10
1107         movaps  -0x68(%rax),%xmm11
1108         movaps  -0x58(%rax),%xmm12
1109         movaps  -0x48(%rax),%xmm13
1110         movaps  -0x38(%rax),%xmm14
1111         movaps  -0x28(%rax),%xmm15
1112 ___
1113 $code.=<<___;
1114         mov     -16(%rax),%rbp
1115 .cfi_restore    %rbp
1116         mov     -8(%rax),%rbx
1117 .cfi_restore    %rbx
1118         lea     (%rax),%rsp
1119 .cfi_def_cfa_register   %rsp
1120 .Lepilogue_avx:
1121         ret
1122 .cfi_endproc
1123 .size   sha256_multi_block_avx,.-sha256_multi_block_avx
1124 ___
1125                                                 if ($avx>1) {
1126 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1127
1128 $REG_SZ=32;
1129 @ptr=map("%r$_",(12..15,8..11));
1130
1131 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1132 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1133
1134 $code.=<<___;
1135 .type   sha256_multi_block_avx2,\@function,3
1136 .align  32
1137 sha256_multi_block_avx2:
1138 .cfi_startproc
1139 _avx2_shortcut:
1140         mov     %rsp,%rax
1141 .cfi_def_cfa_register   %rax
1142         push    %rbx
1143 .cfi_push       %rbx
1144         push    %rbp
1145 .cfi_push       %rbp
1146         push    %r12
1147 .cfi_push       %r12
1148         push    %r13
1149 .cfi_push       %r13
1150         push    %r14
1151 .cfi_push       %r14
1152         push    %r15
1153 .cfi_push       %r15
1154 ___
1155 $code.=<<___ if ($win64);
1156         lea     -0xa8(%rsp),%rsp
1157         movaps  %xmm6,(%rsp)
1158         movaps  %xmm7,0x10(%rsp)
1159         movaps  %xmm8,0x20(%rsp)
1160         movaps  %xmm9,0x30(%rsp)
1161         movaps  %xmm10,0x40(%rsp)
1162         movaps  %xmm11,0x50(%rsp)
1163         movaps  %xmm12,-0x78(%rax)
1164         movaps  %xmm13,-0x68(%rax)
1165         movaps  %xmm14,-0x58(%rax)
1166         movaps  %xmm15,-0x48(%rax)
1167 ___
1168 $code.=<<___;
1169         sub     \$`$REG_SZ*18`, %rsp
1170         and     \$-256,%rsp
1171         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
1172 .cfi_cfa_expression     %rsp+`$REG_SZ*17`,deref,+8
1173 .Lbody_avx2:
1174         lea     K256+128(%rip),$Tbl
1175         lea     0x80($ctx),$ctx                 # size optimization
1176
1177 .Loop_grande_avx2:
1178         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
1179         xor     $num,$num
1180         lea     `$REG_SZ*16`(%rsp),%rbx
1181 ___
1182 for($i=0;$i<8;$i++) {
1183     $code.=<<___;
1184         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
1185         mov     `16*$i+8`($inp),%ecx            # number of blocks
1186         cmp     $num,%ecx
1187         cmovg   %ecx,$num                       # find maximum
1188         test    %ecx,%ecx
1189         mov     %ecx,`4*$i`(%rbx)               # initialize counters
1190         cmovle  $Tbl,@ptr[$i]                   # cancel input
1191 ___
1192 }
1193 $code.=<<___;
1194         vmovdqu 0x00-0x80($ctx),$A              # load context
1195          lea    128(%rsp),%rax
1196         vmovdqu 0x20-0x80($ctx),$B
1197          lea    256+128(%rsp),%rbx
1198         vmovdqu 0x40-0x80($ctx),$C
1199         vmovdqu 0x60-0x80($ctx),$D
1200         vmovdqu 0x80-0x80($ctx),$E
1201         vmovdqu 0xa0-0x80($ctx),$F
1202         vmovdqu 0xc0-0x80($ctx),$G
1203         vmovdqu 0xe0-0x80($ctx),$H
1204         vmovdqu .Lpbswap(%rip),$Xn
1205         jmp     .Loop_avx2
1206
1207 .align  32
1208 .Loop_avx2:
1209         vpxor   $B,$C,$bxc                      # magic seed
1210 ___
1211 for($i=0;$i<16;$i++)    { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1212 $code.=<<___;
1213         vmovdqu `&Xi_off($i)`,$Xi
1214         mov     \$3,%ecx
1215         jmp     .Loop_16_xx_avx2
1216 .align  32
1217 .Loop_16_xx_avx2:
1218 ___
1219 for(;$i<32;$i++)        { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1220 $code.=<<___;
1221         dec     %ecx
1222         jnz     .Loop_16_xx_avx2
1223
1224         mov     \$1,%ecx
1225         lea     `$REG_SZ*16`(%rsp),%rbx
1226         lea     K256+128(%rip),$Tbl
1227 ___
1228 for($i=0;$i<8;$i++) {
1229     $code.=<<___;
1230         cmp     `4*$i`(%rbx),%ecx               # examine counters
1231         cmovge  $Tbl,@ptr[$i]                   # cancel input
1232 ___
1233 }
1234 $code.=<<___;
1235         vmovdqa (%rbx),$sigma                   # pull counters
1236         vpxor   $t1,$t1,$t1
1237         vmovdqa $sigma,$Xn
1238         vpcmpgtd $t1,$Xn,$Xn                    # mask value
1239         vpaddd  $Xn,$sigma,$sigma               # counters--
1240
1241         vmovdqu 0x00-0x80($ctx),$t1
1242         vpand   $Xn,$A,$A
1243         vmovdqu 0x20-0x80($ctx),$t2
1244         vpand   $Xn,$B,$B
1245         vmovdqu 0x40-0x80($ctx),$t3
1246         vpand   $Xn,$C,$C
1247         vmovdqu 0x60-0x80($ctx),$Xi
1248         vpand   $Xn,$D,$D
1249         vpaddd  $t1,$A,$A
1250         vmovdqu 0x80-0x80($ctx),$t1
1251         vpand   $Xn,$E,$E
1252         vpaddd  $t2,$B,$B
1253         vmovdqu 0xa0-0x80($ctx),$t2
1254         vpand   $Xn,$F,$F
1255         vpaddd  $t3,$C,$C
1256         vmovdqu 0xc0-0x80($ctx),$t3
1257         vpand   $Xn,$G,$G
1258         vpaddd  $Xi,$D,$D
1259         vmovdqu 0xe0-0x80($ctx),$Xi
1260         vpand   $Xn,$H,$H
1261         vpaddd  $t1,$E,$E
1262         vpaddd  $t2,$F,$F
1263         vmovdqu $A,0x00-0x80($ctx)
1264         vpaddd  $t3,$G,$G
1265         vmovdqu $B,0x20-0x80($ctx)
1266         vpaddd  $Xi,$H,$H
1267         vmovdqu $C,0x40-0x80($ctx)
1268         vmovdqu $D,0x60-0x80($ctx)
1269         vmovdqu $E,0x80-0x80($ctx)
1270         vmovdqu $F,0xa0-0x80($ctx)
1271         vmovdqu $G,0xc0-0x80($ctx)
1272         vmovdqu $H,0xe0-0x80($ctx)
1273
1274         vmovdqu $sigma,(%rbx)                   # save counters
1275         lea     256+128(%rsp),%rbx
1276         vmovdqu .Lpbswap(%rip),$Xn
1277         dec     $num
1278         jnz     .Loop_avx2
1279
1280         #mov    `$REG_SZ*17+8`(%rsp),$num
1281         #lea    $REG_SZ($ctx),$ctx
1282         #lea    `16*$REG_SZ/4`($inp),$inp
1283         #dec    $num
1284         #jnz    .Loop_grande_avx2
1285
1286 .Ldone_avx2:
1287         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
1288 .cfi_def_cfa    %rax,8
1289         vzeroupper
1290 ___
1291 $code.=<<___ if ($win64);
1292         movaps  -0xd8(%rax),%xmm6
1293         movaps  -0xc8(%rax),%xmm7
1294         movaps  -0xb8(%rax),%xmm8
1295         movaps  -0xa8(%rax),%xmm9
1296         movaps  -0x98(%rax),%xmm10
1297         movaps  -0x88(%rax),%xmm11
1298         movaps  -0x78(%rax),%xmm12
1299         movaps  -0x68(%rax),%xmm13
1300         movaps  -0x58(%rax),%xmm14
1301         movaps  -0x48(%rax),%xmm15
1302 ___
1303 $code.=<<___;
1304         mov     -48(%rax),%r15
1305 .cfi_restore    %r15
1306         mov     -40(%rax),%r14
1307 .cfi_restore    %r14
1308         mov     -32(%rax),%r13
1309 .cfi_restore    %r13
1310         mov     -24(%rax),%r12
1311 .cfi_restore    %r12
1312         mov     -16(%rax),%rbp
1313 .cfi_restore    %rbp
1314         mov     -8(%rax),%rbx
1315 .cfi_restore    %rbx
1316         lea     (%rax),%rsp
1317 .cfi_def_cfa_register   %rsp
1318 .Lepilogue_avx2:
1319         ret
1320 .cfi_endproc
1321 .size   sha256_multi_block_avx2,.-sha256_multi_block_avx2
1322 ___
1323                                         }       }}}
1324 $code.=<<___;
1325 .align  256
1326 K256:
1327 ___
1328 sub TABLE {
1329     foreach (@_) {
1330         $code.=<<___;
1331         .long   $_,$_,$_,$_
1332         .long   $_,$_,$_,$_
1333 ___
1334     }
1335 }
1336 &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1337         0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1338         0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1339         0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1340         0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1341         0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1342         0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1343         0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1344         0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1345         0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1346         0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1347         0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1348         0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1349         0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1350         0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1351         0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1352 $code.=<<___;
1353 .Lpbswap:
1354         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1355         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1356 K256_shaext:
1357         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1358         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1359         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1360         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1361         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1362         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1363         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1364         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1365         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1366         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1367         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1368         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1369         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1370         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1371         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1372         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1373         .asciz  "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1374 ___
1375
1376 if ($win64) {
1377 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1378 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1379 $rec="%rcx";
1380 $frame="%rdx";
1381 $context="%r8";
1382 $disp="%r9";
1383
1384 $code.=<<___;
1385 .extern __imp_RtlVirtualUnwind
1386 .type   se_handler,\@abi-omnipotent
1387 .align  16
1388 se_handler:
1389         push    %rsi
1390         push    %rdi
1391         push    %rbx
1392         push    %rbp
1393         push    %r12
1394         push    %r13
1395         push    %r14
1396         push    %r15
1397         pushfq
1398         sub     \$64,%rsp
1399
1400         mov     120($context),%rax      # pull context->Rax
1401         mov     248($context),%rbx      # pull context->Rip
1402
1403         mov     8($disp),%rsi           # disp->ImageBase
1404         mov     56($disp),%r11          # disp->HandlerData
1405
1406         mov     0(%r11),%r10d           # HandlerData[0]
1407         lea     (%rsi,%r10),%r10        # end of prologue label
1408         cmp     %r10,%rbx               # context->Rip<.Lbody
1409         jb      .Lin_prologue
1410
1411         mov     152($context),%rax      # pull context->Rsp
1412
1413         mov     4(%r11),%r10d           # HandlerData[1]
1414         lea     (%rsi,%r10),%r10        # epilogue label
1415         cmp     %r10,%rbx               # context->Rip>=.Lepilogue
1416         jae     .Lin_prologue
1417
1418         mov     `16*17`(%rax),%rax      # pull saved stack pointer
1419
1420         mov     -8(%rax),%rbx
1421         mov     -16(%rax),%rbp
1422         mov     %rbx,144($context)      # restore context->Rbx
1423         mov     %rbp,160($context)      # restore context->Rbp
1424
1425         lea     -24-10*16(%rax),%rsi
1426         lea     512($context),%rdi      # &context.Xmm6
1427         mov     \$20,%ecx
1428         .long   0xa548f3fc              # cld; rep movsq
1429
1430 .Lin_prologue:
1431         mov     8(%rax),%rdi
1432         mov     16(%rax),%rsi
1433         mov     %rax,152($context)      # restore context->Rsp
1434         mov     %rsi,168($context)      # restore context->Rsi
1435         mov     %rdi,176($context)      # restore context->Rdi
1436
1437         mov     40($disp),%rdi          # disp->ContextRecord
1438         mov     $context,%rsi           # context
1439         mov     \$154,%ecx              # sizeof(CONTEXT)
1440         .long   0xa548f3fc              # cld; rep movsq
1441
1442         mov     $disp,%rsi
1443         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1444         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1445         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1446         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1447         mov     40(%rsi),%r10           # disp->ContextRecord
1448         lea     56(%rsi),%r11           # &disp->HandlerData
1449         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1450         mov     %r10,32(%rsp)           # arg5
1451         mov     %r11,40(%rsp)           # arg6
1452         mov     %r12,48(%rsp)           # arg7
1453         mov     %rcx,56(%rsp)           # arg8, (NULL)
1454         call    *__imp_RtlVirtualUnwind(%rip)
1455
1456         mov     \$1,%eax                # ExceptionContinueSearch
1457         add     \$64,%rsp
1458         popfq
1459         pop     %r15
1460         pop     %r14
1461         pop     %r13
1462         pop     %r12
1463         pop     %rbp
1464         pop     %rbx
1465         pop     %rdi
1466         pop     %rsi
1467         ret
1468 .size   se_handler,.-se_handler
1469 ___
1470 $code.=<<___ if ($avx>1);
1471 .type   avx2_handler,\@abi-omnipotent
1472 .align  16
1473 avx2_handler:
1474         push    %rsi
1475         push    %rdi
1476         push    %rbx
1477         push    %rbp
1478         push    %r12
1479         push    %r13
1480         push    %r14
1481         push    %r15
1482         pushfq
1483         sub     \$64,%rsp
1484
1485         mov     120($context),%rax      # pull context->Rax
1486         mov     248($context),%rbx      # pull context->Rip
1487
1488         mov     8($disp),%rsi           # disp->ImageBase
1489         mov     56($disp),%r11          # disp->HandlerData
1490
1491         mov     0(%r11),%r10d           # HandlerData[0]
1492         lea     (%rsi,%r10),%r10        # end of prologue label
1493         cmp     %r10,%rbx               # context->Rip<body label
1494         jb      .Lin_prologue
1495
1496         mov     152($context),%rax      # pull context->Rsp
1497
1498         mov     4(%r11),%r10d           # HandlerData[1]
1499         lea     (%rsi,%r10),%r10        # epilogue label
1500         cmp     %r10,%rbx               # context->Rip>=epilogue label
1501         jae     .Lin_prologue
1502
1503         mov     `32*17`($context),%rax  # pull saved stack pointer
1504
1505         mov     -8(%rax),%rbx
1506         mov     -16(%rax),%rbp
1507         mov     -24(%rax),%r12
1508         mov     -32(%rax),%r13
1509         mov     -40(%rax),%r14
1510         mov     -48(%rax),%r15
1511         mov     %rbx,144($context)      # restore context->Rbx
1512         mov     %rbp,160($context)      # restore context->Rbp
1513         mov     %r12,216($context)      # restore context->R12
1514         mov     %r13,224($context)      # restore context->R13
1515         mov     %r14,232($context)      # restore context->R14
1516         mov     %r15,240($context)      # restore context->R15
1517
1518         lea     -56-10*16(%rax),%rsi
1519         lea     512($context),%rdi      # &context.Xmm6
1520         mov     \$20,%ecx
1521         .long   0xa548f3fc              # cld; rep movsq
1522
1523         jmp     .Lin_prologue
1524 .size   avx2_handler,.-avx2_handler
1525 ___
1526 $code.=<<___;
1527 .section        .pdata
1528 .align  4
1529         .rva    .LSEH_begin_sha256_multi_block
1530         .rva    .LSEH_end_sha256_multi_block
1531         .rva    .LSEH_info_sha256_multi_block
1532         .rva    .LSEH_begin_sha256_multi_block_shaext
1533         .rva    .LSEH_end_sha256_multi_block_shaext
1534         .rva    .LSEH_info_sha256_multi_block_shaext
1535 ___
1536 $code.=<<___ if ($avx);
1537         .rva    .LSEH_begin_sha256_multi_block_avx
1538         .rva    .LSEH_end_sha256_multi_block_avx
1539         .rva    .LSEH_info_sha256_multi_block_avx
1540 ___
1541 $code.=<<___ if ($avx>1);
1542         .rva    .LSEH_begin_sha256_multi_block_avx2
1543         .rva    .LSEH_end_sha256_multi_block_avx2
1544         .rva    .LSEH_info_sha256_multi_block_avx2
1545 ___
1546 $code.=<<___;
1547 .section        .xdata
1548 .align  8
1549 .LSEH_info_sha256_multi_block:
1550         .byte   9,0,0,0
1551         .rva    se_handler
1552         .rva    .Lbody,.Lepilogue                       # HandlerData[]
1553 .LSEH_info_sha256_multi_block_shaext:
1554         .byte   9,0,0,0
1555         .rva    se_handler
1556         .rva    .Lbody_shaext,.Lepilogue_shaext         # HandlerData[]
1557 ___
1558 $code.=<<___ if ($avx);
1559 .LSEH_info_sha256_multi_block_avx:
1560         .byte   9,0,0,0
1561         .rva    se_handler
1562         .rva    .Lbody_avx,.Lepilogue_avx               # HandlerData[]
1563 ___
1564 $code.=<<___ if ($avx>1);
1565 .LSEH_info_sha256_multi_block_avx2:
1566         .byte   9,0,0,0
1567         .rva    avx2_handler
1568         .rva    .Lbody_avx2,.Lepilogue_avx2             # HandlerData[]
1569 ___
1570 }
1571 ####################################################################
1572
1573 sub rex {
1574   local *opcode=shift;
1575   my ($dst,$src)=@_;
1576   my $rex=0;
1577
1578     $rex|=0x04                  if ($dst>=8);
1579     $rex|=0x01                  if ($src>=8);
1580     unshift @opcode,$rex|0x40   if ($rex);
1581 }
1582
1583 sub sha256op38 {
1584     my $instr = shift;
1585     my %opcodelet = (
1586                 "sha256rnds2" => 0xcb,
1587                 "sha256msg1"  => 0xcc,
1588                 "sha256msg2"  => 0xcd   );
1589
1590     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1591       my @opcode=(0x0f,0x38);
1592         rex(\@opcode,$2,$1);
1593         push @opcode,$opcodelet{$instr};
1594         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1595         return ".byte\t".join(',',@opcode);
1596     } else {
1597         return $instr."\t".@_[0];
1598     }
1599 }
1600
1601 foreach (split("\n",$code)) {
1602         s/\`([^\`]*)\`/eval($1)/ge;
1603
1604         s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo         or
1605
1606         s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go          or
1607         s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go         or
1608         s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go    or
1609         s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go        or
1610         s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go            or
1611         s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1612
1613         print $_,"\n";
1614 }
1615
1616 close STDOUT;