x86_64: Don't assume 8-byte pointer size
[openssl.git] / crypto / sha / asm / sha256-mb-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # Multi-buffer SHA256 procedure processes n buffers in parallel by
18 # placing buffer data to designated lane of SIMD register. n is
19 # naturally limited to 4 on pre-AVX2 processors and to 8 on
20 # AVX2-capable processors such as Haswell.
21 #
22 #               this    +aesni(i)       sha256  aesni-sha256    gain(iv)
23 # -------------------------------------------------------------------
24 # Westmere(ii)  23.3/n  +1.28=7.11(n=4) 12.3    +3.75=16.1      +126%
25 # Atom(ii)      38.7/n  +3.93=13.6(n=4) 20.8    +5.69=26.5      +95%
26 # Sandy Bridge  (20.5   +5.15=25.7)/n   11.6    13.0            +103%
27 # Ivy Bridge    (20.4   +5.14=25.5)/n   10.3    11.6            +82%
28 # Haswell(iii)  (21.0   +5.00=26.0)/n   7.80    8.79            +170%
29 # Skylake       (18.9   +5.00=23.9)/n   7.70    8.17            +170%
30 # Bulldozer     (21.6   +5.76=27.4)/n   13.6    13.7            +100%
31 #
32 # (i)   multi-block CBC encrypt with 128-bit key;
33 # (ii)  (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34 #       because of lower AES-NI instruction throughput, nor is there
35 #       AES-NI-SHA256 stitch for these processors;
36 # (iii) "this" is for n=8, when we gather twice as much data, result
37 #       for n=4 is 20.3+4.44=24.7;
38 # (iv)  presented improvement coefficients are asymptotic limits and
39 #       in real-life application are somewhat lower, e.g. for 2KB
40 #       fragments they range from 75% to 130% (on Haswell);
41
42 # $output is the last argument if it looks like a file (it has an extension)
43 # $flavour is the first argument if it doesn't look like a file
44 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
45 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
46
47 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52 die "can't locate x86_64-xlate.pl";
53
54 push(@INC,"${dir}","${dir}../../perlasm");
55 require "x86_64-support.pl";
56
57 $ptr_size=&pointer_size($flavour);
58
59 $avx=0;
60
61 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
62                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
63         $avx = ($1>=2.19) + ($1>=2.22);
64 }
65
66 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
67            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
68         $avx = ($1>=2.09) + ($1>=2.10);
69 }
70
71 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
72            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
73         $avx = ($1>=10) + ($1>=11);
74 }
75
76 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
77         $avx = ($2>=3.0) + ($2>3.0);
78 }
79
80 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
81     or die "can't call $xlate: $!";
82 *STDOUT=*OUT;
83
84 # void sha256_multi_block (
85 #     struct {  unsigned int A[8];
86 #               unsigned int B[8];
87 #               unsigned int C[8];
88 #               unsigned int D[8];
89 #               unsigned int E[8];
90 #               unsigned int F[8];
91 #               unsigned int G[8];
92 #               unsigned int H[8];      } *ctx,
93 #     struct {  void *ptr; int blocks;  } inp[8],
94 #     int num);         /* 1 or 2 */
95 #
96 $ctx="%rdi";    # 1st arg
97 $inp="%rsi";    # 2nd arg
98 $num="%edx";    # 3rd arg
99 @ptr=map("%r$_",(8..11));
100 $Tbl="%rbp";
101 $inp_elm_size=2*$ptr_size;
102
103 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
104 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
105
106 $REG_SZ=16;
107
108 sub Xi_off {
109 my $off = shift;
110
111     $off %= 16; $off *= $REG_SZ;
112     $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
113 }
114
115 sub ROUND_00_15 {
116 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
117
118 $code.=<<___ if ($i<15);
119         movd            `4*$i`(@ptr[0]),$Xi
120         movd            `4*$i`(@ptr[1]),$t1
121         movd            `4*$i`(@ptr[2]),$t2
122         movd            `4*$i`(@ptr[3]),$t3
123         punpckldq       $t2,$Xi
124         punpckldq       $t3,$t1
125         punpckldq       $t1,$Xi
126 ___
127 $code.=<<___ if ($i==15);
128         movd            `4*$i`(@ptr[0]),$Xi
129          lea            `16*4`(@ptr[0]),@ptr[0]
130         movd            `4*$i`(@ptr[1]),$t1
131          lea            `16*4`(@ptr[1]),@ptr[1]
132         movd            `4*$i`(@ptr[2]),$t2
133          lea            `16*4`(@ptr[2]),@ptr[2]
134         movd            `4*$i`(@ptr[3]),$t3
135          lea            `16*4`(@ptr[3]),@ptr[3]
136         punpckldq       $t2,$Xi
137         punpckldq       $t3,$t1
138         punpckldq       $t1,$Xi
139 ___
140 $code.=<<___;
141         movdqa  $e,$sigma
142         `"pshufb        $Xn,$Xi"                if ($i<=15 && ($i&1)==0)`
143         movdqa  $e,$t3
144         `"pshufb        $Xn,$Xi"                if ($i<=15 && ($i&1)==1)`
145         psrld   \$6,$sigma
146         movdqa  $e,$t2
147         pslld   \$7,$t3
148         movdqa  $Xi,`&Xi_off($i)`
149          paddd  $h,$Xi                          # Xi+=h
150
151         psrld   \$11,$t2
152         pxor    $t3,$sigma
153         pslld   \$21-7,$t3
154          paddd  `32*($i%8)-128`($Tbl),$Xi       # Xi+=K[round]
155         pxor    $t2,$sigma
156
157         psrld   \$25-11,$t2
158          movdqa $e,$t1
159          `"prefetcht0   63(@ptr[0])"            if ($i==15)`
160         pxor    $t3,$sigma
161          movdqa $e,$axb                         # borrow $axb
162         pslld   \$26-21,$t3
163          pandn  $g,$t1
164          pand   $f,$axb
165         pxor    $t2,$sigma
166
167          `"prefetcht0   63(@ptr[1])"            if ($i==15)`
168         movdqa  $a,$t2
169         pxor    $t3,$sigma                      # Sigma1(e)
170         movdqa  $a,$t3
171         psrld   \$2,$t2
172         paddd   $sigma,$Xi                      # Xi+=Sigma1(e)
173          pxor   $axb,$t1                        # Ch(e,f,g)
174          movdqa $b,$axb
175         movdqa  $a,$sigma
176         pslld   \$10,$t3
177          pxor   $a,$axb                         # a^b, b^c in next round
178
179          `"prefetcht0   63(@ptr[2])"            if ($i==15)`
180         psrld   \$13,$sigma
181         pxor    $t3,$t2
182          paddd  $t1,$Xi                         # Xi+=Ch(e,f,g)
183         pslld   \$19-10,$t3
184          pand   $axb,$bxc
185         pxor    $sigma,$t2
186
187          `"prefetcht0   63(@ptr[3])"            if ($i==15)`
188         psrld   \$22-13,$sigma
189         pxor    $t3,$t2
190          movdqa $b,$h
191         pslld   \$30-19,$t3
192         pxor    $t2,$sigma
193          pxor   $bxc,$h                         # h=Maj(a,b,c)=Ch(a^b,c,b)
194          paddd  $Xi,$d                          # d+=Xi
195         pxor    $t3,$sigma                      # Sigma0(a)
196
197         paddd   $Xi,$h                          # h+=Xi
198         paddd   $sigma,$h                       # h+=Sigma0(a)
199 ___
200 $code.=<<___ if (($i%8)==7);
201         lea     `32*8`($Tbl),$Tbl
202 ___
203         ($axb,$bxc)=($bxc,$axb);
204 }
205
206 sub ROUND_16_XX {
207 my $i=shift;
208
209 $code.=<<___;
210         movdqa  `&Xi_off($i+1)`,$Xn
211         paddd   `&Xi_off($i+9)`,$Xi             # Xi+=X[i+9]
212
213         movdqa  $Xn,$sigma
214         movdqa  $Xn,$t2
215         psrld   \$3,$sigma
216         movdqa  $Xn,$t3
217
218         psrld   \$7,$t2
219         movdqa  `&Xi_off($i+14)`,$t1
220         pslld   \$14,$t3
221         pxor    $t2,$sigma
222         psrld   \$18-7,$t2
223         movdqa  $t1,$axb                        # borrow $axb
224         pxor    $t3,$sigma
225         pslld   \$25-14,$t3
226         pxor    $t2,$sigma
227         psrld   \$10,$t1
228         movdqa  $axb,$t2
229
230         psrld   \$17,$axb
231         pxor    $t3,$sigma                      # sigma0(X[i+1])
232         pslld   \$13,$t2
233          paddd  $sigma,$Xi                      # Xi+=sigma0(e)
234         pxor    $axb,$t1
235         psrld   \$19-17,$axb
236         pxor    $t2,$t1
237         pslld   \$15-13,$t2
238         pxor    $axb,$t1
239         pxor    $t2,$t1                         # sigma0(X[i+14])
240         paddd   $t1,$Xi                         # Xi+=sigma1(X[i+14])
241 ___
242         &ROUND_00_15($i,@_);
243         ($Xi,$Xn)=($Xn,$Xi);
244 }
245
246 $code.=<<___;
247 .text
248
249 .extern OPENSSL_ia32cap_P
250
251 .globl  sha256_multi_block
252 .type   sha256_multi_block,\@function,3
253 .align  32
254 sha256_multi_block:
255 .cfi_startproc
256         mov     OPENSSL_ia32cap_P+4(%rip),%rcx
257         bt      \$61,%rcx                       # check SHA bit
258         jc      _shaext_shortcut
259 ___
260 $code.=<<___ if ($avx);
261         test    \$`1<<28`,%ecx
262         jnz     _avx_shortcut
263 ___
264 $code.=<<___;
265         mov     %rsp,%rax
266 .cfi_def_cfa_register   %rax
267         push    %rbx
268 .cfi_push       %rbx
269         push    %rbp
270 .cfi_push       %rbp
271 ___
272 $code.=<<___ if ($win64);
273         lea     -0xa8(%rsp),%rsp
274         movaps  %xmm6,(%rsp)
275         movaps  %xmm7,0x10(%rsp)
276         movaps  %xmm8,0x20(%rsp)
277         movaps  %xmm9,0x30(%rsp)
278         movaps  %xmm10,-0x78(%rax)
279         movaps  %xmm11,-0x68(%rax)
280         movaps  %xmm12,-0x58(%rax)
281         movaps  %xmm13,-0x48(%rax)
282         movaps  %xmm14,-0x38(%rax)
283         movaps  %xmm15,-0x28(%rax)
284 ___
285 $code.=<<___;
286         sub     \$`$REG_SZ*18`, %rsp
287         and     \$-256,%rsp
288         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
289 .cfi_cfa_expression     %rsp+`$REG_SZ*17`,deref,+8
290 .Lbody:
291         lea     K256+128(%rip),$Tbl
292         lea     `$REG_SZ*16`(%rsp),%rbx
293         lea     0x80($ctx),$ctx                 # size optimization
294
295 .Loop_grande:
296         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
297         xor     $num,$num
298 ___
299 for($i=0;$i<4;$i++) {
300     $ptr_reg=&pointer_register($flavour,@ptr[$i]);
301     $code.=<<___;
302         # input pointer
303         mov     `$inp_elm_size*$i+0`($inp),$ptr_reg
304         # number of blocks
305         mov     `$inp_elm_size*$i+$ptr_size`($inp),%ecx
306         cmp     $num,%ecx
307         cmovg   %ecx,$num                       # find maximum
308         test    %ecx,%ecx
309         mov     %ecx,`4*$i`(%rbx)               # initialize counters
310         cmovle  $Tbl,@ptr[$i]                   # cancel input
311 ___
312 }
313 $code.=<<___;
314         test    $num,$num
315         jz      .Ldone
316
317         movdqu  0x00-0x80($ctx),$A              # load context
318          lea    128(%rsp),%rax
319         movdqu  0x20-0x80($ctx),$B
320         movdqu  0x40-0x80($ctx),$C
321         movdqu  0x60-0x80($ctx),$D
322         movdqu  0x80-0x80($ctx),$E
323         movdqu  0xa0-0x80($ctx),$F
324         movdqu  0xc0-0x80($ctx),$G
325         movdqu  0xe0-0x80($ctx),$H
326         movdqu  .Lpbswap(%rip),$Xn
327         jmp     .Loop
328
329 .align  32
330 .Loop:
331         movdqa  $C,$bxc
332         pxor    $B,$bxc                         # magic seed
333 ___
334 for($i=0;$i<16;$i++)    { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
335 $code.=<<___;
336         movdqu  `&Xi_off($i)`,$Xi
337         mov     \$3,%ecx
338         jmp     .Loop_16_xx
339 .align  32
340 .Loop_16_xx:
341 ___
342 for(;$i<32;$i++)        { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
343 $code.=<<___;
344         dec     %ecx
345         jnz     .Loop_16_xx
346
347         mov     \$1,%ecx
348         lea     K256+128(%rip),$Tbl
349
350         movdqa  (%rbx),$sigma                   # pull counters
351         cmp     4*0(%rbx),%ecx                  # examine counters
352         pxor    $t1,$t1
353         cmovge  $Tbl,@ptr[0]                    # cancel input
354         cmp     4*1(%rbx),%ecx
355         movdqa  $sigma,$Xn
356         cmovge  $Tbl,@ptr[1]
357         cmp     4*2(%rbx),%ecx
358         pcmpgtd $t1,$Xn                         # mask value
359         cmovge  $Tbl,@ptr[2]
360         cmp     4*3(%rbx),%ecx
361         paddd   $Xn,$sigma                      # counters--
362         cmovge  $Tbl,@ptr[3]
363
364         movdqu  0x00-0x80($ctx),$t1
365         pand    $Xn,$A
366         movdqu  0x20-0x80($ctx),$t2
367         pand    $Xn,$B
368         movdqu  0x40-0x80($ctx),$t3
369         pand    $Xn,$C
370         movdqu  0x60-0x80($ctx),$Xi
371         pand    $Xn,$D
372         paddd   $t1,$A
373         movdqu  0x80-0x80($ctx),$t1
374         pand    $Xn,$E
375         paddd   $t2,$B
376         movdqu  0xa0-0x80($ctx),$t2
377         pand    $Xn,$F
378         paddd   $t3,$C
379         movdqu  0xc0-0x80($ctx),$t3
380         pand    $Xn,$G
381         paddd   $Xi,$D
382         movdqu  0xe0-0x80($ctx),$Xi
383         pand    $Xn,$H
384         paddd   $t1,$E
385         paddd   $t2,$F
386         movdqu  $A,0x00-0x80($ctx)
387         paddd   $t3,$G
388         movdqu  $B,0x20-0x80($ctx)
389         paddd   $Xi,$H
390         movdqu  $C,0x40-0x80($ctx)
391         movdqu  $D,0x60-0x80($ctx)
392         movdqu  $E,0x80-0x80($ctx)
393         movdqu  $F,0xa0-0x80($ctx)
394         movdqu  $G,0xc0-0x80($ctx)
395         movdqu  $H,0xe0-0x80($ctx)
396
397         movdqa  $sigma,(%rbx)                   # save counters
398         movdqa  .Lpbswap(%rip),$Xn
399         dec     $num
400         jnz     .Loop
401
402         mov     `$REG_SZ*17+8`(%rsp),$num
403         lea     $REG_SZ($ctx),$ctx
404         lea     `$inp_elm_size*$REG_SZ/4`($inp),$inp
405         dec     $num
406         jnz     .Loop_grande
407
408 .Ldone:
409         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
410 .cfi_def_cfa    %rax,8
411 ___
412 $code.=<<___ if ($win64);
413         movaps  -0xb8(%rax),%xmm6
414         movaps  -0xa8(%rax),%xmm7
415         movaps  -0x98(%rax),%xmm8
416         movaps  -0x88(%rax),%xmm9
417         movaps  -0x78(%rax),%xmm10
418         movaps  -0x68(%rax),%xmm11
419         movaps  -0x58(%rax),%xmm12
420         movaps  -0x48(%rax),%xmm13
421         movaps  -0x38(%rax),%xmm14
422         movaps  -0x28(%rax),%xmm15
423 ___
424 $code.=<<___;
425         mov     -16(%rax),%rbp
426 .cfi_restore    %rbp
427         mov     -8(%rax),%rbx
428 .cfi_restore    %rbx
429         lea     (%rax),%rsp
430 .cfi_def_cfa_register   %rsp
431 .Lepilogue:
432         ret
433 .cfi_endproc
434 .size   sha256_multi_block,.-sha256_multi_block
435 ___
436                                                 {{{
437 my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
438 my @MSG0=map("%xmm$_",(4..7));
439 my @MSG1=map("%xmm$_",(8..11));
440
441 $code.=<<___;
442 .type   sha256_multi_block_shaext,\@function,3
443 .align  32
444 sha256_multi_block_shaext:
445 .cfi_startproc
446 _shaext_shortcut:
447         mov     %rsp,%rax
448 .cfi_def_cfa_register   %rax
449         push    %rbx
450 .cfi_push       %rbx
451         push    %rbp
452 .cfi_push       %rbp
453 ___
454 $code.=<<___ if ($win64);
455         lea     -0xa8(%rsp),%rsp
456         movaps  %xmm6,(%rsp)
457         movaps  %xmm7,0x10(%rsp)
458         movaps  %xmm8,0x20(%rsp)
459         movaps  %xmm9,0x30(%rsp)
460         movaps  %xmm10,-0x78(%rax)
461         movaps  %xmm11,-0x68(%rax)
462         movaps  %xmm12,-0x58(%rax)
463         movaps  %xmm13,-0x48(%rax)
464         movaps  %xmm14,-0x38(%rax)
465         movaps  %xmm15,-0x28(%rax)
466 ___
467 $code.=<<___;
468         sub     \$`$REG_SZ*18`,%rsp
469         shl     \$1,$num                        # we process pair at a time
470         and     \$-256,%rsp
471         lea     0x80($ctx),$ctx                 # size optimization
472         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
473 .Lbody_shaext:
474         lea     `$REG_SZ*16`(%rsp),%rbx
475         lea     K256_shaext+0x80(%rip),$Tbl
476
477 .Loop_grande_shaext:
478         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
479         xor     $num,$num
480 ___
481 for($i=0;$i<2;$i++) {
482     $ptr_reg=&pointer_register($flavour,@ptr[$i]);
483     $code.=<<___;
484         # input pointer
485         mov     `$inp_elm_size*$i+0`($inp),$ptr_reg
486         # number of blocks
487         mov     `$inp_elm_size*$i+$ptr_size`($inp),%ecx
488         cmp     $num,%ecx
489         cmovg   %ecx,$num                       # find maximum
490         test    %ecx,%ecx
491         mov     %ecx,`4*$i`(%rbx)               # initialize counters
492         cmovle  %rsp,@ptr[$i]                   # cancel input
493 ___
494 }
495 $code.=<<___;
496         test    $num,$num
497         jz      .Ldone_shaext
498
499         movq            0x00-0x80($ctx),$ABEF0          # A1.A0
500         movq            0x20-0x80($ctx),@MSG0[0]        # B1.B0
501         movq            0x40-0x80($ctx),$CDGH0          # C1.C0
502         movq            0x60-0x80($ctx),@MSG0[1]        # D1.D0
503         movq            0x80-0x80($ctx),@MSG1[0]        # E1.E0
504         movq            0xa0-0x80($ctx),@MSG1[1]        # F1.F0
505         movq            0xc0-0x80($ctx),@MSG1[2]        # G1.G0
506         movq            0xe0-0x80($ctx),@MSG1[3]        # H1.H0
507
508         punpckldq       @MSG0[0],$ABEF0                 # B1.A1.B0.A0
509         punpckldq       @MSG0[1],$CDGH0                 # D1.C1.D0.C0
510         punpckldq       @MSG1[1],@MSG1[0]               # F1.E1.F0.E0
511         punpckldq       @MSG1[3],@MSG1[2]               # H1.G1.H0.G0
512         movdqa          K256_shaext-0x10(%rip),$TMPx    # byte swap
513
514         movdqa          $ABEF0,$ABEF1
515         movdqa          $CDGH0,$CDGH1
516         punpcklqdq      @MSG1[0],$ABEF0                 # F0.E0.B0.A0
517         punpcklqdq      @MSG1[2],$CDGH0                 # H0.G0.D0.C0
518         punpckhqdq      @MSG1[0],$ABEF1                 # F1.E1.B1.A1
519         punpckhqdq      @MSG1[2],$CDGH1                 # H1.G1.D1.C1
520
521         pshufd          \$0b00011011,$ABEF0,$ABEF0
522         pshufd          \$0b00011011,$CDGH0,$CDGH0
523         pshufd          \$0b00011011,$ABEF1,$ABEF1
524         pshufd          \$0b00011011,$CDGH1,$CDGH1
525         jmp             .Loop_shaext
526
527 .align  32
528 .Loop_shaext:
529         movdqu          0x00(@ptr[0]),@MSG0[0]
530          movdqu         0x00(@ptr[1]),@MSG1[0]
531         movdqu          0x10(@ptr[0]),@MSG0[1]
532          movdqu         0x10(@ptr[1]),@MSG1[1]
533         movdqu          0x20(@ptr[0]),@MSG0[2]
534         pshufb          $TMPx,@MSG0[0]
535          movdqu         0x20(@ptr[1]),@MSG1[2]
536          pshufb         $TMPx,@MSG1[0]
537         movdqu          0x30(@ptr[0]),@MSG0[3]
538         lea             0x40(@ptr[0]),@ptr[0]
539          movdqu         0x30(@ptr[1]),@MSG1[3]
540          lea            0x40(@ptr[1]),@ptr[1]
541
542         movdqa          0*16-0x80($Tbl),$Wi
543         pshufb          $TMPx,@MSG0[1]
544         paddd           @MSG0[0],$Wi
545         pxor            $ABEF0,@MSG0[0]         # black magic
546         movdqa          $Wi,$TMP0
547          movdqa         0*16-0x80($Tbl),$TMP1
548          pshufb         $TMPx,@MSG1[1]
549          paddd          @MSG1[0],$TMP1
550         movdqa          $CDGH0,0x50(%rsp)       # offload
551         sha256rnds2     $ABEF0,$CDGH0           # 0-3
552          pxor           $ABEF1,@MSG1[0]         # black magic
553          movdqa         $TMP1,$Wi
554          movdqa         $CDGH1,0x70(%rsp)
555          sha256rnds2    $ABEF1,$CDGH1           # 0-3
556         pshufd          \$0x0e,$TMP0,$Wi
557         pxor            $ABEF0,@MSG0[0]         # black magic
558         movdqa          $ABEF0,0x40(%rsp)       # offload
559         sha256rnds2     $CDGH0,$ABEF0
560          pshufd         \$0x0e,$TMP1,$Wi
561          pxor           $ABEF1,@MSG1[0]         # black magic
562          movdqa         $ABEF1,0x60(%rsp)
563         movdqa          1*16-0x80($Tbl),$TMP0
564         paddd           @MSG0[1],$TMP0
565         pshufb          $TMPx,@MSG0[2]
566          sha256rnds2    $CDGH1,$ABEF1
567
568         movdqa          $TMP0,$Wi
569          movdqa         1*16-0x80($Tbl),$TMP1
570          paddd          @MSG1[1],$TMP1
571         sha256rnds2     $ABEF0,$CDGH0           # 4-7
572          movdqa         $TMP1,$Wi
573         prefetcht0      127(@ptr[0])
574         pshufb          $TMPx,@MSG0[3]
575          pshufb         $TMPx,@MSG1[2]
576          prefetcht0     127(@ptr[1])
577          sha256rnds2    $ABEF1,$CDGH1           # 4-7
578         pshufd          \$0x0e,$TMP0,$Wi
579          pshufb         $TMPx,@MSG1[3]
580         sha256msg1      @MSG0[1],@MSG0[0]
581         sha256rnds2     $CDGH0,$ABEF0
582          pshufd         \$0x0e,$TMP1,$Wi
583         movdqa          2*16-0x80($Tbl),$TMP0
584         paddd           @MSG0[2],$TMP0
585          sha256rnds2    $CDGH1,$ABEF1
586
587         movdqa          $TMP0,$Wi
588          movdqa         2*16-0x80($Tbl),$TMP1
589          paddd          @MSG1[2],$TMP1
590         sha256rnds2     $ABEF0,$CDGH0           # 8-11
591          sha256msg1     @MSG1[1],@MSG1[0]
592          movdqa         $TMP1,$Wi
593         movdqa          @MSG0[3],$TMPx
594          sha256rnds2    $ABEF1,$CDGH1           # 8-11
595         pshufd          \$0x0e,$TMP0,$Wi
596         palignr         \$4,@MSG0[2],$TMPx
597         paddd           $TMPx,@MSG0[0]
598          movdqa         @MSG1[3],$TMPx
599          palignr        \$4,@MSG1[2],$TMPx
600         sha256msg1      @MSG0[2],@MSG0[1]
601         sha256rnds2     $CDGH0,$ABEF0
602          pshufd         \$0x0e,$TMP1,$Wi
603         movdqa          3*16-0x80($Tbl),$TMP0
604         paddd           @MSG0[3],$TMP0
605          sha256rnds2    $CDGH1,$ABEF1
606          sha256msg1     @MSG1[2],@MSG1[1]
607
608         movdqa          $TMP0,$Wi
609          movdqa         3*16-0x80($Tbl),$TMP1
610          paddd          $TMPx,@MSG1[0]
611          paddd          @MSG1[3],$TMP1
612         sha256msg2      @MSG0[3],@MSG0[0]
613         sha256rnds2     $ABEF0,$CDGH0           # 12-15
614          movdqa         $TMP1,$Wi
615         movdqa          @MSG0[0],$TMPx
616         palignr         \$4,@MSG0[3],$TMPx
617          sha256rnds2    $ABEF1,$CDGH1           # 12-15
618          sha256msg2     @MSG1[3],@MSG1[0]
619         pshufd          \$0x0e,$TMP0,$Wi
620         paddd           $TMPx,@MSG0[1]
621          movdqa         @MSG1[0],$TMPx
622          palignr        \$4,@MSG1[3],$TMPx
623         sha256msg1      @MSG0[3],@MSG0[2]
624         sha256rnds2     $CDGH0,$ABEF0
625          pshufd         \$0x0e,$TMP1,$Wi
626         movdqa          4*16-0x80($Tbl),$TMP0
627         paddd           @MSG0[0],$TMP0
628          sha256rnds2    $CDGH1,$ABEF1
629          sha256msg1     @MSG1[3],@MSG1[2]
630 ___
631 for($i=4;$i<16-3;$i++) {
632 $code.=<<___;
633         movdqa          $TMP0,$Wi
634          movdqa         $i*16-0x80($Tbl),$TMP1
635          paddd          $TMPx,@MSG1[1]
636          paddd          @MSG1[0],$TMP1
637         sha256msg2      @MSG0[0],@MSG0[1]
638         sha256rnds2     $ABEF0,$CDGH0           # 16-19...
639          movdqa         $TMP1,$Wi
640         movdqa          @MSG0[1],$TMPx
641         palignr         \$4,@MSG0[0],$TMPx
642          sha256rnds2    $ABEF1,$CDGH1           # 16-19...
643          sha256msg2     @MSG1[0],@MSG1[1]
644         pshufd          \$0x0e,$TMP0,$Wi
645         paddd           $TMPx,@MSG0[2]
646          movdqa         @MSG1[1],$TMPx
647          palignr        \$4,@MSG1[0],$TMPx
648         sha256msg1      @MSG0[0],@MSG0[3]
649         sha256rnds2     $CDGH0,$ABEF0
650          pshufd         \$0x0e,$TMP1,$Wi
651         movdqa          `($i+1)*16`-0x80($Tbl),$TMP0
652         paddd           @MSG0[1],$TMP0
653          sha256rnds2    $CDGH1,$ABEF1
654          sha256msg1     @MSG1[0],@MSG1[3]
655 ___
656         push(@MSG0,shift(@MSG0));       push(@MSG1,shift(@MSG1));
657 }
658 $code.=<<___;
659         movdqa          $TMP0,$Wi
660          movdqa         13*16-0x80($Tbl),$TMP1
661          paddd          $TMPx,@MSG1[1]
662          paddd          @MSG1[0],$TMP1
663         sha256msg2      @MSG0[0],@MSG0[1]
664         sha256rnds2     $ABEF0,$CDGH0           # 52-55
665          movdqa         $TMP1,$Wi
666         movdqa          @MSG0[1],$TMPx
667         palignr         \$4,@MSG0[0],$TMPx
668          sha256rnds2    $ABEF1,$CDGH1           # 52-55
669          sha256msg2     @MSG1[0],@MSG1[1]
670         pshufd          \$0x0e,$TMP0,$Wi
671         paddd           $TMPx,@MSG0[2]
672          movdqa         @MSG1[1],$TMPx
673          palignr        \$4,@MSG1[0],$TMPx
674         nop
675         sha256rnds2     $CDGH0,$ABEF0
676          pshufd         \$0x0e,$TMP1,$Wi
677         movdqa          14*16-0x80($Tbl),$TMP0
678         paddd           @MSG0[1],$TMP0
679          sha256rnds2    $CDGH1,$ABEF1
680
681         movdqa          $TMP0,$Wi
682          movdqa         14*16-0x80($Tbl),$TMP1
683          paddd          $TMPx,@MSG1[2]
684          paddd          @MSG1[1],$TMP1
685         sha256msg2      @MSG0[1],@MSG0[2]
686         nop
687         sha256rnds2     $ABEF0,$CDGH0           # 56-59
688          movdqa         $TMP1,$Wi
689           mov           \$1,%ecx
690           pxor          @MSG0[1],@MSG0[1]       # zero
691          sha256rnds2    $ABEF1,$CDGH1           # 56-59
692          sha256msg2     @MSG1[1],@MSG1[2]
693         pshufd          \$0x0e,$TMP0,$Wi
694         movdqa          15*16-0x80($Tbl),$TMP0
695         paddd           @MSG0[2],$TMP0
696           movq          (%rbx),@MSG0[2]         # pull counters
697           nop
698         sha256rnds2     $CDGH0,$ABEF0
699          pshufd         \$0x0e,$TMP1,$Wi
700          movdqa         15*16-0x80($Tbl),$TMP1
701          paddd          @MSG1[2],$TMP1
702          sha256rnds2    $CDGH1,$ABEF1
703
704         movdqa          $TMP0,$Wi
705           cmp           4*0(%rbx),%ecx          # examine counters
706           cmovge        %rsp,@ptr[0]            # cancel input
707           cmp           4*1(%rbx),%ecx
708           cmovge        %rsp,@ptr[1]
709           pshufd        \$0x00,@MSG0[2],@MSG1[0]
710         sha256rnds2     $ABEF0,$CDGH0           # 60-63
711          movdqa         $TMP1,$Wi
712           pshufd        \$0x55,@MSG0[2],@MSG1[1]
713           movdqa        @MSG0[2],@MSG1[2]
714          sha256rnds2    $ABEF1,$CDGH1           # 60-63
715         pshufd          \$0x0e,$TMP0,$Wi
716           pcmpgtd       @MSG0[1],@MSG1[0]
717           pcmpgtd       @MSG0[1],@MSG1[1]
718         sha256rnds2     $CDGH0,$ABEF0
719          pshufd         \$0x0e,$TMP1,$Wi
720           pcmpgtd       @MSG0[1],@MSG1[2]       # counter mask
721           movdqa        K256_shaext-0x10(%rip),$TMPx
722          sha256rnds2    $CDGH1,$ABEF1
723
724         pand            @MSG1[0],$CDGH0
725          pand           @MSG1[1],$CDGH1
726         pand            @MSG1[0],$ABEF0
727          pand           @MSG1[1],$ABEF1
728         paddd           @MSG0[2],@MSG1[2]       # counters--
729
730         paddd           0x50(%rsp),$CDGH0
731          paddd          0x70(%rsp),$CDGH1
732         paddd           0x40(%rsp),$ABEF0
733          paddd          0x60(%rsp),$ABEF1
734
735         movq            @MSG1[2],(%rbx)         # save counters
736         dec             $num
737         jnz             .Loop_shaext
738
739         mov             `$REG_SZ*17+8`(%rsp),$num
740
741         pshufd          \$0b00011011,$ABEF0,$ABEF0
742         pshufd          \$0b00011011,$CDGH0,$CDGH0
743         pshufd          \$0b00011011,$ABEF1,$ABEF1
744         pshufd          \$0b00011011,$CDGH1,$CDGH1
745
746         movdqa          $ABEF0,@MSG0[0]
747         movdqa          $CDGH0,@MSG0[1]
748         punpckldq       $ABEF1,$ABEF0                   # B1.B0.A1.A0
749         punpckhdq       $ABEF1,@MSG0[0]                 # F1.F0.E1.E0
750         punpckldq       $CDGH1,$CDGH0                   # D1.D0.C1.C0
751         punpckhdq       $CDGH1,@MSG0[1]                 # H1.H0.G1.G0
752
753         movq            $ABEF0,0x00-0x80($ctx)          # A1.A0
754         psrldq          \$8,$ABEF0
755         movq            @MSG0[0],0x80-0x80($ctx)        # E1.E0
756         psrldq          \$8,@MSG0[0]
757         movq            $ABEF0,0x20-0x80($ctx)          # B1.B0
758         movq            @MSG0[0],0xa0-0x80($ctx)        # F1.F0
759
760         movq            $CDGH0,0x40-0x80($ctx)          # C1.C0
761         psrldq          \$8,$CDGH0
762         movq            @MSG0[1],0xc0-0x80($ctx)        # G1.G0
763         psrldq          \$8,@MSG0[1]
764         movq            $CDGH0,0x60-0x80($ctx)          # D1.D0
765         movq            @MSG0[1],0xe0-0x80($ctx)        # H1.H0
766
767         lea     `$REG_SZ/2`($ctx),$ctx
768         lea     `$inp_elm_size*2`($inp),$inp
769         dec     $num
770         jnz     .Loop_grande_shaext
771
772 .Ldone_shaext:
773         #mov    `$REG_SZ*17`(%rsp),%rax         # original %rsp
774 ___
775 $code.=<<___ if ($win64);
776         movaps  -0xb8(%rax),%xmm6
777         movaps  -0xa8(%rax),%xmm7
778         movaps  -0x98(%rax),%xmm8
779         movaps  -0x88(%rax),%xmm9
780         movaps  -0x78(%rax),%xmm10
781         movaps  -0x68(%rax),%xmm11
782         movaps  -0x58(%rax),%xmm12
783         movaps  -0x48(%rax),%xmm13
784         movaps  -0x38(%rax),%xmm14
785         movaps  -0x28(%rax),%xmm15
786 ___
787 $code.=<<___;
788         mov     -16(%rax),%rbp
789 .cfi_restore    %rbp
790         mov     -8(%rax),%rbx
791 .cfi_restore    %rbx
792         lea     (%rax),%rsp
793 .cfi_def_cfa_register   %rsp
794 .Lepilogue_shaext:
795         ret
796 .cfi_endproc
797 .size   sha256_multi_block_shaext,.-sha256_multi_block_shaext
798 ___
799                                                 }}}
800                                                 if ($avx) {{{
801 sub ROUND_00_15_avx {
802 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
803
804 $code.=<<___ if ($i<15 && $REG_SZ==16);
805         vmovd           `4*$i`(@ptr[0]),$Xi
806         vmovd           `4*$i`(@ptr[1]),$t1
807         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
808         vpinsrd         \$1,`4*$i`(@ptr[3]),$t1,$t1
809         vpunpckldq      $t1,$Xi,$Xi
810         vpshufb         $Xn,$Xi,$Xi
811 ___
812 $code.=<<___ if ($i==15 && $REG_SZ==16);
813         vmovd           `4*$i`(@ptr[0]),$Xi
814          lea            `16*4`(@ptr[0]),@ptr[0]
815         vmovd           `4*$i`(@ptr[1]),$t1
816          lea            `16*4`(@ptr[1]),@ptr[1]
817         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
818          lea            `16*4`(@ptr[2]),@ptr[2]
819         vpinsrd         \$1,`4*$i`(@ptr[3]),$t1,$t1
820          lea            `16*4`(@ptr[3]),@ptr[3]
821         vpunpckldq      $t1,$Xi,$Xi
822         vpshufb         $Xn,$Xi,$Xi
823 ___
824 $code.=<<___ if ($i<15 && $REG_SZ==32);
825         vmovd           `4*$i`(@ptr[0]),$Xi
826         vmovd           `4*$i`(@ptr[4]),$t1
827         vmovd           `4*$i`(@ptr[1]),$t2
828         vmovd           `4*$i`(@ptr[5]),$t3
829         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
830         vpinsrd         \$1,`4*$i`(@ptr[6]),$t1,$t1
831         vpinsrd         \$1,`4*$i`(@ptr[3]),$t2,$t2
832         vpunpckldq      $t2,$Xi,$Xi
833         vpinsrd         \$1,`4*$i`(@ptr[7]),$t3,$t3
834         vpunpckldq      $t3,$t1,$t1
835         vinserti128     $t1,$Xi,$Xi
836         vpshufb         $Xn,$Xi,$Xi
837 ___
838 $code.=<<___ if ($i==15 && $REG_SZ==32);
839         vmovd           `4*$i`(@ptr[0]),$Xi
840          lea            `16*4`(@ptr[0]),@ptr[0]
841         vmovd           `4*$i`(@ptr[4]),$t1
842          lea            `16*4`(@ptr[4]),@ptr[4]
843         vmovd           `4*$i`(@ptr[1]),$t2
844          lea            `16*4`(@ptr[1]),@ptr[1]
845         vmovd           `4*$i`(@ptr[5]),$t3
846          lea            `16*4`(@ptr[5]),@ptr[5]
847         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
848          lea            `16*4`(@ptr[2]),@ptr[2]
849         vpinsrd         \$1,`4*$i`(@ptr[6]),$t1,$t1
850          lea            `16*4`(@ptr[6]),@ptr[6]
851         vpinsrd         \$1,`4*$i`(@ptr[3]),$t2,$t2
852          lea            `16*4`(@ptr[3]),@ptr[3]
853         vpunpckldq      $t2,$Xi,$Xi
854         vpinsrd         \$1,`4*$i`(@ptr[7]),$t3,$t3
855          lea            `16*4`(@ptr[7]),@ptr[7]
856         vpunpckldq      $t3,$t1,$t1
857         vinserti128     $t1,$Xi,$Xi
858         vpshufb         $Xn,$Xi,$Xi
859 ___
860 $code.=<<___;
861         vpsrld  \$6,$e,$sigma
862         vpslld  \$26,$e,$t3
863         vmovdqu $Xi,`&Xi_off($i)`
864          vpaddd $h,$Xi,$Xi                      # Xi+=h
865
866         vpsrld  \$11,$e,$t2
867         vpxor   $t3,$sigma,$sigma
868         vpslld  \$21,$e,$t3
869          vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi   # Xi+=K[round]
870         vpxor   $t2,$sigma,$sigma
871
872         vpsrld  \$25,$e,$t2
873         vpxor   $t3,$sigma,$sigma
874          `"prefetcht0   63(@ptr[0])"            if ($i==15)`
875         vpslld  \$7,$e,$t3
876          vpandn $g,$e,$t1
877          vpand  $f,$e,$axb                      # borrow $axb
878          `"prefetcht0   63(@ptr[1])"            if ($i==15)`
879         vpxor   $t2,$sigma,$sigma
880
881         vpsrld  \$2,$a,$h                       # borrow $h
882         vpxor   $t3,$sigma,$sigma               # Sigma1(e)
883          `"prefetcht0   63(@ptr[2])"            if ($i==15)`
884         vpslld  \$30,$a,$t2
885          vpxor  $axb,$t1,$t1                    # Ch(e,f,g)
886          vpxor  $a,$b,$axb                      # a^b, b^c in next round
887          `"prefetcht0   63(@ptr[3])"            if ($i==15)`
888         vpxor   $t2,$h,$h
889         vpaddd  $sigma,$Xi,$Xi                  # Xi+=Sigma1(e)
890
891         vpsrld  \$13,$a,$t2
892          `"prefetcht0   63(@ptr[4])"            if ($i==15 && $REG_SZ==32)`
893         vpslld  \$19,$a,$t3
894          vpaddd $t1,$Xi,$Xi                     # Xi+=Ch(e,f,g)
895          vpand  $axb,$bxc,$bxc
896          `"prefetcht0   63(@ptr[5])"            if ($i==15 && $REG_SZ==32)`
897         vpxor   $t2,$h,$sigma
898
899         vpsrld  \$22,$a,$t2
900         vpxor   $t3,$sigma,$sigma
901          `"prefetcht0   63(@ptr[6])"            if ($i==15 && $REG_SZ==32)`
902         vpslld  \$10,$a,$t3
903          vpxor  $bxc,$b,$h                      # h=Maj(a,b,c)=Ch(a^b,c,b)
904          vpaddd $Xi,$d,$d                       # d+=Xi
905          `"prefetcht0   63(@ptr[7])"            if ($i==15 && $REG_SZ==32)`
906         vpxor   $t2,$sigma,$sigma
907         vpxor   $t3,$sigma,$sigma               # Sigma0(a)
908
909         vpaddd  $Xi,$h,$h                       # h+=Xi
910         vpaddd  $sigma,$h,$h                    # h+=Sigma0(a)
911 ___
912 $code.=<<___ if (($i%8)==7);
913         add     \$`32*8`,$Tbl
914 ___
915         ($axb,$bxc)=($bxc,$axb);
916 }
917
918 sub ROUND_16_XX_avx {
919 my $i=shift;
920
921 $code.=<<___;
922         vmovdqu `&Xi_off($i+1)`,$Xn
923         vpaddd  `&Xi_off($i+9)`,$Xi,$Xi         # Xi+=X[i+9]
924
925         vpsrld  \$3,$Xn,$sigma
926         vpsrld  \$7,$Xn,$t2
927         vpslld  \$25,$Xn,$t3
928         vpxor   $t2,$sigma,$sigma
929         vpsrld  \$18,$Xn,$t2
930         vpxor   $t3,$sigma,$sigma
931         vpslld  \$14,$Xn,$t3
932         vmovdqu `&Xi_off($i+14)`,$t1
933         vpsrld  \$10,$t1,$axb                   # borrow $axb
934
935         vpxor   $t2,$sigma,$sigma
936         vpsrld  \$17,$t1,$t2
937         vpxor   $t3,$sigma,$sigma               # sigma0(X[i+1])
938         vpslld  \$15,$t1,$t3
939          vpaddd $sigma,$Xi,$Xi                  # Xi+=sigma0(e)
940         vpxor   $t2,$axb,$sigma
941         vpsrld  \$19,$t1,$t2
942         vpxor   $t3,$sigma,$sigma
943         vpslld  \$13,$t1,$t3
944         vpxor   $t2,$sigma,$sigma
945         vpxor   $t3,$sigma,$sigma               # sigma0(X[i+14])
946         vpaddd  $sigma,$Xi,$Xi                  # Xi+=sigma1(X[i+14])
947 ___
948         &ROUND_00_15_avx($i,@_);
949         ($Xi,$Xn)=($Xn,$Xi);
950 }
951
952 $code.=<<___;
953 .type   sha256_multi_block_avx,\@function,3
954 .align  32
955 sha256_multi_block_avx:
956 .cfi_startproc
957 _avx_shortcut:
958 ___
959 $code.=<<___ if ($avx>1);
960         shr     \$32,%rcx
961         cmp     \$2,$num
962         jb      .Lavx
963         test    \$`1<<5`,%ecx
964         jnz     _avx2_shortcut
965         jmp     .Lavx
966 .align  32
967 .Lavx:
968 ___
969 $code.=<<___;
970         mov     %rsp,%rax
971 .cfi_def_cfa_register   %rax
972         push    %rbx
973 .cfi_push       %rbx
974         push    %rbp
975 .cfi_push       %rbp
976 ___
977 $code.=<<___ if ($win64);
978         lea     -0xa8(%rsp),%rsp
979         movaps  %xmm6,(%rsp)
980         movaps  %xmm7,0x10(%rsp)
981         movaps  %xmm8,0x20(%rsp)
982         movaps  %xmm9,0x30(%rsp)
983         movaps  %xmm10,-0x78(%rax)
984         movaps  %xmm11,-0x68(%rax)
985         movaps  %xmm12,-0x58(%rax)
986         movaps  %xmm13,-0x48(%rax)
987         movaps  %xmm14,-0x38(%rax)
988         movaps  %xmm15,-0x28(%rax)
989 ___
990 $code.=<<___;
991         sub     \$`$REG_SZ*18`, %rsp
992         and     \$-256,%rsp
993         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
994 .cfi_cfa_expression     %rsp+`$REG_SZ*17`,deref,+8
995 .Lbody_avx:
996         lea     K256+128(%rip),$Tbl
997         lea     `$REG_SZ*16`(%rsp),%rbx
998         lea     0x80($ctx),$ctx                 # size optimization
999
1000 .Loop_grande_avx:
1001         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
1002         xor     $num,$num
1003 ___
1004 for($i=0;$i<4;$i++) {
1005     $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1006     $code.=<<___;
1007         # input pointer
1008         mov     `$inp_elm_size*$i+0`($inp),$ptr_reg
1009         # number of blocks
1010         mov     `$inp_elm_size*$i+$ptr_size`($inp),%ecx
1011         cmp     $num,%ecx
1012         cmovg   %ecx,$num                       # find maximum
1013         test    %ecx,%ecx
1014         mov     %ecx,`4*$i`(%rbx)               # initialize counters
1015         cmovle  $Tbl,@ptr[$i]                   # cancel input
1016 ___
1017 }
1018 $code.=<<___;
1019         test    $num,$num
1020         jz      .Ldone_avx
1021
1022         vmovdqu 0x00-0x80($ctx),$A              # load context
1023          lea    128(%rsp),%rax
1024         vmovdqu 0x20-0x80($ctx),$B
1025         vmovdqu 0x40-0x80($ctx),$C
1026         vmovdqu 0x60-0x80($ctx),$D
1027         vmovdqu 0x80-0x80($ctx),$E
1028         vmovdqu 0xa0-0x80($ctx),$F
1029         vmovdqu 0xc0-0x80($ctx),$G
1030         vmovdqu 0xe0-0x80($ctx),$H
1031         vmovdqu .Lpbswap(%rip),$Xn
1032         jmp     .Loop_avx
1033
1034 .align  32
1035 .Loop_avx:
1036         vpxor   $B,$C,$bxc                      # magic seed
1037 ___
1038 for($i=0;$i<16;$i++)    { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1039 $code.=<<___;
1040         vmovdqu `&Xi_off($i)`,$Xi
1041         mov     \$3,%ecx
1042         jmp     .Loop_16_xx_avx
1043 .align  32
1044 .Loop_16_xx_avx:
1045 ___
1046 for(;$i<32;$i++)        { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1047 $code.=<<___;
1048         dec     %ecx
1049         jnz     .Loop_16_xx_avx
1050
1051         mov     \$1,%ecx
1052         lea     K256+128(%rip),$Tbl
1053 ___
1054 for($i=0;$i<4;$i++) {
1055     $code.=<<___;
1056         cmp     `4*$i`(%rbx),%ecx               # examine counters
1057         cmovge  $Tbl,@ptr[$i]                   # cancel input
1058 ___
1059 }
1060 $code.=<<___;
1061         vmovdqa (%rbx),$sigma                   # pull counters
1062         vpxor   $t1,$t1,$t1
1063         vmovdqa $sigma,$Xn
1064         vpcmpgtd $t1,$Xn,$Xn                    # mask value
1065         vpaddd  $Xn,$sigma,$sigma               # counters--
1066
1067         vmovdqu 0x00-0x80($ctx),$t1
1068         vpand   $Xn,$A,$A
1069         vmovdqu 0x20-0x80($ctx),$t2
1070         vpand   $Xn,$B,$B
1071         vmovdqu 0x40-0x80($ctx),$t3
1072         vpand   $Xn,$C,$C
1073         vmovdqu 0x60-0x80($ctx),$Xi
1074         vpand   $Xn,$D,$D
1075         vpaddd  $t1,$A,$A
1076         vmovdqu 0x80-0x80($ctx),$t1
1077         vpand   $Xn,$E,$E
1078         vpaddd  $t2,$B,$B
1079         vmovdqu 0xa0-0x80($ctx),$t2
1080         vpand   $Xn,$F,$F
1081         vpaddd  $t3,$C,$C
1082         vmovdqu 0xc0-0x80($ctx),$t3
1083         vpand   $Xn,$G,$G
1084         vpaddd  $Xi,$D,$D
1085         vmovdqu 0xe0-0x80($ctx),$Xi
1086         vpand   $Xn,$H,$H
1087         vpaddd  $t1,$E,$E
1088         vpaddd  $t2,$F,$F
1089         vmovdqu $A,0x00-0x80($ctx)
1090         vpaddd  $t3,$G,$G
1091         vmovdqu $B,0x20-0x80($ctx)
1092         vpaddd  $Xi,$H,$H
1093         vmovdqu $C,0x40-0x80($ctx)
1094         vmovdqu $D,0x60-0x80($ctx)
1095         vmovdqu $E,0x80-0x80($ctx)
1096         vmovdqu $F,0xa0-0x80($ctx)
1097         vmovdqu $G,0xc0-0x80($ctx)
1098         vmovdqu $H,0xe0-0x80($ctx)
1099
1100         vmovdqu $sigma,(%rbx)                   # save counters
1101         vmovdqu .Lpbswap(%rip),$Xn
1102         dec     $num
1103         jnz     .Loop_avx
1104
1105         mov     `$REG_SZ*17+8`(%rsp),$num
1106         lea     $REG_SZ($ctx),$ctx
1107         lea     `$inp_elm_size*$REG_SZ/4`($inp),$inp
1108         dec     $num
1109         jnz     .Loop_grande_avx
1110
1111 .Ldone_avx:
1112         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
1113 .cfi_def_cfa    %rax,8
1114         vzeroupper
1115 ___
1116 $code.=<<___ if ($win64);
1117         movaps  -0xb8(%rax),%xmm6
1118         movaps  -0xa8(%rax),%xmm7
1119         movaps  -0x98(%rax),%xmm8
1120         movaps  -0x88(%rax),%xmm9
1121         movaps  -0x78(%rax),%xmm10
1122         movaps  -0x68(%rax),%xmm11
1123         movaps  -0x58(%rax),%xmm12
1124         movaps  -0x48(%rax),%xmm13
1125         movaps  -0x38(%rax),%xmm14
1126         movaps  -0x28(%rax),%xmm15
1127 ___
1128 $code.=<<___;
1129         mov     -16(%rax),%rbp
1130 .cfi_restore    %rbp
1131         mov     -8(%rax),%rbx
1132 .cfi_restore    %rbx
1133         lea     (%rax),%rsp
1134 .cfi_def_cfa_register   %rsp
1135 .Lepilogue_avx:
1136         ret
1137 .cfi_endproc
1138 .size   sha256_multi_block_avx,.-sha256_multi_block_avx
1139 ___
1140                                                 if ($avx>1) {
1141 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1142
1143 $REG_SZ=32;
1144 @ptr=map("%r$_",(12..15,8..11));
1145
1146 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1147 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1148
1149 $code.=<<___;
1150 .type   sha256_multi_block_avx2,\@function,3
1151 .align  32
1152 sha256_multi_block_avx2:
1153 .cfi_startproc
1154 _avx2_shortcut:
1155         mov     %rsp,%rax
1156 .cfi_def_cfa_register   %rax
1157         push    %rbx
1158 .cfi_push       %rbx
1159         push    %rbp
1160 .cfi_push       %rbp
1161         push    %r12
1162 .cfi_push       %r12
1163         push    %r13
1164 .cfi_push       %r13
1165         push    %r14
1166 .cfi_push       %r14
1167         push    %r15
1168 .cfi_push       %r15
1169 ___
1170 $code.=<<___ if ($win64);
1171         lea     -0xa8(%rsp),%rsp
1172         movaps  %xmm6,(%rsp)
1173         movaps  %xmm7,0x10(%rsp)
1174         movaps  %xmm8,0x20(%rsp)
1175         movaps  %xmm9,0x30(%rsp)
1176         movaps  %xmm10,0x40(%rsp)
1177         movaps  %xmm11,0x50(%rsp)
1178         movaps  %xmm12,-0x78(%rax)
1179         movaps  %xmm13,-0x68(%rax)
1180         movaps  %xmm14,-0x58(%rax)
1181         movaps  %xmm15,-0x48(%rax)
1182 ___
1183 $code.=<<___;
1184         sub     \$`$REG_SZ*18`, %rsp
1185         and     \$-256,%rsp
1186         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
1187 .cfi_cfa_expression     %rsp+`$REG_SZ*17`,deref,+8
1188 .Lbody_avx2:
1189         lea     K256+128(%rip),$Tbl
1190         lea     0x80($ctx),$ctx                 # size optimization
1191
1192 .Loop_grande_avx2:
1193         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
1194         xor     $num,$num
1195         lea     `$REG_SZ*16`(%rsp),%rbx
1196 ___
1197 for($i=0;$i<8;$i++) {
1198     $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1199     $code.=<<___;
1200         # input pointer
1201         mov     `$inp_elm_size*$i+0`($inp),$ptr_reg
1202         # number of blocks
1203         mov     `$inp_elm_size*$i+$ptr_size`($inp),%ecx
1204         cmp     $num,%ecx
1205         cmovg   %ecx,$num                       # find maximum
1206         test    %ecx,%ecx
1207         mov     %ecx,`4*$i`(%rbx)               # initialize counters
1208         cmovle  $Tbl,@ptr[$i]                   # cancel input
1209 ___
1210 }
1211 $code.=<<___;
1212         vmovdqu 0x00-0x80($ctx),$A              # load context
1213          lea    128(%rsp),%rax
1214         vmovdqu 0x20-0x80($ctx),$B
1215          lea    256+128(%rsp),%rbx
1216         vmovdqu 0x40-0x80($ctx),$C
1217         vmovdqu 0x60-0x80($ctx),$D
1218         vmovdqu 0x80-0x80($ctx),$E
1219         vmovdqu 0xa0-0x80($ctx),$F
1220         vmovdqu 0xc0-0x80($ctx),$G
1221         vmovdqu 0xe0-0x80($ctx),$H
1222         vmovdqu .Lpbswap(%rip),$Xn
1223         jmp     .Loop_avx2
1224
1225 .align  32
1226 .Loop_avx2:
1227         vpxor   $B,$C,$bxc                      # magic seed
1228 ___
1229 for($i=0;$i<16;$i++)    { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1230 $code.=<<___;
1231         vmovdqu `&Xi_off($i)`,$Xi
1232         mov     \$3,%ecx
1233         jmp     .Loop_16_xx_avx2
1234 .align  32
1235 .Loop_16_xx_avx2:
1236 ___
1237 for(;$i<32;$i++)        { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1238 $code.=<<___;
1239         dec     %ecx
1240         jnz     .Loop_16_xx_avx2
1241
1242         mov     \$1,%ecx
1243         lea     `$REG_SZ*16`(%rsp),%rbx
1244         lea     K256+128(%rip),$Tbl
1245 ___
1246 for($i=0;$i<8;$i++) {
1247     $code.=<<___;
1248         cmp     `4*$i`(%rbx),%ecx               # examine counters
1249         cmovge  $Tbl,@ptr[$i]                   # cancel input
1250 ___
1251 }
1252 $code.=<<___;
1253         vmovdqa (%rbx),$sigma                   # pull counters
1254         vpxor   $t1,$t1,$t1
1255         vmovdqa $sigma,$Xn
1256         vpcmpgtd $t1,$Xn,$Xn                    # mask value
1257         vpaddd  $Xn,$sigma,$sigma               # counters--
1258
1259         vmovdqu 0x00-0x80($ctx),$t1
1260         vpand   $Xn,$A,$A
1261         vmovdqu 0x20-0x80($ctx),$t2
1262         vpand   $Xn,$B,$B
1263         vmovdqu 0x40-0x80($ctx),$t3
1264         vpand   $Xn,$C,$C
1265         vmovdqu 0x60-0x80($ctx),$Xi
1266         vpand   $Xn,$D,$D
1267         vpaddd  $t1,$A,$A
1268         vmovdqu 0x80-0x80($ctx),$t1
1269         vpand   $Xn,$E,$E
1270         vpaddd  $t2,$B,$B
1271         vmovdqu 0xa0-0x80($ctx),$t2
1272         vpand   $Xn,$F,$F
1273         vpaddd  $t3,$C,$C
1274         vmovdqu 0xc0-0x80($ctx),$t3
1275         vpand   $Xn,$G,$G
1276         vpaddd  $Xi,$D,$D
1277         vmovdqu 0xe0-0x80($ctx),$Xi
1278         vpand   $Xn,$H,$H
1279         vpaddd  $t1,$E,$E
1280         vpaddd  $t2,$F,$F
1281         vmovdqu $A,0x00-0x80($ctx)
1282         vpaddd  $t3,$G,$G
1283         vmovdqu $B,0x20-0x80($ctx)
1284         vpaddd  $Xi,$H,$H
1285         vmovdqu $C,0x40-0x80($ctx)
1286         vmovdqu $D,0x60-0x80($ctx)
1287         vmovdqu $E,0x80-0x80($ctx)
1288         vmovdqu $F,0xa0-0x80($ctx)
1289         vmovdqu $G,0xc0-0x80($ctx)
1290         vmovdqu $H,0xe0-0x80($ctx)
1291
1292         vmovdqu $sigma,(%rbx)                   # save counters
1293         lea     256+128(%rsp),%rbx
1294         vmovdqu .Lpbswap(%rip),$Xn
1295         dec     $num
1296         jnz     .Loop_avx2
1297
1298         #mov    `$REG_SZ*17+8`(%rsp),$num
1299         #lea    $REG_SZ($ctx),$ctx
1300         #lea    `$inp_elm_size*$REG_SZ/4`($inp),$inp
1301         #dec    $num
1302         #jnz    .Loop_grande_avx2
1303
1304 .Ldone_avx2:
1305         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
1306 .cfi_def_cfa    %rax,8
1307         vzeroupper
1308 ___
1309 $code.=<<___ if ($win64);
1310         movaps  -0xd8(%rax),%xmm6
1311         movaps  -0xc8(%rax),%xmm7
1312         movaps  -0xb8(%rax),%xmm8
1313         movaps  -0xa8(%rax),%xmm9
1314         movaps  -0x98(%rax),%xmm10
1315         movaps  -0x88(%rax),%xmm11
1316         movaps  -0x78(%rax),%xmm12
1317         movaps  -0x68(%rax),%xmm13
1318         movaps  -0x58(%rax),%xmm14
1319         movaps  -0x48(%rax),%xmm15
1320 ___
1321 $code.=<<___;
1322         mov     -48(%rax),%r15
1323 .cfi_restore    %r15
1324         mov     -40(%rax),%r14
1325 .cfi_restore    %r14
1326         mov     -32(%rax),%r13
1327 .cfi_restore    %r13
1328         mov     -24(%rax),%r12
1329 .cfi_restore    %r12
1330         mov     -16(%rax),%rbp
1331 .cfi_restore    %rbp
1332         mov     -8(%rax),%rbx
1333 .cfi_restore    %rbx
1334         lea     (%rax),%rsp
1335 .cfi_def_cfa_register   %rsp
1336 .Lepilogue_avx2:
1337         ret
1338 .cfi_endproc
1339 .size   sha256_multi_block_avx2,.-sha256_multi_block_avx2
1340 ___
1341                                         }       }}}
1342 $code.=<<___;
1343 .align  256
1344 K256:
1345 ___
1346 sub TABLE {
1347     foreach (@_) {
1348         $code.=<<___;
1349         .long   $_,$_,$_,$_
1350         .long   $_,$_,$_,$_
1351 ___
1352     }
1353 }
1354 &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1355         0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1356         0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1357         0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1358         0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1359         0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1360         0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1361         0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1362         0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1363         0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1364         0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1365         0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1366         0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1367         0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1368         0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1369         0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1370 $code.=<<___;
1371 .Lpbswap:
1372         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1373         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1374 K256_shaext:
1375         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1376         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1377         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1378         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1379         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1380         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1381         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1382         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1383         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1384         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1385         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1386         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1387         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1388         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1389         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1390         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1391         .asciz  "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1392 ___
1393
1394 if ($win64) {
1395 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1396 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1397 $rec="%rcx";
1398 $frame="%rdx";
1399 $context="%r8";
1400 $disp="%r9";
1401
1402 $code.=<<___;
1403 .extern __imp_RtlVirtualUnwind
1404 .type   se_handler,\@abi-omnipotent
1405 .align  16
1406 se_handler:
1407         push    %rsi
1408         push    %rdi
1409         push    %rbx
1410         push    %rbp
1411         push    %r12
1412         push    %r13
1413         push    %r14
1414         push    %r15
1415         pushfq
1416         sub     \$64,%rsp
1417
1418         mov     120($context),%rax      # pull context->Rax
1419         mov     248($context),%rbx      # pull context->Rip
1420
1421         mov     8($disp),%rsi           # disp->ImageBase
1422         mov     56($disp),%r11          # disp->HandlerData
1423
1424         mov     0(%r11),%r10d           # HandlerData[0]
1425         lea     (%rsi,%r10),%r10        # end of prologue label
1426         cmp     %r10,%rbx               # context->Rip<.Lbody
1427         jb      .Lin_prologue
1428
1429         mov     152($context),%rax      # pull context->Rsp
1430
1431         mov     4(%r11),%r10d           # HandlerData[1]
1432         lea     (%rsi,%r10),%r10        # epilogue label
1433         cmp     %r10,%rbx               # context->Rip>=.Lepilogue
1434         jae     .Lin_prologue
1435
1436         mov     `16*17`(%rax),%rax      # pull saved stack pointer
1437
1438         mov     -8(%rax),%rbx
1439         mov     -16(%rax),%rbp
1440         mov     %rbx,144($context)      # restore context->Rbx
1441         mov     %rbp,160($context)      # restore context->Rbp
1442
1443         lea     -24-10*16(%rax),%rsi
1444         lea     512($context),%rdi      # &context.Xmm6
1445         mov     \$20,%ecx
1446         .long   0xa548f3fc              # cld; rep movsq
1447
1448 .Lin_prologue:
1449         mov     8(%rax),%rdi
1450         mov     16(%rax),%rsi
1451         mov     %rax,152($context)      # restore context->Rsp
1452         mov     %rsi,168($context)      # restore context->Rsi
1453         mov     %rdi,176($context)      # restore context->Rdi
1454
1455         mov     40($disp),%rdi          # disp->ContextRecord
1456         mov     $context,%rsi           # context
1457         mov     \$154,%ecx              # sizeof(CONTEXT)
1458         .long   0xa548f3fc              # cld; rep movsq
1459
1460         mov     $disp,%rsi
1461         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1462         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1463         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1464         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1465         mov     40(%rsi),%r10           # disp->ContextRecord
1466         lea     56(%rsi),%r11           # &disp->HandlerData
1467         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1468         mov     %r10,32(%rsp)           # arg5
1469         mov     %r11,40(%rsp)           # arg6
1470         mov     %r12,48(%rsp)           # arg7
1471         mov     %rcx,56(%rsp)           # arg8, (NULL)
1472         call    *__imp_RtlVirtualUnwind(%rip)
1473
1474         mov     \$1,%eax                # ExceptionContinueSearch
1475         add     \$64,%rsp
1476         popfq
1477         pop     %r15
1478         pop     %r14
1479         pop     %r13
1480         pop     %r12
1481         pop     %rbp
1482         pop     %rbx
1483         pop     %rdi
1484         pop     %rsi
1485         ret
1486 .size   se_handler,.-se_handler
1487 ___
1488 $code.=<<___ if ($avx>1);
1489 .type   avx2_handler,\@abi-omnipotent
1490 .align  16
1491 avx2_handler:
1492         push    %rsi
1493         push    %rdi
1494         push    %rbx
1495         push    %rbp
1496         push    %r12
1497         push    %r13
1498         push    %r14
1499         push    %r15
1500         pushfq
1501         sub     \$64,%rsp
1502
1503         mov     120($context),%rax      # pull context->Rax
1504         mov     248($context),%rbx      # pull context->Rip
1505
1506         mov     8($disp),%rsi           # disp->ImageBase
1507         mov     56($disp),%r11          # disp->HandlerData
1508
1509         mov     0(%r11),%r10d           # HandlerData[0]
1510         lea     (%rsi,%r10),%r10        # end of prologue label
1511         cmp     %r10,%rbx               # context->Rip<body label
1512         jb      .Lin_prologue
1513
1514         mov     152($context),%rax      # pull context->Rsp
1515
1516         mov     4(%r11),%r10d           # HandlerData[1]
1517         lea     (%rsi,%r10),%r10        # epilogue label
1518         cmp     %r10,%rbx               # context->Rip>=epilogue label
1519         jae     .Lin_prologue
1520
1521         mov     `32*17`($context),%rax  # pull saved stack pointer
1522
1523         mov     -8(%rax),%rbx
1524         mov     -16(%rax),%rbp
1525         mov     -24(%rax),%r12
1526         mov     -32(%rax),%r13
1527         mov     -40(%rax),%r14
1528         mov     -48(%rax),%r15
1529         mov     %rbx,144($context)      # restore context->Rbx
1530         mov     %rbp,160($context)      # restore context->Rbp
1531         mov     %r12,216($context)      # restore context->R12
1532         mov     %r13,224($context)      # restore context->R13
1533         mov     %r14,232($context)      # restore context->R14
1534         mov     %r15,240($context)      # restore context->R15
1535
1536         lea     -56-10*16(%rax),%rsi
1537         lea     512($context),%rdi      # &context.Xmm6
1538         mov     \$20,%ecx
1539         .long   0xa548f3fc              # cld; rep movsq
1540
1541         jmp     .Lin_prologue
1542 .size   avx2_handler,.-avx2_handler
1543 ___
1544 $code.=<<___;
1545 .section        .pdata
1546 .align  4
1547         .rva    .LSEH_begin_sha256_multi_block
1548         .rva    .LSEH_end_sha256_multi_block
1549         .rva    .LSEH_info_sha256_multi_block
1550         .rva    .LSEH_begin_sha256_multi_block_shaext
1551         .rva    .LSEH_end_sha256_multi_block_shaext
1552         .rva    .LSEH_info_sha256_multi_block_shaext
1553 ___
1554 $code.=<<___ if ($avx);
1555         .rva    .LSEH_begin_sha256_multi_block_avx
1556         .rva    .LSEH_end_sha256_multi_block_avx
1557         .rva    .LSEH_info_sha256_multi_block_avx
1558 ___
1559 $code.=<<___ if ($avx>1);
1560         .rva    .LSEH_begin_sha256_multi_block_avx2
1561         .rva    .LSEH_end_sha256_multi_block_avx2
1562         .rva    .LSEH_info_sha256_multi_block_avx2
1563 ___
1564 $code.=<<___;
1565 .section        .xdata
1566 .align  8
1567 .LSEH_info_sha256_multi_block:
1568         .byte   9,0,0,0
1569         .rva    se_handler
1570         .rva    .Lbody,.Lepilogue                       # HandlerData[]
1571 .LSEH_info_sha256_multi_block_shaext:
1572         .byte   9,0,0,0
1573         .rva    se_handler
1574         .rva    .Lbody_shaext,.Lepilogue_shaext         # HandlerData[]
1575 ___
1576 $code.=<<___ if ($avx);
1577 .LSEH_info_sha256_multi_block_avx:
1578         .byte   9,0,0,0
1579         .rva    se_handler
1580         .rva    .Lbody_avx,.Lepilogue_avx               # HandlerData[]
1581 ___
1582 $code.=<<___ if ($avx>1);
1583 .LSEH_info_sha256_multi_block_avx2:
1584         .byte   9,0,0,0
1585         .rva    avx2_handler
1586         .rva    .Lbody_avx2,.Lepilogue_avx2             # HandlerData[]
1587 ___
1588 }
1589 ####################################################################
1590
1591 sub rex {
1592   local *opcode=shift;
1593   my ($dst,$src)=@_;
1594   my $rex=0;
1595
1596     $rex|=0x04                  if ($dst>=8);
1597     $rex|=0x01                  if ($src>=8);
1598     unshift @opcode,$rex|0x40   if ($rex);
1599 }
1600
1601 sub sha256op38 {
1602     my $instr = shift;
1603     my %opcodelet = (
1604                 "sha256rnds2" => 0xcb,
1605                 "sha256msg1"  => 0xcc,
1606                 "sha256msg2"  => 0xcd   );
1607
1608     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1609       my @opcode=(0x0f,0x38);
1610         rex(\@opcode,$2,$1);
1611         push @opcode,$opcodelet{$instr};
1612         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1613         return ".byte\t".join(',',@opcode);
1614     } else {
1615         return $instr."\t".@_[0];
1616     }
1617 }
1618
1619 foreach (split("\n",$code)) {
1620         s/\`([^\`]*)\`/eval($1)/ge;
1621
1622         s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo         or
1623
1624         s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go          or
1625         s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go         or
1626         s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go    or
1627         s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go        or
1628         s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go            or
1629         s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1630
1631         print $_,"\n";
1632 }
1633
1634 close STDOUT or die "error closing STDOUT: $!";