ec870170320d3725b7715dcbd5e230bb668310cc
[openssl.git] / crypto / sha / asm / sha256-mb-x86_64.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # Multi-buffer SHA256 procedure processes n buffers in parallel by
11 # placing buffer data to designated lane of SIMD register. n is
12 # naturally limited to 4 on pre-AVX2 processors and to 8 on
13 # AVX2-capable processors such as Haswell.
14 #
15 #               this    +aesni(i)       sha256  aesni-sha256    gain(iv)
16 # -------------------------------------------------------------------
17 # Westmere(ii)  23.3/n  +1.28=7.11(n=4) 12.3    +3.75=16.1      +126%
18 # Atom(ii)      38.7/n  +3.93=13.6(n=4) 20.8    +5.69=26.5      +95%
19 # Sandy Bridge  (20.5   +5.15=25.7)/n   11.6    13.0            +103%
20 # Ivy Bridge    (20.4   +5.14=25.5)/n   10.3    11.6            +82%
21 # Haswell(iii)  (21.0   +5.00=26.0)/n   7.80    8.79            +170%
22 # Bulldozer     (21.6   +5.76=27.4)/n   13.6    13.7            +100%
23 #
24 # (i)   multi-block CBC encrypt with 128-bit key;
25 # (ii)  (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
26 #       because of lower AES-NI instruction throughput, nor is there
27 #       AES-NI-SHA256 stitch for these processors;
28 # (iii) "this" is for n=8, when we gather twice as much data, result
29 #       for n=4 is 20.3+4.44=24.7;
30 # (iv)  presented improvement coefficients are asymptotic limits and
31 #       in real-life application are somewhat lower, e.g. for 2KB 
32 #       fragments they range from 75% to 130% (on Haswell);
33
34 $flavour = shift;
35 $output  = shift;
36 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
37
38 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
39
40 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
41 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
42 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
43 die "can't locate x86_64-xlate.pl";
44
45 $avx=0;
46
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49         $avx = ($1>=2.19) + ($1>=2.22);
50 }
51
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54         $avx = ($1>=2.09) + ($1>=2.10);
55 }
56
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59         $avx = ($1>=10) + ($1>=11);
60 }
61
62 open OUT,"| \"$^X\" $xlate $flavour $output";
63 *STDOUT=*OUT;
64
65 # void sha256_multi_block (
66 #     struct {  unsigned int A[8];
67 #               unsigned int B[8];
68 #               unsigned int C[8];
69 #               unsigned int D[8];
70 #               unsigned int E[8];
71 #               unsigned int F[8];
72 #               unsigned int G[8];
73 #               unsigned int H[8];      } *ctx,
74 #     struct {  void *ptr; int blocks;  } inp[8],
75 #     int num);         /* 1 or 2 */
76 #
77 $ctx="%rdi";    # 1st arg
78 $inp="%rsi";    # 2nd arg
79 $num="%edx";    # 3rd arg
80 @ptr=map("%r$_",(8..11));
81 $Tbl="%rbp";
82
83 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
84 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
85
86 $REG_SZ=16;
87
88 sub Xi_off {
89 my $off = shift;
90
91     $off %= 16; $off *= $REG_SZ;
92     $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
93 }
94
95 sub ROUND_00_15 {
96 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
97
98 $code.=<<___ if ($i<15);
99         movd            `4*$i`(@ptr[0]),$Xi
100         movd            `4*$i`(@ptr[1]),$t1
101         movd            `4*$i`(@ptr[2]),$t2
102         movd            `4*$i`(@ptr[3]),$t3
103         punpckldq       $t2,$Xi
104         punpckldq       $t3,$t1
105         punpckldq       $t1,$Xi
106 ___
107 $code.=<<___ if ($i==15);
108         movd            `4*$i`(@ptr[0]),$Xi
109          lea            `16*4`(@ptr[0]),@ptr[0]
110         movd            `4*$i`(@ptr[1]),$t1
111          lea            `16*4`(@ptr[1]),@ptr[1]
112         movd            `4*$i`(@ptr[2]),$t2
113          lea            `16*4`(@ptr[2]),@ptr[2]
114         movd            `4*$i`(@ptr[3]),$t3
115          lea            `16*4`(@ptr[3]),@ptr[3]
116         punpckldq       $t2,$Xi
117         punpckldq       $t3,$t1
118         punpckldq       $t1,$Xi
119 ___
120 $code.=<<___;
121         movdqa  $e,$sigma
122         `"pshufb        $Xn,$Xi"                if ($i<=15 && ($i&1)==0)`
123         movdqa  $e,$t3
124         `"pshufb        $Xn,$Xi"                if ($i<=15 && ($i&1)==1)`
125         psrld   \$6,$sigma
126         movdqa  $e,$t2
127         pslld   \$7,$t3
128         movdqa  $Xi,`&Xi_off($i)`
129          paddd  $h,$Xi                          # Xi+=h
130
131         psrld   \$11,$t2
132         pxor    $t3,$sigma
133         pslld   \$21-7,$t3
134          paddd  `32*($i%8)-128`($Tbl),$Xi       # Xi+=K[round]
135         pxor    $t2,$sigma
136
137         psrld   \$25-11,$t2
138          movdqa $e,$t1
139          `"prefetcht0   63(@ptr[0])"            if ($i==15)`
140         pxor    $t3,$sigma
141          movdqa $e,$axb                         # borrow $axb
142         pslld   \$26-21,$t3
143          pandn  $g,$t1
144          pand   $f,$axb
145         pxor    $t2,$sigma
146
147          `"prefetcht0   63(@ptr[1])"            if ($i==15)`
148         movdqa  $a,$t2
149         pxor    $t3,$sigma                      # Sigma1(e)
150         movdqa  $a,$t3
151         psrld   \$2,$t2
152         paddd   $sigma,$Xi                      # Xi+=Sigma1(e)
153          pxor   $axb,$t1                        # Ch(e,f,g)
154          movdqa $b,$axb
155         movdqa  $a,$sigma
156         pslld   \$10,$t3
157          pxor   $a,$axb                         # a^b, b^c in next round
158
159          `"prefetcht0   63(@ptr[2])"            if ($i==15)`
160         psrld   \$13,$sigma
161         pxor    $t3,$t2
162          paddd  $t1,$Xi                         # Xi+=Ch(e,f,g)
163         pslld   \$19-10,$t3
164          pand   $axb,$bxc
165         pxor    $sigma,$t2
166
167          `"prefetcht0   63(@ptr[3])"            if ($i==15)`
168         psrld   \$22-13,$sigma
169         pxor    $t3,$t2
170          movdqa $b,$h
171         pslld   \$30-19,$t3
172         pxor    $t2,$sigma
173          pxor   $bxc,$h                         # h=Maj(a,b,c)=Ch(a^b,c,b)
174          paddd  $Xi,$d                          # d+=Xi
175         pxor    $t3,$sigma                      # Sigma0(a)
176
177         paddd   $Xi,$h                          # h+=Xi
178         paddd   $sigma,$h                       # h+=Sigma0(a)
179 ___
180 $code.=<<___ if (($i%8)==7);
181         lea     `32*8`($Tbl),$Tbl
182 ___
183         ($axb,$bxc)=($bxc,$axb);
184 }
185
186 sub ROUND_16_XX {
187 my $i=shift;
188
189 $code.=<<___;
190         movdqa  `&Xi_off($i+1)`,$Xn
191         paddd   `&Xi_off($i+9)`,$Xi             # Xi+=X[i+9]
192
193         movdqa  $Xn,$sigma
194         movdqa  $Xn,$t2
195         psrld   \$3,$sigma
196         movdqa  $Xn,$t3
197
198         psrld   \$7,$t2
199         movdqa  `&Xi_off($i+14)`,$t1
200         pslld   \$14,$t3
201         pxor    $t2,$sigma
202         psrld   \$18-7,$t2
203         movdqa  $t1,$axb                        # borrow $axb
204         pxor    $t3,$sigma
205         pslld   \$25-14,$t3
206         pxor    $t2,$sigma
207         psrld   \$10,$t1
208         movdqa  $axb,$t2
209
210         psrld   \$17,$axb
211         pxor    $t3,$sigma                      # sigma0(X[i+1])
212         pslld   \$13,$t2
213          paddd  $sigma,$Xi                      # Xi+=sigma0(e)
214         pxor    $axb,$t1
215         psrld   \$19-17,$axb
216         pxor    $t2,$t1
217         pslld   \$15-13,$t2
218         pxor    $axb,$t1
219         pxor    $t2,$t1                         # sigma0(X[i+14])
220         paddd   $t1,$Xi                         # Xi+=sigma1(X[i+14])
221 ___
222         &ROUND_00_15($i,@_);
223         ($Xi,$Xn)=($Xn,$Xi);
224 }
225
226 $code.=<<___;
227 .text
228
229 .extern OPENSSL_ia32cap_P
230
231 .globl  sha256_multi_block
232 .type   sha256_multi_block,\@function,3
233 .align  32
234 sha256_multi_block:
235         mov     OPENSSL_ia32cap_P+4(%rip),%rcx
236         bt      \$61,%rcx                       # check SHA bit
237         jc      _shaext_shortcut
238 ___
239 $code.=<<___ if ($avx);
240         test    \$`1<<28`,%ecx
241         jnz     _avx_shortcut
242 ___
243 $code.=<<___;
244         mov     %rsp,%rax
245         push    %rbx
246         push    %rbp
247 ___
248 $code.=<<___ if ($win64);
249         lea     -0xa8(%rsp),%rsp
250         movaps  %xmm6,(%rsp)
251         movaps  %xmm7,0x10(%rsp)
252         movaps  %xmm8,0x20(%rsp)
253         movaps  %xmm9,0x30(%rsp)
254         movaps  %xmm10,-0x78(%rax)
255         movaps  %xmm11,-0x68(%rax)
256         movaps  %xmm12,-0x58(%rax)
257         movaps  %xmm13,-0x48(%rax)
258         movaps  %xmm14,-0x38(%rax)
259         movaps  %xmm15,-0x28(%rax)
260 ___
261 $code.=<<___;
262         sub     \$`$REG_SZ*18`, %rsp
263         and     \$-256,%rsp
264         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
265 .Lbody:
266         lea     K256+128(%rip),$Tbl
267         lea     `$REG_SZ*16`(%rsp),%rbx
268         lea     0x80($ctx),$ctx                 # size optimization
269
270 .Loop_grande:
271         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
272         xor     $num,$num
273 ___
274 for($i=0;$i<4;$i++) {
275     $code.=<<___;
276         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
277         mov     `16*$i+8`($inp),%ecx            # number of blocks
278         cmp     $num,%ecx
279         cmovg   %ecx,$num                       # find maximum
280         test    %ecx,%ecx
281         mov     %ecx,`4*$i`(%rbx)               # initialize counters
282         cmovle  $Tbl,@ptr[$i]                   # cancel input
283 ___
284 }
285 $code.=<<___;
286         test    $num,$num
287         jz      .Ldone
288
289         movdqu  0x00-0x80($ctx),$A              # load context
290          lea    128(%rsp),%rax
291         movdqu  0x20-0x80($ctx),$B
292         movdqu  0x40-0x80($ctx),$C
293         movdqu  0x60-0x80($ctx),$D
294         movdqu  0x80-0x80($ctx),$E
295         movdqu  0xa0-0x80($ctx),$F
296         movdqu  0xc0-0x80($ctx),$G
297         movdqu  0xe0-0x80($ctx),$H
298         movdqu  .Lpbswap(%rip),$Xn
299         jmp     .Loop
300
301 .align  32
302 .Loop:
303         movdqa  $C,$bxc
304         pxor    $B,$bxc                         # magic seed
305 ___
306 for($i=0;$i<16;$i++)    { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
307 $code.=<<___;
308         movdqu  `&Xi_off($i)`,$Xi
309         mov     \$3,%ecx
310         jmp     .Loop_16_xx
311 .align  32
312 .Loop_16_xx:
313 ___
314 for(;$i<32;$i++)        { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
315 $code.=<<___;
316         dec     %ecx
317         jnz     .Loop_16_xx
318
319         mov     \$1,%ecx
320         lea     K256+128(%rip),$Tbl
321
322         movdqa  (%rbx),$sigma                   # pull counters
323         cmp     4*0(%rbx),%ecx                  # examine counters
324         pxor    $t1,$t1
325         cmovge  $Tbl,@ptr[0]                    # cancel input
326         cmp     4*1(%rbx),%ecx
327         movdqa  $sigma,$Xn
328         cmovge  $Tbl,@ptr[1]
329         cmp     4*2(%rbx),%ecx
330         pcmpgtd $t1,$Xn                         # mask value
331         cmovge  $Tbl,@ptr[2]
332         cmp     4*3(%rbx),%ecx
333         paddd   $Xn,$sigma                      # counters--
334         cmovge  $Tbl,@ptr[3]
335
336         movdqu  0x00-0x80($ctx),$t1
337         pand    $Xn,$A
338         movdqu  0x20-0x80($ctx),$t2
339         pand    $Xn,$B
340         movdqu  0x40-0x80($ctx),$t3
341         pand    $Xn,$C
342         movdqu  0x60-0x80($ctx),$Xi
343         pand    $Xn,$D
344         paddd   $t1,$A
345         movdqu  0x80-0x80($ctx),$t1
346         pand    $Xn,$E
347         paddd   $t2,$B
348         movdqu  0xa0-0x80($ctx),$t2
349         pand    $Xn,$F
350         paddd   $t3,$C
351         movdqu  0xc0-0x80($ctx),$t3
352         pand    $Xn,$G
353         paddd   $Xi,$D
354         movdqu  0xe0-0x80($ctx),$Xi
355         pand    $Xn,$H
356         paddd   $t1,$E
357         paddd   $t2,$F
358         movdqu  $A,0x00-0x80($ctx)
359         paddd   $t3,$G
360         movdqu  $B,0x20-0x80($ctx)
361         paddd   $Xi,$H
362         movdqu  $C,0x40-0x80($ctx)
363         movdqu  $D,0x60-0x80($ctx)
364         movdqu  $E,0x80-0x80($ctx)
365         movdqu  $F,0xa0-0x80($ctx)
366         movdqu  $G,0xc0-0x80($ctx)
367         movdqu  $H,0xe0-0x80($ctx)
368
369         movdqa  $sigma,(%rbx)                   # save counters
370         movdqa  .Lpbswap(%rip),$Xn
371         dec     $num
372         jnz     .Loop
373
374         mov     `$REG_SZ*17+8`(%rsp),$num
375         lea     $REG_SZ($ctx),$ctx
376         lea     `16*$REG_SZ/4`($inp),$inp
377         dec     $num
378         jnz     .Loop_grande
379
380 .Ldone:
381         mov     `$REG_SZ*17`(%rsp),%rax         # orignal %rsp
382 ___
383 $code.=<<___ if ($win64);
384         movaps  -0xb8(%rax),%xmm6
385         movaps  -0xa8(%rax),%xmm7
386         movaps  -0x98(%rax),%xmm8
387         movaps  -0x88(%rax),%xmm9
388         movaps  -0x78(%rax),%xmm10
389         movaps  -0x68(%rax),%xmm11
390         movaps  -0x58(%rax),%xmm12
391         movaps  -0x48(%rax),%xmm13
392         movaps  -0x38(%rax),%xmm14
393         movaps  -0x28(%rax),%xmm15
394 ___
395 $code.=<<___;
396         mov     -16(%rax),%rbp
397         mov     -8(%rax),%rbx
398         lea     (%rax),%rsp
399 .Lepilogue:
400         ret
401 .size   sha256_multi_block,.-sha256_multi_block
402 ___
403                                                 {{{
404 my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
405 my @MSG0=map("%xmm$_",(4..7));
406 my @MSG1=map("%xmm$_",(8..11));
407
408 $code.=<<___;
409 .type   sha256_multi_block_shaext,\@function,3
410 .align  32
411 sha256_multi_block_shaext:
412 _shaext_shortcut:
413         mov     %rsp,%rax
414         push    %rbx
415         push    %rbp
416 ___
417 $code.=<<___ if ($win64);
418         lea     -0xa8(%rsp),%rsp
419         movaps  %xmm6,(%rsp)
420         movaps  %xmm7,0x10(%rsp)
421         movaps  %xmm8,0x20(%rsp)
422         movaps  %xmm9,0x30(%rsp)
423         movaps  %xmm10,-0x78(%rax)
424         movaps  %xmm11,-0x68(%rax)
425         movaps  %xmm12,-0x58(%rax)
426         movaps  %xmm13,-0x48(%rax)
427         movaps  %xmm14,-0x38(%rax)
428         movaps  %xmm15,-0x28(%rax)
429 ___
430 $code.=<<___;
431         sub     \$`$REG_SZ*18`,%rsp
432         shl     \$1,$num                        # we process pair at a time
433         and     \$-256,%rsp
434         lea     0x80($ctx),$ctx                 # size optimization
435         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
436 .Lbody_shaext:
437         lea     `$REG_SZ*16`(%rsp),%rbx
438         lea     K256_shaext+0x80(%rip),$Tbl
439
440 .Loop_grande_shaext:
441         mov     $num,`$REG_SZ*17+8`(%rsp)       # orignal $num
442         xor     $num,$num
443 ___
444 for($i=0;$i<2;$i++) {
445     $code.=<<___;
446         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
447         mov     `16*$i+8`($inp),%ecx            # number of blocks
448         cmp     $num,%ecx
449         cmovg   %ecx,$num                       # find maximum
450         test    %ecx,%ecx
451         mov     %ecx,`4*$i`(%rbx)               # initialize counters
452         cmovle  %rsp,@ptr[$i]                   # cancel input
453 ___
454 }
455 $code.=<<___;
456         test    $num,$num
457         jz      .Ldone_shaext
458
459         movq            0x00-0x80($ctx),$ABEF0          # A1.A0
460         movq            0x20-0x80($ctx),@MSG0[0]        # B1.B0
461         movq            0x40-0x80($ctx),$CDGH0          # C1.C0
462         movq            0x60-0x80($ctx),@MSG0[1]        # D1.D0
463         movq            0x80-0x80($ctx),@MSG1[0]        # E1.E0
464         movq            0xa0-0x80($ctx),@MSG1[1]        # F1.F0
465         movq            0xc0-0x80($ctx),@MSG1[2]        # G1.G0
466         movq            0xe0-0x80($ctx),@MSG1[3]        # H1.H0
467
468         punpckldq       @MSG0[0],$ABEF0                 # B1.A1.B0.A0
469         punpckldq       @MSG0[1],$CDGH0                 # D1.C1.D0.C0
470         punpckldq       @MSG1[1],@MSG1[0]               # F1.E1.F0.E0
471         punpckldq       @MSG1[3],@MSG1[2]               # H1.G1.H0.G0
472         movdqa          K256_shaext-0x10(%rip),$TMPx    # byte swap
473
474         movdqa          $ABEF0,$ABEF1
475         movdqa          $CDGH0,$CDGH1
476         punpcklqdq      @MSG1[0],$ABEF0                 # F0.E0.B0.A0
477         punpcklqdq      @MSG1[2],$CDGH0                 # H0.G0.D0.C0
478         punpckhqdq      @MSG1[0],$ABEF1                 # F1.E1.B1.A1
479         punpckhqdq      @MSG1[2],$CDGH1                 # H1.G1.D1.C1
480
481         pshufd          \$0b00011011,$ABEF0,$ABEF0
482         pshufd          \$0b00011011,$CDGH0,$CDGH0
483         pshufd          \$0b00011011,$ABEF1,$ABEF1
484         pshufd          \$0b00011011,$CDGH1,$CDGH1
485         jmp             .Loop_shaext
486
487 .align  32
488 .Loop_shaext:
489         movdqu          0x00(@ptr[0]),@MSG0[0]
490          movdqu         0x00(@ptr[1]),@MSG1[0]
491         movdqu          0x10(@ptr[0]),@MSG0[1]
492          movdqu         0x10(@ptr[1]),@MSG1[1]
493         movdqu          0x20(@ptr[0]),@MSG0[2]
494         pshufb          $TMPx,@MSG0[0]
495          movdqu         0x20(@ptr[1]),@MSG1[2]
496          pshufb         $TMPx,@MSG1[0]
497         movdqu          0x30(@ptr[0]),@MSG0[3]
498         lea             0x40(@ptr[0]),@ptr[0]
499          movdqu         0x30(@ptr[1]),@MSG1[3]
500          lea            0x40(@ptr[1]),@ptr[1]
501
502         movdqa          0*16-0x80($Tbl),$Wi
503         pshufb          $TMPx,@MSG0[1]
504         paddd           @MSG0[0],$Wi
505         pxor            $ABEF0,@MSG0[0]         # black magic
506         movdqa          $Wi,$TMP0
507          movdqa         0*16-0x80($Tbl),$TMP1
508          pshufb         $TMPx,@MSG1[1]
509          paddd          @MSG1[0],$TMP1
510         movdqa          $CDGH0,0x50(%rsp)       # offload
511         sha256rnds2     $ABEF0,$CDGH0           # 0-3
512          pxor           $ABEF1,@MSG1[0]         # black magic
513          movdqa         $TMP1,$Wi
514          movdqa         $CDGH1,0x70(%rsp)
515          sha256rnds2    $ABEF1,$CDGH1           # 0-3
516         pshufd          \$0x0e,$TMP0,$Wi
517         pxor            $ABEF0,@MSG0[0]         # black magic
518         movdqa          $ABEF0,0x40(%rsp)       # offload
519         sha256rnds2     $CDGH0,$ABEF0
520          pshufd         \$0x0e,$TMP1,$Wi
521          pxor           $ABEF1,@MSG1[0]         # black magic
522          movdqa         $ABEF1,0x60(%rsp)
523         movdqa          1*16-0x80($Tbl),$TMP0
524         paddd           @MSG0[1],$TMP0
525         pshufb          $TMPx,@MSG0[2]
526          sha256rnds2    $CDGH1,$ABEF1
527
528         movdqa          $TMP0,$Wi
529          movdqa         1*16-0x80($Tbl),$TMP1
530          paddd          @MSG1[1],$TMP1
531         sha256rnds2     $ABEF0,$CDGH0           # 4-7
532          movdqa         $TMP1,$Wi
533         prefetcht0      127(@ptr[0])
534         pshufb          $TMPx,@MSG0[3]
535          pshufb         $TMPx,@MSG1[2]
536          prefetcht0     127(@ptr[1])
537          sha256rnds2    $ABEF1,$CDGH1           # 4-7
538         pshufd          \$0x0e,$TMP0,$Wi
539          pshufb         $TMPx,@MSG1[3]
540         sha256msg1      @MSG0[1],@MSG0[0]
541         sha256rnds2     $CDGH0,$ABEF0
542          pshufd         \$0x0e,$TMP1,$Wi
543         movdqa          2*16-0x80($Tbl),$TMP0
544         paddd           @MSG0[2],$TMP0
545          sha256rnds2    $CDGH1,$ABEF1
546
547         movdqa          $TMP0,$Wi
548          movdqa         2*16-0x80($Tbl),$TMP1
549          paddd          @MSG1[2],$TMP1
550         sha256rnds2     $ABEF0,$CDGH0           # 8-11
551          sha256msg1     @MSG1[1],@MSG1[0]
552          movdqa         $TMP1,$Wi
553         movdqa          @MSG0[3],$TMPx
554          sha256rnds2    $ABEF1,$CDGH1           # 8-11
555         pshufd          \$0x0e,$TMP0,$Wi
556         palignr         \$4,@MSG0[2],$TMPx
557         paddd           $TMPx,@MSG0[0]
558          movdqa         @MSG1[3],$TMPx
559          palignr        \$4,@MSG1[2],$TMPx
560         sha256msg1      @MSG0[2],@MSG0[1]
561         sha256rnds2     $CDGH0,$ABEF0
562          pshufd         \$0x0e,$TMP1,$Wi
563         movdqa          3*16-0x80($Tbl),$TMP0
564         paddd           @MSG0[3],$TMP0
565          sha256rnds2    $CDGH1,$ABEF1
566          sha256msg1     @MSG1[2],@MSG1[1]
567
568         movdqa          $TMP0,$Wi
569          movdqa         3*16-0x80($Tbl),$TMP1
570          paddd          $TMPx,@MSG1[0]
571          paddd          @MSG1[3],$TMP1
572         sha256msg2      @MSG0[3],@MSG0[0]
573         sha256rnds2     $ABEF0,$CDGH0           # 12-15
574          movdqa         $TMP1,$Wi
575         movdqa          @MSG0[0],$TMPx
576         palignr         \$4,@MSG0[3],$TMPx
577          sha256rnds2    $ABEF1,$CDGH1           # 12-15
578          sha256msg2     @MSG1[3],@MSG1[0]
579         pshufd          \$0x0e,$TMP0,$Wi
580         paddd           $TMPx,@MSG0[1]
581          movdqa         @MSG1[0],$TMPx
582          palignr        \$4,@MSG1[3],$TMPx
583         sha256msg1      @MSG0[3],@MSG0[2]
584         sha256rnds2     $CDGH0,$ABEF0
585          pshufd         \$0x0e,$TMP1,$Wi
586         movdqa          4*16-0x80($Tbl),$TMP0
587         paddd           @MSG0[0],$TMP0
588          sha256rnds2    $CDGH1,$ABEF1
589          sha256msg1     @MSG1[3],@MSG1[2]
590 ___
591 for($i=4;$i<16-3;$i++) {
592 $code.=<<___;
593         movdqa          $TMP0,$Wi
594          movdqa         $i*16-0x80($Tbl),$TMP1
595          paddd          $TMPx,@MSG1[1]
596          paddd          @MSG1[0],$TMP1
597         sha256msg2      @MSG0[0],@MSG0[1]
598         sha256rnds2     $ABEF0,$CDGH0           # 16-19...
599          movdqa         $TMP1,$Wi
600         movdqa          @MSG0[1],$TMPx
601         palignr         \$4,@MSG0[0],$TMPx
602          sha256rnds2    $ABEF1,$CDGH1           # 16-19...
603          sha256msg2     @MSG1[0],@MSG1[1]
604         pshufd          \$0x0e,$TMP0,$Wi
605         paddd           $TMPx,@MSG0[2]
606          movdqa         @MSG1[1],$TMPx
607          palignr        \$4,@MSG1[0],$TMPx
608         sha256msg1      @MSG0[0],@MSG0[3]
609         sha256rnds2     $CDGH0,$ABEF0
610          pshufd         \$0x0e,$TMP1,$Wi
611         movdqa          `($i+1)*16`-0x80($Tbl),$TMP0
612         paddd           @MSG0[1],$TMP0
613          sha256rnds2    $CDGH1,$ABEF1
614          sha256msg1     @MSG1[0],@MSG1[3]
615 ___
616         push(@MSG0,shift(@MSG0));       push(@MSG1,shift(@MSG1));
617 }
618 $code.=<<___;
619         movdqa          $TMP0,$Wi
620          movdqa         13*16-0x80($Tbl),$TMP1
621          paddd          $TMPx,@MSG1[1]
622          paddd          @MSG1[0],$TMP1
623         sha256msg2      @MSG0[0],@MSG0[1]
624         sha256rnds2     $ABEF0,$CDGH0           # 52-55
625          movdqa         $TMP1,$Wi
626         movdqa          @MSG0[1],$TMPx
627         palignr         \$4,@MSG0[0],$TMPx
628          sha256rnds2    $ABEF1,$CDGH1           # 52-55
629          sha256msg2     @MSG1[0],@MSG1[1]
630         pshufd          \$0x0e,$TMP0,$Wi
631         paddd           $TMPx,@MSG0[2]
632          movdqa         @MSG1[1],$TMPx
633          palignr        \$4,@MSG1[0],$TMPx
634         nop
635         sha256rnds2     $CDGH0,$ABEF0
636          pshufd         \$0x0e,$TMP1,$Wi
637         movdqa          14*16-0x80($Tbl),$TMP0
638         paddd           @MSG0[1],$TMP0
639          sha256rnds2    $CDGH1,$ABEF1
640
641         movdqa          $TMP0,$Wi
642          movdqa         14*16-0x80($Tbl),$TMP1
643          paddd          $TMPx,@MSG1[2]
644          paddd          @MSG1[1],$TMP1
645         sha256msg2      @MSG0[1],@MSG0[2]
646         nop
647         sha256rnds2     $ABEF0,$CDGH0           # 56-59
648          movdqa         $TMP1,$Wi
649           mov           \$1,%ecx
650           pxor          @MSG0[1],@MSG0[1]       # zero
651          sha256rnds2    $ABEF1,$CDGH1           # 56-59
652          sha256msg2     @MSG1[1],@MSG1[2]
653         pshufd          \$0x0e,$TMP0,$Wi
654         movdqa          15*16-0x80($Tbl),$TMP0
655         paddd           @MSG0[2],$TMP0
656           movq          (%rbx),@MSG0[2]         # pull counters
657           nop
658         sha256rnds2     $CDGH0,$ABEF0
659          pshufd         \$0x0e,$TMP1,$Wi
660          movdqa         15*16-0x80($Tbl),$TMP1
661          paddd          @MSG1[2],$TMP1
662          sha256rnds2    $CDGH1,$ABEF1
663
664         movdqa          $TMP0,$Wi
665           cmp           4*0(%rbx),%ecx          # examine counters
666           cmovge        %rsp,@ptr[0]            # cancel input
667           cmp           4*1(%rbx),%ecx
668           cmovge        %rsp,@ptr[1]
669           pshufd        \$0x00,@MSG0[2],@MSG1[0]
670         sha256rnds2     $ABEF0,$CDGH0           # 60-63
671          movdqa         $TMP1,$Wi
672           pshufd        \$0x55,@MSG0[2],@MSG1[1]
673           movdqa        @MSG0[2],@MSG1[2]
674          sha256rnds2    $ABEF1,$CDGH1           # 60-63
675         pshufd          \$0x0e,$TMP0,$Wi
676           pcmpgtd       @MSG0[1],@MSG1[0]
677           pcmpgtd       @MSG0[1],@MSG1[1]
678         sha256rnds2     $CDGH0,$ABEF0
679          pshufd         \$0x0e,$TMP1,$Wi
680           pcmpgtd       @MSG0[1],@MSG1[2]       # counter mask
681           movdqa        K256_shaext-0x10(%rip),$TMPx
682          sha256rnds2    $CDGH1,$ABEF1
683
684         pand            @MSG1[0],$CDGH0
685          pand           @MSG1[1],$CDGH1
686         pand            @MSG1[0],$ABEF0
687          pand           @MSG1[1],$ABEF1
688         paddd           @MSG0[2],@MSG1[2]       # counters--
689
690         paddd           0x50(%rsp),$CDGH0
691          paddd          0x70(%rsp),$CDGH1
692         paddd           0x40(%rsp),$ABEF0
693          paddd          0x60(%rsp),$ABEF1
694
695         movq            @MSG1[2],(%rbx)         # save counters
696         dec             $num
697         jnz             .Loop_shaext
698
699         mov             `$REG_SZ*17+8`(%rsp),$num
700
701         pshufd          \$0b00011011,$ABEF0,$ABEF0
702         pshufd          \$0b00011011,$CDGH0,$CDGH0
703         pshufd          \$0b00011011,$ABEF1,$ABEF1
704         pshufd          \$0b00011011,$CDGH1,$CDGH1
705
706         movdqa          $ABEF0,@MSG0[0]
707         movdqa          $CDGH0,@MSG0[1]
708         punpckldq       $ABEF1,$ABEF0                   # B1.B0.A1.A0
709         punpckhdq       $ABEF1,@MSG0[0]                 # F1.F0.E1.E0
710         punpckldq       $CDGH1,$CDGH0                   # D1.D0.C1.C0
711         punpckhdq       $CDGH1,@MSG0[1]                 # H1.H0.G1.G0
712
713         movq            $ABEF0,0x00-0x80($ctx)          # A1.A0
714         psrldq          \$8,$ABEF0
715         movq            @MSG0[0],0x80-0x80($ctx)        # E1.E0
716         psrldq          \$8,@MSG0[0]
717         movq            $ABEF0,0x20-0x80($ctx)          # B1.B0
718         movq            @MSG0[0],0xa0-0x80($ctx)        # F1.F0
719
720         movq            $CDGH0,0x40-0x80($ctx)          # C1.C0
721         psrldq          \$8,$CDGH0
722         movq            @MSG0[1],0xc0-0x80($ctx)        # G1.G0
723         psrldq          \$8,@MSG0[1]
724         movq            $CDGH0,0x60-0x80($ctx)          # D1.D0
725         movq            @MSG0[1],0xe0-0x80($ctx)        # H1.H0
726
727         lea     `$REG_SZ/2`($ctx),$ctx
728         lea     `16*2`($inp),$inp
729         dec     $num
730         jnz     .Loop_grande_shaext
731
732 .Ldone_shaext:
733         #mov    `$REG_SZ*17`(%rsp),%rax         # original %rsp
734 ___
735 $code.=<<___ if ($win64);
736         movaps  -0xb8(%rax),%xmm6
737         movaps  -0xa8(%rax),%xmm7
738         movaps  -0x98(%rax),%xmm8
739         movaps  -0x88(%rax),%xmm9
740         movaps  -0x78(%rax),%xmm10
741         movaps  -0x68(%rax),%xmm11
742         movaps  -0x58(%rax),%xmm12
743         movaps  -0x48(%rax),%xmm13
744         movaps  -0x38(%rax),%xmm14
745         movaps  -0x28(%rax),%xmm15
746 ___
747 $code.=<<___;
748         mov     -16(%rax),%rbp
749         mov     -8(%rax),%rbx
750         lea     (%rax),%rsp
751 .Lepilogue_shaext:
752         ret
753 .size   sha256_multi_block_shaext,.-sha256_multi_block_shaext
754 ___
755                                                 }}}
756                                                 if ($avx) {{{
757 sub ROUND_00_15_avx {
758 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
759
760 $code.=<<___ if ($i<15 && $REG_SZ==16);
761         vmovd           `4*$i`(@ptr[0]),$Xi
762         vmovd           `4*$i`(@ptr[1]),$t1
763         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
764         vpinsrd         \$1,`4*$i`(@ptr[3]),$t1,$t1
765         vpunpckldq      $t1,$Xi,$Xi
766         vpshufb         $Xn,$Xi,$Xi
767 ___
768 $code.=<<___ if ($i==15 && $REG_SZ==16);
769         vmovd           `4*$i`(@ptr[0]),$Xi
770          lea            `16*4`(@ptr[0]),@ptr[0]
771         vmovd           `4*$i`(@ptr[1]),$t1
772          lea            `16*4`(@ptr[1]),@ptr[1]
773         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
774          lea            `16*4`(@ptr[2]),@ptr[2]
775         vpinsrd         \$1,`4*$i`(@ptr[3]),$t1,$t1
776          lea            `16*4`(@ptr[3]),@ptr[3]
777         vpunpckldq      $t1,$Xi,$Xi
778         vpshufb         $Xn,$Xi,$Xi
779 ___
780 $code.=<<___ if ($i<15 && $REG_SZ==32);
781         vmovd           `4*$i`(@ptr[0]),$Xi
782         vmovd           `4*$i`(@ptr[4]),$t1
783         vmovd           `4*$i`(@ptr[1]),$t2
784         vmovd           `4*$i`(@ptr[5]),$t3
785         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
786         vpinsrd         \$1,`4*$i`(@ptr[6]),$t1,$t1
787         vpinsrd         \$1,`4*$i`(@ptr[3]),$t2,$t2
788         vpunpckldq      $t2,$Xi,$Xi
789         vpinsrd         \$1,`4*$i`(@ptr[7]),$t3,$t3
790         vpunpckldq      $t3,$t1,$t1
791         vinserti128     $t1,$Xi,$Xi
792         vpshufb         $Xn,$Xi,$Xi
793 ___
794 $code.=<<___ if ($i==15 && $REG_SZ==32);
795         vmovd           `4*$i`(@ptr[0]),$Xi
796          lea            `16*4`(@ptr[0]),@ptr[0]
797         vmovd           `4*$i`(@ptr[4]),$t1
798          lea            `16*4`(@ptr[4]),@ptr[4]
799         vmovd           `4*$i`(@ptr[1]),$t2
800          lea            `16*4`(@ptr[1]),@ptr[1]
801         vmovd           `4*$i`(@ptr[5]),$t3
802          lea            `16*4`(@ptr[5]),@ptr[5]
803         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
804          lea            `16*4`(@ptr[2]),@ptr[2]
805         vpinsrd         \$1,`4*$i`(@ptr[6]),$t1,$t1
806          lea            `16*4`(@ptr[6]),@ptr[6]
807         vpinsrd         \$1,`4*$i`(@ptr[3]),$t2,$t2
808          lea            `16*4`(@ptr[3]),@ptr[3]
809         vpunpckldq      $t2,$Xi,$Xi
810         vpinsrd         \$1,`4*$i`(@ptr[7]),$t3,$t3
811          lea            `16*4`(@ptr[7]),@ptr[7]
812         vpunpckldq      $t3,$t1,$t1
813         vinserti128     $t1,$Xi,$Xi
814         vpshufb         $Xn,$Xi,$Xi
815 ___
816 $code.=<<___;
817         vpsrld  \$6,$e,$sigma
818         vpslld  \$26,$e,$t3
819         vmovdqu $Xi,`&Xi_off($i)`
820          vpaddd $h,$Xi,$Xi                      # Xi+=h
821
822         vpsrld  \$11,$e,$t2
823         vpxor   $t3,$sigma,$sigma
824         vpslld  \$21,$e,$t3
825          vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi   # Xi+=K[round]
826         vpxor   $t2,$sigma,$sigma
827
828         vpsrld  \$25,$e,$t2
829         vpxor   $t3,$sigma,$sigma
830          `"prefetcht0   63(@ptr[0])"            if ($i==15)`
831         vpslld  \$7,$e,$t3
832          vpandn $g,$e,$t1
833          vpand  $f,$e,$axb                      # borrow $axb
834          `"prefetcht0   63(@ptr[1])"            if ($i==15)`
835         vpxor   $t2,$sigma,$sigma
836
837         vpsrld  \$2,$a,$h                       # borrow $h
838         vpxor   $t3,$sigma,$sigma               # Sigma1(e)
839          `"prefetcht0   63(@ptr[2])"            if ($i==15)`
840         vpslld  \$30,$a,$t2
841          vpxor  $axb,$t1,$t1                    # Ch(e,f,g)
842          vpxor  $a,$b,$axb                      # a^b, b^c in next round
843          `"prefetcht0   63(@ptr[3])"            if ($i==15)`
844         vpxor   $t2,$h,$h
845         vpaddd  $sigma,$Xi,$Xi                  # Xi+=Sigma1(e)
846
847         vpsrld  \$13,$a,$t2
848          `"prefetcht0   63(@ptr[4])"            if ($i==15 && $REG_SZ==32)`
849         vpslld  \$19,$a,$t3
850          vpaddd $t1,$Xi,$Xi                     # Xi+=Ch(e,f,g)
851          vpand  $axb,$bxc,$bxc
852          `"prefetcht0   63(@ptr[5])"            if ($i==15 && $REG_SZ==32)`
853         vpxor   $t2,$h,$sigma
854
855         vpsrld  \$22,$a,$t2
856         vpxor   $t3,$sigma,$sigma
857          `"prefetcht0   63(@ptr[6])"            if ($i==15 && $REG_SZ==32)`
858         vpslld  \$10,$a,$t3
859          vpxor  $bxc,$b,$h                      # h=Maj(a,b,c)=Ch(a^b,c,b)
860          vpaddd $Xi,$d,$d                       # d+=Xi
861          `"prefetcht0   63(@ptr[7])"            if ($i==15 && $REG_SZ==32)`
862         vpxor   $t2,$sigma,$sigma
863         vpxor   $t3,$sigma,$sigma               # Sigma0(a)
864
865         vpaddd  $Xi,$h,$h                       # h+=Xi
866         vpaddd  $sigma,$h,$h                    # h+=Sigma0(a)
867 ___
868 $code.=<<___ if (($i%8)==7);
869         add     \$`32*8`,$Tbl
870 ___
871         ($axb,$bxc)=($bxc,$axb);
872 }
873
874 sub ROUND_16_XX_avx {
875 my $i=shift;
876
877 $code.=<<___;
878         vmovdqu `&Xi_off($i+1)`,$Xn
879         vpaddd  `&Xi_off($i+9)`,$Xi,$Xi         # Xi+=X[i+9]
880
881         vpsrld  \$3,$Xn,$sigma
882         vpsrld  \$7,$Xn,$t2
883         vpslld  \$25,$Xn,$t3
884         vpxor   $t2,$sigma,$sigma
885         vpsrld  \$18,$Xn,$t2
886         vpxor   $t3,$sigma,$sigma
887         vpslld  \$14,$Xn,$t3
888         vmovdqu `&Xi_off($i+14)`,$t1
889         vpsrld  \$10,$t1,$axb                   # borrow $axb
890
891         vpxor   $t2,$sigma,$sigma
892         vpsrld  \$17,$t1,$t2
893         vpxor   $t3,$sigma,$sigma               # sigma0(X[i+1])
894         vpslld  \$15,$t1,$t3
895          vpaddd $sigma,$Xi,$Xi                  # Xi+=sigma0(e)
896         vpxor   $t2,$axb,$sigma
897         vpsrld  \$19,$t1,$t2
898         vpxor   $t3,$sigma,$sigma
899         vpslld  \$13,$t1,$t3
900         vpxor   $t2,$sigma,$sigma
901         vpxor   $t3,$sigma,$sigma               # sigma0(X[i+14])
902         vpaddd  $sigma,$Xi,$Xi                  # Xi+=sigma1(X[i+14])
903 ___
904         &ROUND_00_15_avx($i,@_);
905         ($Xi,$Xn)=($Xn,$Xi);
906 }
907
908 $code.=<<___;
909 .type   sha256_multi_block_avx,\@function,3
910 .align  32
911 sha256_multi_block_avx:
912 _avx_shortcut:
913 ___
914 $code.=<<___ if ($avx>1);
915         shr     \$32,%rcx
916         cmp     \$2,$num
917         jb      .Lavx
918         test    \$`1<<5`,%ecx
919         jnz     _avx2_shortcut
920         jmp     .Lavx
921 .align  32
922 .Lavx:
923 ___
924 $code.=<<___;
925         mov     %rsp,%rax
926         push    %rbx
927         push    %rbp
928 ___
929 $code.=<<___ if ($win64);
930         lea     -0xa8(%rsp),%rsp
931         movaps  %xmm6,(%rsp)
932         movaps  %xmm7,0x10(%rsp)
933         movaps  %xmm8,0x20(%rsp)
934         movaps  %xmm9,0x30(%rsp)
935         movaps  %xmm10,-0x78(%rax)
936         movaps  %xmm11,-0x68(%rax)
937         movaps  %xmm12,-0x58(%rax)
938         movaps  %xmm13,-0x48(%rax)
939         movaps  %xmm14,-0x38(%rax)
940         movaps  %xmm15,-0x28(%rax)
941 ___
942 $code.=<<___;
943         sub     \$`$REG_SZ*18`, %rsp
944         and     \$-256,%rsp
945         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
946 .Lbody_avx:
947         lea     K256+128(%rip),$Tbl
948         lea     `$REG_SZ*16`(%rsp),%rbx
949         lea     0x80($ctx),$ctx                 # size optimization
950
951 .Loop_grande_avx:
952         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
953         xor     $num,$num
954 ___
955 for($i=0;$i<4;$i++) {
956     $code.=<<___;
957         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
958         mov     `16*$i+8`($inp),%ecx            # number of blocks
959         cmp     $num,%ecx
960         cmovg   %ecx,$num                       # find maximum
961         test    %ecx,%ecx
962         mov     %ecx,`4*$i`(%rbx)               # initialize counters
963         cmovle  $Tbl,@ptr[$i]                   # cancel input
964 ___
965 }
966 $code.=<<___;
967         test    $num,$num
968         jz      .Ldone_avx
969
970         vmovdqu 0x00-0x80($ctx),$A              # load context
971          lea    128(%rsp),%rax
972         vmovdqu 0x20-0x80($ctx),$B
973         vmovdqu 0x40-0x80($ctx),$C
974         vmovdqu 0x60-0x80($ctx),$D
975         vmovdqu 0x80-0x80($ctx),$E
976         vmovdqu 0xa0-0x80($ctx),$F
977         vmovdqu 0xc0-0x80($ctx),$G
978         vmovdqu 0xe0-0x80($ctx),$H
979         vmovdqu .Lpbswap(%rip),$Xn
980         jmp     .Loop_avx
981
982 .align  32
983 .Loop_avx:
984         vpxor   $B,$C,$bxc                      # magic seed
985 ___
986 for($i=0;$i<16;$i++)    { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
987 $code.=<<___;
988         vmovdqu `&Xi_off($i)`,$Xi
989         mov     \$3,%ecx
990         jmp     .Loop_16_xx_avx
991 .align  32
992 .Loop_16_xx_avx:
993 ___
994 for(;$i<32;$i++)        { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
995 $code.=<<___;
996         dec     %ecx
997         jnz     .Loop_16_xx_avx
998
999         mov     \$1,%ecx
1000         lea     K256+128(%rip),$Tbl
1001 ___
1002 for($i=0;$i<4;$i++) {
1003     $code.=<<___;
1004         cmp     `4*$i`(%rbx),%ecx               # examine counters
1005         cmovge  $Tbl,@ptr[$i]                   # cancel input
1006 ___
1007 }
1008 $code.=<<___;
1009         vmovdqa (%rbx),$sigma                   # pull counters
1010         vpxor   $t1,$t1,$t1
1011         vmovdqa $sigma,$Xn
1012         vpcmpgtd $t1,$Xn,$Xn                    # mask value
1013         vpaddd  $Xn,$sigma,$sigma               # counters--
1014
1015         vmovdqu 0x00-0x80($ctx),$t1
1016         vpand   $Xn,$A,$A
1017         vmovdqu 0x20-0x80($ctx),$t2
1018         vpand   $Xn,$B,$B
1019         vmovdqu 0x40-0x80($ctx),$t3
1020         vpand   $Xn,$C,$C
1021         vmovdqu 0x60-0x80($ctx),$Xi
1022         vpand   $Xn,$D,$D
1023         vpaddd  $t1,$A,$A
1024         vmovdqu 0x80-0x80($ctx),$t1
1025         vpand   $Xn,$E,$E
1026         vpaddd  $t2,$B,$B
1027         vmovdqu 0xa0-0x80($ctx),$t2
1028         vpand   $Xn,$F,$F
1029         vpaddd  $t3,$C,$C
1030         vmovdqu 0xc0-0x80($ctx),$t3
1031         vpand   $Xn,$G,$G
1032         vpaddd  $Xi,$D,$D
1033         vmovdqu 0xe0-0x80($ctx),$Xi
1034         vpand   $Xn,$H,$H
1035         vpaddd  $t1,$E,$E
1036         vpaddd  $t2,$F,$F
1037         vmovdqu $A,0x00-0x80($ctx)
1038         vpaddd  $t3,$G,$G
1039         vmovdqu $B,0x20-0x80($ctx)
1040         vpaddd  $Xi,$H,$H
1041         vmovdqu $C,0x40-0x80($ctx)
1042         vmovdqu $D,0x60-0x80($ctx)
1043         vmovdqu $E,0x80-0x80($ctx)
1044         vmovdqu $F,0xa0-0x80($ctx)
1045         vmovdqu $G,0xc0-0x80($ctx)
1046         vmovdqu $H,0xe0-0x80($ctx)
1047
1048         vmovdqu $sigma,(%rbx)                   # save counters
1049         vmovdqu .Lpbswap(%rip),$Xn
1050         dec     $num
1051         jnz     .Loop_avx
1052
1053         mov     `$REG_SZ*17+8`(%rsp),$num
1054         lea     $REG_SZ($ctx),$ctx
1055         lea     `16*$REG_SZ/4`($inp),$inp
1056         dec     $num
1057         jnz     .Loop_grande_avx
1058
1059 .Ldone_avx:
1060         mov     `$REG_SZ*17`(%rsp),%rax         # orignal %rsp
1061         vzeroupper
1062 ___
1063 $code.=<<___ if ($win64);
1064         movaps  -0xb8(%rax),%xmm6
1065         movaps  -0xa8(%rax),%xmm7
1066         movaps  -0x98(%rax),%xmm8
1067         movaps  -0x88(%rax),%xmm9
1068         movaps  -0x78(%rax),%xmm10
1069         movaps  -0x68(%rax),%xmm11
1070         movaps  -0x58(%rax),%xmm12
1071         movaps  -0x48(%rax),%xmm13
1072         movaps  -0x38(%rax),%xmm14
1073         movaps  -0x28(%rax),%xmm15
1074 ___
1075 $code.=<<___;
1076         mov     -16(%rax),%rbp
1077         mov     -8(%rax),%rbx
1078         lea     (%rax),%rsp
1079 .Lepilogue_avx:
1080         ret
1081 .size   sha256_multi_block_avx,.-sha256_multi_block_avx
1082 ___
1083                                                 if ($avx>1) {
1084 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1085
1086 $REG_SZ=32;
1087 @ptr=map("%r$_",(12..15,8..11));
1088
1089 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1090 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1091
1092 $code.=<<___;
1093 .type   sha256_multi_block_avx2,\@function,3
1094 .align  32
1095 sha256_multi_block_avx2:
1096 _avx2_shortcut:
1097         mov     %rsp,%rax
1098         push    %rbx
1099         push    %rbp
1100         push    %r12
1101         push    %r13
1102         push    %r14
1103         push    %r15
1104 ___
1105 $code.=<<___ if ($win64);
1106         lea     -0xa8(%rsp),%rsp
1107         movaps  %xmm6,(%rsp)
1108         movaps  %xmm7,0x10(%rsp)
1109         movaps  %xmm8,0x20(%rsp)
1110         movaps  %xmm9,0x30(%rsp)
1111         movaps  %xmm10,0x40(%rsp)
1112         movaps  %xmm11,0x50(%rsp)
1113         movaps  %xmm12,-0x78(%rax)
1114         movaps  %xmm13,-0x68(%rax)
1115         movaps  %xmm14,-0x58(%rax)
1116         movaps  %xmm15,-0x48(%rax)
1117 ___
1118 $code.=<<___;
1119         sub     \$`$REG_SZ*18`, %rsp
1120         and     \$-256,%rsp
1121         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
1122 .Lbody_avx2:
1123         lea     K256+128(%rip),$Tbl
1124         lea     0x80($ctx),$ctx                 # size optimization
1125
1126 .Loop_grande_avx2:
1127         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
1128         xor     $num,$num
1129         lea     `$REG_SZ*16`(%rsp),%rbx
1130 ___
1131 for($i=0;$i<8;$i++) {
1132     $code.=<<___;
1133         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
1134         mov     `16*$i+8`($inp),%ecx            # number of blocks
1135         cmp     $num,%ecx
1136         cmovg   %ecx,$num                       # find maximum
1137         test    %ecx,%ecx
1138         mov     %ecx,`4*$i`(%rbx)               # initialize counters
1139         cmovle  $Tbl,@ptr[$i]                   # cancel input
1140 ___
1141 }
1142 $code.=<<___;
1143         vmovdqu 0x00-0x80($ctx),$A              # load context
1144          lea    128(%rsp),%rax
1145         vmovdqu 0x20-0x80($ctx),$B
1146          lea    256+128(%rsp),%rbx
1147         vmovdqu 0x40-0x80($ctx),$C
1148         vmovdqu 0x60-0x80($ctx),$D
1149         vmovdqu 0x80-0x80($ctx),$E
1150         vmovdqu 0xa0-0x80($ctx),$F
1151         vmovdqu 0xc0-0x80($ctx),$G
1152         vmovdqu 0xe0-0x80($ctx),$H
1153         vmovdqu .Lpbswap(%rip),$Xn
1154         jmp     .Loop_avx2
1155
1156 .align  32
1157 .Loop_avx2:
1158         vpxor   $B,$C,$bxc                      # magic seed
1159 ___
1160 for($i=0;$i<16;$i++)    { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1161 $code.=<<___;
1162         vmovdqu `&Xi_off($i)`,$Xi
1163         mov     \$3,%ecx
1164         jmp     .Loop_16_xx_avx2
1165 .align  32
1166 .Loop_16_xx_avx2:
1167 ___
1168 for(;$i<32;$i++)        { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1169 $code.=<<___;
1170         dec     %ecx
1171         jnz     .Loop_16_xx_avx2
1172
1173         mov     \$1,%ecx
1174         lea     `$REG_SZ*16`(%rsp),%rbx
1175         lea     K256+128(%rip),$Tbl
1176 ___
1177 for($i=0;$i<8;$i++) {
1178     $code.=<<___;
1179         cmp     `4*$i`(%rbx),%ecx               # examine counters
1180         cmovge  $Tbl,@ptr[$i]                   # cancel input
1181 ___
1182 }
1183 $code.=<<___;
1184         vmovdqa (%rbx),$sigma                   # pull counters
1185         vpxor   $t1,$t1,$t1
1186         vmovdqa $sigma,$Xn
1187         vpcmpgtd $t1,$Xn,$Xn                    # mask value
1188         vpaddd  $Xn,$sigma,$sigma               # counters--
1189
1190         vmovdqu 0x00-0x80($ctx),$t1
1191         vpand   $Xn,$A,$A
1192         vmovdqu 0x20-0x80($ctx),$t2
1193         vpand   $Xn,$B,$B
1194         vmovdqu 0x40-0x80($ctx),$t3
1195         vpand   $Xn,$C,$C
1196         vmovdqu 0x60-0x80($ctx),$Xi
1197         vpand   $Xn,$D,$D
1198         vpaddd  $t1,$A,$A
1199         vmovdqu 0x80-0x80($ctx),$t1
1200         vpand   $Xn,$E,$E
1201         vpaddd  $t2,$B,$B
1202         vmovdqu 0xa0-0x80($ctx),$t2
1203         vpand   $Xn,$F,$F
1204         vpaddd  $t3,$C,$C
1205         vmovdqu 0xc0-0x80($ctx),$t3
1206         vpand   $Xn,$G,$G
1207         vpaddd  $Xi,$D,$D
1208         vmovdqu 0xe0-0x80($ctx),$Xi
1209         vpand   $Xn,$H,$H
1210         vpaddd  $t1,$E,$E
1211         vpaddd  $t2,$F,$F
1212         vmovdqu $A,0x00-0x80($ctx)
1213         vpaddd  $t3,$G,$G
1214         vmovdqu $B,0x20-0x80($ctx)
1215         vpaddd  $Xi,$H,$H
1216         vmovdqu $C,0x40-0x80($ctx)
1217         vmovdqu $D,0x60-0x80($ctx)
1218         vmovdqu $E,0x80-0x80($ctx)
1219         vmovdqu $F,0xa0-0x80($ctx)
1220         vmovdqu $G,0xc0-0x80($ctx)
1221         vmovdqu $H,0xe0-0x80($ctx)
1222
1223         vmovdqu $sigma,(%rbx)                   # save counters
1224         lea     256+128(%rsp),%rbx
1225         vmovdqu .Lpbswap(%rip),$Xn
1226         dec     $num
1227         jnz     .Loop_avx2
1228
1229         #mov    `$REG_SZ*17+8`(%rsp),$num
1230         #lea    $REG_SZ($ctx),$ctx
1231         #lea    `16*$REG_SZ/4`($inp),$inp
1232         #dec    $num
1233         #jnz    .Loop_grande_avx2
1234
1235 .Ldone_avx2:
1236         mov     `$REG_SZ*17`(%rsp),%rax         # orignal %rsp
1237         vzeroupper
1238 ___
1239 $code.=<<___ if ($win64);
1240         movaps  -0xd8(%rax),%xmm6
1241         movaps  -0xc8(%rax),%xmm7
1242         movaps  -0xb8(%rax),%xmm8
1243         movaps  -0xa8(%rax),%xmm9
1244         movaps  -0x98(%rax),%xmm10
1245         movaps  -0x88(%rax),%xmm11
1246         movaps  -0x78(%rax),%xmm12
1247         movaps  -0x68(%rax),%xmm13
1248         movaps  -0x58(%rax),%xmm14
1249         movaps  -0x48(%rax),%xmm15
1250 ___
1251 $code.=<<___;
1252         mov     -48(%rax),%r15
1253         mov     -40(%rax),%r14
1254         mov     -32(%rax),%r13
1255         mov     -24(%rax),%r12
1256         mov     -16(%rax),%rbp
1257         mov     -8(%rax),%rbx
1258         lea     (%rax),%rsp
1259 .Lepilogue_avx2:
1260         ret
1261 .size   sha256_multi_block_avx2,.-sha256_multi_block_avx2
1262 ___
1263                                         }       }}}
1264 $code.=<<___;
1265 .align  256
1266 K256:
1267 ___
1268 sub TABLE {
1269     foreach (@_) {
1270         $code.=<<___;
1271         .long   $_,$_,$_,$_
1272         .long   $_,$_,$_,$_
1273 ___
1274     }
1275 }
1276 &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1277         0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1278         0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1279         0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1280         0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1281         0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1282         0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1283         0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1284         0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1285         0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1286         0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1287         0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1288         0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1289         0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1290         0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1291         0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1292 $code.=<<___;
1293 .Lpbswap:
1294         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1295         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1296 K256_shaext:
1297         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1298         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1299         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1300         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1301         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1302         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1303         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1304         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1305         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1306         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1307         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1308         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1309         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1310         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1311         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1312         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1313         .asciz  "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1314 ___
1315
1316 if ($win64) {
1317 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1318 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1319 $rec="%rcx";
1320 $frame="%rdx";
1321 $context="%r8";
1322 $disp="%r9";
1323
1324 $code.=<<___;
1325 .extern __imp_RtlVirtualUnwind
1326 .type   se_handler,\@abi-omnipotent
1327 .align  16
1328 se_handler:
1329         push    %rsi
1330         push    %rdi
1331         push    %rbx
1332         push    %rbp
1333         push    %r12
1334         push    %r13
1335         push    %r14
1336         push    %r15
1337         pushfq
1338         sub     \$64,%rsp
1339
1340         mov     120($context),%rax      # pull context->Rax
1341         mov     248($context),%rbx      # pull context->Rip
1342
1343         mov     8($disp),%rsi           # disp->ImageBase
1344         mov     56($disp),%r11          # disp->HandlerData
1345
1346         mov     0(%r11),%r10d           # HandlerData[0]
1347         lea     (%rsi,%r10),%r10        # end of prologue label
1348         cmp     %r10,%rbx               # context->Rip<.Lbody
1349         jb      .Lin_prologue
1350
1351         mov     152($context),%rax      # pull context->Rsp
1352
1353         mov     4(%r11),%r10d           # HandlerData[1]
1354         lea     (%rsi,%r10),%r10        # epilogue label
1355         cmp     %r10,%rbx               # context->Rip>=.Lepilogue
1356         jae     .Lin_prologue
1357
1358         mov     `16*17`(%rax),%rax      # pull saved stack pointer
1359
1360         mov     -8(%rax),%rbx
1361         mov     -16(%rax),%rbp
1362         mov     %rbx,144($context)      # restore context->Rbx
1363         mov     %rbp,160($context)      # restore context->Rbp
1364
1365         lea     -24-10*16(%rax),%rsi
1366         lea     512($context),%rdi      # &context.Xmm6
1367         mov     \$20,%ecx
1368         .long   0xa548f3fc              # cld; rep movsq
1369
1370 .Lin_prologue:
1371         mov     8(%rax),%rdi
1372         mov     16(%rax),%rsi
1373         mov     %rax,152($context)      # restore context->Rsp
1374         mov     %rsi,168($context)      # restore context->Rsi
1375         mov     %rdi,176($context)      # restore context->Rdi
1376
1377         mov     40($disp),%rdi          # disp->ContextRecord
1378         mov     $context,%rsi           # context
1379         mov     \$154,%ecx              # sizeof(CONTEXT)
1380         .long   0xa548f3fc              # cld; rep movsq
1381
1382         mov     $disp,%rsi
1383         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1384         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1385         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1386         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1387         mov     40(%rsi),%r10           # disp->ContextRecord
1388         lea     56(%rsi),%r11           # &disp->HandlerData
1389         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1390         mov     %r10,32(%rsp)           # arg5
1391         mov     %r11,40(%rsp)           # arg6
1392         mov     %r12,48(%rsp)           # arg7
1393         mov     %rcx,56(%rsp)           # arg8, (NULL)
1394         call    *__imp_RtlVirtualUnwind(%rip)
1395
1396         mov     \$1,%eax                # ExceptionContinueSearch
1397         add     \$64,%rsp
1398         popfq
1399         pop     %r15
1400         pop     %r14
1401         pop     %r13
1402         pop     %r12
1403         pop     %rbp
1404         pop     %rbx
1405         pop     %rdi
1406         pop     %rsi
1407         ret
1408 .size   se_handler,.-se_handler
1409 ___
1410 $code.=<<___ if ($avx>1);
1411 .type   avx2_handler,\@abi-omnipotent
1412 .align  16
1413 avx2_handler:
1414         push    %rsi
1415         push    %rdi
1416         push    %rbx
1417         push    %rbp
1418         push    %r12
1419         push    %r13
1420         push    %r14
1421         push    %r15
1422         pushfq
1423         sub     \$64,%rsp
1424
1425         mov     120($context),%rax      # pull context->Rax
1426         mov     248($context),%rbx      # pull context->Rip
1427
1428         mov     8($disp),%rsi           # disp->ImageBase
1429         mov     56($disp),%r11          # disp->HandlerData
1430
1431         mov     0(%r11),%r10d           # HandlerData[0]
1432         lea     (%rsi,%r10),%r10        # end of prologue label
1433         cmp     %r10,%rbx               # context->Rip<body label
1434         jb      .Lin_prologue
1435
1436         mov     152($context),%rax      # pull context->Rsp
1437
1438         mov     4(%r11),%r10d           # HandlerData[1]
1439         lea     (%rsi,%r10),%r10        # epilogue label
1440         cmp     %r10,%rbx               # context->Rip>=epilogue label
1441         jae     .Lin_prologue
1442
1443         mov     `32*17`($context),%rax  # pull saved stack pointer
1444
1445         mov     -8(%rax),%rbx
1446         mov     -16(%rax),%rbp
1447         mov     -24(%rax),%r12
1448         mov     -32(%rax),%r13
1449         mov     -40(%rax),%r14
1450         mov     -48(%rax),%r15
1451         mov     %rbx,144($context)      # restore context->Rbx
1452         mov     %rbp,160($context)      # restore context->Rbp
1453         mov     %r12,216($context)      # restore cotnext->R12
1454         mov     %r13,224($context)      # restore cotnext->R13
1455         mov     %r14,232($context)      # restore cotnext->R14
1456         mov     %r15,240($context)      # restore cotnext->R15
1457
1458         lea     -56-10*16(%rax),%rsi
1459         lea     512($context),%rdi      # &context.Xmm6
1460         mov     \$20,%ecx
1461         .long   0xa548f3fc              # cld; rep movsq
1462
1463         jmp     .Lin_prologue
1464 .size   avx2_handler,.-avx2_handler
1465 ___
1466 $code.=<<___;
1467 .section        .pdata
1468 .align  4
1469         .rva    .LSEH_begin_sha256_multi_block
1470         .rva    .LSEH_end_sha256_multi_block
1471         .rva    .LSEH_info_sha256_multi_block
1472         .rva    .LSEH_begin_sha256_multi_block_shaext
1473         .rva    .LSEH_end_sha256_multi_block_shaext
1474         .rva    .LSEH_info_sha256_multi_block_shaext
1475 ___
1476 $code.=<<___ if ($avx);
1477         .rva    .LSEH_begin_sha256_multi_block_avx
1478         .rva    .LSEH_end_sha256_multi_block_avx
1479         .rva    .LSEH_info_sha256_multi_block_avx
1480 ___
1481 $code.=<<___ if ($avx>1);
1482         .rva    .LSEH_begin_sha256_multi_block_avx2
1483         .rva    .LSEH_end_sha256_multi_block_avx2
1484         .rva    .LSEH_info_sha256_multi_block_avx2
1485 ___
1486 $code.=<<___;
1487 .section        .xdata
1488 .align  8
1489 .LSEH_info_sha256_multi_block:
1490         .byte   9,0,0,0
1491         .rva    se_handler
1492         .rva    .Lbody,.Lepilogue                       # HandlerData[]
1493 .LSEH_info_sha256_multi_block_shaext:
1494         .byte   9,0,0,0
1495         .rva    se_handler
1496         .rva    .Lbody_shaext,.Lepilogue_shaext         # HandlerData[]
1497 ___
1498 $code.=<<___ if ($avx);
1499 .LSEH_info_sha256_multi_block_avx:
1500         .byte   9,0,0,0
1501         .rva    se_handler
1502         .rva    .Lbody_avx,.Lepilogue_avx               # HandlerData[]
1503 ___
1504 $code.=<<___ if ($avx>1);
1505 .LSEH_info_sha256_multi_block_avx2:
1506         .byte   9,0,0,0
1507         .rva    avx2_handler
1508         .rva    .Lbody_avx2,.Lepilogue_avx2             # HandlerData[]
1509 ___
1510 }
1511 ####################################################################
1512
1513 sub rex {
1514   local *opcode=shift;
1515   my ($dst,$src)=@_;
1516   my $rex=0;
1517
1518     $rex|=0x04                  if ($dst>=8);
1519     $rex|=0x01                  if ($src>=8);
1520     unshift @opcode,$rex|0x40   if ($rex);
1521 }
1522
1523 sub sha256op38 {
1524     my $instr = shift;
1525     my %opcodelet = (
1526                 "sha256rnds2" => 0xcb,
1527                 "sha256msg1"  => 0xcc,
1528                 "sha256msg2"  => 0xcd   );
1529
1530     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1531       my @opcode=(0x0f,0x38);
1532         rex(\@opcode,$2,$1);
1533         push @opcode,$opcodelet{$instr};
1534         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1535         return ".byte\t".join(',',@opcode);
1536     } else {
1537         return $instr."\t".@_[0];
1538     }
1539 }
1540
1541 foreach (split("\n",$code)) {
1542         s/\`([^\`]*)\`/eval($1)/ge;
1543
1544         s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo         or
1545
1546         s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go          or
1547         s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go         or
1548         s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go    or
1549         s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go        or
1550         s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go            or
1551         s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1552
1553         print $_,"\n";
1554 }
1555
1556 close STDOUT;