x86_64 assembly pack: refine clang detection.
[openssl.git] / crypto / sha / asm / sha256-mb-x86_64.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # Multi-buffer SHA256 procedure processes n buffers in parallel by
11 # placing buffer data to designated lane of SIMD register. n is
12 # naturally limited to 4 on pre-AVX2 processors and to 8 on
13 # AVX2-capable processors such as Haswell.
14 #
15 #               this    +aesni(i)       sha256  aesni-sha256    gain(iv)
16 # -------------------------------------------------------------------
17 # Westmere(ii)  23.3/n  +1.28=7.11(n=4) 12.3    +3.75=16.1      +126%
18 # Atom(ii)      38.7/n  +3.93=13.6(n=4) 20.8    +5.69=26.5      +95%
19 # Sandy Bridge  (20.5   +5.15=25.7)/n   11.6    13.0            +103%
20 # Ivy Bridge    (20.4   +5.14=25.5)/n   10.3    11.6            +82%
21 # Haswell(iii)  (21.0   +5.00=26.0)/n   7.80    8.79            +170%
22 # Bulldozer     (21.6   +5.76=27.4)/n   13.6    13.7            +100%
23 #
24 # (i)   multi-block CBC encrypt with 128-bit key;
25 # (ii)  (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
26 #       because of lower AES-NI instruction throughput, nor is there
27 #       AES-NI-SHA256 stitch for these processors;
28 # (iii) "this" is for n=8, when we gather twice as much data, result
29 #       for n=4 is 20.3+4.44=24.7;
30 # (iv)  presented improvement coefficients are asymptotic limits and
31 #       in real-life application are somewhat lower, e.g. for 2KB 
32 #       fragments they range from 75% to 130% (on Haswell);
33
34 $flavour = shift;
35 $output  = shift;
36 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
37
38 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
39
40 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
41 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
42 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
43 die "can't locate x86_64-xlate.pl";
44
45 $avx=0;
46
47 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
48                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
49         $avx = ($1>=2.19) + ($1>=2.22);
50 }
51
52 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
53            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
54         $avx = ($1>=2.09) + ($1>=2.10);
55 }
56
57 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
58            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
59         $avx = ($1>=10) + ($1>=11);
60 }
61
62 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
63         $avx = ($2>=3.0) + ($2>3.0);
64 }
65
66 open OUT,"| \"$^X\" $xlate $flavour $output";
67 *STDOUT=*OUT;
68
69 # void sha256_multi_block (
70 #     struct {  unsigned int A[8];
71 #               unsigned int B[8];
72 #               unsigned int C[8];
73 #               unsigned int D[8];
74 #               unsigned int E[8];
75 #               unsigned int F[8];
76 #               unsigned int G[8];
77 #               unsigned int H[8];      } *ctx,
78 #     struct {  void *ptr; int blocks;  } inp[8],
79 #     int num);         /* 1 or 2 */
80 #
81 $ctx="%rdi";    # 1st arg
82 $inp="%rsi";    # 2nd arg
83 $num="%edx";    # 3rd arg
84 @ptr=map("%r$_",(8..11));
85 $Tbl="%rbp";
86
87 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
88 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
89
90 $REG_SZ=16;
91
92 sub Xi_off {
93 my $off = shift;
94
95     $off %= 16; $off *= $REG_SZ;
96     $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
97 }
98
99 sub ROUND_00_15 {
100 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
101
102 $code.=<<___ if ($i<15);
103         movd            `4*$i`(@ptr[0]),$Xi
104         movd            `4*$i`(@ptr[1]),$t1
105         movd            `4*$i`(@ptr[2]),$t2
106         movd            `4*$i`(@ptr[3]),$t3
107         punpckldq       $t2,$Xi
108         punpckldq       $t3,$t1
109         punpckldq       $t1,$Xi
110 ___
111 $code.=<<___ if ($i==15);
112         movd            `4*$i`(@ptr[0]),$Xi
113          lea            `16*4`(@ptr[0]),@ptr[0]
114         movd            `4*$i`(@ptr[1]),$t1
115          lea            `16*4`(@ptr[1]),@ptr[1]
116         movd            `4*$i`(@ptr[2]),$t2
117          lea            `16*4`(@ptr[2]),@ptr[2]
118         movd            `4*$i`(@ptr[3]),$t3
119          lea            `16*4`(@ptr[3]),@ptr[3]
120         punpckldq       $t2,$Xi
121         punpckldq       $t3,$t1
122         punpckldq       $t1,$Xi
123 ___
124 $code.=<<___;
125         movdqa  $e,$sigma
126         `"pshufb        $Xn,$Xi"                if ($i<=15 && ($i&1)==0)`
127         movdqa  $e,$t3
128         `"pshufb        $Xn,$Xi"                if ($i<=15 && ($i&1)==1)`
129         psrld   \$6,$sigma
130         movdqa  $e,$t2
131         pslld   \$7,$t3
132         movdqa  $Xi,`&Xi_off($i)`
133          paddd  $h,$Xi                          # Xi+=h
134
135         psrld   \$11,$t2
136         pxor    $t3,$sigma
137         pslld   \$21-7,$t3
138          paddd  `32*($i%8)-128`($Tbl),$Xi       # Xi+=K[round]
139         pxor    $t2,$sigma
140
141         psrld   \$25-11,$t2
142          movdqa $e,$t1
143          `"prefetcht0   63(@ptr[0])"            if ($i==15)`
144         pxor    $t3,$sigma
145          movdqa $e,$axb                         # borrow $axb
146         pslld   \$26-21,$t3
147          pandn  $g,$t1
148          pand   $f,$axb
149         pxor    $t2,$sigma
150
151          `"prefetcht0   63(@ptr[1])"            if ($i==15)`
152         movdqa  $a,$t2
153         pxor    $t3,$sigma                      # Sigma1(e)
154         movdqa  $a,$t3
155         psrld   \$2,$t2
156         paddd   $sigma,$Xi                      # Xi+=Sigma1(e)
157          pxor   $axb,$t1                        # Ch(e,f,g)
158          movdqa $b,$axb
159         movdqa  $a,$sigma
160         pslld   \$10,$t3
161          pxor   $a,$axb                         # a^b, b^c in next round
162
163          `"prefetcht0   63(@ptr[2])"            if ($i==15)`
164         psrld   \$13,$sigma
165         pxor    $t3,$t2
166          paddd  $t1,$Xi                         # Xi+=Ch(e,f,g)
167         pslld   \$19-10,$t3
168          pand   $axb,$bxc
169         pxor    $sigma,$t2
170
171          `"prefetcht0   63(@ptr[3])"            if ($i==15)`
172         psrld   \$22-13,$sigma
173         pxor    $t3,$t2
174          movdqa $b,$h
175         pslld   \$30-19,$t3
176         pxor    $t2,$sigma
177          pxor   $bxc,$h                         # h=Maj(a,b,c)=Ch(a^b,c,b)
178          paddd  $Xi,$d                          # d+=Xi
179         pxor    $t3,$sigma                      # Sigma0(a)
180
181         paddd   $Xi,$h                          # h+=Xi
182         paddd   $sigma,$h                       # h+=Sigma0(a)
183 ___
184 $code.=<<___ if (($i%8)==7);
185         lea     `32*8`($Tbl),$Tbl
186 ___
187         ($axb,$bxc)=($bxc,$axb);
188 }
189
190 sub ROUND_16_XX {
191 my $i=shift;
192
193 $code.=<<___;
194         movdqa  `&Xi_off($i+1)`,$Xn
195         paddd   `&Xi_off($i+9)`,$Xi             # Xi+=X[i+9]
196
197         movdqa  $Xn,$sigma
198         movdqa  $Xn,$t2
199         psrld   \$3,$sigma
200         movdqa  $Xn,$t3
201
202         psrld   \$7,$t2
203         movdqa  `&Xi_off($i+14)`,$t1
204         pslld   \$14,$t3
205         pxor    $t2,$sigma
206         psrld   \$18-7,$t2
207         movdqa  $t1,$axb                        # borrow $axb
208         pxor    $t3,$sigma
209         pslld   \$25-14,$t3
210         pxor    $t2,$sigma
211         psrld   \$10,$t1
212         movdqa  $axb,$t2
213
214         psrld   \$17,$axb
215         pxor    $t3,$sigma                      # sigma0(X[i+1])
216         pslld   \$13,$t2
217          paddd  $sigma,$Xi                      # Xi+=sigma0(e)
218         pxor    $axb,$t1
219         psrld   \$19-17,$axb
220         pxor    $t2,$t1
221         pslld   \$15-13,$t2
222         pxor    $axb,$t1
223         pxor    $t2,$t1                         # sigma0(X[i+14])
224         paddd   $t1,$Xi                         # Xi+=sigma1(X[i+14])
225 ___
226         &ROUND_00_15($i,@_);
227         ($Xi,$Xn)=($Xn,$Xi);
228 }
229
230 $code.=<<___;
231 .text
232
233 .extern OPENSSL_ia32cap_P
234
235 .globl  sha256_multi_block
236 .type   sha256_multi_block,\@function,3
237 .align  32
238 sha256_multi_block:
239         mov     OPENSSL_ia32cap_P+4(%rip),%rcx
240         bt      \$61,%rcx                       # check SHA bit
241         jc      _shaext_shortcut
242 ___
243 $code.=<<___ if ($avx);
244         test    \$`1<<28`,%ecx
245         jnz     _avx_shortcut
246 ___
247 $code.=<<___;
248         mov     %rsp,%rax
249         push    %rbx
250         push    %rbp
251 ___
252 $code.=<<___ if ($win64);
253         lea     -0xa8(%rsp),%rsp
254         movaps  %xmm6,(%rsp)
255         movaps  %xmm7,0x10(%rsp)
256         movaps  %xmm8,0x20(%rsp)
257         movaps  %xmm9,0x30(%rsp)
258         movaps  %xmm10,-0x78(%rax)
259         movaps  %xmm11,-0x68(%rax)
260         movaps  %xmm12,-0x58(%rax)
261         movaps  %xmm13,-0x48(%rax)
262         movaps  %xmm14,-0x38(%rax)
263         movaps  %xmm15,-0x28(%rax)
264 ___
265 $code.=<<___;
266         sub     \$`$REG_SZ*18`, %rsp
267         and     \$-256,%rsp
268         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
269 .Lbody:
270         lea     K256+128(%rip),$Tbl
271         lea     `$REG_SZ*16`(%rsp),%rbx
272         lea     0x80($ctx),$ctx                 # size optimization
273
274 .Loop_grande:
275         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
276         xor     $num,$num
277 ___
278 for($i=0;$i<4;$i++) {
279     $code.=<<___;
280         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
281         mov     `16*$i+8`($inp),%ecx            # number of blocks
282         cmp     $num,%ecx
283         cmovg   %ecx,$num                       # find maximum
284         test    %ecx,%ecx
285         mov     %ecx,`4*$i`(%rbx)               # initialize counters
286         cmovle  $Tbl,@ptr[$i]                   # cancel input
287 ___
288 }
289 $code.=<<___;
290         test    $num,$num
291         jz      .Ldone
292
293         movdqu  0x00-0x80($ctx),$A              # load context
294          lea    128(%rsp),%rax
295         movdqu  0x20-0x80($ctx),$B
296         movdqu  0x40-0x80($ctx),$C
297         movdqu  0x60-0x80($ctx),$D
298         movdqu  0x80-0x80($ctx),$E
299         movdqu  0xa0-0x80($ctx),$F
300         movdqu  0xc0-0x80($ctx),$G
301         movdqu  0xe0-0x80($ctx),$H
302         movdqu  .Lpbswap(%rip),$Xn
303         jmp     .Loop
304
305 .align  32
306 .Loop:
307         movdqa  $C,$bxc
308         pxor    $B,$bxc                         # magic seed
309 ___
310 for($i=0;$i<16;$i++)    { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
311 $code.=<<___;
312         movdqu  `&Xi_off($i)`,$Xi
313         mov     \$3,%ecx
314         jmp     .Loop_16_xx
315 .align  32
316 .Loop_16_xx:
317 ___
318 for(;$i<32;$i++)        { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
319 $code.=<<___;
320         dec     %ecx
321         jnz     .Loop_16_xx
322
323         mov     \$1,%ecx
324         lea     K256+128(%rip),$Tbl
325
326         movdqa  (%rbx),$sigma                   # pull counters
327         cmp     4*0(%rbx),%ecx                  # examine counters
328         pxor    $t1,$t1
329         cmovge  $Tbl,@ptr[0]                    # cancel input
330         cmp     4*1(%rbx),%ecx
331         movdqa  $sigma,$Xn
332         cmovge  $Tbl,@ptr[1]
333         cmp     4*2(%rbx),%ecx
334         pcmpgtd $t1,$Xn                         # mask value
335         cmovge  $Tbl,@ptr[2]
336         cmp     4*3(%rbx),%ecx
337         paddd   $Xn,$sigma                      # counters--
338         cmovge  $Tbl,@ptr[3]
339
340         movdqu  0x00-0x80($ctx),$t1
341         pand    $Xn,$A
342         movdqu  0x20-0x80($ctx),$t2
343         pand    $Xn,$B
344         movdqu  0x40-0x80($ctx),$t3
345         pand    $Xn,$C
346         movdqu  0x60-0x80($ctx),$Xi
347         pand    $Xn,$D
348         paddd   $t1,$A
349         movdqu  0x80-0x80($ctx),$t1
350         pand    $Xn,$E
351         paddd   $t2,$B
352         movdqu  0xa0-0x80($ctx),$t2
353         pand    $Xn,$F
354         paddd   $t3,$C
355         movdqu  0xc0-0x80($ctx),$t3
356         pand    $Xn,$G
357         paddd   $Xi,$D
358         movdqu  0xe0-0x80($ctx),$Xi
359         pand    $Xn,$H
360         paddd   $t1,$E
361         paddd   $t2,$F
362         movdqu  $A,0x00-0x80($ctx)
363         paddd   $t3,$G
364         movdqu  $B,0x20-0x80($ctx)
365         paddd   $Xi,$H
366         movdqu  $C,0x40-0x80($ctx)
367         movdqu  $D,0x60-0x80($ctx)
368         movdqu  $E,0x80-0x80($ctx)
369         movdqu  $F,0xa0-0x80($ctx)
370         movdqu  $G,0xc0-0x80($ctx)
371         movdqu  $H,0xe0-0x80($ctx)
372
373         movdqa  $sigma,(%rbx)                   # save counters
374         movdqa  .Lpbswap(%rip),$Xn
375         dec     $num
376         jnz     .Loop
377
378         mov     `$REG_SZ*17+8`(%rsp),$num
379         lea     $REG_SZ($ctx),$ctx
380         lea     `16*$REG_SZ/4`($inp),$inp
381         dec     $num
382         jnz     .Loop_grande
383
384 .Ldone:
385         mov     `$REG_SZ*17`(%rsp),%rax         # orignal %rsp
386 ___
387 $code.=<<___ if ($win64);
388         movaps  -0xb8(%rax),%xmm6
389         movaps  -0xa8(%rax),%xmm7
390         movaps  -0x98(%rax),%xmm8
391         movaps  -0x88(%rax),%xmm9
392         movaps  -0x78(%rax),%xmm10
393         movaps  -0x68(%rax),%xmm11
394         movaps  -0x58(%rax),%xmm12
395         movaps  -0x48(%rax),%xmm13
396         movaps  -0x38(%rax),%xmm14
397         movaps  -0x28(%rax),%xmm15
398 ___
399 $code.=<<___;
400         mov     -16(%rax),%rbp
401         mov     -8(%rax),%rbx
402         lea     (%rax),%rsp
403 .Lepilogue:
404         ret
405 .size   sha256_multi_block,.-sha256_multi_block
406 ___
407                                                 {{{
408 my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
409 my @MSG0=map("%xmm$_",(4..7));
410 my @MSG1=map("%xmm$_",(8..11));
411
412 $code.=<<___;
413 .type   sha256_multi_block_shaext,\@function,3
414 .align  32
415 sha256_multi_block_shaext:
416 _shaext_shortcut:
417         mov     %rsp,%rax
418         push    %rbx
419         push    %rbp
420 ___
421 $code.=<<___ if ($win64);
422         lea     -0xa8(%rsp),%rsp
423         movaps  %xmm6,(%rsp)
424         movaps  %xmm7,0x10(%rsp)
425         movaps  %xmm8,0x20(%rsp)
426         movaps  %xmm9,0x30(%rsp)
427         movaps  %xmm10,-0x78(%rax)
428         movaps  %xmm11,-0x68(%rax)
429         movaps  %xmm12,-0x58(%rax)
430         movaps  %xmm13,-0x48(%rax)
431         movaps  %xmm14,-0x38(%rax)
432         movaps  %xmm15,-0x28(%rax)
433 ___
434 $code.=<<___;
435         sub     \$`$REG_SZ*18`,%rsp
436         shl     \$1,$num                        # we process pair at a time
437         and     \$-256,%rsp
438         lea     0x80($ctx),$ctx                 # size optimization
439         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
440 .Lbody_shaext:
441         lea     `$REG_SZ*16`(%rsp),%rbx
442         lea     K256_shaext+0x80(%rip),$Tbl
443
444 .Loop_grande_shaext:
445         mov     $num,`$REG_SZ*17+8`(%rsp)       # orignal $num
446         xor     $num,$num
447 ___
448 for($i=0;$i<2;$i++) {
449     $code.=<<___;
450         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
451         mov     `16*$i+8`($inp),%ecx            # number of blocks
452         cmp     $num,%ecx
453         cmovg   %ecx,$num                       # find maximum
454         test    %ecx,%ecx
455         mov     %ecx,`4*$i`(%rbx)               # initialize counters
456         cmovle  %rsp,@ptr[$i]                   # cancel input
457 ___
458 }
459 $code.=<<___;
460         test    $num,$num
461         jz      .Ldone_shaext
462
463         movq            0x00-0x80($ctx),$ABEF0          # A1.A0
464         movq            0x20-0x80($ctx),@MSG0[0]        # B1.B0
465         movq            0x40-0x80($ctx),$CDGH0          # C1.C0
466         movq            0x60-0x80($ctx),@MSG0[1]        # D1.D0
467         movq            0x80-0x80($ctx),@MSG1[0]        # E1.E0
468         movq            0xa0-0x80($ctx),@MSG1[1]        # F1.F0
469         movq            0xc0-0x80($ctx),@MSG1[2]        # G1.G0
470         movq            0xe0-0x80($ctx),@MSG1[3]        # H1.H0
471
472         punpckldq       @MSG0[0],$ABEF0                 # B1.A1.B0.A0
473         punpckldq       @MSG0[1],$CDGH0                 # D1.C1.D0.C0
474         punpckldq       @MSG1[1],@MSG1[0]               # F1.E1.F0.E0
475         punpckldq       @MSG1[3],@MSG1[2]               # H1.G1.H0.G0
476         movdqa          K256_shaext-0x10(%rip),$TMPx    # byte swap
477
478         movdqa          $ABEF0,$ABEF1
479         movdqa          $CDGH0,$CDGH1
480         punpcklqdq      @MSG1[0],$ABEF0                 # F0.E0.B0.A0
481         punpcklqdq      @MSG1[2],$CDGH0                 # H0.G0.D0.C0
482         punpckhqdq      @MSG1[0],$ABEF1                 # F1.E1.B1.A1
483         punpckhqdq      @MSG1[2],$CDGH1                 # H1.G1.D1.C1
484
485         pshufd          \$0b00011011,$ABEF0,$ABEF0
486         pshufd          \$0b00011011,$CDGH0,$CDGH0
487         pshufd          \$0b00011011,$ABEF1,$ABEF1
488         pshufd          \$0b00011011,$CDGH1,$CDGH1
489         jmp             .Loop_shaext
490
491 .align  32
492 .Loop_shaext:
493         movdqu          0x00(@ptr[0]),@MSG0[0]
494          movdqu         0x00(@ptr[1]),@MSG1[0]
495         movdqu          0x10(@ptr[0]),@MSG0[1]
496          movdqu         0x10(@ptr[1]),@MSG1[1]
497         movdqu          0x20(@ptr[0]),@MSG0[2]
498         pshufb          $TMPx,@MSG0[0]
499          movdqu         0x20(@ptr[1]),@MSG1[2]
500          pshufb         $TMPx,@MSG1[0]
501         movdqu          0x30(@ptr[0]),@MSG0[3]
502         lea             0x40(@ptr[0]),@ptr[0]
503          movdqu         0x30(@ptr[1]),@MSG1[3]
504          lea            0x40(@ptr[1]),@ptr[1]
505
506         movdqa          0*16-0x80($Tbl),$Wi
507         pshufb          $TMPx,@MSG0[1]
508         paddd           @MSG0[0],$Wi
509         pxor            $ABEF0,@MSG0[0]         # black magic
510         movdqa          $Wi,$TMP0
511          movdqa         0*16-0x80($Tbl),$TMP1
512          pshufb         $TMPx,@MSG1[1]
513          paddd          @MSG1[0],$TMP1
514         movdqa          $CDGH0,0x50(%rsp)       # offload
515         sha256rnds2     $ABEF0,$CDGH0           # 0-3
516          pxor           $ABEF1,@MSG1[0]         # black magic
517          movdqa         $TMP1,$Wi
518          movdqa         $CDGH1,0x70(%rsp)
519          sha256rnds2    $ABEF1,$CDGH1           # 0-3
520         pshufd          \$0x0e,$TMP0,$Wi
521         pxor            $ABEF0,@MSG0[0]         # black magic
522         movdqa          $ABEF0,0x40(%rsp)       # offload
523         sha256rnds2     $CDGH0,$ABEF0
524          pshufd         \$0x0e,$TMP1,$Wi
525          pxor           $ABEF1,@MSG1[0]         # black magic
526          movdqa         $ABEF1,0x60(%rsp)
527         movdqa          1*16-0x80($Tbl),$TMP0
528         paddd           @MSG0[1],$TMP0
529         pshufb          $TMPx,@MSG0[2]
530          sha256rnds2    $CDGH1,$ABEF1
531
532         movdqa          $TMP0,$Wi
533          movdqa         1*16-0x80($Tbl),$TMP1
534          paddd          @MSG1[1],$TMP1
535         sha256rnds2     $ABEF0,$CDGH0           # 4-7
536          movdqa         $TMP1,$Wi
537         prefetcht0      127(@ptr[0])
538         pshufb          $TMPx,@MSG0[3]
539          pshufb         $TMPx,@MSG1[2]
540          prefetcht0     127(@ptr[1])
541          sha256rnds2    $ABEF1,$CDGH1           # 4-7
542         pshufd          \$0x0e,$TMP0,$Wi
543          pshufb         $TMPx,@MSG1[3]
544         sha256msg1      @MSG0[1],@MSG0[0]
545         sha256rnds2     $CDGH0,$ABEF0
546          pshufd         \$0x0e,$TMP1,$Wi
547         movdqa          2*16-0x80($Tbl),$TMP0
548         paddd           @MSG0[2],$TMP0
549          sha256rnds2    $CDGH1,$ABEF1
550
551         movdqa          $TMP0,$Wi
552          movdqa         2*16-0x80($Tbl),$TMP1
553          paddd          @MSG1[2],$TMP1
554         sha256rnds2     $ABEF0,$CDGH0           # 8-11
555          sha256msg1     @MSG1[1],@MSG1[0]
556          movdqa         $TMP1,$Wi
557         movdqa          @MSG0[3],$TMPx
558          sha256rnds2    $ABEF1,$CDGH1           # 8-11
559         pshufd          \$0x0e,$TMP0,$Wi
560         palignr         \$4,@MSG0[2],$TMPx
561         paddd           $TMPx,@MSG0[0]
562          movdqa         @MSG1[3],$TMPx
563          palignr        \$4,@MSG1[2],$TMPx
564         sha256msg1      @MSG0[2],@MSG0[1]
565         sha256rnds2     $CDGH0,$ABEF0
566          pshufd         \$0x0e,$TMP1,$Wi
567         movdqa          3*16-0x80($Tbl),$TMP0
568         paddd           @MSG0[3],$TMP0
569          sha256rnds2    $CDGH1,$ABEF1
570          sha256msg1     @MSG1[2],@MSG1[1]
571
572         movdqa          $TMP0,$Wi
573          movdqa         3*16-0x80($Tbl),$TMP1
574          paddd          $TMPx,@MSG1[0]
575          paddd          @MSG1[3],$TMP1
576         sha256msg2      @MSG0[3],@MSG0[0]
577         sha256rnds2     $ABEF0,$CDGH0           # 12-15
578          movdqa         $TMP1,$Wi
579         movdqa          @MSG0[0],$TMPx
580         palignr         \$4,@MSG0[3],$TMPx
581          sha256rnds2    $ABEF1,$CDGH1           # 12-15
582          sha256msg2     @MSG1[3],@MSG1[0]
583         pshufd          \$0x0e,$TMP0,$Wi
584         paddd           $TMPx,@MSG0[1]
585          movdqa         @MSG1[0],$TMPx
586          palignr        \$4,@MSG1[3],$TMPx
587         sha256msg1      @MSG0[3],@MSG0[2]
588         sha256rnds2     $CDGH0,$ABEF0
589          pshufd         \$0x0e,$TMP1,$Wi
590         movdqa          4*16-0x80($Tbl),$TMP0
591         paddd           @MSG0[0],$TMP0
592          sha256rnds2    $CDGH1,$ABEF1
593          sha256msg1     @MSG1[3],@MSG1[2]
594 ___
595 for($i=4;$i<16-3;$i++) {
596 $code.=<<___;
597         movdqa          $TMP0,$Wi
598          movdqa         $i*16-0x80($Tbl),$TMP1
599          paddd          $TMPx,@MSG1[1]
600          paddd          @MSG1[0],$TMP1
601         sha256msg2      @MSG0[0],@MSG0[1]
602         sha256rnds2     $ABEF0,$CDGH0           # 16-19...
603          movdqa         $TMP1,$Wi
604         movdqa          @MSG0[1],$TMPx
605         palignr         \$4,@MSG0[0],$TMPx
606          sha256rnds2    $ABEF1,$CDGH1           # 16-19...
607          sha256msg2     @MSG1[0],@MSG1[1]
608         pshufd          \$0x0e,$TMP0,$Wi
609         paddd           $TMPx,@MSG0[2]
610          movdqa         @MSG1[1],$TMPx
611          palignr        \$4,@MSG1[0],$TMPx
612         sha256msg1      @MSG0[0],@MSG0[3]
613         sha256rnds2     $CDGH0,$ABEF0
614          pshufd         \$0x0e,$TMP1,$Wi
615         movdqa          `($i+1)*16`-0x80($Tbl),$TMP0
616         paddd           @MSG0[1],$TMP0
617          sha256rnds2    $CDGH1,$ABEF1
618          sha256msg1     @MSG1[0],@MSG1[3]
619 ___
620         push(@MSG0,shift(@MSG0));       push(@MSG1,shift(@MSG1));
621 }
622 $code.=<<___;
623         movdqa          $TMP0,$Wi
624          movdqa         13*16-0x80($Tbl),$TMP1
625          paddd          $TMPx,@MSG1[1]
626          paddd          @MSG1[0],$TMP1
627         sha256msg2      @MSG0[0],@MSG0[1]
628         sha256rnds2     $ABEF0,$CDGH0           # 52-55
629          movdqa         $TMP1,$Wi
630         movdqa          @MSG0[1],$TMPx
631         palignr         \$4,@MSG0[0],$TMPx
632          sha256rnds2    $ABEF1,$CDGH1           # 52-55
633          sha256msg2     @MSG1[0],@MSG1[1]
634         pshufd          \$0x0e,$TMP0,$Wi
635         paddd           $TMPx,@MSG0[2]
636          movdqa         @MSG1[1],$TMPx
637          palignr        \$4,@MSG1[0],$TMPx
638         nop
639         sha256rnds2     $CDGH0,$ABEF0
640          pshufd         \$0x0e,$TMP1,$Wi
641         movdqa          14*16-0x80($Tbl),$TMP0
642         paddd           @MSG0[1],$TMP0
643          sha256rnds2    $CDGH1,$ABEF1
644
645         movdqa          $TMP0,$Wi
646          movdqa         14*16-0x80($Tbl),$TMP1
647          paddd          $TMPx,@MSG1[2]
648          paddd          @MSG1[1],$TMP1
649         sha256msg2      @MSG0[1],@MSG0[2]
650         nop
651         sha256rnds2     $ABEF0,$CDGH0           # 56-59
652          movdqa         $TMP1,$Wi
653           mov           \$1,%ecx
654           pxor          @MSG0[1],@MSG0[1]       # zero
655          sha256rnds2    $ABEF1,$CDGH1           # 56-59
656          sha256msg2     @MSG1[1],@MSG1[2]
657         pshufd          \$0x0e,$TMP0,$Wi
658         movdqa          15*16-0x80($Tbl),$TMP0
659         paddd           @MSG0[2],$TMP0
660           movq          (%rbx),@MSG0[2]         # pull counters
661           nop
662         sha256rnds2     $CDGH0,$ABEF0
663          pshufd         \$0x0e,$TMP1,$Wi
664          movdqa         15*16-0x80($Tbl),$TMP1
665          paddd          @MSG1[2],$TMP1
666          sha256rnds2    $CDGH1,$ABEF1
667
668         movdqa          $TMP0,$Wi
669           cmp           4*0(%rbx),%ecx          # examine counters
670           cmovge        %rsp,@ptr[0]            # cancel input
671           cmp           4*1(%rbx),%ecx
672           cmovge        %rsp,@ptr[1]
673           pshufd        \$0x00,@MSG0[2],@MSG1[0]
674         sha256rnds2     $ABEF0,$CDGH0           # 60-63
675          movdqa         $TMP1,$Wi
676           pshufd        \$0x55,@MSG0[2],@MSG1[1]
677           movdqa        @MSG0[2],@MSG1[2]
678          sha256rnds2    $ABEF1,$CDGH1           # 60-63
679         pshufd          \$0x0e,$TMP0,$Wi
680           pcmpgtd       @MSG0[1],@MSG1[0]
681           pcmpgtd       @MSG0[1],@MSG1[1]
682         sha256rnds2     $CDGH0,$ABEF0
683          pshufd         \$0x0e,$TMP1,$Wi
684           pcmpgtd       @MSG0[1],@MSG1[2]       # counter mask
685           movdqa        K256_shaext-0x10(%rip),$TMPx
686          sha256rnds2    $CDGH1,$ABEF1
687
688         pand            @MSG1[0],$CDGH0
689          pand           @MSG1[1],$CDGH1
690         pand            @MSG1[0],$ABEF0
691          pand           @MSG1[1],$ABEF1
692         paddd           @MSG0[2],@MSG1[2]       # counters--
693
694         paddd           0x50(%rsp),$CDGH0
695          paddd          0x70(%rsp),$CDGH1
696         paddd           0x40(%rsp),$ABEF0
697          paddd          0x60(%rsp),$ABEF1
698
699         movq            @MSG1[2],(%rbx)         # save counters
700         dec             $num
701         jnz             .Loop_shaext
702
703         mov             `$REG_SZ*17+8`(%rsp),$num
704
705         pshufd          \$0b00011011,$ABEF0,$ABEF0
706         pshufd          \$0b00011011,$CDGH0,$CDGH0
707         pshufd          \$0b00011011,$ABEF1,$ABEF1
708         pshufd          \$0b00011011,$CDGH1,$CDGH1
709
710         movdqa          $ABEF0,@MSG0[0]
711         movdqa          $CDGH0,@MSG0[1]
712         punpckldq       $ABEF1,$ABEF0                   # B1.B0.A1.A0
713         punpckhdq       $ABEF1,@MSG0[0]                 # F1.F0.E1.E0
714         punpckldq       $CDGH1,$CDGH0                   # D1.D0.C1.C0
715         punpckhdq       $CDGH1,@MSG0[1]                 # H1.H0.G1.G0
716
717         movq            $ABEF0,0x00-0x80($ctx)          # A1.A0
718         psrldq          \$8,$ABEF0
719         movq            @MSG0[0],0x80-0x80($ctx)        # E1.E0
720         psrldq          \$8,@MSG0[0]
721         movq            $ABEF0,0x20-0x80($ctx)          # B1.B0
722         movq            @MSG0[0],0xa0-0x80($ctx)        # F1.F0
723
724         movq            $CDGH0,0x40-0x80($ctx)          # C1.C0
725         psrldq          \$8,$CDGH0
726         movq            @MSG0[1],0xc0-0x80($ctx)        # G1.G0
727         psrldq          \$8,@MSG0[1]
728         movq            $CDGH0,0x60-0x80($ctx)          # D1.D0
729         movq            @MSG0[1],0xe0-0x80($ctx)        # H1.H0
730
731         lea     `$REG_SZ/2`($ctx),$ctx
732         lea     `16*2`($inp),$inp
733         dec     $num
734         jnz     .Loop_grande_shaext
735
736 .Ldone_shaext:
737         #mov    `$REG_SZ*17`(%rsp),%rax         # original %rsp
738 ___
739 $code.=<<___ if ($win64);
740         movaps  -0xb8(%rax),%xmm6
741         movaps  -0xa8(%rax),%xmm7
742         movaps  -0x98(%rax),%xmm8
743         movaps  -0x88(%rax),%xmm9
744         movaps  -0x78(%rax),%xmm10
745         movaps  -0x68(%rax),%xmm11
746         movaps  -0x58(%rax),%xmm12
747         movaps  -0x48(%rax),%xmm13
748         movaps  -0x38(%rax),%xmm14
749         movaps  -0x28(%rax),%xmm15
750 ___
751 $code.=<<___;
752         mov     -16(%rax),%rbp
753         mov     -8(%rax),%rbx
754         lea     (%rax),%rsp
755 .Lepilogue_shaext:
756         ret
757 .size   sha256_multi_block_shaext,.-sha256_multi_block_shaext
758 ___
759                                                 }}}
760                                                 if ($avx) {{{
761 sub ROUND_00_15_avx {
762 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
763
764 $code.=<<___ if ($i<15 && $REG_SZ==16);
765         vmovd           `4*$i`(@ptr[0]),$Xi
766         vmovd           `4*$i`(@ptr[1]),$t1
767         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
768         vpinsrd         \$1,`4*$i`(@ptr[3]),$t1,$t1
769         vpunpckldq      $t1,$Xi,$Xi
770         vpshufb         $Xn,$Xi,$Xi
771 ___
772 $code.=<<___ if ($i==15 && $REG_SZ==16);
773         vmovd           `4*$i`(@ptr[0]),$Xi
774          lea            `16*4`(@ptr[0]),@ptr[0]
775         vmovd           `4*$i`(@ptr[1]),$t1
776          lea            `16*4`(@ptr[1]),@ptr[1]
777         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
778          lea            `16*4`(@ptr[2]),@ptr[2]
779         vpinsrd         \$1,`4*$i`(@ptr[3]),$t1,$t1
780          lea            `16*4`(@ptr[3]),@ptr[3]
781         vpunpckldq      $t1,$Xi,$Xi
782         vpshufb         $Xn,$Xi,$Xi
783 ___
784 $code.=<<___ if ($i<15 && $REG_SZ==32);
785         vmovd           `4*$i`(@ptr[0]),$Xi
786         vmovd           `4*$i`(@ptr[4]),$t1
787         vmovd           `4*$i`(@ptr[1]),$t2
788         vmovd           `4*$i`(@ptr[5]),$t3
789         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
790         vpinsrd         \$1,`4*$i`(@ptr[6]),$t1,$t1
791         vpinsrd         \$1,`4*$i`(@ptr[3]),$t2,$t2
792         vpunpckldq      $t2,$Xi,$Xi
793         vpinsrd         \$1,`4*$i`(@ptr[7]),$t3,$t3
794         vpunpckldq      $t3,$t1,$t1
795         vinserti128     $t1,$Xi,$Xi
796         vpshufb         $Xn,$Xi,$Xi
797 ___
798 $code.=<<___ if ($i==15 && $REG_SZ==32);
799         vmovd           `4*$i`(@ptr[0]),$Xi
800          lea            `16*4`(@ptr[0]),@ptr[0]
801         vmovd           `4*$i`(@ptr[4]),$t1
802          lea            `16*4`(@ptr[4]),@ptr[4]
803         vmovd           `4*$i`(@ptr[1]),$t2
804          lea            `16*4`(@ptr[1]),@ptr[1]
805         vmovd           `4*$i`(@ptr[5]),$t3
806          lea            `16*4`(@ptr[5]),@ptr[5]
807         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
808          lea            `16*4`(@ptr[2]),@ptr[2]
809         vpinsrd         \$1,`4*$i`(@ptr[6]),$t1,$t1
810          lea            `16*4`(@ptr[6]),@ptr[6]
811         vpinsrd         \$1,`4*$i`(@ptr[3]),$t2,$t2
812          lea            `16*4`(@ptr[3]),@ptr[3]
813         vpunpckldq      $t2,$Xi,$Xi
814         vpinsrd         \$1,`4*$i`(@ptr[7]),$t3,$t3
815          lea            `16*4`(@ptr[7]),@ptr[7]
816         vpunpckldq      $t3,$t1,$t1
817         vinserti128     $t1,$Xi,$Xi
818         vpshufb         $Xn,$Xi,$Xi
819 ___
820 $code.=<<___;
821         vpsrld  \$6,$e,$sigma
822         vpslld  \$26,$e,$t3
823         vmovdqu $Xi,`&Xi_off($i)`
824          vpaddd $h,$Xi,$Xi                      # Xi+=h
825
826         vpsrld  \$11,$e,$t2
827         vpxor   $t3,$sigma,$sigma
828         vpslld  \$21,$e,$t3
829          vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi   # Xi+=K[round]
830         vpxor   $t2,$sigma,$sigma
831
832         vpsrld  \$25,$e,$t2
833         vpxor   $t3,$sigma,$sigma
834          `"prefetcht0   63(@ptr[0])"            if ($i==15)`
835         vpslld  \$7,$e,$t3
836          vpandn $g,$e,$t1
837          vpand  $f,$e,$axb                      # borrow $axb
838          `"prefetcht0   63(@ptr[1])"            if ($i==15)`
839         vpxor   $t2,$sigma,$sigma
840
841         vpsrld  \$2,$a,$h                       # borrow $h
842         vpxor   $t3,$sigma,$sigma               # Sigma1(e)
843          `"prefetcht0   63(@ptr[2])"            if ($i==15)`
844         vpslld  \$30,$a,$t2
845          vpxor  $axb,$t1,$t1                    # Ch(e,f,g)
846          vpxor  $a,$b,$axb                      # a^b, b^c in next round
847          `"prefetcht0   63(@ptr[3])"            if ($i==15)`
848         vpxor   $t2,$h,$h
849         vpaddd  $sigma,$Xi,$Xi                  # Xi+=Sigma1(e)
850
851         vpsrld  \$13,$a,$t2
852          `"prefetcht0   63(@ptr[4])"            if ($i==15 && $REG_SZ==32)`
853         vpslld  \$19,$a,$t3
854          vpaddd $t1,$Xi,$Xi                     # Xi+=Ch(e,f,g)
855          vpand  $axb,$bxc,$bxc
856          `"prefetcht0   63(@ptr[5])"            if ($i==15 && $REG_SZ==32)`
857         vpxor   $t2,$h,$sigma
858
859         vpsrld  \$22,$a,$t2
860         vpxor   $t3,$sigma,$sigma
861          `"prefetcht0   63(@ptr[6])"            if ($i==15 && $REG_SZ==32)`
862         vpslld  \$10,$a,$t3
863          vpxor  $bxc,$b,$h                      # h=Maj(a,b,c)=Ch(a^b,c,b)
864          vpaddd $Xi,$d,$d                       # d+=Xi
865          `"prefetcht0   63(@ptr[7])"            if ($i==15 && $REG_SZ==32)`
866         vpxor   $t2,$sigma,$sigma
867         vpxor   $t3,$sigma,$sigma               # Sigma0(a)
868
869         vpaddd  $Xi,$h,$h                       # h+=Xi
870         vpaddd  $sigma,$h,$h                    # h+=Sigma0(a)
871 ___
872 $code.=<<___ if (($i%8)==7);
873         add     \$`32*8`,$Tbl
874 ___
875         ($axb,$bxc)=($bxc,$axb);
876 }
877
878 sub ROUND_16_XX_avx {
879 my $i=shift;
880
881 $code.=<<___;
882         vmovdqu `&Xi_off($i+1)`,$Xn
883         vpaddd  `&Xi_off($i+9)`,$Xi,$Xi         # Xi+=X[i+9]
884
885         vpsrld  \$3,$Xn,$sigma
886         vpsrld  \$7,$Xn,$t2
887         vpslld  \$25,$Xn,$t3
888         vpxor   $t2,$sigma,$sigma
889         vpsrld  \$18,$Xn,$t2
890         vpxor   $t3,$sigma,$sigma
891         vpslld  \$14,$Xn,$t3
892         vmovdqu `&Xi_off($i+14)`,$t1
893         vpsrld  \$10,$t1,$axb                   # borrow $axb
894
895         vpxor   $t2,$sigma,$sigma
896         vpsrld  \$17,$t1,$t2
897         vpxor   $t3,$sigma,$sigma               # sigma0(X[i+1])
898         vpslld  \$15,$t1,$t3
899          vpaddd $sigma,$Xi,$Xi                  # Xi+=sigma0(e)
900         vpxor   $t2,$axb,$sigma
901         vpsrld  \$19,$t1,$t2
902         vpxor   $t3,$sigma,$sigma
903         vpslld  \$13,$t1,$t3
904         vpxor   $t2,$sigma,$sigma
905         vpxor   $t3,$sigma,$sigma               # sigma0(X[i+14])
906         vpaddd  $sigma,$Xi,$Xi                  # Xi+=sigma1(X[i+14])
907 ___
908         &ROUND_00_15_avx($i,@_);
909         ($Xi,$Xn)=($Xn,$Xi);
910 }
911
912 $code.=<<___;
913 .type   sha256_multi_block_avx,\@function,3
914 .align  32
915 sha256_multi_block_avx:
916 _avx_shortcut:
917 ___
918 $code.=<<___ if ($avx>1);
919         shr     \$32,%rcx
920         cmp     \$2,$num
921         jb      .Lavx
922         test    \$`1<<5`,%ecx
923         jnz     _avx2_shortcut
924         jmp     .Lavx
925 .align  32
926 .Lavx:
927 ___
928 $code.=<<___;
929         mov     %rsp,%rax
930         push    %rbx
931         push    %rbp
932 ___
933 $code.=<<___ if ($win64);
934         lea     -0xa8(%rsp),%rsp
935         movaps  %xmm6,(%rsp)
936         movaps  %xmm7,0x10(%rsp)
937         movaps  %xmm8,0x20(%rsp)
938         movaps  %xmm9,0x30(%rsp)
939         movaps  %xmm10,-0x78(%rax)
940         movaps  %xmm11,-0x68(%rax)
941         movaps  %xmm12,-0x58(%rax)
942         movaps  %xmm13,-0x48(%rax)
943         movaps  %xmm14,-0x38(%rax)
944         movaps  %xmm15,-0x28(%rax)
945 ___
946 $code.=<<___;
947         sub     \$`$REG_SZ*18`, %rsp
948         and     \$-256,%rsp
949         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
950 .Lbody_avx:
951         lea     K256+128(%rip),$Tbl
952         lea     `$REG_SZ*16`(%rsp),%rbx
953         lea     0x80($ctx),$ctx                 # size optimization
954
955 .Loop_grande_avx:
956         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
957         xor     $num,$num
958 ___
959 for($i=0;$i<4;$i++) {
960     $code.=<<___;
961         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
962         mov     `16*$i+8`($inp),%ecx            # number of blocks
963         cmp     $num,%ecx
964         cmovg   %ecx,$num                       # find maximum
965         test    %ecx,%ecx
966         mov     %ecx,`4*$i`(%rbx)               # initialize counters
967         cmovle  $Tbl,@ptr[$i]                   # cancel input
968 ___
969 }
970 $code.=<<___;
971         test    $num,$num
972         jz      .Ldone_avx
973
974         vmovdqu 0x00-0x80($ctx),$A              # load context
975          lea    128(%rsp),%rax
976         vmovdqu 0x20-0x80($ctx),$B
977         vmovdqu 0x40-0x80($ctx),$C
978         vmovdqu 0x60-0x80($ctx),$D
979         vmovdqu 0x80-0x80($ctx),$E
980         vmovdqu 0xa0-0x80($ctx),$F
981         vmovdqu 0xc0-0x80($ctx),$G
982         vmovdqu 0xe0-0x80($ctx),$H
983         vmovdqu .Lpbswap(%rip),$Xn
984         jmp     .Loop_avx
985
986 .align  32
987 .Loop_avx:
988         vpxor   $B,$C,$bxc                      # magic seed
989 ___
990 for($i=0;$i<16;$i++)    { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
991 $code.=<<___;
992         vmovdqu `&Xi_off($i)`,$Xi
993         mov     \$3,%ecx
994         jmp     .Loop_16_xx_avx
995 .align  32
996 .Loop_16_xx_avx:
997 ___
998 for(;$i<32;$i++)        { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
999 $code.=<<___;
1000         dec     %ecx
1001         jnz     .Loop_16_xx_avx
1002
1003         mov     \$1,%ecx
1004         lea     K256+128(%rip),$Tbl
1005 ___
1006 for($i=0;$i<4;$i++) {
1007     $code.=<<___;
1008         cmp     `4*$i`(%rbx),%ecx               # examine counters
1009         cmovge  $Tbl,@ptr[$i]                   # cancel input
1010 ___
1011 }
1012 $code.=<<___;
1013         vmovdqa (%rbx),$sigma                   # pull counters
1014         vpxor   $t1,$t1,$t1
1015         vmovdqa $sigma,$Xn
1016         vpcmpgtd $t1,$Xn,$Xn                    # mask value
1017         vpaddd  $Xn,$sigma,$sigma               # counters--
1018
1019         vmovdqu 0x00-0x80($ctx),$t1
1020         vpand   $Xn,$A,$A
1021         vmovdqu 0x20-0x80($ctx),$t2
1022         vpand   $Xn,$B,$B
1023         vmovdqu 0x40-0x80($ctx),$t3
1024         vpand   $Xn,$C,$C
1025         vmovdqu 0x60-0x80($ctx),$Xi
1026         vpand   $Xn,$D,$D
1027         vpaddd  $t1,$A,$A
1028         vmovdqu 0x80-0x80($ctx),$t1
1029         vpand   $Xn,$E,$E
1030         vpaddd  $t2,$B,$B
1031         vmovdqu 0xa0-0x80($ctx),$t2
1032         vpand   $Xn,$F,$F
1033         vpaddd  $t3,$C,$C
1034         vmovdqu 0xc0-0x80($ctx),$t3
1035         vpand   $Xn,$G,$G
1036         vpaddd  $Xi,$D,$D
1037         vmovdqu 0xe0-0x80($ctx),$Xi
1038         vpand   $Xn,$H,$H
1039         vpaddd  $t1,$E,$E
1040         vpaddd  $t2,$F,$F
1041         vmovdqu $A,0x00-0x80($ctx)
1042         vpaddd  $t3,$G,$G
1043         vmovdqu $B,0x20-0x80($ctx)
1044         vpaddd  $Xi,$H,$H
1045         vmovdqu $C,0x40-0x80($ctx)
1046         vmovdqu $D,0x60-0x80($ctx)
1047         vmovdqu $E,0x80-0x80($ctx)
1048         vmovdqu $F,0xa0-0x80($ctx)
1049         vmovdqu $G,0xc0-0x80($ctx)
1050         vmovdqu $H,0xe0-0x80($ctx)
1051
1052         vmovdqu $sigma,(%rbx)                   # save counters
1053         vmovdqu .Lpbswap(%rip),$Xn
1054         dec     $num
1055         jnz     .Loop_avx
1056
1057         mov     `$REG_SZ*17+8`(%rsp),$num
1058         lea     $REG_SZ($ctx),$ctx
1059         lea     `16*$REG_SZ/4`($inp),$inp
1060         dec     $num
1061         jnz     .Loop_grande_avx
1062
1063 .Ldone_avx:
1064         mov     `$REG_SZ*17`(%rsp),%rax         # orignal %rsp
1065         vzeroupper
1066 ___
1067 $code.=<<___ if ($win64);
1068         movaps  -0xb8(%rax),%xmm6
1069         movaps  -0xa8(%rax),%xmm7
1070         movaps  -0x98(%rax),%xmm8
1071         movaps  -0x88(%rax),%xmm9
1072         movaps  -0x78(%rax),%xmm10
1073         movaps  -0x68(%rax),%xmm11
1074         movaps  -0x58(%rax),%xmm12
1075         movaps  -0x48(%rax),%xmm13
1076         movaps  -0x38(%rax),%xmm14
1077         movaps  -0x28(%rax),%xmm15
1078 ___
1079 $code.=<<___;
1080         mov     -16(%rax),%rbp
1081         mov     -8(%rax),%rbx
1082         lea     (%rax),%rsp
1083 .Lepilogue_avx:
1084         ret
1085 .size   sha256_multi_block_avx,.-sha256_multi_block_avx
1086 ___
1087                                                 if ($avx>1) {
1088 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1089
1090 $REG_SZ=32;
1091 @ptr=map("%r$_",(12..15,8..11));
1092
1093 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1094 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1095
1096 $code.=<<___;
1097 .type   sha256_multi_block_avx2,\@function,3
1098 .align  32
1099 sha256_multi_block_avx2:
1100 _avx2_shortcut:
1101         mov     %rsp,%rax
1102         push    %rbx
1103         push    %rbp
1104         push    %r12
1105         push    %r13
1106         push    %r14
1107         push    %r15
1108 ___
1109 $code.=<<___ if ($win64);
1110         lea     -0xa8(%rsp),%rsp
1111         movaps  %xmm6,(%rsp)
1112         movaps  %xmm7,0x10(%rsp)
1113         movaps  %xmm8,0x20(%rsp)
1114         movaps  %xmm9,0x30(%rsp)
1115         movaps  %xmm10,0x40(%rsp)
1116         movaps  %xmm11,0x50(%rsp)
1117         movaps  %xmm12,-0x78(%rax)
1118         movaps  %xmm13,-0x68(%rax)
1119         movaps  %xmm14,-0x58(%rax)
1120         movaps  %xmm15,-0x48(%rax)
1121 ___
1122 $code.=<<___;
1123         sub     \$`$REG_SZ*18`, %rsp
1124         and     \$-256,%rsp
1125         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
1126 .Lbody_avx2:
1127         lea     K256+128(%rip),$Tbl
1128         lea     0x80($ctx),$ctx                 # size optimization
1129
1130 .Loop_grande_avx2:
1131         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
1132         xor     $num,$num
1133         lea     `$REG_SZ*16`(%rsp),%rbx
1134 ___
1135 for($i=0;$i<8;$i++) {
1136     $code.=<<___;
1137         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
1138         mov     `16*$i+8`($inp),%ecx            # number of blocks
1139         cmp     $num,%ecx
1140         cmovg   %ecx,$num                       # find maximum
1141         test    %ecx,%ecx
1142         mov     %ecx,`4*$i`(%rbx)               # initialize counters
1143         cmovle  $Tbl,@ptr[$i]                   # cancel input
1144 ___
1145 }
1146 $code.=<<___;
1147         vmovdqu 0x00-0x80($ctx),$A              # load context
1148          lea    128(%rsp),%rax
1149         vmovdqu 0x20-0x80($ctx),$B
1150          lea    256+128(%rsp),%rbx
1151         vmovdqu 0x40-0x80($ctx),$C
1152         vmovdqu 0x60-0x80($ctx),$D
1153         vmovdqu 0x80-0x80($ctx),$E
1154         vmovdqu 0xa0-0x80($ctx),$F
1155         vmovdqu 0xc0-0x80($ctx),$G
1156         vmovdqu 0xe0-0x80($ctx),$H
1157         vmovdqu .Lpbswap(%rip),$Xn
1158         jmp     .Loop_avx2
1159
1160 .align  32
1161 .Loop_avx2:
1162         vpxor   $B,$C,$bxc                      # magic seed
1163 ___
1164 for($i=0;$i<16;$i++)    { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1165 $code.=<<___;
1166         vmovdqu `&Xi_off($i)`,$Xi
1167         mov     \$3,%ecx
1168         jmp     .Loop_16_xx_avx2
1169 .align  32
1170 .Loop_16_xx_avx2:
1171 ___
1172 for(;$i<32;$i++)        { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1173 $code.=<<___;
1174         dec     %ecx
1175         jnz     .Loop_16_xx_avx2
1176
1177         mov     \$1,%ecx
1178         lea     `$REG_SZ*16`(%rsp),%rbx
1179         lea     K256+128(%rip),$Tbl
1180 ___
1181 for($i=0;$i<8;$i++) {
1182     $code.=<<___;
1183         cmp     `4*$i`(%rbx),%ecx               # examine counters
1184         cmovge  $Tbl,@ptr[$i]                   # cancel input
1185 ___
1186 }
1187 $code.=<<___;
1188         vmovdqa (%rbx),$sigma                   # pull counters
1189         vpxor   $t1,$t1,$t1
1190         vmovdqa $sigma,$Xn
1191         vpcmpgtd $t1,$Xn,$Xn                    # mask value
1192         vpaddd  $Xn,$sigma,$sigma               # counters--
1193
1194         vmovdqu 0x00-0x80($ctx),$t1
1195         vpand   $Xn,$A,$A
1196         vmovdqu 0x20-0x80($ctx),$t2
1197         vpand   $Xn,$B,$B
1198         vmovdqu 0x40-0x80($ctx),$t3
1199         vpand   $Xn,$C,$C
1200         vmovdqu 0x60-0x80($ctx),$Xi
1201         vpand   $Xn,$D,$D
1202         vpaddd  $t1,$A,$A
1203         vmovdqu 0x80-0x80($ctx),$t1
1204         vpand   $Xn,$E,$E
1205         vpaddd  $t2,$B,$B
1206         vmovdqu 0xa0-0x80($ctx),$t2
1207         vpand   $Xn,$F,$F
1208         vpaddd  $t3,$C,$C
1209         vmovdqu 0xc0-0x80($ctx),$t3
1210         vpand   $Xn,$G,$G
1211         vpaddd  $Xi,$D,$D
1212         vmovdqu 0xe0-0x80($ctx),$Xi
1213         vpand   $Xn,$H,$H
1214         vpaddd  $t1,$E,$E
1215         vpaddd  $t2,$F,$F
1216         vmovdqu $A,0x00-0x80($ctx)
1217         vpaddd  $t3,$G,$G
1218         vmovdqu $B,0x20-0x80($ctx)
1219         vpaddd  $Xi,$H,$H
1220         vmovdqu $C,0x40-0x80($ctx)
1221         vmovdqu $D,0x60-0x80($ctx)
1222         vmovdqu $E,0x80-0x80($ctx)
1223         vmovdqu $F,0xa0-0x80($ctx)
1224         vmovdqu $G,0xc0-0x80($ctx)
1225         vmovdqu $H,0xe0-0x80($ctx)
1226
1227         vmovdqu $sigma,(%rbx)                   # save counters
1228         lea     256+128(%rsp),%rbx
1229         vmovdqu .Lpbswap(%rip),$Xn
1230         dec     $num
1231         jnz     .Loop_avx2
1232
1233         #mov    `$REG_SZ*17+8`(%rsp),$num
1234         #lea    $REG_SZ($ctx),$ctx
1235         #lea    `16*$REG_SZ/4`($inp),$inp
1236         #dec    $num
1237         #jnz    .Loop_grande_avx2
1238
1239 .Ldone_avx2:
1240         mov     `$REG_SZ*17`(%rsp),%rax         # orignal %rsp
1241         vzeroupper
1242 ___
1243 $code.=<<___ if ($win64);
1244         movaps  -0xd8(%rax),%xmm6
1245         movaps  -0xc8(%rax),%xmm7
1246         movaps  -0xb8(%rax),%xmm8
1247         movaps  -0xa8(%rax),%xmm9
1248         movaps  -0x98(%rax),%xmm10
1249         movaps  -0x88(%rax),%xmm11
1250         movaps  -0x78(%rax),%xmm12
1251         movaps  -0x68(%rax),%xmm13
1252         movaps  -0x58(%rax),%xmm14
1253         movaps  -0x48(%rax),%xmm15
1254 ___
1255 $code.=<<___;
1256         mov     -48(%rax),%r15
1257         mov     -40(%rax),%r14
1258         mov     -32(%rax),%r13
1259         mov     -24(%rax),%r12
1260         mov     -16(%rax),%rbp
1261         mov     -8(%rax),%rbx
1262         lea     (%rax),%rsp
1263 .Lepilogue_avx2:
1264         ret
1265 .size   sha256_multi_block_avx2,.-sha256_multi_block_avx2
1266 ___
1267                                         }       }}}
1268 $code.=<<___;
1269 .align  256
1270 K256:
1271 ___
1272 sub TABLE {
1273     foreach (@_) {
1274         $code.=<<___;
1275         .long   $_,$_,$_,$_
1276         .long   $_,$_,$_,$_
1277 ___
1278     }
1279 }
1280 &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1281         0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1282         0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1283         0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1284         0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1285         0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1286         0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1287         0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1288         0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1289         0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1290         0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1291         0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1292         0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1293         0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1294         0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1295         0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1296 $code.=<<___;
1297 .Lpbswap:
1298         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1299         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1300 K256_shaext:
1301         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1302         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1303         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1304         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1305         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1306         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1307         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1308         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1309         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1310         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1311         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1312         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1313         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1314         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1315         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1316         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1317         .asciz  "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1318 ___
1319
1320 if ($win64) {
1321 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1322 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1323 $rec="%rcx";
1324 $frame="%rdx";
1325 $context="%r8";
1326 $disp="%r9";
1327
1328 $code.=<<___;
1329 .extern __imp_RtlVirtualUnwind
1330 .type   se_handler,\@abi-omnipotent
1331 .align  16
1332 se_handler:
1333         push    %rsi
1334         push    %rdi
1335         push    %rbx
1336         push    %rbp
1337         push    %r12
1338         push    %r13
1339         push    %r14
1340         push    %r15
1341         pushfq
1342         sub     \$64,%rsp
1343
1344         mov     120($context),%rax      # pull context->Rax
1345         mov     248($context),%rbx      # pull context->Rip
1346
1347         mov     8($disp),%rsi           # disp->ImageBase
1348         mov     56($disp),%r11          # disp->HandlerData
1349
1350         mov     0(%r11),%r10d           # HandlerData[0]
1351         lea     (%rsi,%r10),%r10        # end of prologue label
1352         cmp     %r10,%rbx               # context->Rip<.Lbody
1353         jb      .Lin_prologue
1354
1355         mov     152($context),%rax      # pull context->Rsp
1356
1357         mov     4(%r11),%r10d           # HandlerData[1]
1358         lea     (%rsi,%r10),%r10        # epilogue label
1359         cmp     %r10,%rbx               # context->Rip>=.Lepilogue
1360         jae     .Lin_prologue
1361
1362         mov     `16*17`(%rax),%rax      # pull saved stack pointer
1363
1364         mov     -8(%rax),%rbx
1365         mov     -16(%rax),%rbp
1366         mov     %rbx,144($context)      # restore context->Rbx
1367         mov     %rbp,160($context)      # restore context->Rbp
1368
1369         lea     -24-10*16(%rax),%rsi
1370         lea     512($context),%rdi      # &context.Xmm6
1371         mov     \$20,%ecx
1372         .long   0xa548f3fc              # cld; rep movsq
1373
1374 .Lin_prologue:
1375         mov     8(%rax),%rdi
1376         mov     16(%rax),%rsi
1377         mov     %rax,152($context)      # restore context->Rsp
1378         mov     %rsi,168($context)      # restore context->Rsi
1379         mov     %rdi,176($context)      # restore context->Rdi
1380
1381         mov     40($disp),%rdi          # disp->ContextRecord
1382         mov     $context,%rsi           # context
1383         mov     \$154,%ecx              # sizeof(CONTEXT)
1384         .long   0xa548f3fc              # cld; rep movsq
1385
1386         mov     $disp,%rsi
1387         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1388         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1389         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1390         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1391         mov     40(%rsi),%r10           # disp->ContextRecord
1392         lea     56(%rsi),%r11           # &disp->HandlerData
1393         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1394         mov     %r10,32(%rsp)           # arg5
1395         mov     %r11,40(%rsp)           # arg6
1396         mov     %r12,48(%rsp)           # arg7
1397         mov     %rcx,56(%rsp)           # arg8, (NULL)
1398         call    *__imp_RtlVirtualUnwind(%rip)
1399
1400         mov     \$1,%eax                # ExceptionContinueSearch
1401         add     \$64,%rsp
1402         popfq
1403         pop     %r15
1404         pop     %r14
1405         pop     %r13
1406         pop     %r12
1407         pop     %rbp
1408         pop     %rbx
1409         pop     %rdi
1410         pop     %rsi
1411         ret
1412 .size   se_handler,.-se_handler
1413 ___
1414 $code.=<<___ if ($avx>1);
1415 .type   avx2_handler,\@abi-omnipotent
1416 .align  16
1417 avx2_handler:
1418         push    %rsi
1419         push    %rdi
1420         push    %rbx
1421         push    %rbp
1422         push    %r12
1423         push    %r13
1424         push    %r14
1425         push    %r15
1426         pushfq
1427         sub     \$64,%rsp
1428
1429         mov     120($context),%rax      # pull context->Rax
1430         mov     248($context),%rbx      # pull context->Rip
1431
1432         mov     8($disp),%rsi           # disp->ImageBase
1433         mov     56($disp),%r11          # disp->HandlerData
1434
1435         mov     0(%r11),%r10d           # HandlerData[0]
1436         lea     (%rsi,%r10),%r10        # end of prologue label
1437         cmp     %r10,%rbx               # context->Rip<body label
1438         jb      .Lin_prologue
1439
1440         mov     152($context),%rax      # pull context->Rsp
1441
1442         mov     4(%r11),%r10d           # HandlerData[1]
1443         lea     (%rsi,%r10),%r10        # epilogue label
1444         cmp     %r10,%rbx               # context->Rip>=epilogue label
1445         jae     .Lin_prologue
1446
1447         mov     `32*17`($context),%rax  # pull saved stack pointer
1448
1449         mov     -8(%rax),%rbx
1450         mov     -16(%rax),%rbp
1451         mov     -24(%rax),%r12
1452         mov     -32(%rax),%r13
1453         mov     -40(%rax),%r14
1454         mov     -48(%rax),%r15
1455         mov     %rbx,144($context)      # restore context->Rbx
1456         mov     %rbp,160($context)      # restore context->Rbp
1457         mov     %r12,216($context)      # restore cotnext->R12
1458         mov     %r13,224($context)      # restore cotnext->R13
1459         mov     %r14,232($context)      # restore cotnext->R14
1460         mov     %r15,240($context)      # restore cotnext->R15
1461
1462         lea     -56-10*16(%rax),%rsi
1463         lea     512($context),%rdi      # &context.Xmm6
1464         mov     \$20,%ecx
1465         .long   0xa548f3fc              # cld; rep movsq
1466
1467         jmp     .Lin_prologue
1468 .size   avx2_handler,.-avx2_handler
1469 ___
1470 $code.=<<___;
1471 .section        .pdata
1472 .align  4
1473         .rva    .LSEH_begin_sha256_multi_block
1474         .rva    .LSEH_end_sha256_multi_block
1475         .rva    .LSEH_info_sha256_multi_block
1476         .rva    .LSEH_begin_sha256_multi_block_shaext
1477         .rva    .LSEH_end_sha256_multi_block_shaext
1478         .rva    .LSEH_info_sha256_multi_block_shaext
1479 ___
1480 $code.=<<___ if ($avx);
1481         .rva    .LSEH_begin_sha256_multi_block_avx
1482         .rva    .LSEH_end_sha256_multi_block_avx
1483         .rva    .LSEH_info_sha256_multi_block_avx
1484 ___
1485 $code.=<<___ if ($avx>1);
1486         .rva    .LSEH_begin_sha256_multi_block_avx2
1487         .rva    .LSEH_end_sha256_multi_block_avx2
1488         .rva    .LSEH_info_sha256_multi_block_avx2
1489 ___
1490 $code.=<<___;
1491 .section        .xdata
1492 .align  8
1493 .LSEH_info_sha256_multi_block:
1494         .byte   9,0,0,0
1495         .rva    se_handler
1496         .rva    .Lbody,.Lepilogue                       # HandlerData[]
1497 .LSEH_info_sha256_multi_block_shaext:
1498         .byte   9,0,0,0
1499         .rva    se_handler
1500         .rva    .Lbody_shaext,.Lepilogue_shaext         # HandlerData[]
1501 ___
1502 $code.=<<___ if ($avx);
1503 .LSEH_info_sha256_multi_block_avx:
1504         .byte   9,0,0,0
1505         .rva    se_handler
1506         .rva    .Lbody_avx,.Lepilogue_avx               # HandlerData[]
1507 ___
1508 $code.=<<___ if ($avx>1);
1509 .LSEH_info_sha256_multi_block_avx2:
1510         .byte   9,0,0,0
1511         .rva    avx2_handler
1512         .rva    .Lbody_avx2,.Lepilogue_avx2             # HandlerData[]
1513 ___
1514 }
1515 ####################################################################
1516
1517 sub rex {
1518   local *opcode=shift;
1519   my ($dst,$src)=@_;
1520   my $rex=0;
1521
1522     $rex|=0x04                  if ($dst>=8);
1523     $rex|=0x01                  if ($src>=8);
1524     unshift @opcode,$rex|0x40   if ($rex);
1525 }
1526
1527 sub sha256op38 {
1528     my $instr = shift;
1529     my %opcodelet = (
1530                 "sha256rnds2" => 0xcb,
1531                 "sha256msg1"  => 0xcc,
1532                 "sha256msg2"  => 0xcd   );
1533
1534     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1535       my @opcode=(0x0f,0x38);
1536         rex(\@opcode,$2,$1);
1537         push @opcode,$opcodelet{$instr};
1538         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1539         return ".byte\t".join(',',@opcode);
1540     } else {
1541         return $instr."\t".@_[0];
1542     }
1543 }
1544
1545 foreach (split("\n",$code)) {
1546         s/\`([^\`]*)\`/eval($1)/ge;
1547
1548         s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo         or
1549
1550         s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go          or
1551         s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go         or
1552         s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go    or
1553         s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go        or
1554         s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go            or
1555         s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1556
1557         print $_,"\n";
1558 }
1559
1560 close STDOUT;