Skylake performance results.
[openssl.git] / crypto / sha / asm / sha256-mb-x86_64.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # Multi-buffer SHA256 procedure processes n buffers in parallel by
11 # placing buffer data to designated lane of SIMD register. n is
12 # naturally limited to 4 on pre-AVX2 processors and to 8 on
13 # AVX2-capable processors such as Haswell.
14 #
15 #               this    +aesni(i)       sha256  aesni-sha256    gain(iv)
16 # -------------------------------------------------------------------
17 # Westmere(ii)  23.3/n  +1.28=7.11(n=4) 12.3    +3.75=16.1      +126%
18 # Atom(ii)      38.7/n  +3.93=13.6(n=4) 20.8    +5.69=26.5      +95%
19 # Sandy Bridge  (20.5   +5.15=25.7)/n   11.6    13.0            +103%
20 # Ivy Bridge    (20.4   +5.14=25.5)/n   10.3    11.6            +82%
21 # Haswell(iii)  (21.0   +5.00=26.0)/n   7.80    8.79            +170%
22 # Skylake       (18.9   +5.00=23.9)/n   7.70    8.17            +170%
23 # Bulldozer     (21.6   +5.76=27.4)/n   13.6    13.7            +100%
24 #
25 # (i)   multi-block CBC encrypt with 128-bit key;
26 # (ii)  (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
27 #       because of lower AES-NI instruction throughput, nor is there
28 #       AES-NI-SHA256 stitch for these processors;
29 # (iii) "this" is for n=8, when we gather twice as much data, result
30 #       for n=4 is 20.3+4.44=24.7;
31 # (iv)  presented improvement coefficients are asymptotic limits and
32 #       in real-life application are somewhat lower, e.g. for 2KB 
33 #       fragments they range from 75% to 130% (on Haswell);
34
35 $flavour = shift;
36 $output  = shift;
37 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
38
39 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
40
41 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
42 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
43 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
44 die "can't locate x86_64-xlate.pl";
45
46 $avx=0;
47
48 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
49                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
50         $avx = ($1>=2.19) + ($1>=2.22);
51 }
52
53 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
54            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
55         $avx = ($1>=2.09) + ($1>=2.10);
56 }
57
58 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
59            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
60         $avx = ($1>=10) + ($1>=11);
61 }
62
63 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
64         $avx = ($2>=3.0) + ($2>3.0);
65 }
66
67 open OUT,"| \"$^X\" $xlate $flavour $output";
68 *STDOUT=*OUT;
69
70 # void sha256_multi_block (
71 #     struct {  unsigned int A[8];
72 #               unsigned int B[8];
73 #               unsigned int C[8];
74 #               unsigned int D[8];
75 #               unsigned int E[8];
76 #               unsigned int F[8];
77 #               unsigned int G[8];
78 #               unsigned int H[8];      } *ctx,
79 #     struct {  void *ptr; int blocks;  } inp[8],
80 #     int num);         /* 1 or 2 */
81 #
82 $ctx="%rdi";    # 1st arg
83 $inp="%rsi";    # 2nd arg
84 $num="%edx";    # 3rd arg
85 @ptr=map("%r$_",(8..11));
86 $Tbl="%rbp";
87
88 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
89 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
90
91 $REG_SZ=16;
92
93 sub Xi_off {
94 my $off = shift;
95
96     $off %= 16; $off *= $REG_SZ;
97     $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
98 }
99
100 sub ROUND_00_15 {
101 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
102
103 $code.=<<___ if ($i<15);
104         movd            `4*$i`(@ptr[0]),$Xi
105         movd            `4*$i`(@ptr[1]),$t1
106         movd            `4*$i`(@ptr[2]),$t2
107         movd            `4*$i`(@ptr[3]),$t3
108         punpckldq       $t2,$Xi
109         punpckldq       $t3,$t1
110         punpckldq       $t1,$Xi
111 ___
112 $code.=<<___ if ($i==15);
113         movd            `4*$i`(@ptr[0]),$Xi
114          lea            `16*4`(@ptr[0]),@ptr[0]
115         movd            `4*$i`(@ptr[1]),$t1
116          lea            `16*4`(@ptr[1]),@ptr[1]
117         movd            `4*$i`(@ptr[2]),$t2
118          lea            `16*4`(@ptr[2]),@ptr[2]
119         movd            `4*$i`(@ptr[3]),$t3
120          lea            `16*4`(@ptr[3]),@ptr[3]
121         punpckldq       $t2,$Xi
122         punpckldq       $t3,$t1
123         punpckldq       $t1,$Xi
124 ___
125 $code.=<<___;
126         movdqa  $e,$sigma
127         `"pshufb        $Xn,$Xi"                if ($i<=15 && ($i&1)==0)`
128         movdqa  $e,$t3
129         `"pshufb        $Xn,$Xi"                if ($i<=15 && ($i&1)==1)`
130         psrld   \$6,$sigma
131         movdqa  $e,$t2
132         pslld   \$7,$t3
133         movdqa  $Xi,`&Xi_off($i)`
134          paddd  $h,$Xi                          # Xi+=h
135
136         psrld   \$11,$t2
137         pxor    $t3,$sigma
138         pslld   \$21-7,$t3
139          paddd  `32*($i%8)-128`($Tbl),$Xi       # Xi+=K[round]
140         pxor    $t2,$sigma
141
142         psrld   \$25-11,$t2
143          movdqa $e,$t1
144          `"prefetcht0   63(@ptr[0])"            if ($i==15)`
145         pxor    $t3,$sigma
146          movdqa $e,$axb                         # borrow $axb
147         pslld   \$26-21,$t3
148          pandn  $g,$t1
149          pand   $f,$axb
150         pxor    $t2,$sigma
151
152          `"prefetcht0   63(@ptr[1])"            if ($i==15)`
153         movdqa  $a,$t2
154         pxor    $t3,$sigma                      # Sigma1(e)
155         movdqa  $a,$t3
156         psrld   \$2,$t2
157         paddd   $sigma,$Xi                      # Xi+=Sigma1(e)
158          pxor   $axb,$t1                        # Ch(e,f,g)
159          movdqa $b,$axb
160         movdqa  $a,$sigma
161         pslld   \$10,$t3
162          pxor   $a,$axb                         # a^b, b^c in next round
163
164          `"prefetcht0   63(@ptr[2])"            if ($i==15)`
165         psrld   \$13,$sigma
166         pxor    $t3,$t2
167          paddd  $t1,$Xi                         # Xi+=Ch(e,f,g)
168         pslld   \$19-10,$t3
169          pand   $axb,$bxc
170         pxor    $sigma,$t2
171
172          `"prefetcht0   63(@ptr[3])"            if ($i==15)`
173         psrld   \$22-13,$sigma
174         pxor    $t3,$t2
175          movdqa $b,$h
176         pslld   \$30-19,$t3
177         pxor    $t2,$sigma
178          pxor   $bxc,$h                         # h=Maj(a,b,c)=Ch(a^b,c,b)
179          paddd  $Xi,$d                          # d+=Xi
180         pxor    $t3,$sigma                      # Sigma0(a)
181
182         paddd   $Xi,$h                          # h+=Xi
183         paddd   $sigma,$h                       # h+=Sigma0(a)
184 ___
185 $code.=<<___ if (($i%8)==7);
186         lea     `32*8`($Tbl),$Tbl
187 ___
188         ($axb,$bxc)=($bxc,$axb);
189 }
190
191 sub ROUND_16_XX {
192 my $i=shift;
193
194 $code.=<<___;
195         movdqa  `&Xi_off($i+1)`,$Xn
196         paddd   `&Xi_off($i+9)`,$Xi             # Xi+=X[i+9]
197
198         movdqa  $Xn,$sigma
199         movdqa  $Xn,$t2
200         psrld   \$3,$sigma
201         movdqa  $Xn,$t3
202
203         psrld   \$7,$t2
204         movdqa  `&Xi_off($i+14)`,$t1
205         pslld   \$14,$t3
206         pxor    $t2,$sigma
207         psrld   \$18-7,$t2
208         movdqa  $t1,$axb                        # borrow $axb
209         pxor    $t3,$sigma
210         pslld   \$25-14,$t3
211         pxor    $t2,$sigma
212         psrld   \$10,$t1
213         movdqa  $axb,$t2
214
215         psrld   \$17,$axb
216         pxor    $t3,$sigma                      # sigma0(X[i+1])
217         pslld   \$13,$t2
218          paddd  $sigma,$Xi                      # Xi+=sigma0(e)
219         pxor    $axb,$t1
220         psrld   \$19-17,$axb
221         pxor    $t2,$t1
222         pslld   \$15-13,$t2
223         pxor    $axb,$t1
224         pxor    $t2,$t1                         # sigma0(X[i+14])
225         paddd   $t1,$Xi                         # Xi+=sigma1(X[i+14])
226 ___
227         &ROUND_00_15($i,@_);
228         ($Xi,$Xn)=($Xn,$Xi);
229 }
230
231 $code.=<<___;
232 .text
233
234 .extern OPENSSL_ia32cap_P
235
236 .globl  sha256_multi_block
237 .type   sha256_multi_block,\@function,3
238 .align  32
239 sha256_multi_block:
240         mov     OPENSSL_ia32cap_P+4(%rip),%rcx
241         bt      \$61,%rcx                       # check SHA bit
242         jc      _shaext_shortcut
243 ___
244 $code.=<<___ if ($avx);
245         test    \$`1<<28`,%ecx
246         jnz     _avx_shortcut
247 ___
248 $code.=<<___;
249         mov     %rsp,%rax
250         push    %rbx
251         push    %rbp
252 ___
253 $code.=<<___ if ($win64);
254         lea     -0xa8(%rsp),%rsp
255         movaps  %xmm6,(%rsp)
256         movaps  %xmm7,0x10(%rsp)
257         movaps  %xmm8,0x20(%rsp)
258         movaps  %xmm9,0x30(%rsp)
259         movaps  %xmm10,-0x78(%rax)
260         movaps  %xmm11,-0x68(%rax)
261         movaps  %xmm12,-0x58(%rax)
262         movaps  %xmm13,-0x48(%rax)
263         movaps  %xmm14,-0x38(%rax)
264         movaps  %xmm15,-0x28(%rax)
265 ___
266 $code.=<<___;
267         sub     \$`$REG_SZ*18`, %rsp
268         and     \$-256,%rsp
269         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
270 .Lbody:
271         lea     K256+128(%rip),$Tbl
272         lea     `$REG_SZ*16`(%rsp),%rbx
273         lea     0x80($ctx),$ctx                 # size optimization
274
275 .Loop_grande:
276         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
277         xor     $num,$num
278 ___
279 for($i=0;$i<4;$i++) {
280     $code.=<<___;
281         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
282         mov     `16*$i+8`($inp),%ecx            # number of blocks
283         cmp     $num,%ecx
284         cmovg   %ecx,$num                       # find maximum
285         test    %ecx,%ecx
286         mov     %ecx,`4*$i`(%rbx)               # initialize counters
287         cmovle  $Tbl,@ptr[$i]                   # cancel input
288 ___
289 }
290 $code.=<<___;
291         test    $num,$num
292         jz      .Ldone
293
294         movdqu  0x00-0x80($ctx),$A              # load context
295          lea    128(%rsp),%rax
296         movdqu  0x20-0x80($ctx),$B
297         movdqu  0x40-0x80($ctx),$C
298         movdqu  0x60-0x80($ctx),$D
299         movdqu  0x80-0x80($ctx),$E
300         movdqu  0xa0-0x80($ctx),$F
301         movdqu  0xc0-0x80($ctx),$G
302         movdqu  0xe0-0x80($ctx),$H
303         movdqu  .Lpbswap(%rip),$Xn
304         jmp     .Loop
305
306 .align  32
307 .Loop:
308         movdqa  $C,$bxc
309         pxor    $B,$bxc                         # magic seed
310 ___
311 for($i=0;$i<16;$i++)    { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
312 $code.=<<___;
313         movdqu  `&Xi_off($i)`,$Xi
314         mov     \$3,%ecx
315         jmp     .Loop_16_xx
316 .align  32
317 .Loop_16_xx:
318 ___
319 for(;$i<32;$i++)        { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
320 $code.=<<___;
321         dec     %ecx
322         jnz     .Loop_16_xx
323
324         mov     \$1,%ecx
325         lea     K256+128(%rip),$Tbl
326
327         movdqa  (%rbx),$sigma                   # pull counters
328         cmp     4*0(%rbx),%ecx                  # examine counters
329         pxor    $t1,$t1
330         cmovge  $Tbl,@ptr[0]                    # cancel input
331         cmp     4*1(%rbx),%ecx
332         movdqa  $sigma,$Xn
333         cmovge  $Tbl,@ptr[1]
334         cmp     4*2(%rbx),%ecx
335         pcmpgtd $t1,$Xn                         # mask value
336         cmovge  $Tbl,@ptr[2]
337         cmp     4*3(%rbx),%ecx
338         paddd   $Xn,$sigma                      # counters--
339         cmovge  $Tbl,@ptr[3]
340
341         movdqu  0x00-0x80($ctx),$t1
342         pand    $Xn,$A
343         movdqu  0x20-0x80($ctx),$t2
344         pand    $Xn,$B
345         movdqu  0x40-0x80($ctx),$t3
346         pand    $Xn,$C
347         movdqu  0x60-0x80($ctx),$Xi
348         pand    $Xn,$D
349         paddd   $t1,$A
350         movdqu  0x80-0x80($ctx),$t1
351         pand    $Xn,$E
352         paddd   $t2,$B
353         movdqu  0xa0-0x80($ctx),$t2
354         pand    $Xn,$F
355         paddd   $t3,$C
356         movdqu  0xc0-0x80($ctx),$t3
357         pand    $Xn,$G
358         paddd   $Xi,$D
359         movdqu  0xe0-0x80($ctx),$Xi
360         pand    $Xn,$H
361         paddd   $t1,$E
362         paddd   $t2,$F
363         movdqu  $A,0x00-0x80($ctx)
364         paddd   $t3,$G
365         movdqu  $B,0x20-0x80($ctx)
366         paddd   $Xi,$H
367         movdqu  $C,0x40-0x80($ctx)
368         movdqu  $D,0x60-0x80($ctx)
369         movdqu  $E,0x80-0x80($ctx)
370         movdqu  $F,0xa0-0x80($ctx)
371         movdqu  $G,0xc0-0x80($ctx)
372         movdqu  $H,0xe0-0x80($ctx)
373
374         movdqa  $sigma,(%rbx)                   # save counters
375         movdqa  .Lpbswap(%rip),$Xn
376         dec     $num
377         jnz     .Loop
378
379         mov     `$REG_SZ*17+8`(%rsp),$num
380         lea     $REG_SZ($ctx),$ctx
381         lea     `16*$REG_SZ/4`($inp),$inp
382         dec     $num
383         jnz     .Loop_grande
384
385 .Ldone:
386         mov     `$REG_SZ*17`(%rsp),%rax         # orignal %rsp
387 ___
388 $code.=<<___ if ($win64);
389         movaps  -0xb8(%rax),%xmm6
390         movaps  -0xa8(%rax),%xmm7
391         movaps  -0x98(%rax),%xmm8
392         movaps  -0x88(%rax),%xmm9
393         movaps  -0x78(%rax),%xmm10
394         movaps  -0x68(%rax),%xmm11
395         movaps  -0x58(%rax),%xmm12
396         movaps  -0x48(%rax),%xmm13
397         movaps  -0x38(%rax),%xmm14
398         movaps  -0x28(%rax),%xmm15
399 ___
400 $code.=<<___;
401         mov     -16(%rax),%rbp
402         mov     -8(%rax),%rbx
403         lea     (%rax),%rsp
404 .Lepilogue:
405         ret
406 .size   sha256_multi_block,.-sha256_multi_block
407 ___
408                                                 {{{
409 my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
410 my @MSG0=map("%xmm$_",(4..7));
411 my @MSG1=map("%xmm$_",(8..11));
412
413 $code.=<<___;
414 .type   sha256_multi_block_shaext,\@function,3
415 .align  32
416 sha256_multi_block_shaext:
417 _shaext_shortcut:
418         mov     %rsp,%rax
419         push    %rbx
420         push    %rbp
421 ___
422 $code.=<<___ if ($win64);
423         lea     -0xa8(%rsp),%rsp
424         movaps  %xmm6,(%rsp)
425         movaps  %xmm7,0x10(%rsp)
426         movaps  %xmm8,0x20(%rsp)
427         movaps  %xmm9,0x30(%rsp)
428         movaps  %xmm10,-0x78(%rax)
429         movaps  %xmm11,-0x68(%rax)
430         movaps  %xmm12,-0x58(%rax)
431         movaps  %xmm13,-0x48(%rax)
432         movaps  %xmm14,-0x38(%rax)
433         movaps  %xmm15,-0x28(%rax)
434 ___
435 $code.=<<___;
436         sub     \$`$REG_SZ*18`,%rsp
437         shl     \$1,$num                        # we process pair at a time
438         and     \$-256,%rsp
439         lea     0x80($ctx),$ctx                 # size optimization
440         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
441 .Lbody_shaext:
442         lea     `$REG_SZ*16`(%rsp),%rbx
443         lea     K256_shaext+0x80(%rip),$Tbl
444
445 .Loop_grande_shaext:
446         mov     $num,`$REG_SZ*17+8`(%rsp)       # orignal $num
447         xor     $num,$num
448 ___
449 for($i=0;$i<2;$i++) {
450     $code.=<<___;
451         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
452         mov     `16*$i+8`($inp),%ecx            # number of blocks
453         cmp     $num,%ecx
454         cmovg   %ecx,$num                       # find maximum
455         test    %ecx,%ecx
456         mov     %ecx,`4*$i`(%rbx)               # initialize counters
457         cmovle  %rsp,@ptr[$i]                   # cancel input
458 ___
459 }
460 $code.=<<___;
461         test    $num,$num
462         jz      .Ldone_shaext
463
464         movq            0x00-0x80($ctx),$ABEF0          # A1.A0
465         movq            0x20-0x80($ctx),@MSG0[0]        # B1.B0
466         movq            0x40-0x80($ctx),$CDGH0          # C1.C0
467         movq            0x60-0x80($ctx),@MSG0[1]        # D1.D0
468         movq            0x80-0x80($ctx),@MSG1[0]        # E1.E0
469         movq            0xa0-0x80($ctx),@MSG1[1]        # F1.F0
470         movq            0xc0-0x80($ctx),@MSG1[2]        # G1.G0
471         movq            0xe0-0x80($ctx),@MSG1[3]        # H1.H0
472
473         punpckldq       @MSG0[0],$ABEF0                 # B1.A1.B0.A0
474         punpckldq       @MSG0[1],$CDGH0                 # D1.C1.D0.C0
475         punpckldq       @MSG1[1],@MSG1[0]               # F1.E1.F0.E0
476         punpckldq       @MSG1[3],@MSG1[2]               # H1.G1.H0.G0
477         movdqa          K256_shaext-0x10(%rip),$TMPx    # byte swap
478
479         movdqa          $ABEF0,$ABEF1
480         movdqa          $CDGH0,$CDGH1
481         punpcklqdq      @MSG1[0],$ABEF0                 # F0.E0.B0.A0
482         punpcklqdq      @MSG1[2],$CDGH0                 # H0.G0.D0.C0
483         punpckhqdq      @MSG1[0],$ABEF1                 # F1.E1.B1.A1
484         punpckhqdq      @MSG1[2],$CDGH1                 # H1.G1.D1.C1
485
486         pshufd          \$0b00011011,$ABEF0,$ABEF0
487         pshufd          \$0b00011011,$CDGH0,$CDGH0
488         pshufd          \$0b00011011,$ABEF1,$ABEF1
489         pshufd          \$0b00011011,$CDGH1,$CDGH1
490         jmp             .Loop_shaext
491
492 .align  32
493 .Loop_shaext:
494         movdqu          0x00(@ptr[0]),@MSG0[0]
495          movdqu         0x00(@ptr[1]),@MSG1[0]
496         movdqu          0x10(@ptr[0]),@MSG0[1]
497          movdqu         0x10(@ptr[1]),@MSG1[1]
498         movdqu          0x20(@ptr[0]),@MSG0[2]
499         pshufb          $TMPx,@MSG0[0]
500          movdqu         0x20(@ptr[1]),@MSG1[2]
501          pshufb         $TMPx,@MSG1[0]
502         movdqu          0x30(@ptr[0]),@MSG0[3]
503         lea             0x40(@ptr[0]),@ptr[0]
504          movdqu         0x30(@ptr[1]),@MSG1[3]
505          lea            0x40(@ptr[1]),@ptr[1]
506
507         movdqa          0*16-0x80($Tbl),$Wi
508         pshufb          $TMPx,@MSG0[1]
509         paddd           @MSG0[0],$Wi
510         pxor            $ABEF0,@MSG0[0]         # black magic
511         movdqa          $Wi,$TMP0
512          movdqa         0*16-0x80($Tbl),$TMP1
513          pshufb         $TMPx,@MSG1[1]
514          paddd          @MSG1[0],$TMP1
515         movdqa          $CDGH0,0x50(%rsp)       # offload
516         sha256rnds2     $ABEF0,$CDGH0           # 0-3
517          pxor           $ABEF1,@MSG1[0]         # black magic
518          movdqa         $TMP1,$Wi
519          movdqa         $CDGH1,0x70(%rsp)
520          sha256rnds2    $ABEF1,$CDGH1           # 0-3
521         pshufd          \$0x0e,$TMP0,$Wi
522         pxor            $ABEF0,@MSG0[0]         # black magic
523         movdqa          $ABEF0,0x40(%rsp)       # offload
524         sha256rnds2     $CDGH0,$ABEF0
525          pshufd         \$0x0e,$TMP1,$Wi
526          pxor           $ABEF1,@MSG1[0]         # black magic
527          movdqa         $ABEF1,0x60(%rsp)
528         movdqa          1*16-0x80($Tbl),$TMP0
529         paddd           @MSG0[1],$TMP0
530         pshufb          $TMPx,@MSG0[2]
531          sha256rnds2    $CDGH1,$ABEF1
532
533         movdqa          $TMP0,$Wi
534          movdqa         1*16-0x80($Tbl),$TMP1
535          paddd          @MSG1[1],$TMP1
536         sha256rnds2     $ABEF0,$CDGH0           # 4-7
537          movdqa         $TMP1,$Wi
538         prefetcht0      127(@ptr[0])
539         pshufb          $TMPx,@MSG0[3]
540          pshufb         $TMPx,@MSG1[2]
541          prefetcht0     127(@ptr[1])
542          sha256rnds2    $ABEF1,$CDGH1           # 4-7
543         pshufd          \$0x0e,$TMP0,$Wi
544          pshufb         $TMPx,@MSG1[3]
545         sha256msg1      @MSG0[1],@MSG0[0]
546         sha256rnds2     $CDGH0,$ABEF0
547          pshufd         \$0x0e,$TMP1,$Wi
548         movdqa          2*16-0x80($Tbl),$TMP0
549         paddd           @MSG0[2],$TMP0
550          sha256rnds2    $CDGH1,$ABEF1
551
552         movdqa          $TMP0,$Wi
553          movdqa         2*16-0x80($Tbl),$TMP1
554          paddd          @MSG1[2],$TMP1
555         sha256rnds2     $ABEF0,$CDGH0           # 8-11
556          sha256msg1     @MSG1[1],@MSG1[0]
557          movdqa         $TMP1,$Wi
558         movdqa          @MSG0[3],$TMPx
559          sha256rnds2    $ABEF1,$CDGH1           # 8-11
560         pshufd          \$0x0e,$TMP0,$Wi
561         palignr         \$4,@MSG0[2],$TMPx
562         paddd           $TMPx,@MSG0[0]
563          movdqa         @MSG1[3],$TMPx
564          palignr        \$4,@MSG1[2],$TMPx
565         sha256msg1      @MSG0[2],@MSG0[1]
566         sha256rnds2     $CDGH0,$ABEF0
567          pshufd         \$0x0e,$TMP1,$Wi
568         movdqa          3*16-0x80($Tbl),$TMP0
569         paddd           @MSG0[3],$TMP0
570          sha256rnds2    $CDGH1,$ABEF1
571          sha256msg1     @MSG1[2],@MSG1[1]
572
573         movdqa          $TMP0,$Wi
574          movdqa         3*16-0x80($Tbl),$TMP1
575          paddd          $TMPx,@MSG1[0]
576          paddd          @MSG1[3],$TMP1
577         sha256msg2      @MSG0[3],@MSG0[0]
578         sha256rnds2     $ABEF0,$CDGH0           # 12-15
579          movdqa         $TMP1,$Wi
580         movdqa          @MSG0[0],$TMPx
581         palignr         \$4,@MSG0[3],$TMPx
582          sha256rnds2    $ABEF1,$CDGH1           # 12-15
583          sha256msg2     @MSG1[3],@MSG1[0]
584         pshufd          \$0x0e,$TMP0,$Wi
585         paddd           $TMPx,@MSG0[1]
586          movdqa         @MSG1[0],$TMPx
587          palignr        \$4,@MSG1[3],$TMPx
588         sha256msg1      @MSG0[3],@MSG0[2]
589         sha256rnds2     $CDGH0,$ABEF0
590          pshufd         \$0x0e,$TMP1,$Wi
591         movdqa          4*16-0x80($Tbl),$TMP0
592         paddd           @MSG0[0],$TMP0
593          sha256rnds2    $CDGH1,$ABEF1
594          sha256msg1     @MSG1[3],@MSG1[2]
595 ___
596 for($i=4;$i<16-3;$i++) {
597 $code.=<<___;
598         movdqa          $TMP0,$Wi
599          movdqa         $i*16-0x80($Tbl),$TMP1
600          paddd          $TMPx,@MSG1[1]
601          paddd          @MSG1[0],$TMP1
602         sha256msg2      @MSG0[0],@MSG0[1]
603         sha256rnds2     $ABEF0,$CDGH0           # 16-19...
604          movdqa         $TMP1,$Wi
605         movdqa          @MSG0[1],$TMPx
606         palignr         \$4,@MSG0[0],$TMPx
607          sha256rnds2    $ABEF1,$CDGH1           # 16-19...
608          sha256msg2     @MSG1[0],@MSG1[1]
609         pshufd          \$0x0e,$TMP0,$Wi
610         paddd           $TMPx,@MSG0[2]
611          movdqa         @MSG1[1],$TMPx
612          palignr        \$4,@MSG1[0],$TMPx
613         sha256msg1      @MSG0[0],@MSG0[3]
614         sha256rnds2     $CDGH0,$ABEF0
615          pshufd         \$0x0e,$TMP1,$Wi
616         movdqa          `($i+1)*16`-0x80($Tbl),$TMP0
617         paddd           @MSG0[1],$TMP0
618          sha256rnds2    $CDGH1,$ABEF1
619          sha256msg1     @MSG1[0],@MSG1[3]
620 ___
621         push(@MSG0,shift(@MSG0));       push(@MSG1,shift(@MSG1));
622 }
623 $code.=<<___;
624         movdqa          $TMP0,$Wi
625          movdqa         13*16-0x80($Tbl),$TMP1
626          paddd          $TMPx,@MSG1[1]
627          paddd          @MSG1[0],$TMP1
628         sha256msg2      @MSG0[0],@MSG0[1]
629         sha256rnds2     $ABEF0,$CDGH0           # 52-55
630          movdqa         $TMP1,$Wi
631         movdqa          @MSG0[1],$TMPx
632         palignr         \$4,@MSG0[0],$TMPx
633          sha256rnds2    $ABEF1,$CDGH1           # 52-55
634          sha256msg2     @MSG1[0],@MSG1[1]
635         pshufd          \$0x0e,$TMP0,$Wi
636         paddd           $TMPx,@MSG0[2]
637          movdqa         @MSG1[1],$TMPx
638          palignr        \$4,@MSG1[0],$TMPx
639         nop
640         sha256rnds2     $CDGH0,$ABEF0
641          pshufd         \$0x0e,$TMP1,$Wi
642         movdqa          14*16-0x80($Tbl),$TMP0
643         paddd           @MSG0[1],$TMP0
644          sha256rnds2    $CDGH1,$ABEF1
645
646         movdqa          $TMP0,$Wi
647          movdqa         14*16-0x80($Tbl),$TMP1
648          paddd          $TMPx,@MSG1[2]
649          paddd          @MSG1[1],$TMP1
650         sha256msg2      @MSG0[1],@MSG0[2]
651         nop
652         sha256rnds2     $ABEF0,$CDGH0           # 56-59
653          movdqa         $TMP1,$Wi
654           mov           \$1,%ecx
655           pxor          @MSG0[1],@MSG0[1]       # zero
656          sha256rnds2    $ABEF1,$CDGH1           # 56-59
657          sha256msg2     @MSG1[1],@MSG1[2]
658         pshufd          \$0x0e,$TMP0,$Wi
659         movdqa          15*16-0x80($Tbl),$TMP0
660         paddd           @MSG0[2],$TMP0
661           movq          (%rbx),@MSG0[2]         # pull counters
662           nop
663         sha256rnds2     $CDGH0,$ABEF0
664          pshufd         \$0x0e,$TMP1,$Wi
665          movdqa         15*16-0x80($Tbl),$TMP1
666          paddd          @MSG1[2],$TMP1
667          sha256rnds2    $CDGH1,$ABEF1
668
669         movdqa          $TMP0,$Wi
670           cmp           4*0(%rbx),%ecx          # examine counters
671           cmovge        %rsp,@ptr[0]            # cancel input
672           cmp           4*1(%rbx),%ecx
673           cmovge        %rsp,@ptr[1]
674           pshufd        \$0x00,@MSG0[2],@MSG1[0]
675         sha256rnds2     $ABEF0,$CDGH0           # 60-63
676          movdqa         $TMP1,$Wi
677           pshufd        \$0x55,@MSG0[2],@MSG1[1]
678           movdqa        @MSG0[2],@MSG1[2]
679          sha256rnds2    $ABEF1,$CDGH1           # 60-63
680         pshufd          \$0x0e,$TMP0,$Wi
681           pcmpgtd       @MSG0[1],@MSG1[0]
682           pcmpgtd       @MSG0[1],@MSG1[1]
683         sha256rnds2     $CDGH0,$ABEF0
684          pshufd         \$0x0e,$TMP1,$Wi
685           pcmpgtd       @MSG0[1],@MSG1[2]       # counter mask
686           movdqa        K256_shaext-0x10(%rip),$TMPx
687          sha256rnds2    $CDGH1,$ABEF1
688
689         pand            @MSG1[0],$CDGH0
690          pand           @MSG1[1],$CDGH1
691         pand            @MSG1[0],$ABEF0
692          pand           @MSG1[1],$ABEF1
693         paddd           @MSG0[2],@MSG1[2]       # counters--
694
695         paddd           0x50(%rsp),$CDGH0
696          paddd          0x70(%rsp),$CDGH1
697         paddd           0x40(%rsp),$ABEF0
698          paddd          0x60(%rsp),$ABEF1
699
700         movq            @MSG1[2],(%rbx)         # save counters
701         dec             $num
702         jnz             .Loop_shaext
703
704         mov             `$REG_SZ*17+8`(%rsp),$num
705
706         pshufd          \$0b00011011,$ABEF0,$ABEF0
707         pshufd          \$0b00011011,$CDGH0,$CDGH0
708         pshufd          \$0b00011011,$ABEF1,$ABEF1
709         pshufd          \$0b00011011,$CDGH1,$CDGH1
710
711         movdqa          $ABEF0,@MSG0[0]
712         movdqa          $CDGH0,@MSG0[1]
713         punpckldq       $ABEF1,$ABEF0                   # B1.B0.A1.A0
714         punpckhdq       $ABEF1,@MSG0[0]                 # F1.F0.E1.E0
715         punpckldq       $CDGH1,$CDGH0                   # D1.D0.C1.C0
716         punpckhdq       $CDGH1,@MSG0[1]                 # H1.H0.G1.G0
717
718         movq            $ABEF0,0x00-0x80($ctx)          # A1.A0
719         psrldq          \$8,$ABEF0
720         movq            @MSG0[0],0x80-0x80($ctx)        # E1.E0
721         psrldq          \$8,@MSG0[0]
722         movq            $ABEF0,0x20-0x80($ctx)          # B1.B0
723         movq            @MSG0[0],0xa0-0x80($ctx)        # F1.F0
724
725         movq            $CDGH0,0x40-0x80($ctx)          # C1.C0
726         psrldq          \$8,$CDGH0
727         movq            @MSG0[1],0xc0-0x80($ctx)        # G1.G0
728         psrldq          \$8,@MSG0[1]
729         movq            $CDGH0,0x60-0x80($ctx)          # D1.D0
730         movq            @MSG0[1],0xe0-0x80($ctx)        # H1.H0
731
732         lea     `$REG_SZ/2`($ctx),$ctx
733         lea     `16*2`($inp),$inp
734         dec     $num
735         jnz     .Loop_grande_shaext
736
737 .Ldone_shaext:
738         #mov    `$REG_SZ*17`(%rsp),%rax         # original %rsp
739 ___
740 $code.=<<___ if ($win64);
741         movaps  -0xb8(%rax),%xmm6
742         movaps  -0xa8(%rax),%xmm7
743         movaps  -0x98(%rax),%xmm8
744         movaps  -0x88(%rax),%xmm9
745         movaps  -0x78(%rax),%xmm10
746         movaps  -0x68(%rax),%xmm11
747         movaps  -0x58(%rax),%xmm12
748         movaps  -0x48(%rax),%xmm13
749         movaps  -0x38(%rax),%xmm14
750         movaps  -0x28(%rax),%xmm15
751 ___
752 $code.=<<___;
753         mov     -16(%rax),%rbp
754         mov     -8(%rax),%rbx
755         lea     (%rax),%rsp
756 .Lepilogue_shaext:
757         ret
758 .size   sha256_multi_block_shaext,.-sha256_multi_block_shaext
759 ___
760                                                 }}}
761                                                 if ($avx) {{{
762 sub ROUND_00_15_avx {
763 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
764
765 $code.=<<___ if ($i<15 && $REG_SZ==16);
766         vmovd           `4*$i`(@ptr[0]),$Xi
767         vmovd           `4*$i`(@ptr[1]),$t1
768         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
769         vpinsrd         \$1,`4*$i`(@ptr[3]),$t1,$t1
770         vpunpckldq      $t1,$Xi,$Xi
771         vpshufb         $Xn,$Xi,$Xi
772 ___
773 $code.=<<___ if ($i==15 && $REG_SZ==16);
774         vmovd           `4*$i`(@ptr[0]),$Xi
775          lea            `16*4`(@ptr[0]),@ptr[0]
776         vmovd           `4*$i`(@ptr[1]),$t1
777          lea            `16*4`(@ptr[1]),@ptr[1]
778         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
779          lea            `16*4`(@ptr[2]),@ptr[2]
780         vpinsrd         \$1,`4*$i`(@ptr[3]),$t1,$t1
781          lea            `16*4`(@ptr[3]),@ptr[3]
782         vpunpckldq      $t1,$Xi,$Xi
783         vpshufb         $Xn,$Xi,$Xi
784 ___
785 $code.=<<___ if ($i<15 && $REG_SZ==32);
786         vmovd           `4*$i`(@ptr[0]),$Xi
787         vmovd           `4*$i`(@ptr[4]),$t1
788         vmovd           `4*$i`(@ptr[1]),$t2
789         vmovd           `4*$i`(@ptr[5]),$t3
790         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
791         vpinsrd         \$1,`4*$i`(@ptr[6]),$t1,$t1
792         vpinsrd         \$1,`4*$i`(@ptr[3]),$t2,$t2
793         vpunpckldq      $t2,$Xi,$Xi
794         vpinsrd         \$1,`4*$i`(@ptr[7]),$t3,$t3
795         vpunpckldq      $t3,$t1,$t1
796         vinserti128     $t1,$Xi,$Xi
797         vpshufb         $Xn,$Xi,$Xi
798 ___
799 $code.=<<___ if ($i==15 && $REG_SZ==32);
800         vmovd           `4*$i`(@ptr[0]),$Xi
801          lea            `16*4`(@ptr[0]),@ptr[0]
802         vmovd           `4*$i`(@ptr[4]),$t1
803          lea            `16*4`(@ptr[4]),@ptr[4]
804         vmovd           `4*$i`(@ptr[1]),$t2
805          lea            `16*4`(@ptr[1]),@ptr[1]
806         vmovd           `4*$i`(@ptr[5]),$t3
807          lea            `16*4`(@ptr[5]),@ptr[5]
808         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
809          lea            `16*4`(@ptr[2]),@ptr[2]
810         vpinsrd         \$1,`4*$i`(@ptr[6]),$t1,$t1
811          lea            `16*4`(@ptr[6]),@ptr[6]
812         vpinsrd         \$1,`4*$i`(@ptr[3]),$t2,$t2
813          lea            `16*4`(@ptr[3]),@ptr[3]
814         vpunpckldq      $t2,$Xi,$Xi
815         vpinsrd         \$1,`4*$i`(@ptr[7]),$t3,$t3
816          lea            `16*4`(@ptr[7]),@ptr[7]
817         vpunpckldq      $t3,$t1,$t1
818         vinserti128     $t1,$Xi,$Xi
819         vpshufb         $Xn,$Xi,$Xi
820 ___
821 $code.=<<___;
822         vpsrld  \$6,$e,$sigma
823         vpslld  \$26,$e,$t3
824         vmovdqu $Xi,`&Xi_off($i)`
825          vpaddd $h,$Xi,$Xi                      # Xi+=h
826
827         vpsrld  \$11,$e,$t2
828         vpxor   $t3,$sigma,$sigma
829         vpslld  \$21,$e,$t3
830          vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi   # Xi+=K[round]
831         vpxor   $t2,$sigma,$sigma
832
833         vpsrld  \$25,$e,$t2
834         vpxor   $t3,$sigma,$sigma
835          `"prefetcht0   63(@ptr[0])"            if ($i==15)`
836         vpslld  \$7,$e,$t3
837          vpandn $g,$e,$t1
838          vpand  $f,$e,$axb                      # borrow $axb
839          `"prefetcht0   63(@ptr[1])"            if ($i==15)`
840         vpxor   $t2,$sigma,$sigma
841
842         vpsrld  \$2,$a,$h                       # borrow $h
843         vpxor   $t3,$sigma,$sigma               # Sigma1(e)
844          `"prefetcht0   63(@ptr[2])"            if ($i==15)`
845         vpslld  \$30,$a,$t2
846          vpxor  $axb,$t1,$t1                    # Ch(e,f,g)
847          vpxor  $a,$b,$axb                      # a^b, b^c in next round
848          `"prefetcht0   63(@ptr[3])"            if ($i==15)`
849         vpxor   $t2,$h,$h
850         vpaddd  $sigma,$Xi,$Xi                  # Xi+=Sigma1(e)
851
852         vpsrld  \$13,$a,$t2
853          `"prefetcht0   63(@ptr[4])"            if ($i==15 && $REG_SZ==32)`
854         vpslld  \$19,$a,$t3
855          vpaddd $t1,$Xi,$Xi                     # Xi+=Ch(e,f,g)
856          vpand  $axb,$bxc,$bxc
857          `"prefetcht0   63(@ptr[5])"            if ($i==15 && $REG_SZ==32)`
858         vpxor   $t2,$h,$sigma
859
860         vpsrld  \$22,$a,$t2
861         vpxor   $t3,$sigma,$sigma
862          `"prefetcht0   63(@ptr[6])"            if ($i==15 && $REG_SZ==32)`
863         vpslld  \$10,$a,$t3
864          vpxor  $bxc,$b,$h                      # h=Maj(a,b,c)=Ch(a^b,c,b)
865          vpaddd $Xi,$d,$d                       # d+=Xi
866          `"prefetcht0   63(@ptr[7])"            if ($i==15 && $REG_SZ==32)`
867         vpxor   $t2,$sigma,$sigma
868         vpxor   $t3,$sigma,$sigma               # Sigma0(a)
869
870         vpaddd  $Xi,$h,$h                       # h+=Xi
871         vpaddd  $sigma,$h,$h                    # h+=Sigma0(a)
872 ___
873 $code.=<<___ if (($i%8)==7);
874         add     \$`32*8`,$Tbl
875 ___
876         ($axb,$bxc)=($bxc,$axb);
877 }
878
879 sub ROUND_16_XX_avx {
880 my $i=shift;
881
882 $code.=<<___;
883         vmovdqu `&Xi_off($i+1)`,$Xn
884         vpaddd  `&Xi_off($i+9)`,$Xi,$Xi         # Xi+=X[i+9]
885
886         vpsrld  \$3,$Xn,$sigma
887         vpsrld  \$7,$Xn,$t2
888         vpslld  \$25,$Xn,$t3
889         vpxor   $t2,$sigma,$sigma
890         vpsrld  \$18,$Xn,$t2
891         vpxor   $t3,$sigma,$sigma
892         vpslld  \$14,$Xn,$t3
893         vmovdqu `&Xi_off($i+14)`,$t1
894         vpsrld  \$10,$t1,$axb                   # borrow $axb
895
896         vpxor   $t2,$sigma,$sigma
897         vpsrld  \$17,$t1,$t2
898         vpxor   $t3,$sigma,$sigma               # sigma0(X[i+1])
899         vpslld  \$15,$t1,$t3
900          vpaddd $sigma,$Xi,$Xi                  # Xi+=sigma0(e)
901         vpxor   $t2,$axb,$sigma
902         vpsrld  \$19,$t1,$t2
903         vpxor   $t3,$sigma,$sigma
904         vpslld  \$13,$t1,$t3
905         vpxor   $t2,$sigma,$sigma
906         vpxor   $t3,$sigma,$sigma               # sigma0(X[i+14])
907         vpaddd  $sigma,$Xi,$Xi                  # Xi+=sigma1(X[i+14])
908 ___
909         &ROUND_00_15_avx($i,@_);
910         ($Xi,$Xn)=($Xn,$Xi);
911 }
912
913 $code.=<<___;
914 .type   sha256_multi_block_avx,\@function,3
915 .align  32
916 sha256_multi_block_avx:
917 _avx_shortcut:
918 ___
919 $code.=<<___ if ($avx>1);
920         shr     \$32,%rcx
921         cmp     \$2,$num
922         jb      .Lavx
923         test    \$`1<<5`,%ecx
924         jnz     _avx2_shortcut
925         jmp     .Lavx
926 .align  32
927 .Lavx:
928 ___
929 $code.=<<___;
930         mov     %rsp,%rax
931         push    %rbx
932         push    %rbp
933 ___
934 $code.=<<___ if ($win64);
935         lea     -0xa8(%rsp),%rsp
936         movaps  %xmm6,(%rsp)
937         movaps  %xmm7,0x10(%rsp)
938         movaps  %xmm8,0x20(%rsp)
939         movaps  %xmm9,0x30(%rsp)
940         movaps  %xmm10,-0x78(%rax)
941         movaps  %xmm11,-0x68(%rax)
942         movaps  %xmm12,-0x58(%rax)
943         movaps  %xmm13,-0x48(%rax)
944         movaps  %xmm14,-0x38(%rax)
945         movaps  %xmm15,-0x28(%rax)
946 ___
947 $code.=<<___;
948         sub     \$`$REG_SZ*18`, %rsp
949         and     \$-256,%rsp
950         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
951 .Lbody_avx:
952         lea     K256+128(%rip),$Tbl
953         lea     `$REG_SZ*16`(%rsp),%rbx
954         lea     0x80($ctx),$ctx                 # size optimization
955
956 .Loop_grande_avx:
957         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
958         xor     $num,$num
959 ___
960 for($i=0;$i<4;$i++) {
961     $code.=<<___;
962         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
963         mov     `16*$i+8`($inp),%ecx            # number of blocks
964         cmp     $num,%ecx
965         cmovg   %ecx,$num                       # find maximum
966         test    %ecx,%ecx
967         mov     %ecx,`4*$i`(%rbx)               # initialize counters
968         cmovle  $Tbl,@ptr[$i]                   # cancel input
969 ___
970 }
971 $code.=<<___;
972         test    $num,$num
973         jz      .Ldone_avx
974
975         vmovdqu 0x00-0x80($ctx),$A              # load context
976          lea    128(%rsp),%rax
977         vmovdqu 0x20-0x80($ctx),$B
978         vmovdqu 0x40-0x80($ctx),$C
979         vmovdqu 0x60-0x80($ctx),$D
980         vmovdqu 0x80-0x80($ctx),$E
981         vmovdqu 0xa0-0x80($ctx),$F
982         vmovdqu 0xc0-0x80($ctx),$G
983         vmovdqu 0xe0-0x80($ctx),$H
984         vmovdqu .Lpbswap(%rip),$Xn
985         jmp     .Loop_avx
986
987 .align  32
988 .Loop_avx:
989         vpxor   $B,$C,$bxc                      # magic seed
990 ___
991 for($i=0;$i<16;$i++)    { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
992 $code.=<<___;
993         vmovdqu `&Xi_off($i)`,$Xi
994         mov     \$3,%ecx
995         jmp     .Loop_16_xx_avx
996 .align  32
997 .Loop_16_xx_avx:
998 ___
999 for(;$i<32;$i++)        { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1000 $code.=<<___;
1001         dec     %ecx
1002         jnz     .Loop_16_xx_avx
1003
1004         mov     \$1,%ecx
1005         lea     K256+128(%rip),$Tbl
1006 ___
1007 for($i=0;$i<4;$i++) {
1008     $code.=<<___;
1009         cmp     `4*$i`(%rbx),%ecx               # examine counters
1010         cmovge  $Tbl,@ptr[$i]                   # cancel input
1011 ___
1012 }
1013 $code.=<<___;
1014         vmovdqa (%rbx),$sigma                   # pull counters
1015         vpxor   $t1,$t1,$t1
1016         vmovdqa $sigma,$Xn
1017         vpcmpgtd $t1,$Xn,$Xn                    # mask value
1018         vpaddd  $Xn,$sigma,$sigma               # counters--
1019
1020         vmovdqu 0x00-0x80($ctx),$t1
1021         vpand   $Xn,$A,$A
1022         vmovdqu 0x20-0x80($ctx),$t2
1023         vpand   $Xn,$B,$B
1024         vmovdqu 0x40-0x80($ctx),$t3
1025         vpand   $Xn,$C,$C
1026         vmovdqu 0x60-0x80($ctx),$Xi
1027         vpand   $Xn,$D,$D
1028         vpaddd  $t1,$A,$A
1029         vmovdqu 0x80-0x80($ctx),$t1
1030         vpand   $Xn,$E,$E
1031         vpaddd  $t2,$B,$B
1032         vmovdqu 0xa0-0x80($ctx),$t2
1033         vpand   $Xn,$F,$F
1034         vpaddd  $t3,$C,$C
1035         vmovdqu 0xc0-0x80($ctx),$t3
1036         vpand   $Xn,$G,$G
1037         vpaddd  $Xi,$D,$D
1038         vmovdqu 0xe0-0x80($ctx),$Xi
1039         vpand   $Xn,$H,$H
1040         vpaddd  $t1,$E,$E
1041         vpaddd  $t2,$F,$F
1042         vmovdqu $A,0x00-0x80($ctx)
1043         vpaddd  $t3,$G,$G
1044         vmovdqu $B,0x20-0x80($ctx)
1045         vpaddd  $Xi,$H,$H
1046         vmovdqu $C,0x40-0x80($ctx)
1047         vmovdqu $D,0x60-0x80($ctx)
1048         vmovdqu $E,0x80-0x80($ctx)
1049         vmovdqu $F,0xa0-0x80($ctx)
1050         vmovdqu $G,0xc0-0x80($ctx)
1051         vmovdqu $H,0xe0-0x80($ctx)
1052
1053         vmovdqu $sigma,(%rbx)                   # save counters
1054         vmovdqu .Lpbswap(%rip),$Xn
1055         dec     $num
1056         jnz     .Loop_avx
1057
1058         mov     `$REG_SZ*17+8`(%rsp),$num
1059         lea     $REG_SZ($ctx),$ctx
1060         lea     `16*$REG_SZ/4`($inp),$inp
1061         dec     $num
1062         jnz     .Loop_grande_avx
1063
1064 .Ldone_avx:
1065         mov     `$REG_SZ*17`(%rsp),%rax         # orignal %rsp
1066         vzeroupper
1067 ___
1068 $code.=<<___ if ($win64);
1069         movaps  -0xb8(%rax),%xmm6
1070         movaps  -0xa8(%rax),%xmm7
1071         movaps  -0x98(%rax),%xmm8
1072         movaps  -0x88(%rax),%xmm9
1073         movaps  -0x78(%rax),%xmm10
1074         movaps  -0x68(%rax),%xmm11
1075         movaps  -0x58(%rax),%xmm12
1076         movaps  -0x48(%rax),%xmm13
1077         movaps  -0x38(%rax),%xmm14
1078         movaps  -0x28(%rax),%xmm15
1079 ___
1080 $code.=<<___;
1081         mov     -16(%rax),%rbp
1082         mov     -8(%rax),%rbx
1083         lea     (%rax),%rsp
1084 .Lepilogue_avx:
1085         ret
1086 .size   sha256_multi_block_avx,.-sha256_multi_block_avx
1087 ___
1088                                                 if ($avx>1) {
1089 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1090
1091 $REG_SZ=32;
1092 @ptr=map("%r$_",(12..15,8..11));
1093
1094 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1095 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1096
1097 $code.=<<___;
1098 .type   sha256_multi_block_avx2,\@function,3
1099 .align  32
1100 sha256_multi_block_avx2:
1101 _avx2_shortcut:
1102         mov     %rsp,%rax
1103         push    %rbx
1104         push    %rbp
1105         push    %r12
1106         push    %r13
1107         push    %r14
1108         push    %r15
1109 ___
1110 $code.=<<___ if ($win64);
1111         lea     -0xa8(%rsp),%rsp
1112         movaps  %xmm6,(%rsp)
1113         movaps  %xmm7,0x10(%rsp)
1114         movaps  %xmm8,0x20(%rsp)
1115         movaps  %xmm9,0x30(%rsp)
1116         movaps  %xmm10,0x40(%rsp)
1117         movaps  %xmm11,0x50(%rsp)
1118         movaps  %xmm12,-0x78(%rax)
1119         movaps  %xmm13,-0x68(%rax)
1120         movaps  %xmm14,-0x58(%rax)
1121         movaps  %xmm15,-0x48(%rax)
1122 ___
1123 $code.=<<___;
1124         sub     \$`$REG_SZ*18`, %rsp
1125         and     \$-256,%rsp
1126         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
1127 .Lbody_avx2:
1128         lea     K256+128(%rip),$Tbl
1129         lea     0x80($ctx),$ctx                 # size optimization
1130
1131 .Loop_grande_avx2:
1132         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
1133         xor     $num,$num
1134         lea     `$REG_SZ*16`(%rsp),%rbx
1135 ___
1136 for($i=0;$i<8;$i++) {
1137     $code.=<<___;
1138         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
1139         mov     `16*$i+8`($inp),%ecx            # number of blocks
1140         cmp     $num,%ecx
1141         cmovg   %ecx,$num                       # find maximum
1142         test    %ecx,%ecx
1143         mov     %ecx,`4*$i`(%rbx)               # initialize counters
1144         cmovle  $Tbl,@ptr[$i]                   # cancel input
1145 ___
1146 }
1147 $code.=<<___;
1148         vmovdqu 0x00-0x80($ctx),$A              # load context
1149          lea    128(%rsp),%rax
1150         vmovdqu 0x20-0x80($ctx),$B
1151          lea    256+128(%rsp),%rbx
1152         vmovdqu 0x40-0x80($ctx),$C
1153         vmovdqu 0x60-0x80($ctx),$D
1154         vmovdqu 0x80-0x80($ctx),$E
1155         vmovdqu 0xa0-0x80($ctx),$F
1156         vmovdqu 0xc0-0x80($ctx),$G
1157         vmovdqu 0xe0-0x80($ctx),$H
1158         vmovdqu .Lpbswap(%rip),$Xn
1159         jmp     .Loop_avx2
1160
1161 .align  32
1162 .Loop_avx2:
1163         vpxor   $B,$C,$bxc                      # magic seed
1164 ___
1165 for($i=0;$i<16;$i++)    { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1166 $code.=<<___;
1167         vmovdqu `&Xi_off($i)`,$Xi
1168         mov     \$3,%ecx
1169         jmp     .Loop_16_xx_avx2
1170 .align  32
1171 .Loop_16_xx_avx2:
1172 ___
1173 for(;$i<32;$i++)        { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1174 $code.=<<___;
1175         dec     %ecx
1176         jnz     .Loop_16_xx_avx2
1177
1178         mov     \$1,%ecx
1179         lea     `$REG_SZ*16`(%rsp),%rbx
1180         lea     K256+128(%rip),$Tbl
1181 ___
1182 for($i=0;$i<8;$i++) {
1183     $code.=<<___;
1184         cmp     `4*$i`(%rbx),%ecx               # examine counters
1185         cmovge  $Tbl,@ptr[$i]                   # cancel input
1186 ___
1187 }
1188 $code.=<<___;
1189         vmovdqa (%rbx),$sigma                   # pull counters
1190         vpxor   $t1,$t1,$t1
1191         vmovdqa $sigma,$Xn
1192         vpcmpgtd $t1,$Xn,$Xn                    # mask value
1193         vpaddd  $Xn,$sigma,$sigma               # counters--
1194
1195         vmovdqu 0x00-0x80($ctx),$t1
1196         vpand   $Xn,$A,$A
1197         vmovdqu 0x20-0x80($ctx),$t2
1198         vpand   $Xn,$B,$B
1199         vmovdqu 0x40-0x80($ctx),$t3
1200         vpand   $Xn,$C,$C
1201         vmovdqu 0x60-0x80($ctx),$Xi
1202         vpand   $Xn,$D,$D
1203         vpaddd  $t1,$A,$A
1204         vmovdqu 0x80-0x80($ctx),$t1
1205         vpand   $Xn,$E,$E
1206         vpaddd  $t2,$B,$B
1207         vmovdqu 0xa0-0x80($ctx),$t2
1208         vpand   $Xn,$F,$F
1209         vpaddd  $t3,$C,$C
1210         vmovdqu 0xc0-0x80($ctx),$t3
1211         vpand   $Xn,$G,$G
1212         vpaddd  $Xi,$D,$D
1213         vmovdqu 0xe0-0x80($ctx),$Xi
1214         vpand   $Xn,$H,$H
1215         vpaddd  $t1,$E,$E
1216         vpaddd  $t2,$F,$F
1217         vmovdqu $A,0x00-0x80($ctx)
1218         vpaddd  $t3,$G,$G
1219         vmovdqu $B,0x20-0x80($ctx)
1220         vpaddd  $Xi,$H,$H
1221         vmovdqu $C,0x40-0x80($ctx)
1222         vmovdqu $D,0x60-0x80($ctx)
1223         vmovdqu $E,0x80-0x80($ctx)
1224         vmovdqu $F,0xa0-0x80($ctx)
1225         vmovdqu $G,0xc0-0x80($ctx)
1226         vmovdqu $H,0xe0-0x80($ctx)
1227
1228         vmovdqu $sigma,(%rbx)                   # save counters
1229         lea     256+128(%rsp),%rbx
1230         vmovdqu .Lpbswap(%rip),$Xn
1231         dec     $num
1232         jnz     .Loop_avx2
1233
1234         #mov    `$REG_SZ*17+8`(%rsp),$num
1235         #lea    $REG_SZ($ctx),$ctx
1236         #lea    `16*$REG_SZ/4`($inp),$inp
1237         #dec    $num
1238         #jnz    .Loop_grande_avx2
1239
1240 .Ldone_avx2:
1241         mov     `$REG_SZ*17`(%rsp),%rax         # orignal %rsp
1242         vzeroupper
1243 ___
1244 $code.=<<___ if ($win64);
1245         movaps  -0xd8(%rax),%xmm6
1246         movaps  -0xc8(%rax),%xmm7
1247         movaps  -0xb8(%rax),%xmm8
1248         movaps  -0xa8(%rax),%xmm9
1249         movaps  -0x98(%rax),%xmm10
1250         movaps  -0x88(%rax),%xmm11
1251         movaps  -0x78(%rax),%xmm12
1252         movaps  -0x68(%rax),%xmm13
1253         movaps  -0x58(%rax),%xmm14
1254         movaps  -0x48(%rax),%xmm15
1255 ___
1256 $code.=<<___;
1257         mov     -48(%rax),%r15
1258         mov     -40(%rax),%r14
1259         mov     -32(%rax),%r13
1260         mov     -24(%rax),%r12
1261         mov     -16(%rax),%rbp
1262         mov     -8(%rax),%rbx
1263         lea     (%rax),%rsp
1264 .Lepilogue_avx2:
1265         ret
1266 .size   sha256_multi_block_avx2,.-sha256_multi_block_avx2
1267 ___
1268                                         }       }}}
1269 $code.=<<___;
1270 .align  256
1271 K256:
1272 ___
1273 sub TABLE {
1274     foreach (@_) {
1275         $code.=<<___;
1276         .long   $_,$_,$_,$_
1277         .long   $_,$_,$_,$_
1278 ___
1279     }
1280 }
1281 &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1282         0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1283         0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1284         0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1285         0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1286         0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1287         0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1288         0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1289         0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1290         0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1291         0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1292         0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1293         0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1294         0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1295         0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1296         0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1297 $code.=<<___;
1298 .Lpbswap:
1299         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1300         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1301 K256_shaext:
1302         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1303         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1304         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1305         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1306         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1307         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1308         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1309         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1310         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1311         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1312         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1313         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1314         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1315         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1316         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1317         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1318         .asciz  "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1319 ___
1320
1321 if ($win64) {
1322 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1323 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1324 $rec="%rcx";
1325 $frame="%rdx";
1326 $context="%r8";
1327 $disp="%r9";
1328
1329 $code.=<<___;
1330 .extern __imp_RtlVirtualUnwind
1331 .type   se_handler,\@abi-omnipotent
1332 .align  16
1333 se_handler:
1334         push    %rsi
1335         push    %rdi
1336         push    %rbx
1337         push    %rbp
1338         push    %r12
1339         push    %r13
1340         push    %r14
1341         push    %r15
1342         pushfq
1343         sub     \$64,%rsp
1344
1345         mov     120($context),%rax      # pull context->Rax
1346         mov     248($context),%rbx      # pull context->Rip
1347
1348         mov     8($disp),%rsi           # disp->ImageBase
1349         mov     56($disp),%r11          # disp->HandlerData
1350
1351         mov     0(%r11),%r10d           # HandlerData[0]
1352         lea     (%rsi,%r10),%r10        # end of prologue label
1353         cmp     %r10,%rbx               # context->Rip<.Lbody
1354         jb      .Lin_prologue
1355
1356         mov     152($context),%rax      # pull context->Rsp
1357
1358         mov     4(%r11),%r10d           # HandlerData[1]
1359         lea     (%rsi,%r10),%r10        # epilogue label
1360         cmp     %r10,%rbx               # context->Rip>=.Lepilogue
1361         jae     .Lin_prologue
1362
1363         mov     `16*17`(%rax),%rax      # pull saved stack pointer
1364
1365         mov     -8(%rax),%rbx
1366         mov     -16(%rax),%rbp
1367         mov     %rbx,144($context)      # restore context->Rbx
1368         mov     %rbp,160($context)      # restore context->Rbp
1369
1370         lea     -24-10*16(%rax),%rsi
1371         lea     512($context),%rdi      # &context.Xmm6
1372         mov     \$20,%ecx
1373         .long   0xa548f3fc              # cld; rep movsq
1374
1375 .Lin_prologue:
1376         mov     8(%rax),%rdi
1377         mov     16(%rax),%rsi
1378         mov     %rax,152($context)      # restore context->Rsp
1379         mov     %rsi,168($context)      # restore context->Rsi
1380         mov     %rdi,176($context)      # restore context->Rdi
1381
1382         mov     40($disp),%rdi          # disp->ContextRecord
1383         mov     $context,%rsi           # context
1384         mov     \$154,%ecx              # sizeof(CONTEXT)
1385         .long   0xa548f3fc              # cld; rep movsq
1386
1387         mov     $disp,%rsi
1388         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1389         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1390         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1391         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1392         mov     40(%rsi),%r10           # disp->ContextRecord
1393         lea     56(%rsi),%r11           # &disp->HandlerData
1394         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1395         mov     %r10,32(%rsp)           # arg5
1396         mov     %r11,40(%rsp)           # arg6
1397         mov     %r12,48(%rsp)           # arg7
1398         mov     %rcx,56(%rsp)           # arg8, (NULL)
1399         call    *__imp_RtlVirtualUnwind(%rip)
1400
1401         mov     \$1,%eax                # ExceptionContinueSearch
1402         add     \$64,%rsp
1403         popfq
1404         pop     %r15
1405         pop     %r14
1406         pop     %r13
1407         pop     %r12
1408         pop     %rbp
1409         pop     %rbx
1410         pop     %rdi
1411         pop     %rsi
1412         ret
1413 .size   se_handler,.-se_handler
1414 ___
1415 $code.=<<___ if ($avx>1);
1416 .type   avx2_handler,\@abi-omnipotent
1417 .align  16
1418 avx2_handler:
1419         push    %rsi
1420         push    %rdi
1421         push    %rbx
1422         push    %rbp
1423         push    %r12
1424         push    %r13
1425         push    %r14
1426         push    %r15
1427         pushfq
1428         sub     \$64,%rsp
1429
1430         mov     120($context),%rax      # pull context->Rax
1431         mov     248($context),%rbx      # pull context->Rip
1432
1433         mov     8($disp),%rsi           # disp->ImageBase
1434         mov     56($disp),%r11          # disp->HandlerData
1435
1436         mov     0(%r11),%r10d           # HandlerData[0]
1437         lea     (%rsi,%r10),%r10        # end of prologue label
1438         cmp     %r10,%rbx               # context->Rip<body label
1439         jb      .Lin_prologue
1440
1441         mov     152($context),%rax      # pull context->Rsp
1442
1443         mov     4(%r11),%r10d           # HandlerData[1]
1444         lea     (%rsi,%r10),%r10        # epilogue label
1445         cmp     %r10,%rbx               # context->Rip>=epilogue label
1446         jae     .Lin_prologue
1447
1448         mov     `32*17`($context),%rax  # pull saved stack pointer
1449
1450         mov     -8(%rax),%rbx
1451         mov     -16(%rax),%rbp
1452         mov     -24(%rax),%r12
1453         mov     -32(%rax),%r13
1454         mov     -40(%rax),%r14
1455         mov     -48(%rax),%r15
1456         mov     %rbx,144($context)      # restore context->Rbx
1457         mov     %rbp,160($context)      # restore context->Rbp
1458         mov     %r12,216($context)      # restore cotnext->R12
1459         mov     %r13,224($context)      # restore cotnext->R13
1460         mov     %r14,232($context)      # restore cotnext->R14
1461         mov     %r15,240($context)      # restore cotnext->R15
1462
1463         lea     -56-10*16(%rax),%rsi
1464         lea     512($context),%rdi      # &context.Xmm6
1465         mov     \$20,%ecx
1466         .long   0xa548f3fc              # cld; rep movsq
1467
1468         jmp     .Lin_prologue
1469 .size   avx2_handler,.-avx2_handler
1470 ___
1471 $code.=<<___;
1472 .section        .pdata
1473 .align  4
1474         .rva    .LSEH_begin_sha256_multi_block
1475         .rva    .LSEH_end_sha256_multi_block
1476         .rva    .LSEH_info_sha256_multi_block
1477         .rva    .LSEH_begin_sha256_multi_block_shaext
1478         .rva    .LSEH_end_sha256_multi_block_shaext
1479         .rva    .LSEH_info_sha256_multi_block_shaext
1480 ___
1481 $code.=<<___ if ($avx);
1482         .rva    .LSEH_begin_sha256_multi_block_avx
1483         .rva    .LSEH_end_sha256_multi_block_avx
1484         .rva    .LSEH_info_sha256_multi_block_avx
1485 ___
1486 $code.=<<___ if ($avx>1);
1487         .rva    .LSEH_begin_sha256_multi_block_avx2
1488         .rva    .LSEH_end_sha256_multi_block_avx2
1489         .rva    .LSEH_info_sha256_multi_block_avx2
1490 ___
1491 $code.=<<___;
1492 .section        .xdata
1493 .align  8
1494 .LSEH_info_sha256_multi_block:
1495         .byte   9,0,0,0
1496         .rva    se_handler
1497         .rva    .Lbody,.Lepilogue                       # HandlerData[]
1498 .LSEH_info_sha256_multi_block_shaext:
1499         .byte   9,0,0,0
1500         .rva    se_handler
1501         .rva    .Lbody_shaext,.Lepilogue_shaext         # HandlerData[]
1502 ___
1503 $code.=<<___ if ($avx);
1504 .LSEH_info_sha256_multi_block_avx:
1505         .byte   9,0,0,0
1506         .rva    se_handler
1507         .rva    .Lbody_avx,.Lepilogue_avx               # HandlerData[]
1508 ___
1509 $code.=<<___ if ($avx>1);
1510 .LSEH_info_sha256_multi_block_avx2:
1511         .byte   9,0,0,0
1512         .rva    avx2_handler
1513         .rva    .Lbody_avx2,.Lepilogue_avx2             # HandlerData[]
1514 ___
1515 }
1516 ####################################################################
1517
1518 sub rex {
1519   local *opcode=shift;
1520   my ($dst,$src)=@_;
1521   my $rex=0;
1522
1523     $rex|=0x04                  if ($dst>=8);
1524     $rex|=0x01                  if ($src>=8);
1525     unshift @opcode,$rex|0x40   if ($rex);
1526 }
1527
1528 sub sha256op38 {
1529     my $instr = shift;
1530     my %opcodelet = (
1531                 "sha256rnds2" => 0xcb,
1532                 "sha256msg1"  => 0xcc,
1533                 "sha256msg2"  => 0xcd   );
1534
1535     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1536       my @opcode=(0x0f,0x38);
1537         rex(\@opcode,$2,$1);
1538         push @opcode,$opcodelet{$instr};
1539         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1540         return ".byte\t".join(',',@opcode);
1541     } else {
1542         return $instr."\t".@_[0];
1543     }
1544 }
1545
1546 foreach (split("\n",$code)) {
1547         s/\`([^\`]*)\`/eval($1)/ge;
1548
1549         s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo         or
1550
1551         s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go          or
1552         s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go         or
1553         s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go    or
1554         s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go        or
1555         s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go            or
1556         s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1557
1558         print $_,"\n";
1559 }
1560
1561 close STDOUT;