Add OpenSSL copyright to .pl files
[openssl.git] / crypto / sha / asm / sha256-mb-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # Multi-buffer SHA256 procedure processes n buffers in parallel by
18 # placing buffer data to designated lane of SIMD register. n is
19 # naturally limited to 4 on pre-AVX2 processors and to 8 on
20 # AVX2-capable processors such as Haswell.
21 #
22 #               this    +aesni(i)       sha256  aesni-sha256    gain(iv)
23 # -------------------------------------------------------------------
24 # Westmere(ii)  23.3/n  +1.28=7.11(n=4) 12.3    +3.75=16.1      +126%
25 # Atom(ii)      38.7/n  +3.93=13.6(n=4) 20.8    +5.69=26.5      +95%
26 # Sandy Bridge  (20.5   +5.15=25.7)/n   11.6    13.0            +103%
27 # Ivy Bridge    (20.4   +5.14=25.5)/n   10.3    11.6            +82%
28 # Haswell(iii)  (21.0   +5.00=26.0)/n   7.80    8.79            +170%
29 # Skylake       (18.9   +5.00=23.9)/n   7.70    8.17            +170%
30 # Bulldozer     (21.6   +5.76=27.4)/n   13.6    13.7            +100%
31 #
32 # (i)   multi-block CBC encrypt with 128-bit key;
33 # (ii)  (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34 #       because of lower AES-NI instruction throughput, nor is there
35 #       AES-NI-SHA256 stitch for these processors;
36 # (iii) "this" is for n=8, when we gather twice as much data, result
37 #       for n=4 is 20.3+4.44=24.7;
38 # (iv)  presented improvement coefficients are asymptotic limits and
39 #       in real-life application are somewhat lower, e.g. for 2KB 
40 #       fragments they range from 75% to 130% (on Haswell);
41
42 $flavour = shift;
43 $output  = shift;
44 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
45
46 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47
48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51 die "can't locate x86_64-xlate.pl";
52
53 $avx=0;
54
55 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57         $avx = ($1>=2.19) + ($1>=2.22);
58 }
59
60 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62         $avx = ($1>=2.09) + ($1>=2.10);
63 }
64
65 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
67         $avx = ($1>=10) + ($1>=11);
68 }
69
70 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
71         $avx = ($2>=3.0) + ($2>3.0);
72 }
73
74 open OUT,"| \"$^X\" $xlate $flavour $output";
75 *STDOUT=*OUT;
76
77 # void sha256_multi_block (
78 #     struct {  unsigned int A[8];
79 #               unsigned int B[8];
80 #               unsigned int C[8];
81 #               unsigned int D[8];
82 #               unsigned int E[8];
83 #               unsigned int F[8];
84 #               unsigned int G[8];
85 #               unsigned int H[8];      } *ctx,
86 #     struct {  void *ptr; int blocks;  } inp[8],
87 #     int num);         /* 1 or 2 */
88 #
89 $ctx="%rdi";    # 1st arg
90 $inp="%rsi";    # 2nd arg
91 $num="%edx";    # 3rd arg
92 @ptr=map("%r$_",(8..11));
93 $Tbl="%rbp";
94
95 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%xmm$_",(8..15));
96 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%xmm$_",(0..7));
97
98 $REG_SZ=16;
99
100 sub Xi_off {
101 my $off = shift;
102
103     $off %= 16; $off *= $REG_SZ;
104     $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
105 }
106
107 sub ROUND_00_15 {
108 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
109
110 $code.=<<___ if ($i<15);
111         movd            `4*$i`(@ptr[0]),$Xi
112         movd            `4*$i`(@ptr[1]),$t1
113         movd            `4*$i`(@ptr[2]),$t2
114         movd            `4*$i`(@ptr[3]),$t3
115         punpckldq       $t2,$Xi
116         punpckldq       $t3,$t1
117         punpckldq       $t1,$Xi
118 ___
119 $code.=<<___ if ($i==15);
120         movd            `4*$i`(@ptr[0]),$Xi
121          lea            `16*4`(@ptr[0]),@ptr[0]
122         movd            `4*$i`(@ptr[1]),$t1
123          lea            `16*4`(@ptr[1]),@ptr[1]
124         movd            `4*$i`(@ptr[2]),$t2
125          lea            `16*4`(@ptr[2]),@ptr[2]
126         movd            `4*$i`(@ptr[3]),$t3
127          lea            `16*4`(@ptr[3]),@ptr[3]
128         punpckldq       $t2,$Xi
129         punpckldq       $t3,$t1
130         punpckldq       $t1,$Xi
131 ___
132 $code.=<<___;
133         movdqa  $e,$sigma
134         `"pshufb        $Xn,$Xi"                if ($i<=15 && ($i&1)==0)`
135         movdqa  $e,$t3
136         `"pshufb        $Xn,$Xi"                if ($i<=15 && ($i&1)==1)`
137         psrld   \$6,$sigma
138         movdqa  $e,$t2
139         pslld   \$7,$t3
140         movdqa  $Xi,`&Xi_off($i)`
141          paddd  $h,$Xi                          # Xi+=h
142
143         psrld   \$11,$t2
144         pxor    $t3,$sigma
145         pslld   \$21-7,$t3
146          paddd  `32*($i%8)-128`($Tbl),$Xi       # Xi+=K[round]
147         pxor    $t2,$sigma
148
149         psrld   \$25-11,$t2
150          movdqa $e,$t1
151          `"prefetcht0   63(@ptr[0])"            if ($i==15)`
152         pxor    $t3,$sigma
153          movdqa $e,$axb                         # borrow $axb
154         pslld   \$26-21,$t3
155          pandn  $g,$t1
156          pand   $f,$axb
157         pxor    $t2,$sigma
158
159          `"prefetcht0   63(@ptr[1])"            if ($i==15)`
160         movdqa  $a,$t2
161         pxor    $t3,$sigma                      # Sigma1(e)
162         movdqa  $a,$t3
163         psrld   \$2,$t2
164         paddd   $sigma,$Xi                      # Xi+=Sigma1(e)
165          pxor   $axb,$t1                        # Ch(e,f,g)
166          movdqa $b,$axb
167         movdqa  $a,$sigma
168         pslld   \$10,$t3
169          pxor   $a,$axb                         # a^b, b^c in next round
170
171          `"prefetcht0   63(@ptr[2])"            if ($i==15)`
172         psrld   \$13,$sigma
173         pxor    $t3,$t2
174          paddd  $t1,$Xi                         # Xi+=Ch(e,f,g)
175         pslld   \$19-10,$t3
176          pand   $axb,$bxc
177         pxor    $sigma,$t2
178
179          `"prefetcht0   63(@ptr[3])"            if ($i==15)`
180         psrld   \$22-13,$sigma
181         pxor    $t3,$t2
182          movdqa $b,$h
183         pslld   \$30-19,$t3
184         pxor    $t2,$sigma
185          pxor   $bxc,$h                         # h=Maj(a,b,c)=Ch(a^b,c,b)
186          paddd  $Xi,$d                          # d+=Xi
187         pxor    $t3,$sigma                      # Sigma0(a)
188
189         paddd   $Xi,$h                          # h+=Xi
190         paddd   $sigma,$h                       # h+=Sigma0(a)
191 ___
192 $code.=<<___ if (($i%8)==7);
193         lea     `32*8`($Tbl),$Tbl
194 ___
195         ($axb,$bxc)=($bxc,$axb);
196 }
197
198 sub ROUND_16_XX {
199 my $i=shift;
200
201 $code.=<<___;
202         movdqa  `&Xi_off($i+1)`,$Xn
203         paddd   `&Xi_off($i+9)`,$Xi             # Xi+=X[i+9]
204
205         movdqa  $Xn,$sigma
206         movdqa  $Xn,$t2
207         psrld   \$3,$sigma
208         movdqa  $Xn,$t3
209
210         psrld   \$7,$t2
211         movdqa  `&Xi_off($i+14)`,$t1
212         pslld   \$14,$t3
213         pxor    $t2,$sigma
214         psrld   \$18-7,$t2
215         movdqa  $t1,$axb                        # borrow $axb
216         pxor    $t3,$sigma
217         pslld   \$25-14,$t3
218         pxor    $t2,$sigma
219         psrld   \$10,$t1
220         movdqa  $axb,$t2
221
222         psrld   \$17,$axb
223         pxor    $t3,$sigma                      # sigma0(X[i+1])
224         pslld   \$13,$t2
225          paddd  $sigma,$Xi                      # Xi+=sigma0(e)
226         pxor    $axb,$t1
227         psrld   \$19-17,$axb
228         pxor    $t2,$t1
229         pslld   \$15-13,$t2
230         pxor    $axb,$t1
231         pxor    $t2,$t1                         # sigma0(X[i+14])
232         paddd   $t1,$Xi                         # Xi+=sigma1(X[i+14])
233 ___
234         &ROUND_00_15($i,@_);
235         ($Xi,$Xn)=($Xn,$Xi);
236 }
237
238 $code.=<<___;
239 .text
240
241 .extern OPENSSL_ia32cap_P
242
243 .globl  sha256_multi_block
244 .type   sha256_multi_block,\@function,3
245 .align  32
246 sha256_multi_block:
247         mov     OPENSSL_ia32cap_P+4(%rip),%rcx
248         bt      \$61,%rcx                       # check SHA bit
249         jc      _shaext_shortcut
250 ___
251 $code.=<<___ if ($avx);
252         test    \$`1<<28`,%ecx
253         jnz     _avx_shortcut
254 ___
255 $code.=<<___;
256         mov     %rsp,%rax
257         push    %rbx
258         push    %rbp
259 ___
260 $code.=<<___ if ($win64);
261         lea     -0xa8(%rsp),%rsp
262         movaps  %xmm6,(%rsp)
263         movaps  %xmm7,0x10(%rsp)
264         movaps  %xmm8,0x20(%rsp)
265         movaps  %xmm9,0x30(%rsp)
266         movaps  %xmm10,-0x78(%rax)
267         movaps  %xmm11,-0x68(%rax)
268         movaps  %xmm12,-0x58(%rax)
269         movaps  %xmm13,-0x48(%rax)
270         movaps  %xmm14,-0x38(%rax)
271         movaps  %xmm15,-0x28(%rax)
272 ___
273 $code.=<<___;
274         sub     \$`$REG_SZ*18`, %rsp
275         and     \$-256,%rsp
276         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
277 .Lbody:
278         lea     K256+128(%rip),$Tbl
279         lea     `$REG_SZ*16`(%rsp),%rbx
280         lea     0x80($ctx),$ctx                 # size optimization
281
282 .Loop_grande:
283         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
284         xor     $num,$num
285 ___
286 for($i=0;$i<4;$i++) {
287     $code.=<<___;
288         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
289         mov     `16*$i+8`($inp),%ecx            # number of blocks
290         cmp     $num,%ecx
291         cmovg   %ecx,$num                       # find maximum
292         test    %ecx,%ecx
293         mov     %ecx,`4*$i`(%rbx)               # initialize counters
294         cmovle  $Tbl,@ptr[$i]                   # cancel input
295 ___
296 }
297 $code.=<<___;
298         test    $num,$num
299         jz      .Ldone
300
301         movdqu  0x00-0x80($ctx),$A              # load context
302          lea    128(%rsp),%rax
303         movdqu  0x20-0x80($ctx),$B
304         movdqu  0x40-0x80($ctx),$C
305         movdqu  0x60-0x80($ctx),$D
306         movdqu  0x80-0x80($ctx),$E
307         movdqu  0xa0-0x80($ctx),$F
308         movdqu  0xc0-0x80($ctx),$G
309         movdqu  0xe0-0x80($ctx),$H
310         movdqu  .Lpbswap(%rip),$Xn
311         jmp     .Loop
312
313 .align  32
314 .Loop:
315         movdqa  $C,$bxc
316         pxor    $B,$bxc                         # magic seed
317 ___
318 for($i=0;$i<16;$i++)    { &ROUND_00_15($i,@V); unshift(@V,pop(@V)); }
319 $code.=<<___;
320         movdqu  `&Xi_off($i)`,$Xi
321         mov     \$3,%ecx
322         jmp     .Loop_16_xx
323 .align  32
324 .Loop_16_xx:
325 ___
326 for(;$i<32;$i++)        { &ROUND_16_XX($i,@V); unshift(@V,pop(@V)); }
327 $code.=<<___;
328         dec     %ecx
329         jnz     .Loop_16_xx
330
331         mov     \$1,%ecx
332         lea     K256+128(%rip),$Tbl
333
334         movdqa  (%rbx),$sigma                   # pull counters
335         cmp     4*0(%rbx),%ecx                  # examine counters
336         pxor    $t1,$t1
337         cmovge  $Tbl,@ptr[0]                    # cancel input
338         cmp     4*1(%rbx),%ecx
339         movdqa  $sigma,$Xn
340         cmovge  $Tbl,@ptr[1]
341         cmp     4*2(%rbx),%ecx
342         pcmpgtd $t1,$Xn                         # mask value
343         cmovge  $Tbl,@ptr[2]
344         cmp     4*3(%rbx),%ecx
345         paddd   $Xn,$sigma                      # counters--
346         cmovge  $Tbl,@ptr[3]
347
348         movdqu  0x00-0x80($ctx),$t1
349         pand    $Xn,$A
350         movdqu  0x20-0x80($ctx),$t2
351         pand    $Xn,$B
352         movdqu  0x40-0x80($ctx),$t3
353         pand    $Xn,$C
354         movdqu  0x60-0x80($ctx),$Xi
355         pand    $Xn,$D
356         paddd   $t1,$A
357         movdqu  0x80-0x80($ctx),$t1
358         pand    $Xn,$E
359         paddd   $t2,$B
360         movdqu  0xa0-0x80($ctx),$t2
361         pand    $Xn,$F
362         paddd   $t3,$C
363         movdqu  0xc0-0x80($ctx),$t3
364         pand    $Xn,$G
365         paddd   $Xi,$D
366         movdqu  0xe0-0x80($ctx),$Xi
367         pand    $Xn,$H
368         paddd   $t1,$E
369         paddd   $t2,$F
370         movdqu  $A,0x00-0x80($ctx)
371         paddd   $t3,$G
372         movdqu  $B,0x20-0x80($ctx)
373         paddd   $Xi,$H
374         movdqu  $C,0x40-0x80($ctx)
375         movdqu  $D,0x60-0x80($ctx)
376         movdqu  $E,0x80-0x80($ctx)
377         movdqu  $F,0xa0-0x80($ctx)
378         movdqu  $G,0xc0-0x80($ctx)
379         movdqu  $H,0xe0-0x80($ctx)
380
381         movdqa  $sigma,(%rbx)                   # save counters
382         movdqa  .Lpbswap(%rip),$Xn
383         dec     $num
384         jnz     .Loop
385
386         mov     `$REG_SZ*17+8`(%rsp),$num
387         lea     $REG_SZ($ctx),$ctx
388         lea     `16*$REG_SZ/4`($inp),$inp
389         dec     $num
390         jnz     .Loop_grande
391
392 .Ldone:
393         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
394 ___
395 $code.=<<___ if ($win64);
396         movaps  -0xb8(%rax),%xmm6
397         movaps  -0xa8(%rax),%xmm7
398         movaps  -0x98(%rax),%xmm8
399         movaps  -0x88(%rax),%xmm9
400         movaps  -0x78(%rax),%xmm10
401         movaps  -0x68(%rax),%xmm11
402         movaps  -0x58(%rax),%xmm12
403         movaps  -0x48(%rax),%xmm13
404         movaps  -0x38(%rax),%xmm14
405         movaps  -0x28(%rax),%xmm15
406 ___
407 $code.=<<___;
408         mov     -16(%rax),%rbp
409         mov     -8(%rax),%rbx
410         lea     (%rax),%rsp
411 .Lepilogue:
412         ret
413 .size   sha256_multi_block,.-sha256_multi_block
414 ___
415                                                 {{{
416 my ($Wi,$TMP0,$TMP1,$TMPx,$ABEF0,$CDGH0,$ABEF1,$CDGH1)=map("%xmm$_",(0..3,12..15));
417 my @MSG0=map("%xmm$_",(4..7));
418 my @MSG1=map("%xmm$_",(8..11));
419
420 $code.=<<___;
421 .type   sha256_multi_block_shaext,\@function,3
422 .align  32
423 sha256_multi_block_shaext:
424 _shaext_shortcut:
425         mov     %rsp,%rax
426         push    %rbx
427         push    %rbp
428 ___
429 $code.=<<___ if ($win64);
430         lea     -0xa8(%rsp),%rsp
431         movaps  %xmm6,(%rsp)
432         movaps  %xmm7,0x10(%rsp)
433         movaps  %xmm8,0x20(%rsp)
434         movaps  %xmm9,0x30(%rsp)
435         movaps  %xmm10,-0x78(%rax)
436         movaps  %xmm11,-0x68(%rax)
437         movaps  %xmm12,-0x58(%rax)
438         movaps  %xmm13,-0x48(%rax)
439         movaps  %xmm14,-0x38(%rax)
440         movaps  %xmm15,-0x28(%rax)
441 ___
442 $code.=<<___;
443         sub     \$`$REG_SZ*18`,%rsp
444         shl     \$1,$num                        # we process pair at a time
445         and     \$-256,%rsp
446         lea     0x80($ctx),$ctx                 # size optimization
447         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
448 .Lbody_shaext:
449         lea     `$REG_SZ*16`(%rsp),%rbx
450         lea     K256_shaext+0x80(%rip),$Tbl
451
452 .Loop_grande_shaext:
453         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
454         xor     $num,$num
455 ___
456 for($i=0;$i<2;$i++) {
457     $code.=<<___;
458         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
459         mov     `16*$i+8`($inp),%ecx            # number of blocks
460         cmp     $num,%ecx
461         cmovg   %ecx,$num                       # find maximum
462         test    %ecx,%ecx
463         mov     %ecx,`4*$i`(%rbx)               # initialize counters
464         cmovle  %rsp,@ptr[$i]                   # cancel input
465 ___
466 }
467 $code.=<<___;
468         test    $num,$num
469         jz      .Ldone_shaext
470
471         movq            0x00-0x80($ctx),$ABEF0          # A1.A0
472         movq            0x20-0x80($ctx),@MSG0[0]        # B1.B0
473         movq            0x40-0x80($ctx),$CDGH0          # C1.C0
474         movq            0x60-0x80($ctx),@MSG0[1]        # D1.D0
475         movq            0x80-0x80($ctx),@MSG1[0]        # E1.E0
476         movq            0xa0-0x80($ctx),@MSG1[1]        # F1.F0
477         movq            0xc0-0x80($ctx),@MSG1[2]        # G1.G0
478         movq            0xe0-0x80($ctx),@MSG1[3]        # H1.H0
479
480         punpckldq       @MSG0[0],$ABEF0                 # B1.A1.B0.A0
481         punpckldq       @MSG0[1],$CDGH0                 # D1.C1.D0.C0
482         punpckldq       @MSG1[1],@MSG1[0]               # F1.E1.F0.E0
483         punpckldq       @MSG1[3],@MSG1[2]               # H1.G1.H0.G0
484         movdqa          K256_shaext-0x10(%rip),$TMPx    # byte swap
485
486         movdqa          $ABEF0,$ABEF1
487         movdqa          $CDGH0,$CDGH1
488         punpcklqdq      @MSG1[0],$ABEF0                 # F0.E0.B0.A0
489         punpcklqdq      @MSG1[2],$CDGH0                 # H0.G0.D0.C0
490         punpckhqdq      @MSG1[0],$ABEF1                 # F1.E1.B1.A1
491         punpckhqdq      @MSG1[2],$CDGH1                 # H1.G1.D1.C1
492
493         pshufd          \$0b00011011,$ABEF0,$ABEF0
494         pshufd          \$0b00011011,$CDGH0,$CDGH0
495         pshufd          \$0b00011011,$ABEF1,$ABEF1
496         pshufd          \$0b00011011,$CDGH1,$CDGH1
497         jmp             .Loop_shaext
498
499 .align  32
500 .Loop_shaext:
501         movdqu          0x00(@ptr[0]),@MSG0[0]
502          movdqu         0x00(@ptr[1]),@MSG1[0]
503         movdqu          0x10(@ptr[0]),@MSG0[1]
504          movdqu         0x10(@ptr[1]),@MSG1[1]
505         movdqu          0x20(@ptr[0]),@MSG0[2]
506         pshufb          $TMPx,@MSG0[0]
507          movdqu         0x20(@ptr[1]),@MSG1[2]
508          pshufb         $TMPx,@MSG1[0]
509         movdqu          0x30(@ptr[0]),@MSG0[3]
510         lea             0x40(@ptr[0]),@ptr[0]
511          movdqu         0x30(@ptr[1]),@MSG1[3]
512          lea            0x40(@ptr[1]),@ptr[1]
513
514         movdqa          0*16-0x80($Tbl),$Wi
515         pshufb          $TMPx,@MSG0[1]
516         paddd           @MSG0[0],$Wi
517         pxor            $ABEF0,@MSG0[0]         # black magic
518         movdqa          $Wi,$TMP0
519          movdqa         0*16-0x80($Tbl),$TMP1
520          pshufb         $TMPx,@MSG1[1]
521          paddd          @MSG1[0],$TMP1
522         movdqa          $CDGH0,0x50(%rsp)       # offload
523         sha256rnds2     $ABEF0,$CDGH0           # 0-3
524          pxor           $ABEF1,@MSG1[0]         # black magic
525          movdqa         $TMP1,$Wi
526          movdqa         $CDGH1,0x70(%rsp)
527          sha256rnds2    $ABEF1,$CDGH1           # 0-3
528         pshufd          \$0x0e,$TMP0,$Wi
529         pxor            $ABEF0,@MSG0[0]         # black magic
530         movdqa          $ABEF0,0x40(%rsp)       # offload
531         sha256rnds2     $CDGH0,$ABEF0
532          pshufd         \$0x0e,$TMP1,$Wi
533          pxor           $ABEF1,@MSG1[0]         # black magic
534          movdqa         $ABEF1,0x60(%rsp)
535         movdqa          1*16-0x80($Tbl),$TMP0
536         paddd           @MSG0[1],$TMP0
537         pshufb          $TMPx,@MSG0[2]
538          sha256rnds2    $CDGH1,$ABEF1
539
540         movdqa          $TMP0,$Wi
541          movdqa         1*16-0x80($Tbl),$TMP1
542          paddd          @MSG1[1],$TMP1
543         sha256rnds2     $ABEF0,$CDGH0           # 4-7
544          movdqa         $TMP1,$Wi
545         prefetcht0      127(@ptr[0])
546         pshufb          $TMPx,@MSG0[3]
547          pshufb         $TMPx,@MSG1[2]
548          prefetcht0     127(@ptr[1])
549          sha256rnds2    $ABEF1,$CDGH1           # 4-7
550         pshufd          \$0x0e,$TMP0,$Wi
551          pshufb         $TMPx,@MSG1[3]
552         sha256msg1      @MSG0[1],@MSG0[0]
553         sha256rnds2     $CDGH0,$ABEF0
554          pshufd         \$0x0e,$TMP1,$Wi
555         movdqa          2*16-0x80($Tbl),$TMP0
556         paddd           @MSG0[2],$TMP0
557          sha256rnds2    $CDGH1,$ABEF1
558
559         movdqa          $TMP0,$Wi
560          movdqa         2*16-0x80($Tbl),$TMP1
561          paddd          @MSG1[2],$TMP1
562         sha256rnds2     $ABEF0,$CDGH0           # 8-11
563          sha256msg1     @MSG1[1],@MSG1[0]
564          movdqa         $TMP1,$Wi
565         movdqa          @MSG0[3],$TMPx
566          sha256rnds2    $ABEF1,$CDGH1           # 8-11
567         pshufd          \$0x0e,$TMP0,$Wi
568         palignr         \$4,@MSG0[2],$TMPx
569         paddd           $TMPx,@MSG0[0]
570          movdqa         @MSG1[3],$TMPx
571          palignr        \$4,@MSG1[2],$TMPx
572         sha256msg1      @MSG0[2],@MSG0[1]
573         sha256rnds2     $CDGH0,$ABEF0
574          pshufd         \$0x0e,$TMP1,$Wi
575         movdqa          3*16-0x80($Tbl),$TMP0
576         paddd           @MSG0[3],$TMP0
577          sha256rnds2    $CDGH1,$ABEF1
578          sha256msg1     @MSG1[2],@MSG1[1]
579
580         movdqa          $TMP0,$Wi
581          movdqa         3*16-0x80($Tbl),$TMP1
582          paddd          $TMPx,@MSG1[0]
583          paddd          @MSG1[3],$TMP1
584         sha256msg2      @MSG0[3],@MSG0[0]
585         sha256rnds2     $ABEF0,$CDGH0           # 12-15
586          movdqa         $TMP1,$Wi
587         movdqa          @MSG0[0],$TMPx
588         palignr         \$4,@MSG0[3],$TMPx
589          sha256rnds2    $ABEF1,$CDGH1           # 12-15
590          sha256msg2     @MSG1[3],@MSG1[0]
591         pshufd          \$0x0e,$TMP0,$Wi
592         paddd           $TMPx,@MSG0[1]
593          movdqa         @MSG1[0],$TMPx
594          palignr        \$4,@MSG1[3],$TMPx
595         sha256msg1      @MSG0[3],@MSG0[2]
596         sha256rnds2     $CDGH0,$ABEF0
597          pshufd         \$0x0e,$TMP1,$Wi
598         movdqa          4*16-0x80($Tbl),$TMP0
599         paddd           @MSG0[0],$TMP0
600          sha256rnds2    $CDGH1,$ABEF1
601          sha256msg1     @MSG1[3],@MSG1[2]
602 ___
603 for($i=4;$i<16-3;$i++) {
604 $code.=<<___;
605         movdqa          $TMP0,$Wi
606          movdqa         $i*16-0x80($Tbl),$TMP1
607          paddd          $TMPx,@MSG1[1]
608          paddd          @MSG1[0],$TMP1
609         sha256msg2      @MSG0[0],@MSG0[1]
610         sha256rnds2     $ABEF0,$CDGH0           # 16-19...
611          movdqa         $TMP1,$Wi
612         movdqa          @MSG0[1],$TMPx
613         palignr         \$4,@MSG0[0],$TMPx
614          sha256rnds2    $ABEF1,$CDGH1           # 16-19...
615          sha256msg2     @MSG1[0],@MSG1[1]
616         pshufd          \$0x0e,$TMP0,$Wi
617         paddd           $TMPx,@MSG0[2]
618          movdqa         @MSG1[1],$TMPx
619          palignr        \$4,@MSG1[0],$TMPx
620         sha256msg1      @MSG0[0],@MSG0[3]
621         sha256rnds2     $CDGH0,$ABEF0
622          pshufd         \$0x0e,$TMP1,$Wi
623         movdqa          `($i+1)*16`-0x80($Tbl),$TMP0
624         paddd           @MSG0[1],$TMP0
625          sha256rnds2    $CDGH1,$ABEF1
626          sha256msg1     @MSG1[0],@MSG1[3]
627 ___
628         push(@MSG0,shift(@MSG0));       push(@MSG1,shift(@MSG1));
629 }
630 $code.=<<___;
631         movdqa          $TMP0,$Wi
632          movdqa         13*16-0x80($Tbl),$TMP1
633          paddd          $TMPx,@MSG1[1]
634          paddd          @MSG1[0],$TMP1
635         sha256msg2      @MSG0[0],@MSG0[1]
636         sha256rnds2     $ABEF0,$CDGH0           # 52-55
637          movdqa         $TMP1,$Wi
638         movdqa          @MSG0[1],$TMPx
639         palignr         \$4,@MSG0[0],$TMPx
640          sha256rnds2    $ABEF1,$CDGH1           # 52-55
641          sha256msg2     @MSG1[0],@MSG1[1]
642         pshufd          \$0x0e,$TMP0,$Wi
643         paddd           $TMPx,@MSG0[2]
644          movdqa         @MSG1[1],$TMPx
645          palignr        \$4,@MSG1[0],$TMPx
646         nop
647         sha256rnds2     $CDGH0,$ABEF0
648          pshufd         \$0x0e,$TMP1,$Wi
649         movdqa          14*16-0x80($Tbl),$TMP0
650         paddd           @MSG0[1],$TMP0
651          sha256rnds2    $CDGH1,$ABEF1
652
653         movdqa          $TMP0,$Wi
654          movdqa         14*16-0x80($Tbl),$TMP1
655          paddd          $TMPx,@MSG1[2]
656          paddd          @MSG1[1],$TMP1
657         sha256msg2      @MSG0[1],@MSG0[2]
658         nop
659         sha256rnds2     $ABEF0,$CDGH0           # 56-59
660          movdqa         $TMP1,$Wi
661           mov           \$1,%ecx
662           pxor          @MSG0[1],@MSG0[1]       # zero
663          sha256rnds2    $ABEF1,$CDGH1           # 56-59
664          sha256msg2     @MSG1[1],@MSG1[2]
665         pshufd          \$0x0e,$TMP0,$Wi
666         movdqa          15*16-0x80($Tbl),$TMP0
667         paddd           @MSG0[2],$TMP0
668           movq          (%rbx),@MSG0[2]         # pull counters
669           nop
670         sha256rnds2     $CDGH0,$ABEF0
671          pshufd         \$0x0e,$TMP1,$Wi
672          movdqa         15*16-0x80($Tbl),$TMP1
673          paddd          @MSG1[2],$TMP1
674          sha256rnds2    $CDGH1,$ABEF1
675
676         movdqa          $TMP0,$Wi
677           cmp           4*0(%rbx),%ecx          # examine counters
678           cmovge        %rsp,@ptr[0]            # cancel input
679           cmp           4*1(%rbx),%ecx
680           cmovge        %rsp,@ptr[1]
681           pshufd        \$0x00,@MSG0[2],@MSG1[0]
682         sha256rnds2     $ABEF0,$CDGH0           # 60-63
683          movdqa         $TMP1,$Wi
684           pshufd        \$0x55,@MSG0[2],@MSG1[1]
685           movdqa        @MSG0[2],@MSG1[2]
686          sha256rnds2    $ABEF1,$CDGH1           # 60-63
687         pshufd          \$0x0e,$TMP0,$Wi
688           pcmpgtd       @MSG0[1],@MSG1[0]
689           pcmpgtd       @MSG0[1],@MSG1[1]
690         sha256rnds2     $CDGH0,$ABEF0
691          pshufd         \$0x0e,$TMP1,$Wi
692           pcmpgtd       @MSG0[1],@MSG1[2]       # counter mask
693           movdqa        K256_shaext-0x10(%rip),$TMPx
694          sha256rnds2    $CDGH1,$ABEF1
695
696         pand            @MSG1[0],$CDGH0
697          pand           @MSG1[1],$CDGH1
698         pand            @MSG1[0],$ABEF0
699          pand           @MSG1[1],$ABEF1
700         paddd           @MSG0[2],@MSG1[2]       # counters--
701
702         paddd           0x50(%rsp),$CDGH0
703          paddd          0x70(%rsp),$CDGH1
704         paddd           0x40(%rsp),$ABEF0
705          paddd          0x60(%rsp),$ABEF1
706
707         movq            @MSG1[2],(%rbx)         # save counters
708         dec             $num
709         jnz             .Loop_shaext
710
711         mov             `$REG_SZ*17+8`(%rsp),$num
712
713         pshufd          \$0b00011011,$ABEF0,$ABEF0
714         pshufd          \$0b00011011,$CDGH0,$CDGH0
715         pshufd          \$0b00011011,$ABEF1,$ABEF1
716         pshufd          \$0b00011011,$CDGH1,$CDGH1
717
718         movdqa          $ABEF0,@MSG0[0]
719         movdqa          $CDGH0,@MSG0[1]
720         punpckldq       $ABEF1,$ABEF0                   # B1.B0.A1.A0
721         punpckhdq       $ABEF1,@MSG0[0]                 # F1.F0.E1.E0
722         punpckldq       $CDGH1,$CDGH0                   # D1.D0.C1.C0
723         punpckhdq       $CDGH1,@MSG0[1]                 # H1.H0.G1.G0
724
725         movq            $ABEF0,0x00-0x80($ctx)          # A1.A0
726         psrldq          \$8,$ABEF0
727         movq            @MSG0[0],0x80-0x80($ctx)        # E1.E0
728         psrldq          \$8,@MSG0[0]
729         movq            $ABEF0,0x20-0x80($ctx)          # B1.B0
730         movq            @MSG0[0],0xa0-0x80($ctx)        # F1.F0
731
732         movq            $CDGH0,0x40-0x80($ctx)          # C1.C0
733         psrldq          \$8,$CDGH0
734         movq            @MSG0[1],0xc0-0x80($ctx)        # G1.G0
735         psrldq          \$8,@MSG0[1]
736         movq            $CDGH0,0x60-0x80($ctx)          # D1.D0
737         movq            @MSG0[1],0xe0-0x80($ctx)        # H1.H0
738
739         lea     `$REG_SZ/2`($ctx),$ctx
740         lea     `16*2`($inp),$inp
741         dec     $num
742         jnz     .Loop_grande_shaext
743
744 .Ldone_shaext:
745         #mov    `$REG_SZ*17`(%rsp),%rax         # original %rsp
746 ___
747 $code.=<<___ if ($win64);
748         movaps  -0xb8(%rax),%xmm6
749         movaps  -0xa8(%rax),%xmm7
750         movaps  -0x98(%rax),%xmm8
751         movaps  -0x88(%rax),%xmm9
752         movaps  -0x78(%rax),%xmm10
753         movaps  -0x68(%rax),%xmm11
754         movaps  -0x58(%rax),%xmm12
755         movaps  -0x48(%rax),%xmm13
756         movaps  -0x38(%rax),%xmm14
757         movaps  -0x28(%rax),%xmm15
758 ___
759 $code.=<<___;
760         mov     -16(%rax),%rbp
761         mov     -8(%rax),%rbx
762         lea     (%rax),%rsp
763 .Lepilogue_shaext:
764         ret
765 .size   sha256_multi_block_shaext,.-sha256_multi_block_shaext
766 ___
767                                                 }}}
768                                                 if ($avx) {{{
769 sub ROUND_00_15_avx {
770 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
771
772 $code.=<<___ if ($i<15 && $REG_SZ==16);
773         vmovd           `4*$i`(@ptr[0]),$Xi
774         vmovd           `4*$i`(@ptr[1]),$t1
775         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
776         vpinsrd         \$1,`4*$i`(@ptr[3]),$t1,$t1
777         vpunpckldq      $t1,$Xi,$Xi
778         vpshufb         $Xn,$Xi,$Xi
779 ___
780 $code.=<<___ if ($i==15 && $REG_SZ==16);
781         vmovd           `4*$i`(@ptr[0]),$Xi
782          lea            `16*4`(@ptr[0]),@ptr[0]
783         vmovd           `4*$i`(@ptr[1]),$t1
784          lea            `16*4`(@ptr[1]),@ptr[1]
785         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
786          lea            `16*4`(@ptr[2]),@ptr[2]
787         vpinsrd         \$1,`4*$i`(@ptr[3]),$t1,$t1
788          lea            `16*4`(@ptr[3]),@ptr[3]
789         vpunpckldq      $t1,$Xi,$Xi
790         vpshufb         $Xn,$Xi,$Xi
791 ___
792 $code.=<<___ if ($i<15 && $REG_SZ==32);
793         vmovd           `4*$i`(@ptr[0]),$Xi
794         vmovd           `4*$i`(@ptr[4]),$t1
795         vmovd           `4*$i`(@ptr[1]),$t2
796         vmovd           `4*$i`(@ptr[5]),$t3
797         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
798         vpinsrd         \$1,`4*$i`(@ptr[6]),$t1,$t1
799         vpinsrd         \$1,`4*$i`(@ptr[3]),$t2,$t2
800         vpunpckldq      $t2,$Xi,$Xi
801         vpinsrd         \$1,`4*$i`(@ptr[7]),$t3,$t3
802         vpunpckldq      $t3,$t1,$t1
803         vinserti128     $t1,$Xi,$Xi
804         vpshufb         $Xn,$Xi,$Xi
805 ___
806 $code.=<<___ if ($i==15 && $REG_SZ==32);
807         vmovd           `4*$i`(@ptr[0]),$Xi
808          lea            `16*4`(@ptr[0]),@ptr[0]
809         vmovd           `4*$i`(@ptr[4]),$t1
810          lea            `16*4`(@ptr[4]),@ptr[4]
811         vmovd           `4*$i`(@ptr[1]),$t2
812          lea            `16*4`(@ptr[1]),@ptr[1]
813         vmovd           `4*$i`(@ptr[5]),$t3
814          lea            `16*4`(@ptr[5]),@ptr[5]
815         vpinsrd         \$1,`4*$i`(@ptr[2]),$Xi,$Xi
816          lea            `16*4`(@ptr[2]),@ptr[2]
817         vpinsrd         \$1,`4*$i`(@ptr[6]),$t1,$t1
818          lea            `16*4`(@ptr[6]),@ptr[6]
819         vpinsrd         \$1,`4*$i`(@ptr[3]),$t2,$t2
820          lea            `16*4`(@ptr[3]),@ptr[3]
821         vpunpckldq      $t2,$Xi,$Xi
822         vpinsrd         \$1,`4*$i`(@ptr[7]),$t3,$t3
823          lea            `16*4`(@ptr[7]),@ptr[7]
824         vpunpckldq      $t3,$t1,$t1
825         vinserti128     $t1,$Xi,$Xi
826         vpshufb         $Xn,$Xi,$Xi
827 ___
828 $code.=<<___;
829         vpsrld  \$6,$e,$sigma
830         vpslld  \$26,$e,$t3
831         vmovdqu $Xi,`&Xi_off($i)`
832          vpaddd $h,$Xi,$Xi                      # Xi+=h
833
834         vpsrld  \$11,$e,$t2
835         vpxor   $t3,$sigma,$sigma
836         vpslld  \$21,$e,$t3
837          vpaddd `32*($i%8)-128`($Tbl),$Xi,$Xi   # Xi+=K[round]
838         vpxor   $t2,$sigma,$sigma
839
840         vpsrld  \$25,$e,$t2
841         vpxor   $t3,$sigma,$sigma
842          `"prefetcht0   63(@ptr[0])"            if ($i==15)`
843         vpslld  \$7,$e,$t3
844          vpandn $g,$e,$t1
845          vpand  $f,$e,$axb                      # borrow $axb
846          `"prefetcht0   63(@ptr[1])"            if ($i==15)`
847         vpxor   $t2,$sigma,$sigma
848
849         vpsrld  \$2,$a,$h                       # borrow $h
850         vpxor   $t3,$sigma,$sigma               # Sigma1(e)
851          `"prefetcht0   63(@ptr[2])"            if ($i==15)`
852         vpslld  \$30,$a,$t2
853          vpxor  $axb,$t1,$t1                    # Ch(e,f,g)
854          vpxor  $a,$b,$axb                      # a^b, b^c in next round
855          `"prefetcht0   63(@ptr[3])"            if ($i==15)`
856         vpxor   $t2,$h,$h
857         vpaddd  $sigma,$Xi,$Xi                  # Xi+=Sigma1(e)
858
859         vpsrld  \$13,$a,$t2
860          `"prefetcht0   63(@ptr[4])"            if ($i==15 && $REG_SZ==32)`
861         vpslld  \$19,$a,$t3
862          vpaddd $t1,$Xi,$Xi                     # Xi+=Ch(e,f,g)
863          vpand  $axb,$bxc,$bxc
864          `"prefetcht0   63(@ptr[5])"            if ($i==15 && $REG_SZ==32)`
865         vpxor   $t2,$h,$sigma
866
867         vpsrld  \$22,$a,$t2
868         vpxor   $t3,$sigma,$sigma
869          `"prefetcht0   63(@ptr[6])"            if ($i==15 && $REG_SZ==32)`
870         vpslld  \$10,$a,$t3
871          vpxor  $bxc,$b,$h                      # h=Maj(a,b,c)=Ch(a^b,c,b)
872          vpaddd $Xi,$d,$d                       # d+=Xi
873          `"prefetcht0   63(@ptr[7])"            if ($i==15 && $REG_SZ==32)`
874         vpxor   $t2,$sigma,$sigma
875         vpxor   $t3,$sigma,$sigma               # Sigma0(a)
876
877         vpaddd  $Xi,$h,$h                       # h+=Xi
878         vpaddd  $sigma,$h,$h                    # h+=Sigma0(a)
879 ___
880 $code.=<<___ if (($i%8)==7);
881         add     \$`32*8`,$Tbl
882 ___
883         ($axb,$bxc)=($bxc,$axb);
884 }
885
886 sub ROUND_16_XX_avx {
887 my $i=shift;
888
889 $code.=<<___;
890         vmovdqu `&Xi_off($i+1)`,$Xn
891         vpaddd  `&Xi_off($i+9)`,$Xi,$Xi         # Xi+=X[i+9]
892
893         vpsrld  \$3,$Xn,$sigma
894         vpsrld  \$7,$Xn,$t2
895         vpslld  \$25,$Xn,$t3
896         vpxor   $t2,$sigma,$sigma
897         vpsrld  \$18,$Xn,$t2
898         vpxor   $t3,$sigma,$sigma
899         vpslld  \$14,$Xn,$t3
900         vmovdqu `&Xi_off($i+14)`,$t1
901         vpsrld  \$10,$t1,$axb                   # borrow $axb
902
903         vpxor   $t2,$sigma,$sigma
904         vpsrld  \$17,$t1,$t2
905         vpxor   $t3,$sigma,$sigma               # sigma0(X[i+1])
906         vpslld  \$15,$t1,$t3
907          vpaddd $sigma,$Xi,$Xi                  # Xi+=sigma0(e)
908         vpxor   $t2,$axb,$sigma
909         vpsrld  \$19,$t1,$t2
910         vpxor   $t3,$sigma,$sigma
911         vpslld  \$13,$t1,$t3
912         vpxor   $t2,$sigma,$sigma
913         vpxor   $t3,$sigma,$sigma               # sigma0(X[i+14])
914         vpaddd  $sigma,$Xi,$Xi                  # Xi+=sigma1(X[i+14])
915 ___
916         &ROUND_00_15_avx($i,@_);
917         ($Xi,$Xn)=($Xn,$Xi);
918 }
919
920 $code.=<<___;
921 .type   sha256_multi_block_avx,\@function,3
922 .align  32
923 sha256_multi_block_avx:
924 _avx_shortcut:
925 ___
926 $code.=<<___ if ($avx>1);
927         shr     \$32,%rcx
928         cmp     \$2,$num
929         jb      .Lavx
930         test    \$`1<<5`,%ecx
931         jnz     _avx2_shortcut
932         jmp     .Lavx
933 .align  32
934 .Lavx:
935 ___
936 $code.=<<___;
937         mov     %rsp,%rax
938         push    %rbx
939         push    %rbp
940 ___
941 $code.=<<___ if ($win64);
942         lea     -0xa8(%rsp),%rsp
943         movaps  %xmm6,(%rsp)
944         movaps  %xmm7,0x10(%rsp)
945         movaps  %xmm8,0x20(%rsp)
946         movaps  %xmm9,0x30(%rsp)
947         movaps  %xmm10,-0x78(%rax)
948         movaps  %xmm11,-0x68(%rax)
949         movaps  %xmm12,-0x58(%rax)
950         movaps  %xmm13,-0x48(%rax)
951         movaps  %xmm14,-0x38(%rax)
952         movaps  %xmm15,-0x28(%rax)
953 ___
954 $code.=<<___;
955         sub     \$`$REG_SZ*18`, %rsp
956         and     \$-256,%rsp
957         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
958 .Lbody_avx:
959         lea     K256+128(%rip),$Tbl
960         lea     `$REG_SZ*16`(%rsp),%rbx
961         lea     0x80($ctx),$ctx                 # size optimization
962
963 .Loop_grande_avx:
964         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
965         xor     $num,$num
966 ___
967 for($i=0;$i<4;$i++) {
968     $code.=<<___;
969         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
970         mov     `16*$i+8`($inp),%ecx            # number of blocks
971         cmp     $num,%ecx
972         cmovg   %ecx,$num                       # find maximum
973         test    %ecx,%ecx
974         mov     %ecx,`4*$i`(%rbx)               # initialize counters
975         cmovle  $Tbl,@ptr[$i]                   # cancel input
976 ___
977 }
978 $code.=<<___;
979         test    $num,$num
980         jz      .Ldone_avx
981
982         vmovdqu 0x00-0x80($ctx),$A              # load context
983          lea    128(%rsp),%rax
984         vmovdqu 0x20-0x80($ctx),$B
985         vmovdqu 0x40-0x80($ctx),$C
986         vmovdqu 0x60-0x80($ctx),$D
987         vmovdqu 0x80-0x80($ctx),$E
988         vmovdqu 0xa0-0x80($ctx),$F
989         vmovdqu 0xc0-0x80($ctx),$G
990         vmovdqu 0xe0-0x80($ctx),$H
991         vmovdqu .Lpbswap(%rip),$Xn
992         jmp     .Loop_avx
993
994 .align  32
995 .Loop_avx:
996         vpxor   $B,$C,$bxc                      # magic seed
997 ___
998 for($i=0;$i<16;$i++)    { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
999 $code.=<<___;
1000         vmovdqu `&Xi_off($i)`,$Xi
1001         mov     \$3,%ecx
1002         jmp     .Loop_16_xx_avx
1003 .align  32
1004 .Loop_16_xx_avx:
1005 ___
1006 for(;$i<32;$i++)        { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1007 $code.=<<___;
1008         dec     %ecx
1009         jnz     .Loop_16_xx_avx
1010
1011         mov     \$1,%ecx
1012         lea     K256+128(%rip),$Tbl
1013 ___
1014 for($i=0;$i<4;$i++) {
1015     $code.=<<___;
1016         cmp     `4*$i`(%rbx),%ecx               # examine counters
1017         cmovge  $Tbl,@ptr[$i]                   # cancel input
1018 ___
1019 }
1020 $code.=<<___;
1021         vmovdqa (%rbx),$sigma                   # pull counters
1022         vpxor   $t1,$t1,$t1
1023         vmovdqa $sigma,$Xn
1024         vpcmpgtd $t1,$Xn,$Xn                    # mask value
1025         vpaddd  $Xn,$sigma,$sigma               # counters--
1026
1027         vmovdqu 0x00-0x80($ctx),$t1
1028         vpand   $Xn,$A,$A
1029         vmovdqu 0x20-0x80($ctx),$t2
1030         vpand   $Xn,$B,$B
1031         vmovdqu 0x40-0x80($ctx),$t3
1032         vpand   $Xn,$C,$C
1033         vmovdqu 0x60-0x80($ctx),$Xi
1034         vpand   $Xn,$D,$D
1035         vpaddd  $t1,$A,$A
1036         vmovdqu 0x80-0x80($ctx),$t1
1037         vpand   $Xn,$E,$E
1038         vpaddd  $t2,$B,$B
1039         vmovdqu 0xa0-0x80($ctx),$t2
1040         vpand   $Xn,$F,$F
1041         vpaddd  $t3,$C,$C
1042         vmovdqu 0xc0-0x80($ctx),$t3
1043         vpand   $Xn,$G,$G
1044         vpaddd  $Xi,$D,$D
1045         vmovdqu 0xe0-0x80($ctx),$Xi
1046         vpand   $Xn,$H,$H
1047         vpaddd  $t1,$E,$E
1048         vpaddd  $t2,$F,$F
1049         vmovdqu $A,0x00-0x80($ctx)
1050         vpaddd  $t3,$G,$G
1051         vmovdqu $B,0x20-0x80($ctx)
1052         vpaddd  $Xi,$H,$H
1053         vmovdqu $C,0x40-0x80($ctx)
1054         vmovdqu $D,0x60-0x80($ctx)
1055         vmovdqu $E,0x80-0x80($ctx)
1056         vmovdqu $F,0xa0-0x80($ctx)
1057         vmovdqu $G,0xc0-0x80($ctx)
1058         vmovdqu $H,0xe0-0x80($ctx)
1059
1060         vmovdqu $sigma,(%rbx)                   # save counters
1061         vmovdqu .Lpbswap(%rip),$Xn
1062         dec     $num
1063         jnz     .Loop_avx
1064
1065         mov     `$REG_SZ*17+8`(%rsp),$num
1066         lea     $REG_SZ($ctx),$ctx
1067         lea     `16*$REG_SZ/4`($inp),$inp
1068         dec     $num
1069         jnz     .Loop_grande_avx
1070
1071 .Ldone_avx:
1072         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
1073         vzeroupper
1074 ___
1075 $code.=<<___ if ($win64);
1076         movaps  -0xb8(%rax),%xmm6
1077         movaps  -0xa8(%rax),%xmm7
1078         movaps  -0x98(%rax),%xmm8
1079         movaps  -0x88(%rax),%xmm9
1080         movaps  -0x78(%rax),%xmm10
1081         movaps  -0x68(%rax),%xmm11
1082         movaps  -0x58(%rax),%xmm12
1083         movaps  -0x48(%rax),%xmm13
1084         movaps  -0x38(%rax),%xmm14
1085         movaps  -0x28(%rax),%xmm15
1086 ___
1087 $code.=<<___;
1088         mov     -16(%rax),%rbp
1089         mov     -8(%rax),%rbx
1090         lea     (%rax),%rsp
1091 .Lepilogue_avx:
1092         ret
1093 .size   sha256_multi_block_avx,.-sha256_multi_block_avx
1094 ___
1095                                                 if ($avx>1) {
1096 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1097
1098 $REG_SZ=32;
1099 @ptr=map("%r$_",(12..15,8..11));
1100
1101 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("%ymm$_",(8..15));
1102 ($t1,$t2,$t3,$axb,$bxc,$Xi,$Xn,$sigma)=map("%ymm$_",(0..7));
1103
1104 $code.=<<___;
1105 .type   sha256_multi_block_avx2,\@function,3
1106 .align  32
1107 sha256_multi_block_avx2:
1108 _avx2_shortcut:
1109         mov     %rsp,%rax
1110         push    %rbx
1111         push    %rbp
1112         push    %r12
1113         push    %r13
1114         push    %r14
1115         push    %r15
1116 ___
1117 $code.=<<___ if ($win64);
1118         lea     -0xa8(%rsp),%rsp
1119         movaps  %xmm6,(%rsp)
1120         movaps  %xmm7,0x10(%rsp)
1121         movaps  %xmm8,0x20(%rsp)
1122         movaps  %xmm9,0x30(%rsp)
1123         movaps  %xmm10,0x40(%rsp)
1124         movaps  %xmm11,0x50(%rsp)
1125         movaps  %xmm12,-0x78(%rax)
1126         movaps  %xmm13,-0x68(%rax)
1127         movaps  %xmm14,-0x58(%rax)
1128         movaps  %xmm15,-0x48(%rax)
1129 ___
1130 $code.=<<___;
1131         sub     \$`$REG_SZ*18`, %rsp
1132         and     \$-256,%rsp
1133         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
1134 .Lbody_avx2:
1135         lea     K256+128(%rip),$Tbl
1136         lea     0x80($ctx),$ctx                 # size optimization
1137
1138 .Loop_grande_avx2:
1139         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
1140         xor     $num,$num
1141         lea     `$REG_SZ*16`(%rsp),%rbx
1142 ___
1143 for($i=0;$i<8;$i++) {
1144     $code.=<<___;
1145         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
1146         mov     `16*$i+8`($inp),%ecx            # number of blocks
1147         cmp     $num,%ecx
1148         cmovg   %ecx,$num                       # find maximum
1149         test    %ecx,%ecx
1150         mov     %ecx,`4*$i`(%rbx)               # initialize counters
1151         cmovle  $Tbl,@ptr[$i]                   # cancel input
1152 ___
1153 }
1154 $code.=<<___;
1155         vmovdqu 0x00-0x80($ctx),$A              # load context
1156          lea    128(%rsp),%rax
1157         vmovdqu 0x20-0x80($ctx),$B
1158          lea    256+128(%rsp),%rbx
1159         vmovdqu 0x40-0x80($ctx),$C
1160         vmovdqu 0x60-0x80($ctx),$D
1161         vmovdqu 0x80-0x80($ctx),$E
1162         vmovdqu 0xa0-0x80($ctx),$F
1163         vmovdqu 0xc0-0x80($ctx),$G
1164         vmovdqu 0xe0-0x80($ctx),$H
1165         vmovdqu .Lpbswap(%rip),$Xn
1166         jmp     .Loop_avx2
1167
1168 .align  32
1169 .Loop_avx2:
1170         vpxor   $B,$C,$bxc                      # magic seed
1171 ___
1172 for($i=0;$i<16;$i++)    { &ROUND_00_15_avx($i,@V); unshift(@V,pop(@V)); }
1173 $code.=<<___;
1174         vmovdqu `&Xi_off($i)`,$Xi
1175         mov     \$3,%ecx
1176         jmp     .Loop_16_xx_avx2
1177 .align  32
1178 .Loop_16_xx_avx2:
1179 ___
1180 for(;$i<32;$i++)        { &ROUND_16_XX_avx($i,@V); unshift(@V,pop(@V)); }
1181 $code.=<<___;
1182         dec     %ecx
1183         jnz     .Loop_16_xx_avx2
1184
1185         mov     \$1,%ecx
1186         lea     `$REG_SZ*16`(%rsp),%rbx
1187         lea     K256+128(%rip),$Tbl
1188 ___
1189 for($i=0;$i<8;$i++) {
1190     $code.=<<___;
1191         cmp     `4*$i`(%rbx),%ecx               # examine counters
1192         cmovge  $Tbl,@ptr[$i]                   # cancel input
1193 ___
1194 }
1195 $code.=<<___;
1196         vmovdqa (%rbx),$sigma                   # pull counters
1197         vpxor   $t1,$t1,$t1
1198         vmovdqa $sigma,$Xn
1199         vpcmpgtd $t1,$Xn,$Xn                    # mask value
1200         vpaddd  $Xn,$sigma,$sigma               # counters--
1201
1202         vmovdqu 0x00-0x80($ctx),$t1
1203         vpand   $Xn,$A,$A
1204         vmovdqu 0x20-0x80($ctx),$t2
1205         vpand   $Xn,$B,$B
1206         vmovdqu 0x40-0x80($ctx),$t3
1207         vpand   $Xn,$C,$C
1208         vmovdqu 0x60-0x80($ctx),$Xi
1209         vpand   $Xn,$D,$D
1210         vpaddd  $t1,$A,$A
1211         vmovdqu 0x80-0x80($ctx),$t1
1212         vpand   $Xn,$E,$E
1213         vpaddd  $t2,$B,$B
1214         vmovdqu 0xa0-0x80($ctx),$t2
1215         vpand   $Xn,$F,$F
1216         vpaddd  $t3,$C,$C
1217         vmovdqu 0xc0-0x80($ctx),$t3
1218         vpand   $Xn,$G,$G
1219         vpaddd  $Xi,$D,$D
1220         vmovdqu 0xe0-0x80($ctx),$Xi
1221         vpand   $Xn,$H,$H
1222         vpaddd  $t1,$E,$E
1223         vpaddd  $t2,$F,$F
1224         vmovdqu $A,0x00-0x80($ctx)
1225         vpaddd  $t3,$G,$G
1226         vmovdqu $B,0x20-0x80($ctx)
1227         vpaddd  $Xi,$H,$H
1228         vmovdqu $C,0x40-0x80($ctx)
1229         vmovdqu $D,0x60-0x80($ctx)
1230         vmovdqu $E,0x80-0x80($ctx)
1231         vmovdqu $F,0xa0-0x80($ctx)
1232         vmovdqu $G,0xc0-0x80($ctx)
1233         vmovdqu $H,0xe0-0x80($ctx)
1234
1235         vmovdqu $sigma,(%rbx)                   # save counters
1236         lea     256+128(%rsp),%rbx
1237         vmovdqu .Lpbswap(%rip),$Xn
1238         dec     $num
1239         jnz     .Loop_avx2
1240
1241         #mov    `$REG_SZ*17+8`(%rsp),$num
1242         #lea    $REG_SZ($ctx),$ctx
1243         #lea    `16*$REG_SZ/4`($inp),$inp
1244         #dec    $num
1245         #jnz    .Loop_grande_avx2
1246
1247 .Ldone_avx2:
1248         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
1249         vzeroupper
1250 ___
1251 $code.=<<___ if ($win64);
1252         movaps  -0xd8(%rax),%xmm6
1253         movaps  -0xc8(%rax),%xmm7
1254         movaps  -0xb8(%rax),%xmm8
1255         movaps  -0xa8(%rax),%xmm9
1256         movaps  -0x98(%rax),%xmm10
1257         movaps  -0x88(%rax),%xmm11
1258         movaps  -0x78(%rax),%xmm12
1259         movaps  -0x68(%rax),%xmm13
1260         movaps  -0x58(%rax),%xmm14
1261         movaps  -0x48(%rax),%xmm15
1262 ___
1263 $code.=<<___;
1264         mov     -48(%rax),%r15
1265         mov     -40(%rax),%r14
1266         mov     -32(%rax),%r13
1267         mov     -24(%rax),%r12
1268         mov     -16(%rax),%rbp
1269         mov     -8(%rax),%rbx
1270         lea     (%rax),%rsp
1271 .Lepilogue_avx2:
1272         ret
1273 .size   sha256_multi_block_avx2,.-sha256_multi_block_avx2
1274 ___
1275                                         }       }}}
1276 $code.=<<___;
1277 .align  256
1278 K256:
1279 ___
1280 sub TABLE {
1281     foreach (@_) {
1282         $code.=<<___;
1283         .long   $_,$_,$_,$_
1284         .long   $_,$_,$_,$_
1285 ___
1286     }
1287 }
1288 &TABLE( 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5,
1289         0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5,
1290         0xd807aa98,0x12835b01,0x243185be,0x550c7dc3,
1291         0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174,
1292         0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc,
1293         0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da,
1294         0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7,
1295         0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967,
1296         0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13,
1297         0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85,
1298         0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3,
1299         0xd192e819,0xd6990624,0xf40e3585,0x106aa070,
1300         0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5,
1301         0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3,
1302         0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208,
1303         0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 );
1304 $code.=<<___;
1305 .Lpbswap:
1306         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1307         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1308 K256_shaext:
1309         .long   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
1310         .long   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
1311         .long   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
1312         .long   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
1313         .long   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
1314         .long   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
1315         .long   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
1316         .long   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
1317         .long   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
1318         .long   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
1319         .long   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
1320         .long   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
1321         .long   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
1322         .long   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
1323         .long   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
1324         .long   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
1325         .asciz  "SHA256 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1326 ___
1327
1328 if ($win64) {
1329 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1330 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1331 $rec="%rcx";
1332 $frame="%rdx";
1333 $context="%r8";
1334 $disp="%r9";
1335
1336 $code.=<<___;
1337 .extern __imp_RtlVirtualUnwind
1338 .type   se_handler,\@abi-omnipotent
1339 .align  16
1340 se_handler:
1341         push    %rsi
1342         push    %rdi
1343         push    %rbx
1344         push    %rbp
1345         push    %r12
1346         push    %r13
1347         push    %r14
1348         push    %r15
1349         pushfq
1350         sub     \$64,%rsp
1351
1352         mov     120($context),%rax      # pull context->Rax
1353         mov     248($context),%rbx      # pull context->Rip
1354
1355         mov     8($disp),%rsi           # disp->ImageBase
1356         mov     56($disp),%r11          # disp->HandlerData
1357
1358         mov     0(%r11),%r10d           # HandlerData[0]
1359         lea     (%rsi,%r10),%r10        # end of prologue label
1360         cmp     %r10,%rbx               # context->Rip<.Lbody
1361         jb      .Lin_prologue
1362
1363         mov     152($context),%rax      # pull context->Rsp
1364
1365         mov     4(%r11),%r10d           # HandlerData[1]
1366         lea     (%rsi,%r10),%r10        # epilogue label
1367         cmp     %r10,%rbx               # context->Rip>=.Lepilogue
1368         jae     .Lin_prologue
1369
1370         mov     `16*17`(%rax),%rax      # pull saved stack pointer
1371
1372         mov     -8(%rax),%rbx
1373         mov     -16(%rax),%rbp
1374         mov     %rbx,144($context)      # restore context->Rbx
1375         mov     %rbp,160($context)      # restore context->Rbp
1376
1377         lea     -24-10*16(%rax),%rsi
1378         lea     512($context),%rdi      # &context.Xmm6
1379         mov     \$20,%ecx
1380         .long   0xa548f3fc              # cld; rep movsq
1381
1382 .Lin_prologue:
1383         mov     8(%rax),%rdi
1384         mov     16(%rax),%rsi
1385         mov     %rax,152($context)      # restore context->Rsp
1386         mov     %rsi,168($context)      # restore context->Rsi
1387         mov     %rdi,176($context)      # restore context->Rdi
1388
1389         mov     40($disp),%rdi          # disp->ContextRecord
1390         mov     $context,%rsi           # context
1391         mov     \$154,%ecx              # sizeof(CONTEXT)
1392         .long   0xa548f3fc              # cld; rep movsq
1393
1394         mov     $disp,%rsi
1395         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1396         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1397         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1398         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1399         mov     40(%rsi),%r10           # disp->ContextRecord
1400         lea     56(%rsi),%r11           # &disp->HandlerData
1401         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1402         mov     %r10,32(%rsp)           # arg5
1403         mov     %r11,40(%rsp)           # arg6
1404         mov     %r12,48(%rsp)           # arg7
1405         mov     %rcx,56(%rsp)           # arg8, (NULL)
1406         call    *__imp_RtlVirtualUnwind(%rip)
1407
1408         mov     \$1,%eax                # ExceptionContinueSearch
1409         add     \$64,%rsp
1410         popfq
1411         pop     %r15
1412         pop     %r14
1413         pop     %r13
1414         pop     %r12
1415         pop     %rbp
1416         pop     %rbx
1417         pop     %rdi
1418         pop     %rsi
1419         ret
1420 .size   se_handler,.-se_handler
1421 ___
1422 $code.=<<___ if ($avx>1);
1423 .type   avx2_handler,\@abi-omnipotent
1424 .align  16
1425 avx2_handler:
1426         push    %rsi
1427         push    %rdi
1428         push    %rbx
1429         push    %rbp
1430         push    %r12
1431         push    %r13
1432         push    %r14
1433         push    %r15
1434         pushfq
1435         sub     \$64,%rsp
1436
1437         mov     120($context),%rax      # pull context->Rax
1438         mov     248($context),%rbx      # pull context->Rip
1439
1440         mov     8($disp),%rsi           # disp->ImageBase
1441         mov     56($disp),%r11          # disp->HandlerData
1442
1443         mov     0(%r11),%r10d           # HandlerData[0]
1444         lea     (%rsi,%r10),%r10        # end of prologue label
1445         cmp     %r10,%rbx               # context->Rip<body label
1446         jb      .Lin_prologue
1447
1448         mov     152($context),%rax      # pull context->Rsp
1449
1450         mov     4(%r11),%r10d           # HandlerData[1]
1451         lea     (%rsi,%r10),%r10        # epilogue label
1452         cmp     %r10,%rbx               # context->Rip>=epilogue label
1453         jae     .Lin_prologue
1454
1455         mov     `32*17`($context),%rax  # pull saved stack pointer
1456
1457         mov     -8(%rax),%rbx
1458         mov     -16(%rax),%rbp
1459         mov     -24(%rax),%r12
1460         mov     -32(%rax),%r13
1461         mov     -40(%rax),%r14
1462         mov     -48(%rax),%r15
1463         mov     %rbx,144($context)      # restore context->Rbx
1464         mov     %rbp,160($context)      # restore context->Rbp
1465         mov     %r12,216($context)      # restore cotnext->R12
1466         mov     %r13,224($context)      # restore cotnext->R13
1467         mov     %r14,232($context)      # restore cotnext->R14
1468         mov     %r15,240($context)      # restore cotnext->R15
1469
1470         lea     -56-10*16(%rax),%rsi
1471         lea     512($context),%rdi      # &context.Xmm6
1472         mov     \$20,%ecx
1473         .long   0xa548f3fc              # cld; rep movsq
1474
1475         jmp     .Lin_prologue
1476 .size   avx2_handler,.-avx2_handler
1477 ___
1478 $code.=<<___;
1479 .section        .pdata
1480 .align  4
1481         .rva    .LSEH_begin_sha256_multi_block
1482         .rva    .LSEH_end_sha256_multi_block
1483         .rva    .LSEH_info_sha256_multi_block
1484         .rva    .LSEH_begin_sha256_multi_block_shaext
1485         .rva    .LSEH_end_sha256_multi_block_shaext
1486         .rva    .LSEH_info_sha256_multi_block_shaext
1487 ___
1488 $code.=<<___ if ($avx);
1489         .rva    .LSEH_begin_sha256_multi_block_avx
1490         .rva    .LSEH_end_sha256_multi_block_avx
1491         .rva    .LSEH_info_sha256_multi_block_avx
1492 ___
1493 $code.=<<___ if ($avx>1);
1494         .rva    .LSEH_begin_sha256_multi_block_avx2
1495         .rva    .LSEH_end_sha256_multi_block_avx2
1496         .rva    .LSEH_info_sha256_multi_block_avx2
1497 ___
1498 $code.=<<___;
1499 .section        .xdata
1500 .align  8
1501 .LSEH_info_sha256_multi_block:
1502         .byte   9,0,0,0
1503         .rva    se_handler
1504         .rva    .Lbody,.Lepilogue                       # HandlerData[]
1505 .LSEH_info_sha256_multi_block_shaext:
1506         .byte   9,0,0,0
1507         .rva    se_handler
1508         .rva    .Lbody_shaext,.Lepilogue_shaext         # HandlerData[]
1509 ___
1510 $code.=<<___ if ($avx);
1511 .LSEH_info_sha256_multi_block_avx:
1512         .byte   9,0,0,0
1513         .rva    se_handler
1514         .rva    .Lbody_avx,.Lepilogue_avx               # HandlerData[]
1515 ___
1516 $code.=<<___ if ($avx>1);
1517 .LSEH_info_sha256_multi_block_avx2:
1518         .byte   9,0,0,0
1519         .rva    avx2_handler
1520         .rva    .Lbody_avx2,.Lepilogue_avx2             # HandlerData[]
1521 ___
1522 }
1523 ####################################################################
1524
1525 sub rex {
1526   local *opcode=shift;
1527   my ($dst,$src)=@_;
1528   my $rex=0;
1529
1530     $rex|=0x04                  if ($dst>=8);
1531     $rex|=0x01                  if ($src>=8);
1532     unshift @opcode,$rex|0x40   if ($rex);
1533 }
1534
1535 sub sha256op38 {
1536     my $instr = shift;
1537     my %opcodelet = (
1538                 "sha256rnds2" => 0xcb,
1539                 "sha256msg1"  => 0xcc,
1540                 "sha256msg2"  => 0xcd   );
1541
1542     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1543       my @opcode=(0x0f,0x38);
1544         rex(\@opcode,$2,$1);
1545         push @opcode,$opcodelet{$instr};
1546         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1547         return ".byte\t".join(',',@opcode);
1548     } else {
1549         return $instr."\t".@_[0];
1550     }
1551 }
1552
1553 foreach (split("\n",$code)) {
1554         s/\`([^\`]*)\`/eval($1)/ge;
1555
1556         s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo         or
1557
1558         s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go          or
1559         s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go         or
1560         s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go    or
1561         s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go        or
1562         s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go            or
1563         s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1564
1565         print $_,"\n";
1566 }
1567
1568 close STDOUT;