2f6b35f3559ca4edadc75bfbfd594c904065894b
[openssl.git] / crypto / sha / asm / sha1-mb-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # Multi-buffer SHA1 procedure processes n buffers in parallel by
18 # placing buffer data to designated lane of SIMD register. n is
19 # naturally limited to 4 on pre-AVX2 processors and to 8 on
20 # AVX2-capable processors such as Haswell.
21 #
22 #               this    +aesni(i)       sha1    aesni-sha1      gain(iv)
23 # -------------------------------------------------------------------
24 # Westmere(ii)  10.7/n  +1.28=3.96(n=4) 5.30    6.66            +68%
25 # Atom(ii)      18.1/n  +3.93=8.46(n=4) 9.37    12.8            +51%
26 # Sandy Bridge  (8.16   +5.15=13.3)/n   4.99    5.98            +80%
27 # Ivy Bridge    (8.08   +5.14=13.2)/n   4.60    5.54            +68%
28 # Haswell(iii)  (8.96   +5.00=14.0)/n   3.57    4.55            +160%
29 # Skylake       (8.70   +5.00=13.7)/n   3.64    4.20            +145%
30 # Bulldozer     (9.76   +5.76=15.5)/n   5.95    6.37            +64%
31 #
32 # (i)   multi-block CBC encrypt with 128-bit key;
33 # (ii)  (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34 #       because of lower AES-NI instruction throughput;
35 # (iii) "this" is for n=8, when we gather twice as much data, result
36 #       for n=4 is 8.00+4.44=12.4;
37 # (iv)  presented improvement coefficients are asymptotic limits and
38 #       in real-life application are somewhat lower, e.g. for 2KB
39 #       fragments they range from 30% to 100% (on Haswell);
40
41 $flavour = shift;
42 $output  = shift;
43 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
44
45 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
46
47 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
48 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
49 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
50 die "can't locate x86_64-xlate.pl";
51
52 $avx=0;
53
54 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
55                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
56         $avx = ($1>=2.19) + ($1>=2.22);
57 }
58
59 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
60            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
61         $avx = ($1>=2.09) + ($1>=2.10);
62 }
63
64 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
65            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
66         $avx = ($1>=10) + ($1>=11);
67 }
68
69 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
70         $avx = ($2>=3.0) + ($2>3.0);
71 }
72
73 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
74 *STDOUT=*OUT;
75
76 # void sha1_multi_block (
77 #     struct {  unsigned int A[8];
78 #               unsigned int B[8];
79 #               unsigned int C[8];
80 #               unsigned int D[8];
81 #               unsigned int E[8];      } *ctx,
82 #     struct {  void *ptr; int blocks;  } inp[8],
83 #     int num);         /* 1 or 2 */
84 #
85 $ctx="%rdi";    # 1st arg
86 $inp="%rsi";    # 2nd arg
87 $num="%edx";
88 @ptr=map("%r$_",(8..11));
89 $Tbl="%rbp";
90
91 @V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4));
92 ($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9));
93 @Xi=map("%xmm$_",(10..14));
94 $K="%xmm15";
95
96 if (1) {
97     # Atom-specific optimization aiming to eliminate pshufb with high
98     # registers [and thus get rid of 48 cycles accumulated penalty]
99     @Xi=map("%xmm$_",(0..4));
100     ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
101     @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
102 }
103
104 $REG_SZ=16;
105
106 sub Xi_off {
107 my $off = shift;
108
109     $off %= 16; $off *= $REG_SZ;
110     $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
111 }
112
113 sub BODY_00_19 {
114 my ($i,$a,$b,$c,$d,$e)=@_;
115 my $j=$i+1;
116 my $k=$i+2;
117
118 # Loads are performed 2+3/4 iterations in advance. 3/4 means that out
119 # of 4 words you would expect to be loaded per given iteration one is
120 # spilled to next iteration. In other words indices in four input
121 # streams are distributed as following:
122 #
123 # $i==0:        0,0,0,0,1,1,1,1,2,2,2,
124 # $i==1:        2,3,3,3,
125 # $i==2:        3,4,4,4,
126 # ...
127 # $i==13:       14,15,15,15,
128 # $i==14:       15
129 #
130 # Then at $i==15 Xupdate is applied one iteration in advance...
131 $code.=<<___ if ($i==0);
132         movd            (@ptr[0]),@Xi[0]
133          lea            `16*4`(@ptr[0]),@ptr[0]
134         movd            (@ptr[1]),@Xi[2]        # borrow @Xi[2]
135          lea            `16*4`(@ptr[1]),@ptr[1]
136         movd            (@ptr[2]),@Xi[3]        # borrow @Xi[3]
137          lea            `16*4`(@ptr[2]),@ptr[2]
138         movd            (@ptr[3]),@Xi[4]        # borrow @Xi[4]
139          lea            `16*4`(@ptr[3]),@ptr[3]
140         punpckldq       @Xi[3],@Xi[0]
141          movd           `4*$j-16*4`(@ptr[0]),@Xi[1]
142         punpckldq       @Xi[4],@Xi[2]
143          movd           `4*$j-16*4`(@ptr[1]),$t3
144         punpckldq       @Xi[2],@Xi[0]
145          movd           `4*$j-16*4`(@ptr[2]),$t2
146         pshufb          $tx,@Xi[0]
147 ___
148 $code.=<<___ if ($i<14);                        # just load input
149          movd           `4*$j-16*4`(@ptr[3]),$t1
150          punpckldq      $t2,@Xi[1]
151         movdqa  $a,$t2
152         paddd   $K,$e                           # e+=K_00_19
153          punpckldq      $t1,$t3
154         movdqa  $b,$t1
155         movdqa  $b,$t0
156         pslld   \$5,$t2
157         pandn   $d,$t1
158         pand    $c,$t0
159          punpckldq      $t3,@Xi[1]
160         movdqa  $a,$t3
161
162         movdqa  @Xi[0],`&Xi_off($i)`
163         paddd   @Xi[0],$e                       # e+=X[i]
164          movd           `4*$k-16*4`(@ptr[0]),@Xi[2]
165         psrld   \$27,$t3
166         pxor    $t1,$t0                         # Ch(b,c,d)
167         movdqa  $b,$t1
168
169         por     $t3,$t2                         # rol(a,5)
170          movd           `4*$k-16*4`(@ptr[1]),$t3
171         pslld   \$30,$t1
172         paddd   $t0,$e                          # e+=Ch(b,c,d)
173
174         psrld   \$2,$b
175         paddd   $t2,$e                          # e+=rol(a,5)
176          pshufb $tx,@Xi[1]
177          movd           `4*$k-16*4`(@ptr[2]),$t2
178         por     $t1,$b                          # b=rol(b,30)
179 ___
180 $code.=<<___ if ($i==14);                       # just load input
181          movd           `4*$j-16*4`(@ptr[3]),$t1
182          punpckldq      $t2,@Xi[1]
183         movdqa  $a,$t2
184         paddd   $K,$e                           # e+=K_00_19
185          punpckldq      $t1,$t3
186         movdqa  $b,$t1
187         movdqa  $b,$t0
188         pslld   \$5,$t2
189          prefetcht0     63(@ptr[0])
190         pandn   $d,$t1
191         pand    $c,$t0
192          punpckldq      $t3,@Xi[1]
193         movdqa  $a,$t3
194
195         movdqa  @Xi[0],`&Xi_off($i)`
196         paddd   @Xi[0],$e                       # e+=X[i]
197         psrld   \$27,$t3
198         pxor    $t1,$t0                         # Ch(b,c,d)
199         movdqa  $b,$t1
200          prefetcht0     63(@ptr[1])
201
202         por     $t3,$t2                         # rol(a,5)
203         pslld   \$30,$t1
204         paddd   $t0,$e                          # e+=Ch(b,c,d)
205          prefetcht0     63(@ptr[2])
206
207         psrld   \$2,$b
208         paddd   $t2,$e                          # e+=rol(a,5)
209          pshufb $tx,@Xi[1]
210          prefetcht0     63(@ptr[3])
211         por     $t1,$b                          # b=rol(b,30)
212 ___
213 $code.=<<___ if ($i>=13 && $i<15);
214         movdqa  `&Xi_off($j+2)`,@Xi[3]          # preload "X[2]"
215 ___
216 $code.=<<___ if ($i>=15);                       # apply Xupdate
217         pxor    @Xi[-2],@Xi[1]                  # "X[13]"
218         movdqa  `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
219
220         movdqa  $a,$t2
221          pxor   `&Xi_off($j+8)`,@Xi[1]
222         paddd   $K,$e                           # e+=K_00_19
223         movdqa  $b,$t1
224         pslld   \$5,$t2
225          pxor   @Xi[3],@Xi[1]
226         movdqa  $b,$t0
227         pandn   $d,$t1
228          movdqa @Xi[1],$tx
229         pand    $c,$t0
230         movdqa  $a,$t3
231          psrld  \$31,$tx
232          paddd  @Xi[1],@Xi[1]
233
234         movdqa  @Xi[0],`&Xi_off($i)`
235         paddd   @Xi[0],$e                       # e+=X[i]
236         psrld   \$27,$t3
237         pxor    $t1,$t0                         # Ch(b,c,d)
238
239         movdqa  $b,$t1
240         por     $t3,$t2                         # rol(a,5)
241         pslld   \$30,$t1
242         paddd   $t0,$e                          # e+=Ch(b,c,d)
243
244         psrld   \$2,$b
245         paddd   $t2,$e                          # e+=rol(a,5)
246          por    $tx,@Xi[1]                      # rol   \$1,@Xi[1]
247         por     $t1,$b                          # b=rol(b,30)
248 ___
249 push(@Xi,shift(@Xi));
250 }
251
252 sub BODY_20_39 {
253 my ($i,$a,$b,$c,$d,$e)=@_;
254 my $j=$i+1;
255
256 $code.=<<___ if ($i<79);
257         pxor    @Xi[-2],@Xi[1]                  # "X[13]"
258         movdqa  `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
259
260         movdqa  $a,$t2
261         movdqa  $d,$t0
262          pxor   `&Xi_off($j+8)`,@Xi[1]
263         paddd   $K,$e                           # e+=K_20_39
264         pslld   \$5,$t2
265         pxor    $b,$t0
266
267         movdqa  $a,$t3
268 ___
269 $code.=<<___ if ($i<72);
270         movdqa  @Xi[0],`&Xi_off($i)`
271 ___
272 $code.=<<___ if ($i<79);
273         paddd   @Xi[0],$e                       # e+=X[i]
274          pxor   @Xi[3],@Xi[1]
275         psrld   \$27,$t3
276         pxor    $c,$t0                          # Parity(b,c,d)
277         movdqa  $b,$t1
278
279         pslld   \$30,$t1
280          movdqa @Xi[1],$tx
281         por     $t3,$t2                         # rol(a,5)
282          psrld  \$31,$tx
283         paddd   $t0,$e                          # e+=Parity(b,c,d)
284          paddd  @Xi[1],@Xi[1]
285
286         psrld   \$2,$b
287         paddd   $t2,$e                          # e+=rol(a,5)
288          por    $tx,@Xi[1]                      # rol(@Xi[1],1)
289         por     $t1,$b                          # b=rol(b,30)
290 ___
291 $code.=<<___ if ($i==79);
292         movdqa  $a,$t2
293         paddd   $K,$e                           # e+=K_20_39
294         movdqa  $d,$t0
295         pslld   \$5,$t2
296         pxor    $b,$t0
297
298         movdqa  $a,$t3
299         paddd   @Xi[0],$e                       # e+=X[i]
300         psrld   \$27,$t3
301         movdqa  $b,$t1
302         pxor    $c,$t0                          # Parity(b,c,d)
303
304         pslld   \$30,$t1
305         por     $t3,$t2                         # rol(a,5)
306         paddd   $t0,$e                          # e+=Parity(b,c,d)
307
308         psrld   \$2,$b
309         paddd   $t2,$e                          # e+=rol(a,5)
310         por     $t1,$b                          # b=rol(b,30)
311 ___
312 push(@Xi,shift(@Xi));
313 }
314
315 sub BODY_40_59 {
316 my ($i,$a,$b,$c,$d,$e)=@_;
317 my $j=$i+1;
318
319 $code.=<<___;
320         pxor    @Xi[-2],@Xi[1]                  # "X[13]"
321         movdqa  `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
322
323         movdqa  $a,$t2
324         movdqa  $d,$t1
325          pxor   `&Xi_off($j+8)`,@Xi[1]
326         pxor    @Xi[3],@Xi[1]
327         paddd   $K,$e                           # e+=K_40_59
328         pslld   \$5,$t2
329         movdqa  $a,$t3
330         pand    $c,$t1
331
332         movdqa  $d,$t0
333          movdqa @Xi[1],$tx
334         psrld   \$27,$t3
335         paddd   $t1,$e
336         pxor    $c,$t0
337
338         movdqa  @Xi[0],`&Xi_off($i)`
339         paddd   @Xi[0],$e                       # e+=X[i]
340         por     $t3,$t2                         # rol(a,5)
341          psrld  \$31,$tx
342         pand    $b,$t0
343         movdqa  $b,$t1
344
345         pslld   \$30,$t1
346          paddd  @Xi[1],@Xi[1]
347         paddd   $t0,$e                          # e+=Maj(b,d,c)
348
349         psrld   \$2,$b
350         paddd   $t2,$e                          # e+=rol(a,5)
351          por    $tx,@Xi[1]                      # rol(@X[1],1)
352         por     $t1,$b                          # b=rol(b,30)
353 ___
354 push(@Xi,shift(@Xi));
355 }
356
357 $code.=<<___;
358 .text
359
360 .extern OPENSSL_ia32cap_P
361
362 .globl  sha1_multi_block
363 .type   sha1_multi_block,\@function,3
364 .align  32
365 sha1_multi_block:
366         mov     OPENSSL_ia32cap_P+4(%rip),%rcx
367         bt      \$61,%rcx                       # check SHA bit
368         jc      _shaext_shortcut
369 ___
370 $code.=<<___ if ($avx);
371         test    \$`1<<28`,%ecx
372         jnz     _avx_shortcut
373 ___
374 $code.=<<___;
375         mov     %rsp,%rax
376         push    %rbx
377         push    %rbp
378 ___
379 $code.=<<___ if ($win64);
380         lea     -0xa8(%rsp),%rsp
381         movaps  %xmm6,(%rsp)
382         movaps  %xmm7,0x10(%rsp)
383         movaps  %xmm8,0x20(%rsp)
384         movaps  %xmm9,0x30(%rsp)
385         movaps  %xmm10,-0x78(%rax)
386         movaps  %xmm11,-0x68(%rax)
387         movaps  %xmm12,-0x58(%rax)
388         movaps  %xmm13,-0x48(%rax)
389         movaps  %xmm14,-0x38(%rax)
390         movaps  %xmm15,-0x28(%rax)
391 ___
392 $code.=<<___;
393         sub     \$`$REG_SZ*18`,%rsp
394         and     \$-256,%rsp
395         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
396 .Lbody:
397         lea     K_XX_XX(%rip),$Tbl
398         lea     `$REG_SZ*16`(%rsp),%rbx
399
400 .Loop_grande:
401         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
402         xor     $num,$num
403 ___
404 for($i=0;$i<4;$i++) {
405     $code.=<<___;
406         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
407         mov     `16*$i+8`($inp),%ecx            # number of blocks
408         cmp     $num,%ecx
409         cmovg   %ecx,$num                       # find maximum
410         test    %ecx,%ecx
411         mov     %ecx,`4*$i`(%rbx)               # initialize counters
412         cmovle  $Tbl,@ptr[$i]                   # cancel input
413 ___
414 }
415 $code.=<<___;
416         test    $num,$num
417         jz      .Ldone
418
419         movdqu  0x00($ctx),$A                   # load context
420          lea    128(%rsp),%rax
421         movdqu  0x20($ctx),$B
422         movdqu  0x40($ctx),$C
423         movdqu  0x60($ctx),$D
424         movdqu  0x80($ctx),$E
425         movdqa  0x60($Tbl),$tx                  # pbswap_mask
426         movdqa  -0x20($Tbl),$K                  # K_00_19
427         jmp     .Loop
428
429 .align  32
430 .Loop:
431 ___
432 for($i=0;$i<20;$i++)    { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
433 $code.="        movdqa  0x00($Tbl),$K\n";       # K_20_39
434 for(;$i<40;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
435 $code.="        movdqa  0x20($Tbl),$K\n";       # K_40_59
436 for(;$i<60;$i++)        { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
437 $code.="        movdqa  0x40($Tbl),$K\n";       # K_60_79
438 for(;$i<80;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
439 $code.=<<___;
440         movdqa  (%rbx),@Xi[0]                   # pull counters
441         mov     \$1,%ecx
442         cmp     4*0(%rbx),%ecx                  # examinte counters
443         pxor    $t2,$t2
444         cmovge  $Tbl,@ptr[0]                    # cancel input
445         cmp     4*1(%rbx),%ecx
446         movdqa  @Xi[0],@Xi[1]
447         cmovge  $Tbl,@ptr[1]
448         cmp     4*2(%rbx),%ecx
449         pcmpgtd $t2,@Xi[1]                      # mask value
450         cmovge  $Tbl,@ptr[2]
451         cmp     4*3(%rbx),%ecx
452         paddd   @Xi[1],@Xi[0]                   # counters--
453         cmovge  $Tbl,@ptr[3]
454
455         movdqu  0x00($ctx),$t0
456         pand    @Xi[1],$A
457         movdqu  0x20($ctx),$t1
458         pand    @Xi[1],$B
459         paddd   $t0,$A
460         movdqu  0x40($ctx),$t2
461         pand    @Xi[1],$C
462         paddd   $t1,$B
463         movdqu  0x60($ctx),$t3
464         pand    @Xi[1],$D
465         paddd   $t2,$C
466         movdqu  0x80($ctx),$tx
467         pand    @Xi[1],$E
468         movdqu  $A,0x00($ctx)
469         paddd   $t3,$D
470         movdqu  $B,0x20($ctx)
471         paddd   $tx,$E
472         movdqu  $C,0x40($ctx)
473         movdqu  $D,0x60($ctx)
474         movdqu  $E,0x80($ctx)
475
476         movdqa  @Xi[0],(%rbx)                   # save counters
477         movdqa  0x60($Tbl),$tx                  # pbswap_mask
478         movdqa  -0x20($Tbl),$K                  # K_00_19
479         dec     $num
480         jnz     .Loop
481
482         mov     `$REG_SZ*17+8`(%rsp),$num
483         lea     $REG_SZ($ctx),$ctx
484         lea     `16*$REG_SZ/4`($inp),$inp
485         dec     $num
486         jnz     .Loop_grande
487
488 .Ldone:
489         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
490 ___
491 $code.=<<___ if ($win64);
492         movaps  -0xb8(%rax),%xmm6
493         movaps  -0xa8(%rax),%xmm7
494         movaps  -0x98(%rax),%xmm8
495         movaps  -0x88(%rax),%xmm9
496         movaps  -0x78(%rax),%xmm10
497         movaps  -0x68(%rax),%xmm11
498         movaps  -0x58(%rax),%xmm12
499         movaps  -0x48(%rax),%xmm13
500         movaps  -0x38(%rax),%xmm14
501         movaps  -0x28(%rax),%xmm15
502 ___
503 $code.=<<___;
504         mov     -16(%rax),%rbp
505         mov     -8(%rax),%rbx
506         lea     (%rax),%rsp
507 .Lepilogue:
508         ret
509 .size   sha1_multi_block,.-sha1_multi_block
510 ___
511                                                 {{{
512 my ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10));
513 my @MSG0=map("%xmm$_",(4..7));
514 my @MSG1=map("%xmm$_",(11..14));
515
516 $code.=<<___;
517 .type   sha1_multi_block_shaext,\@function,3
518 .align  32
519 sha1_multi_block_shaext:
520 _shaext_shortcut:
521         mov     %rsp,%rax
522         push    %rbx
523         push    %rbp
524 ___
525 $code.=<<___ if ($win64);
526         lea     -0xa8(%rsp),%rsp
527         movaps  %xmm6,(%rsp)
528         movaps  %xmm7,0x10(%rsp)
529         movaps  %xmm8,0x20(%rsp)
530         movaps  %xmm9,0x30(%rsp)
531         movaps  %xmm10,-0x78(%rax)
532         movaps  %xmm11,-0x68(%rax)
533         movaps  %xmm12,-0x58(%rax)
534         movaps  %xmm13,-0x48(%rax)
535         movaps  %xmm14,-0x38(%rax)
536         movaps  %xmm15,-0x28(%rax)
537 ___
538 $code.=<<___;
539         sub     \$`$REG_SZ*18`,%rsp
540         shl     \$1,$num                        # we process pair at a time
541         and     \$-256,%rsp
542         lea     0x40($ctx),$ctx                 # size optimization
543         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
544 .Lbody_shaext:
545         lea     `$REG_SZ*16`(%rsp),%rbx
546         movdqa  K_XX_XX+0x80(%rip),$BSWAP       # byte-n-word swap
547
548 .Loop_grande_shaext:
549         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
550         xor     $num,$num
551 ___
552 for($i=0;$i<2;$i++) {
553     $code.=<<___;
554         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
555         mov     `16*$i+8`($inp),%ecx            # number of blocks
556         cmp     $num,%ecx
557         cmovg   %ecx,$num                       # find maximum
558         test    %ecx,%ecx
559         mov     %ecx,`4*$i`(%rbx)               # initialize counters
560         cmovle  %rsp,@ptr[$i]                   # cancel input
561 ___
562 }
563 $code.=<<___;
564         test    $num,$num
565         jz      .Ldone_shaext
566
567         movq            0x00-0x40($ctx),$ABCD0  # a1.a0
568         movq            0x20-0x40($ctx),@MSG0[0]# b1.b0
569         movq            0x40-0x40($ctx),@MSG0[1]# c1.c0
570         movq            0x60-0x40($ctx),@MSG0[2]# d1.d0
571         movq            0x80-0x40($ctx),@MSG0[3]# e1.e0
572
573         punpckldq       @MSG0[0],$ABCD0         # b1.a1.b0.a0
574         punpckldq       @MSG0[2],@MSG0[1]       # d1.c1.d0.c0
575
576         movdqa          $ABCD0,$ABCD1
577         punpcklqdq      @MSG0[1],$ABCD0         # d0.c0.b0.a0
578         punpckhqdq      @MSG0[1],$ABCD1         # d1.c1.b1.a1
579
580         pshufd          \$0b00111111,@MSG0[3],$E0
581         pshufd          \$0b01111111,@MSG0[3],$E1
582         pshufd          \$0b00011011,$ABCD0,$ABCD0
583         pshufd          \$0b00011011,$ABCD1,$ABCD1
584         jmp             .Loop_shaext
585
586 .align  32
587 .Loop_shaext:
588         movdqu          0x00(@ptr[0]),@MSG0[0]
589          movdqu         0x00(@ptr[1]),@MSG1[0]
590         movdqu          0x10(@ptr[0]),@MSG0[1]
591          movdqu         0x10(@ptr[1]),@MSG1[1]
592         movdqu          0x20(@ptr[0]),@MSG0[2]
593         pshufb          $BSWAP,@MSG0[0]
594          movdqu         0x20(@ptr[1]),@MSG1[2]
595          pshufb         $BSWAP,@MSG1[0]
596         movdqu          0x30(@ptr[0]),@MSG0[3]
597         lea             0x40(@ptr[0]),@ptr[0]
598         pshufb          $BSWAP,@MSG0[1]
599          movdqu         0x30(@ptr[1]),@MSG1[3]
600          lea            0x40(@ptr[1]),@ptr[1]
601          pshufb         $BSWAP,@MSG1[1]
602
603         movdqa          $E0,0x50(%rsp)          # offload
604         paddd           @MSG0[0],$E0
605          movdqa         $E1,0x70(%rsp)
606          paddd          @MSG1[0],$E1
607         movdqa          $ABCD0,0x40(%rsp)       # offload
608         movdqa          $ABCD0,$E0_
609          movdqa         $ABCD1,0x60(%rsp)
610          movdqa         $ABCD1,$E1_
611         sha1rnds4       \$0,$E0,$ABCD0          # 0-3
612         sha1nexte       @MSG0[1],$E0_
613          sha1rnds4      \$0,$E1,$ABCD1          # 0-3
614          sha1nexte      @MSG1[1],$E1_
615         pshufb          $BSWAP,@MSG0[2]
616         prefetcht0      127(@ptr[0])
617         sha1msg1        @MSG0[1],@MSG0[0]
618          pshufb         $BSWAP,@MSG1[2]
619          prefetcht0     127(@ptr[1])
620          sha1msg1       @MSG1[1],@MSG1[0]
621
622         pshufb          $BSWAP,@MSG0[3]
623         movdqa          $ABCD0,$E0
624          pshufb         $BSWAP,@MSG1[3]
625          movdqa         $ABCD1,$E1
626         sha1rnds4       \$0,$E0_,$ABCD0         # 4-7
627         sha1nexte       @MSG0[2],$E0
628          sha1rnds4      \$0,$E1_,$ABCD1         # 4-7
629          sha1nexte      @MSG1[2],$E1
630         pxor            @MSG0[2],@MSG0[0]
631         sha1msg1        @MSG0[2],@MSG0[1]
632          pxor           @MSG1[2],@MSG1[0]
633          sha1msg1       @MSG1[2],@MSG1[1]
634 ___
635 for($i=2;$i<20-4;$i++) {
636 $code.=<<___;
637         movdqa          $ABCD0,$E0_
638          movdqa         $ABCD1,$E1_
639         sha1rnds4       \$`int($i/5)`,$E0,$ABCD0        # 8-11
640         sha1nexte       @MSG0[3],$E0_
641          sha1rnds4      \$`int($i/5)`,$E1,$ABCD1        # 8-11
642          sha1nexte      @MSG1[3],$E1_
643         sha1msg2        @MSG0[3],@MSG0[0]
644          sha1msg2       @MSG1[3],@MSG1[0]
645         pxor            @MSG0[3],@MSG0[1]
646         sha1msg1        @MSG0[3],@MSG0[2]
647          pxor           @MSG1[3],@MSG1[1]
648          sha1msg1       @MSG1[3],@MSG1[2]
649 ___
650         ($E0,$E0_)=($E0_,$E0);          ($E1,$E1_)=($E1_,$E1);
651         push(@MSG0,shift(@MSG0));       push(@MSG1,shift(@MSG1));
652 }
653 $code.=<<___;
654         movdqa          $ABCD0,$E0_
655          movdqa         $ABCD1,$E1_
656         sha1rnds4       \$3,$E0,$ABCD0          # 64-67
657         sha1nexte       @MSG0[3],$E0_
658          sha1rnds4      \$3,$E1,$ABCD1          # 64-67
659          sha1nexte      @MSG1[3],$E1_
660         sha1msg2        @MSG0[3],@MSG0[0]
661          sha1msg2       @MSG1[3],@MSG1[0]
662         pxor            @MSG0[3],@MSG0[1]
663          pxor           @MSG1[3],@MSG1[1]
664
665         mov             \$1,%ecx
666         pxor            @MSG0[2],@MSG0[2]       # zero
667         cmp             4*0(%rbx),%ecx          # examine counters
668         cmovge          %rsp,@ptr[0]            # cancel input
669
670         movdqa          $ABCD0,$E0
671          movdqa         $ABCD1,$E1
672         sha1rnds4       \$3,$E0_,$ABCD0         # 68-71
673         sha1nexte       @MSG0[0],$E0
674          sha1rnds4      \$3,$E1_,$ABCD1         # 68-71
675          sha1nexte      @MSG1[0],$E1
676         sha1msg2        @MSG0[0],@MSG0[1]
677          sha1msg2       @MSG1[0],@MSG1[1]
678
679         cmp             4*1(%rbx),%ecx
680         cmovge          %rsp,@ptr[1]
681         movq            (%rbx),@MSG0[0]         # pull counters
682
683         movdqa          $ABCD0,$E0_
684          movdqa         $ABCD1,$E1_
685         sha1rnds4       \$3,$E0,$ABCD0          # 72-75
686         sha1nexte       @MSG0[1],$E0_
687          sha1rnds4      \$3,$E1,$ABCD1          # 72-75
688          sha1nexte      @MSG1[1],$E1_
689
690         pshufd          \$0x00,@MSG0[0],@MSG1[2]
691         pshufd          \$0x55,@MSG0[0],@MSG1[3]
692         movdqa          @MSG0[0],@MSG0[1]
693         pcmpgtd         @MSG0[2],@MSG1[2]
694         pcmpgtd         @MSG0[2],@MSG1[3]
695
696         movdqa          $ABCD0,$E0
697          movdqa         $ABCD1,$E1
698         sha1rnds4       \$3,$E0_,$ABCD0         # 76-79
699         sha1nexte       $MSG0[2],$E0
700          sha1rnds4      \$3,$E1_,$ABCD1         # 76-79
701          sha1nexte      $MSG0[2],$E1
702
703         pcmpgtd         @MSG0[2],@MSG0[1]       # counter mask
704         pand            @MSG1[2],$ABCD0
705         pand            @MSG1[2],$E0
706          pand           @MSG1[3],$ABCD1
707          pand           @MSG1[3],$E1
708         paddd           @MSG0[1],@MSG0[0]       # counters--
709
710         paddd           0x40(%rsp),$ABCD0
711         paddd           0x50(%rsp),$E0
712          paddd          0x60(%rsp),$ABCD1
713          paddd          0x70(%rsp),$E1
714
715         movq            @MSG0[0],(%rbx)         # save counters
716         dec             $num
717         jnz             .Loop_shaext
718
719         mov             `$REG_SZ*17+8`(%rsp),$num
720
721         pshufd          \$0b00011011,$ABCD0,$ABCD0
722         pshufd          \$0b00011011,$ABCD1,$ABCD1
723
724         movdqa          $ABCD0,@MSG0[0]
725         punpckldq       $ABCD1,$ABCD0           # b1.b0.a1.a0
726         punpckhdq       $ABCD1,@MSG0[0]         # d1.d0.c1.c0
727         punpckhdq       $E1,$E0                 # e1.e0.xx.xx
728         movq            $ABCD0,0x00-0x40($ctx)  # a1.a0
729         psrldq          \$8,$ABCD0
730         movq            @MSG0[0],0x40-0x40($ctx)# c1.c0
731         psrldq          \$8,@MSG0[0]
732         movq            $ABCD0,0x20-0x40($ctx)  # b1.b0
733         psrldq          \$8,$E0
734         movq            @MSG0[0],0x60-0x40($ctx)# d1.d0
735         movq            $E0,0x80-0x40($ctx)     # e1.e0
736
737         lea     `$REG_SZ/2`($ctx),$ctx
738         lea     `16*2`($inp),$inp
739         dec     $num
740         jnz     .Loop_grande_shaext
741
742 .Ldone_shaext:
743         #mov    `$REG_SZ*17`(%rsp),%rax         # original %rsp
744 ___
745 $code.=<<___ if ($win64);
746         movaps  -0xb8(%rax),%xmm6
747         movaps  -0xa8(%rax),%xmm7
748         movaps  -0x98(%rax),%xmm8
749         movaps  -0x88(%rax),%xmm9
750         movaps  -0x78(%rax),%xmm10
751         movaps  -0x68(%rax),%xmm11
752         movaps  -0x58(%rax),%xmm12
753         movaps  -0x48(%rax),%xmm13
754         movaps  -0x38(%rax),%xmm14
755         movaps  -0x28(%rax),%xmm15
756 ___
757 $code.=<<___;
758         mov     -16(%rax),%rbp
759         mov     -8(%rax),%rbx
760         lea     (%rax),%rsp
761 .Lepilogue_shaext:
762         ret
763 .size   sha1_multi_block_shaext,.-sha1_multi_block_shaext
764 ___
765                                                 }}}
766
767                                                 if ($avx) {{{
768 sub BODY_00_19_avx {
769 my ($i,$a,$b,$c,$d,$e)=@_;
770 my $j=$i+1;
771 my $k=$i+2;
772 my $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128";
773 my $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4];
774
775 $code.=<<___ if ($i==0 && $REG_SZ==16);
776         vmovd           (@ptr[0]),@Xi[0]
777          lea            `16*4`(@ptr[0]),@ptr[0]
778         vmovd           (@ptr[1]),@Xi[2]        # borrow Xi[2]
779          lea            `16*4`(@ptr[1]),@ptr[1]
780         vpinsrd         \$1,(@ptr[2]),@Xi[0],@Xi[0]
781          lea            `16*4`(@ptr[2]),@ptr[2]
782         vpinsrd         \$1,(@ptr[3]),@Xi[2],@Xi[2]
783          lea            `16*4`(@ptr[3]),@ptr[3]
784          vmovd          `4*$j-16*4`(@ptr[0]),@Xi[1]
785         vpunpckldq      @Xi[2],@Xi[0],@Xi[0]
786          vmovd          `4*$j-16*4`($ptr_n),$t3
787         vpshufb         $tx,@Xi[0],@Xi[0]
788 ___
789 $code.=<<___ if ($i<15 && $REG_SZ==16);         # just load input
790          vpinsrd        \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
791          vpinsrd        \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3
792 ___
793 $code.=<<___ if ($i==0 && $REG_SZ==32);
794         vmovd           (@ptr[0]),@Xi[0]
795          lea            `16*4`(@ptr[0]),@ptr[0]
796         vmovd           (@ptr[4]),@Xi[2]        # borrow Xi[2]
797          lea            `16*4`(@ptr[4]),@ptr[4]
798         vmovd           (@ptr[1]),$t2
799          lea            `16*4`(@ptr[1]),@ptr[1]
800         vmovd           (@ptr[5]),$t1
801          lea            `16*4`(@ptr[5]),@ptr[5]
802         vpinsrd         \$1,(@ptr[2]),@Xi[0],@Xi[0]
803          lea            `16*4`(@ptr[2]),@ptr[2]
804         vpinsrd         \$1,(@ptr[6]),@Xi[2],@Xi[2]
805          lea            `16*4`(@ptr[6]),@ptr[6]
806         vpinsrd         \$1,(@ptr[3]),$t2,$t2
807          lea            `16*4`(@ptr[3]),@ptr[3]
808         vpunpckldq      $t2,@Xi[0],@Xi[0]
809         vpinsrd         \$1,(@ptr[7]),$t1,$t1
810          lea            `16*4`(@ptr[7]),@ptr[7]
811         vpunpckldq      $t1,@Xi[2],@Xi[2]
812          vmovd          `4*$j-16*4`(@ptr[0]),@Xi[1]
813         vinserti128     @Xi[2],@Xi[0],@Xi[0]
814          vmovd          `4*$j-16*4`($ptr_n),$t3
815         vpshufb         $tx,@Xi[0],@Xi[0]
816 ___
817 $code.=<<___ if ($i<15 && $REG_SZ==32);         # just load input
818          vmovd          `4*$j-16*4`(@ptr[1]),$t2
819          vmovd          `4*$j-16*4`(@ptr[5]),$t1
820          vpinsrd        \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
821          vpinsrd        \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3
822          vpinsrd        \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2
823          vpunpckldq     $t2,@Xi[1],@Xi[1]
824          vpinsrd        \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1
825          vpunpckldq     $t1,$t3,$t3
826 ___
827 $code.=<<___ if ($i<14);
828         vpaddd  $K,$e,$e                        # e+=K_00_19
829         vpslld  \$5,$a,$t2
830         vpandn  $d,$b,$t1
831         vpand   $c,$b,$t0
832
833         vmovdqa @Xi[0],`&Xi_off($i)`
834         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
835          $vpack         $t3,@Xi[1],@Xi[1]
836         vpsrld  \$27,$a,$t3
837         vpxor   $t1,$t0,$t0                     # Ch(b,c,d)
838          vmovd          `4*$k-16*4`(@ptr[0]),@Xi[2]
839
840         vpslld  \$30,$b,$t1
841         vpor    $t3,$t2,$t2                     # rol(a,5)
842          vmovd          `4*$k-16*4`($ptr_n),$t3
843         vpaddd  $t0,$e,$e                       # e+=Ch(b,c,d)
844
845         vpsrld  \$2,$b,$b
846         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
847          vpshufb        $tx,@Xi[1],@Xi[1]
848         vpor    $t1,$b,$b                       # b=rol(b,30)
849 ___
850 $code.=<<___ if ($i==14);
851         vpaddd  $K,$e,$e                        # e+=K_00_19
852          prefetcht0     63(@ptr[0])
853         vpslld  \$5,$a,$t2
854         vpandn  $d,$b,$t1
855         vpand   $c,$b,$t0
856
857         vmovdqa @Xi[0],`&Xi_off($i)`
858         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
859          $vpack         $t3,@Xi[1],@Xi[1]
860         vpsrld  \$27,$a,$t3
861          prefetcht0     63(@ptr[1])
862         vpxor   $t1,$t0,$t0                     # Ch(b,c,d)
863
864         vpslld  \$30,$b,$t1
865         vpor    $t3,$t2,$t2                     # rol(a,5)
866          prefetcht0     63(@ptr[2])
867         vpaddd  $t0,$e,$e                       # e+=Ch(b,c,d)
868
869         vpsrld  \$2,$b,$b
870         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
871          prefetcht0     63(@ptr[3])
872          vpshufb        $tx,@Xi[1],@Xi[1]
873         vpor    $t1,$b,$b                       # b=rol(b,30)
874 ___
875 $code.=<<___ if ($i>=13 && $i<15);
876         vmovdqa `&Xi_off($j+2)`,@Xi[3]          # preload "X[2]"
877 ___
878 $code.=<<___ if ($i>=15);                       # apply Xupdate
879         vpxor   @Xi[-2],@Xi[1],@Xi[1]           # "X[13]"
880         vmovdqa `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
881
882         vpaddd  $K,$e,$e                        # e+=K_00_19
883         vpslld  \$5,$a,$t2
884         vpandn  $d,$b,$t1
885          `"prefetcht0   63(@ptr[4])"            if ($i==15 && $REG_SZ==32)`
886         vpand   $c,$b,$t0
887
888         vmovdqa @Xi[0],`&Xi_off($i)`
889         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
890          vpxor  `&Xi_off($j+8)`,@Xi[1],@Xi[1]
891         vpsrld  \$27,$a,$t3
892         vpxor   $t1,$t0,$t0                     # Ch(b,c,d)
893          vpxor  @Xi[3],@Xi[1],@Xi[1]
894          `"prefetcht0   63(@ptr[5])"            if ($i==15 && $REG_SZ==32)`
895
896         vpslld  \$30,$b,$t1
897         vpor    $t3,$t2,$t2                     # rol(a,5)
898         vpaddd  $t0,$e,$e                       # e+=Ch(b,c,d)
899          `"prefetcht0   63(@ptr[6])"            if ($i==15 && $REG_SZ==32)`
900          vpsrld \$31,@Xi[1],$tx
901          vpaddd @Xi[1],@Xi[1],@Xi[1]
902
903         vpsrld  \$2,$b,$b
904          `"prefetcht0   63(@ptr[7])"            if ($i==15 && $REG_SZ==32)`
905         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
906          vpor   $tx,@Xi[1],@Xi[1]               # rol   \$1,@Xi[1]
907         vpor    $t1,$b,$b                       # b=rol(b,30)
908 ___
909 push(@Xi,shift(@Xi));
910 }
911
912 sub BODY_20_39_avx {
913 my ($i,$a,$b,$c,$d,$e)=@_;
914 my $j=$i+1;
915
916 $code.=<<___ if ($i<79);
917         vpxor   @Xi[-2],@Xi[1],@Xi[1]           # "X[13]"
918         vmovdqa `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
919
920         vpslld  \$5,$a,$t2
921         vpaddd  $K,$e,$e                        # e+=K_20_39
922         vpxor   $b,$d,$t0
923 ___
924 $code.=<<___ if ($i<72);
925         vmovdqa @Xi[0],`&Xi_off($i)`
926 ___
927 $code.=<<___ if ($i<79);
928         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
929          vpxor  `&Xi_off($j+8)`,@Xi[1],@Xi[1]
930         vpsrld  \$27,$a,$t3
931         vpxor   $c,$t0,$t0                      # Parity(b,c,d)
932          vpxor  @Xi[3],@Xi[1],@Xi[1]
933
934         vpslld  \$30,$b,$t1
935         vpor    $t3,$t2,$t2                     # rol(a,5)
936         vpaddd  $t0,$e,$e                       # e+=Parity(b,c,d)
937          vpsrld \$31,@Xi[1],$tx
938          vpaddd @Xi[1],@Xi[1],@Xi[1]
939
940         vpsrld  \$2,$b,$b
941         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
942          vpor   $tx,@Xi[1],@Xi[1]               # rol(@Xi[1],1)
943         vpor    $t1,$b,$b                       # b=rol(b,30)
944 ___
945 $code.=<<___ if ($i==79);
946         vpslld  \$5,$a,$t2
947         vpaddd  $K,$e,$e                        # e+=K_20_39
948         vpxor   $b,$d,$t0
949
950         vpsrld  \$27,$a,$t3
951         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
952         vpxor   $c,$t0,$t0                      # Parity(b,c,d)
953
954         vpslld  \$30,$b,$t1
955         vpor    $t3,$t2,$t2                     # rol(a,5)
956         vpaddd  $t0,$e,$e                       # e+=Parity(b,c,d)
957
958         vpsrld  \$2,$b,$b
959         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
960         vpor    $t1,$b,$b                       # b=rol(b,30)
961 ___
962 push(@Xi,shift(@Xi));
963 }
964
965 sub BODY_40_59_avx {
966 my ($i,$a,$b,$c,$d,$e)=@_;
967 my $j=$i+1;
968
969 $code.=<<___;
970         vpxor   @Xi[-2],@Xi[1],@Xi[1]           # "X[13]"
971         vmovdqa `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
972
973         vpaddd  $K,$e,$e                        # e+=K_40_59
974         vpslld  \$5,$a,$t2
975         vpand   $c,$d,$t1
976          vpxor  `&Xi_off($j+8)`,@Xi[1],@Xi[1]
977
978         vpaddd  $t1,$e,$e
979         vpsrld  \$27,$a,$t3
980         vpxor   $c,$d,$t0
981          vpxor  @Xi[3],@Xi[1],@Xi[1]
982
983         vmovdqu @Xi[0],`&Xi_off($i)`
984         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
985         vpor    $t3,$t2,$t2                     # rol(a,5)
986          vpsrld \$31,@Xi[1],$tx
987         vpand   $b,$t0,$t0
988          vpaddd @Xi[1],@Xi[1],@Xi[1]
989
990         vpslld  \$30,$b,$t1
991         vpaddd  $t0,$e,$e                       # e+=Maj(b,d,c)
992
993         vpsrld  \$2,$b,$b
994         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
995          vpor   $tx,@Xi[1],@Xi[1]               # rol(@X[1],1)
996         vpor    $t1,$b,$b                       # b=rol(b,30)
997 ___
998 push(@Xi,shift(@Xi));
999 }
1000
1001 $code.=<<___;
1002 .type   sha1_multi_block_avx,\@function,3
1003 .align  32
1004 sha1_multi_block_avx:
1005 _avx_shortcut:
1006 ___
1007 $code.=<<___ if ($avx>1);
1008         shr     \$32,%rcx
1009         cmp     \$2,$num
1010         jb      .Lavx
1011         test    \$`1<<5`,%ecx
1012         jnz     _avx2_shortcut
1013         jmp     .Lavx
1014 .align  32
1015 .Lavx:
1016 ___
1017 $code.=<<___;
1018         mov     %rsp,%rax
1019         push    %rbx
1020         push    %rbp
1021 ___
1022 $code.=<<___ if ($win64);
1023         lea     -0xa8(%rsp),%rsp
1024         movaps  %xmm6,(%rsp)
1025         movaps  %xmm7,0x10(%rsp)
1026         movaps  %xmm8,0x20(%rsp)
1027         movaps  %xmm9,0x30(%rsp)
1028         movaps  %xmm10,-0x78(%rax)
1029         movaps  %xmm11,-0x68(%rax)
1030         movaps  %xmm12,-0x58(%rax)
1031         movaps  %xmm13,-0x48(%rax)
1032         movaps  %xmm14,-0x38(%rax)
1033         movaps  %xmm15,-0x28(%rax)
1034 ___
1035 $code.=<<___;
1036         sub     \$`$REG_SZ*18`, %rsp
1037         and     \$-256,%rsp
1038         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
1039 .Lbody_avx:
1040         lea     K_XX_XX(%rip),$Tbl
1041         lea     `$REG_SZ*16`(%rsp),%rbx
1042
1043         vzeroupper
1044 .Loop_grande_avx:
1045         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
1046         xor     $num,$num
1047 ___
1048 for($i=0;$i<4;$i++) {
1049     $code.=<<___;
1050         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
1051         mov     `16*$i+8`($inp),%ecx            # number of blocks
1052         cmp     $num,%ecx
1053         cmovg   %ecx,$num                       # find maximum
1054         test    %ecx,%ecx
1055         mov     %ecx,`4*$i`(%rbx)               # initialize counters
1056         cmovle  $Tbl,@ptr[$i]                   # cancel input
1057 ___
1058 }
1059 $code.=<<___;
1060         test    $num,$num
1061         jz      .Ldone_avx
1062
1063         vmovdqu 0x00($ctx),$A                   # load context
1064          lea    128(%rsp),%rax
1065         vmovdqu 0x20($ctx),$B
1066         vmovdqu 0x40($ctx),$C
1067         vmovdqu 0x60($ctx),$D
1068         vmovdqu 0x80($ctx),$E
1069         vmovdqu 0x60($Tbl),$tx                  # pbswap_mask
1070         jmp     .Loop_avx
1071
1072 .align  32
1073 .Loop_avx:
1074 ___
1075 $code.="        vmovdqa -0x20($Tbl),$K\n";      # K_00_19
1076 for($i=0;$i<20;$i++)    { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
1077 $code.="        vmovdqa 0x00($Tbl),$K\n";       # K_20_39
1078 for(;$i<40;$i++)        { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1079 $code.="        vmovdqa 0x20($Tbl),$K\n";       # K_40_59
1080 for(;$i<60;$i++)        { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
1081 $code.="        vmovdqa 0x40($Tbl),$K\n";       # K_60_79
1082 for(;$i<80;$i++)        { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1083 $code.=<<___;
1084         mov     \$1,%ecx
1085 ___
1086 for($i=0;$i<4;$i++) {
1087     $code.=<<___;
1088         cmp     `4*$i`(%rbx),%ecx               # examine counters
1089         cmovge  $Tbl,@ptr[$i]                   # cancel input
1090 ___
1091 }
1092 $code.=<<___;
1093         vmovdqu (%rbx),$t0                      # pull counters
1094         vpxor   $t2,$t2,$t2
1095         vmovdqa $t0,$t1
1096         vpcmpgtd $t2,$t1,$t1                    # mask value
1097         vpaddd  $t1,$t0,$t0                     # counters--
1098
1099         vpand   $t1,$A,$A
1100         vpand   $t1,$B,$B
1101         vpaddd  0x00($ctx),$A,$A
1102         vpand   $t1,$C,$C
1103         vpaddd  0x20($ctx),$B,$B
1104         vpand   $t1,$D,$D
1105         vpaddd  0x40($ctx),$C,$C
1106         vpand   $t1,$E,$E
1107         vpaddd  0x60($ctx),$D,$D
1108         vpaddd  0x80($ctx),$E,$E
1109         vmovdqu $A,0x00($ctx)
1110         vmovdqu $B,0x20($ctx)
1111         vmovdqu $C,0x40($ctx)
1112         vmovdqu $D,0x60($ctx)
1113         vmovdqu $E,0x80($ctx)
1114
1115         vmovdqu $t0,(%rbx)                      # save counters
1116         vmovdqu 0x60($Tbl),$tx                  # pbswap_mask
1117         dec     $num
1118         jnz     .Loop_avx
1119
1120         mov     `$REG_SZ*17+8`(%rsp),$num
1121         lea     $REG_SZ($ctx),$ctx
1122         lea     `16*$REG_SZ/4`($inp),$inp
1123         dec     $num
1124         jnz     .Loop_grande_avx
1125
1126 .Ldone_avx:
1127         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
1128         vzeroupper
1129 ___
1130 $code.=<<___ if ($win64);
1131         movaps  -0xb8(%rax),%xmm6
1132         movaps  -0xa8(%rax),%xmm7
1133         movaps  -0x98(%rax),%xmm8
1134         movaps  -0x88(%rax),%xmm9
1135         movaps  -0x78(%rax),%xmm10
1136         movaps  -0x68(%rax),%xmm11
1137         movaps  -0x58(%rax),%xmm12
1138         movaps  -0x48(%rax),%xmm13
1139         movaps  -0x38(%rax),%xmm14
1140         movaps  -0x28(%rax),%xmm15
1141 ___
1142 $code.=<<___;
1143         mov     -16(%rax),%rbp
1144         mov     -8(%rax),%rbx
1145         lea     (%rax),%rsp
1146 .Lepilogue_avx:
1147         ret
1148 .size   sha1_multi_block_avx,.-sha1_multi_block_avx
1149 ___
1150
1151                                                 if ($avx>1) {
1152 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1153
1154 $REG_SZ=32;
1155
1156 @ptr=map("%r$_",(12..15,8..11));
1157
1158 @V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4));
1159 ($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9));
1160 @Xi=map("%ymm$_",(10..14));
1161 $K="%ymm15";
1162
1163 $code.=<<___;
1164 .type   sha1_multi_block_avx2,\@function,3
1165 .align  32
1166 sha1_multi_block_avx2:
1167 _avx2_shortcut:
1168         mov     %rsp,%rax
1169         push    %rbx
1170         push    %rbp
1171         push    %r12
1172         push    %r13
1173         push    %r14
1174         push    %r15
1175 ___
1176 $code.=<<___ if ($win64);
1177         lea     -0xa8(%rsp),%rsp
1178         movaps  %xmm6,(%rsp)
1179         movaps  %xmm7,0x10(%rsp)
1180         movaps  %xmm8,0x20(%rsp)
1181         movaps  %xmm9,0x30(%rsp)
1182         movaps  %xmm10,0x40(%rsp)
1183         movaps  %xmm11,0x50(%rsp)
1184         movaps  %xmm12,-0x78(%rax)
1185         movaps  %xmm13,-0x68(%rax)
1186         movaps  %xmm14,-0x58(%rax)
1187         movaps  %xmm15,-0x48(%rax)
1188 ___
1189 $code.=<<___;
1190         sub     \$`$REG_SZ*18`, %rsp
1191         and     \$-256,%rsp
1192         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
1193 .Lbody_avx2:
1194         lea     K_XX_XX(%rip),$Tbl
1195         shr     \$1,$num
1196
1197         vzeroupper
1198 .Loop_grande_avx2:
1199         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
1200         xor     $num,$num
1201         lea     `$REG_SZ*16`(%rsp),%rbx
1202 ___
1203 for($i=0;$i<8;$i++) {
1204     $code.=<<___;
1205         mov     `16*$i+0`($inp),@ptr[$i]        # input pointer
1206         mov     `16*$i+8`($inp),%ecx            # number of blocks
1207         cmp     $num,%ecx
1208         cmovg   %ecx,$num                       # find maximum
1209         test    %ecx,%ecx
1210         mov     %ecx,`4*$i`(%rbx)               # initialize counters
1211         cmovle  $Tbl,@ptr[$i]                   # cancel input
1212 ___
1213 }
1214 $code.=<<___;
1215         vmovdqu 0x00($ctx),$A                   # load context
1216          lea    128(%rsp),%rax
1217         vmovdqu 0x20($ctx),$B
1218          lea    256+128(%rsp),%rbx
1219         vmovdqu 0x40($ctx),$C
1220         vmovdqu 0x60($ctx),$D
1221         vmovdqu 0x80($ctx),$E
1222         vmovdqu 0x60($Tbl),$tx                  # pbswap_mask
1223         jmp     .Loop_avx2
1224
1225 .align  32
1226 .Loop_avx2:
1227 ___
1228 $code.="        vmovdqa -0x20($Tbl),$K\n";      # K_00_19
1229 for($i=0;$i<20;$i++)    { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
1230 $code.="        vmovdqa 0x00($Tbl),$K\n";       # K_20_39
1231 for(;$i<40;$i++)        { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1232 $code.="        vmovdqa 0x20($Tbl),$K\n";       # K_40_59
1233 for(;$i<60;$i++)        { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
1234 $code.="        vmovdqa 0x40($Tbl),$K\n";       # K_60_79
1235 for(;$i<80;$i++)        { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1236 $code.=<<___;
1237         mov     \$1,%ecx
1238         lea     `$REG_SZ*16`(%rsp),%rbx
1239 ___
1240 for($i=0;$i<8;$i++) {
1241     $code.=<<___;
1242         cmp     `4*$i`(%rbx),%ecx               # examine counters
1243         cmovge  $Tbl,@ptr[$i]                   # cancel input
1244 ___
1245 }
1246 $code.=<<___;
1247         vmovdqu (%rbx),$t0              # pull counters
1248         vpxor   $t2,$t2,$t2
1249         vmovdqa $t0,$t1
1250         vpcmpgtd $t2,$t1,$t1                    # mask value
1251         vpaddd  $t1,$t0,$t0                     # counters--
1252
1253         vpand   $t1,$A,$A
1254         vpand   $t1,$B,$B
1255         vpaddd  0x00($ctx),$A,$A
1256         vpand   $t1,$C,$C
1257         vpaddd  0x20($ctx),$B,$B
1258         vpand   $t1,$D,$D
1259         vpaddd  0x40($ctx),$C,$C
1260         vpand   $t1,$E,$E
1261         vpaddd  0x60($ctx),$D,$D
1262         vpaddd  0x80($ctx),$E,$E
1263         vmovdqu $A,0x00($ctx)
1264         vmovdqu $B,0x20($ctx)
1265         vmovdqu $C,0x40($ctx)
1266         vmovdqu $D,0x60($ctx)
1267         vmovdqu $E,0x80($ctx)
1268
1269         vmovdqu $t0,(%rbx)                      # save counters
1270         lea     256+128(%rsp),%rbx
1271         vmovdqu 0x60($Tbl),$tx                  # pbswap_mask
1272         dec     $num
1273         jnz     .Loop_avx2
1274
1275         #mov    `$REG_SZ*17+8`(%rsp),$num
1276         #lea    $REG_SZ($ctx),$ctx
1277         #lea    `16*$REG_SZ/4`($inp),$inp
1278         #dec    $num
1279         #jnz    .Loop_grande_avx2
1280
1281 .Ldone_avx2:
1282         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
1283         vzeroupper
1284 ___
1285 $code.=<<___ if ($win64);
1286         movaps  -0xd8(%rax),%xmm6
1287         movaps  -0xc8(%rax),%xmm7
1288         movaps  -0xb8(%rax),%xmm8
1289         movaps  -0xa8(%rax),%xmm9
1290         movaps  -0x98(%rax),%xmm10
1291         movaps  -0x88(%rax),%xmm11
1292         movaps  -0x78(%rax),%xmm12
1293         movaps  -0x68(%rax),%xmm13
1294         movaps  -0x58(%rax),%xmm14
1295         movaps  -0x48(%rax),%xmm15
1296 ___
1297 $code.=<<___;
1298         mov     -48(%rax),%r15
1299         mov     -40(%rax),%r14
1300         mov     -32(%rax),%r13
1301         mov     -24(%rax),%r12
1302         mov     -16(%rax),%rbp
1303         mov     -8(%rax),%rbx
1304         lea     (%rax),%rsp
1305 .Lepilogue_avx2:
1306         ret
1307 .size   sha1_multi_block_avx2,.-sha1_multi_block_avx2
1308 ___
1309                                                 }       }}}
1310 $code.=<<___;
1311
1312 .align  256
1313         .long   0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
1314         .long   0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
1315 K_XX_XX:
1316         .long   0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
1317         .long   0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
1318         .long   0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
1319         .long   0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
1320         .long   0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
1321         .long   0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
1322         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1323         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1324         .byte   0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1325         .asciz  "SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1326 ___
1327
1328 if ($win64) {
1329 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1330 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1331 $rec="%rcx";
1332 $frame="%rdx";
1333 $context="%r8";
1334 $disp="%r9";
1335
1336 $code.=<<___;
1337 .extern __imp_RtlVirtualUnwind
1338 .type   se_handler,\@abi-omnipotent
1339 .align  16
1340 se_handler:
1341         push    %rsi
1342         push    %rdi
1343         push    %rbx
1344         push    %rbp
1345         push    %r12
1346         push    %r13
1347         push    %r14
1348         push    %r15
1349         pushfq
1350         sub     \$64,%rsp
1351
1352         mov     120($context),%rax      # pull context->Rax
1353         mov     248($context),%rbx      # pull context->Rip
1354
1355         mov     8($disp),%rsi           # disp->ImageBase
1356         mov     56($disp),%r11          # disp->HandlerData
1357
1358         mov     0(%r11),%r10d           # HandlerData[0]
1359         lea     (%rsi,%r10),%r10        # end of prologue label
1360         cmp     %r10,%rbx               # context->Rip<.Lbody
1361         jb      .Lin_prologue
1362
1363         mov     152($context),%rax      # pull context->Rsp
1364
1365         mov     4(%r11),%r10d           # HandlerData[1]
1366         lea     (%rsi,%r10),%r10        # epilogue label
1367         cmp     %r10,%rbx               # context->Rip>=.Lepilogue
1368         jae     .Lin_prologue
1369
1370         mov     `16*17`(%rax),%rax      # pull saved stack pointer
1371
1372         mov     -8(%rax),%rbx
1373         mov     -16(%rax),%rbp
1374         mov     %rbx,144($context)      # restore context->Rbx
1375         mov     %rbp,160($context)      # restore context->Rbp
1376
1377         lea     -24-10*16(%rax),%rsi
1378         lea     512($context),%rdi      # &context.Xmm6
1379         mov     \$20,%ecx
1380         .long   0xa548f3fc              # cld; rep movsq
1381
1382 .Lin_prologue:
1383         mov     8(%rax),%rdi
1384         mov     16(%rax),%rsi
1385         mov     %rax,152($context)      # restore context->Rsp
1386         mov     %rsi,168($context)      # restore context->Rsi
1387         mov     %rdi,176($context)      # restore context->Rdi
1388
1389         mov     40($disp),%rdi          # disp->ContextRecord
1390         mov     $context,%rsi           # context
1391         mov     \$154,%ecx              # sizeof(CONTEXT)
1392         .long   0xa548f3fc              # cld; rep movsq
1393
1394         mov     $disp,%rsi
1395         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1396         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1397         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1398         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1399         mov     40(%rsi),%r10           # disp->ContextRecord
1400         lea     56(%rsi),%r11           # &disp->HandlerData
1401         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1402         mov     %r10,32(%rsp)           # arg5
1403         mov     %r11,40(%rsp)           # arg6
1404         mov     %r12,48(%rsp)           # arg7
1405         mov     %rcx,56(%rsp)           # arg8, (NULL)
1406         call    *__imp_RtlVirtualUnwind(%rip)
1407
1408         mov     \$1,%eax                # ExceptionContinueSearch
1409         add     \$64,%rsp
1410         popfq
1411         pop     %r15
1412         pop     %r14
1413         pop     %r13
1414         pop     %r12
1415         pop     %rbp
1416         pop     %rbx
1417         pop     %rdi
1418         pop     %rsi
1419         ret
1420 .size   se_handler,.-se_handler
1421 ___
1422 $code.=<<___ if ($avx>1);
1423 .type   avx2_handler,\@abi-omnipotent
1424 .align  16
1425 avx2_handler:
1426         push    %rsi
1427         push    %rdi
1428         push    %rbx
1429         push    %rbp
1430         push    %r12
1431         push    %r13
1432         push    %r14
1433         push    %r15
1434         pushfq
1435         sub     \$64,%rsp
1436
1437         mov     120($context),%rax      # pull context->Rax
1438         mov     248($context),%rbx      # pull context->Rip
1439
1440         mov     8($disp),%rsi           # disp->ImageBase
1441         mov     56($disp),%r11          # disp->HandlerData
1442
1443         mov     0(%r11),%r10d           # HandlerData[0]
1444         lea     (%rsi,%r10),%r10        # end of prologue label
1445         cmp     %r10,%rbx               # context->Rip<body label
1446         jb      .Lin_prologue
1447
1448         mov     152($context),%rax      # pull context->Rsp
1449
1450         mov     4(%r11),%r10d           # HandlerData[1]
1451         lea     (%rsi,%r10),%r10        # epilogue label
1452         cmp     %r10,%rbx               # context->Rip>=epilogue label
1453         jae     .Lin_prologue
1454
1455         mov     `32*17`($context),%rax  # pull saved stack pointer
1456
1457         mov     -8(%rax),%rbx
1458         mov     -16(%rax),%rbp
1459         mov     -24(%rax),%r12
1460         mov     -32(%rax),%r13
1461         mov     -40(%rax),%r14
1462         mov     -48(%rax),%r15
1463         mov     %rbx,144($context)      # restore context->Rbx
1464         mov     %rbp,160($context)      # restore context->Rbp
1465         mov     %r12,216($context)      # restore cotnext->R12
1466         mov     %r13,224($context)      # restore cotnext->R13
1467         mov     %r14,232($context)      # restore cotnext->R14
1468         mov     %r15,240($context)      # restore cotnext->R15
1469
1470         lea     -56-10*16(%rax),%rsi
1471         lea     512($context),%rdi      # &context.Xmm6
1472         mov     \$20,%ecx
1473         .long   0xa548f3fc              # cld; rep movsq
1474
1475         jmp     .Lin_prologue
1476 .size   avx2_handler,.-avx2_handler
1477 ___
1478 $code.=<<___;
1479 .section        .pdata
1480 .align  4
1481         .rva    .LSEH_begin_sha1_multi_block
1482         .rva    .LSEH_end_sha1_multi_block
1483         .rva    .LSEH_info_sha1_multi_block
1484         .rva    .LSEH_begin_sha1_multi_block_shaext
1485         .rva    .LSEH_end_sha1_multi_block_shaext
1486         .rva    .LSEH_info_sha1_multi_block_shaext
1487 ___
1488 $code.=<<___ if ($avx);
1489         .rva    .LSEH_begin_sha1_multi_block_avx
1490         .rva    .LSEH_end_sha1_multi_block_avx
1491         .rva    .LSEH_info_sha1_multi_block_avx
1492 ___
1493 $code.=<<___ if ($avx>1);
1494         .rva    .LSEH_begin_sha1_multi_block_avx2
1495         .rva    .LSEH_end_sha1_multi_block_avx2
1496         .rva    .LSEH_info_sha1_multi_block_avx2
1497 ___
1498 $code.=<<___;
1499 .section        .xdata
1500 .align  8
1501 .LSEH_info_sha1_multi_block:
1502         .byte   9,0,0,0
1503         .rva    se_handler
1504         .rva    .Lbody,.Lepilogue                       # HandlerData[]
1505 .LSEH_info_sha1_multi_block_shaext:
1506         .byte   9,0,0,0
1507         .rva    se_handler
1508         .rva    .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
1509 ___
1510 $code.=<<___ if ($avx);
1511 .LSEH_info_sha1_multi_block_avx:
1512         .byte   9,0,0,0
1513         .rva    se_handler
1514         .rva    .Lbody_avx,.Lepilogue_avx               # HandlerData[]
1515 ___
1516 $code.=<<___ if ($avx>1);
1517 .LSEH_info_sha1_multi_block_avx2:
1518         .byte   9,0,0,0
1519         .rva    avx2_handler
1520         .rva    .Lbody_avx2,.Lepilogue_avx2             # HandlerData[]
1521 ___
1522 }
1523 ####################################################################
1524
1525 sub rex {
1526   local *opcode=shift;
1527   my ($dst,$src)=@_;
1528   my $rex=0;
1529
1530     $rex|=0x04                  if ($dst>=8);
1531     $rex|=0x01                  if ($src>=8);
1532     unshift @opcode,$rex|0x40   if ($rex);
1533 }
1534
1535 sub sha1rnds4 {
1536     if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1537       my @opcode=(0x0f,0x3a,0xcc);
1538         rex(\@opcode,$3,$2);
1539         push @opcode,0xc0|($2&7)|(($3&7)<<3);           # ModR/M
1540         my $c=$1;
1541         push @opcode,$c=~/^0/?oct($c):$c;
1542         return ".byte\t".join(',',@opcode);
1543     } else {
1544         return "sha1rnds4\t".@_[0];
1545     }
1546 }
1547
1548 sub sha1op38 {
1549     my $instr = shift;
1550     my %opcodelet = (
1551                 "sha1nexte" => 0xc8,
1552                 "sha1msg1"  => 0xc9,
1553                 "sha1msg2"  => 0xca     );
1554
1555     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1556       my @opcode=(0x0f,0x38);
1557         rex(\@opcode,$2,$1);
1558         push @opcode,$opcodelet{$instr};
1559         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1560         return ".byte\t".join(',',@opcode);
1561     } else {
1562         return $instr."\t".@_[0];
1563     }
1564 }
1565
1566 foreach (split("\n",$code)) {
1567         s/\`([^\`]*)\`/eval($1)/ge;
1568
1569         s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo                or
1570         s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo             or
1571
1572         s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go          or
1573         s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go         or
1574         s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go    or
1575         s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go        or
1576         s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go            or
1577         s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1578
1579         print $_,"\n";
1580 }
1581
1582 close STDOUT;