Update copyright year
[openssl.git] / crypto / sha / asm / sha1-mb-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # Multi-buffer SHA1 procedure processes n buffers in parallel by
18 # placing buffer data to designated lane of SIMD register. n is
19 # naturally limited to 4 on pre-AVX2 processors and to 8 on
20 # AVX2-capable processors such as Haswell.
21 #
22 #               this    +aesni(i)       sha1    aesni-sha1      gain(iv)
23 # -------------------------------------------------------------------
24 # Westmere(ii)  10.7/n  +1.28=3.96(n=4) 5.30    6.66            +68%
25 # Atom(ii)      18.1/n  +3.93=8.46(n=4) 9.37    12.8            +51%
26 # Sandy Bridge  (8.16   +5.15=13.3)/n   4.99    5.98            +80%
27 # Ivy Bridge    (8.08   +5.14=13.2)/n   4.60    5.54            +68%
28 # Haswell(iii)  (8.96   +5.00=14.0)/n   3.57    4.55            +160%
29 # Skylake       (8.70   +5.00=13.7)/n   3.64    4.20            +145%
30 # Bulldozer     (9.76   +5.76=15.5)/n   5.95    6.37            +64%
31 #
32 # (i)   multi-block CBC encrypt with 128-bit key;
33 # (ii)  (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
34 #       because of lower AES-NI instruction throughput;
35 # (iii) "this" is for n=8, when we gather twice as much data, result
36 #       for n=4 is 8.00+4.44=12.4;
37 # (iv)  presented improvement coefficients are asymptotic limits and
38 #       in real-life application are somewhat lower, e.g. for 2KB
39 #       fragments they range from 30% to 100% (on Haswell);
40
41 # $output is the last argument if it looks like a file (it has an extension)
42 # $flavour is the first argument if it doesn't look like a file
43 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
44 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
45
46 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
47
48 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
49 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
50 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
51 die "can't locate x86_64-xlate.pl";
52
53 push(@INC,"${dir}","${dir}../../perlasm");
54 require "x86_64-support.pl";
55
56 $ptr_size=&pointer_size($flavour);
57
58 $avx=0;
59
60 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
61                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
62         $avx = ($1>=2.19) + ($1>=2.22);
63 }
64
65 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
66            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
67         $avx = ($1>=2.09) + ($1>=2.10);
68 }
69
70 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
71            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
72         $avx = ($1>=10) + ($1>=11);
73 }
74
75 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
76         $avx = ($2>=3.0) + ($2>3.0);
77 }
78
79 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
80     or die "can't call $xlate: $!";
81 *STDOUT=*OUT;
82
83 # void sha1_multi_block (
84 #     struct {  unsigned int A[8];
85 #               unsigned int B[8];
86 #               unsigned int C[8];
87 #               unsigned int D[8];
88 #               unsigned int E[8];      } *ctx,
89 #     struct {  void *ptr; int blocks;  } inp[8],
90 #     int num);         /* 1 or 2 */
91 #
92 $ctx="%rdi";    # 1st arg
93 $inp="%rsi";    # 2nd arg
94 $num="%edx";
95 @ptr=map("%r$_",(8..11));
96 $Tbl="%rbp";
97 $inp_elm_size=2*$ptr_size;
98
99 @V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4));
100 ($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9));
101 @Xi=map("%xmm$_",(10..14));
102 $K="%xmm15";
103
104 if (1) {
105     # Atom-specific optimization aiming to eliminate pshufb with high
106     # registers [and thus get rid of 48 cycles accumulated penalty]
107     @Xi=map("%xmm$_",(0..4));
108     ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
109     @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
110 }
111
112 $REG_SZ=16;
113
114 sub Xi_off {
115 my $off = shift;
116
117     $off %= 16; $off *= $REG_SZ;
118     $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
119 }
120
121 sub BODY_00_19 {
122 my ($i,$a,$b,$c,$d,$e)=@_;
123 my $j=$i+1;
124 my $k=$i+2;
125
126 # Loads are performed 2+3/4 iterations in advance. 3/4 means that out
127 # of 4 words you would expect to be loaded per given iteration one is
128 # spilled to next iteration. In other words indices in four input
129 # streams are distributed as following:
130 #
131 # $i==0:        0,0,0,0,1,1,1,1,2,2,2,
132 # $i==1:        2,3,3,3,
133 # $i==2:        3,4,4,4,
134 # ...
135 # $i==13:       14,15,15,15,
136 # $i==14:       15
137 #
138 # Then at $i==15 Xupdate is applied one iteration in advance...
139 $code.=<<___ if ($i==0);
140         movd            (@ptr[0]),@Xi[0]
141          lea            `16*4`(@ptr[0]),@ptr[0]
142         movd            (@ptr[1]),@Xi[2]        # borrow @Xi[2]
143          lea            `16*4`(@ptr[1]),@ptr[1]
144         movd            (@ptr[2]),@Xi[3]        # borrow @Xi[3]
145          lea            `16*4`(@ptr[2]),@ptr[2]
146         movd            (@ptr[3]),@Xi[4]        # borrow @Xi[4]
147          lea            `16*4`(@ptr[3]),@ptr[3]
148         punpckldq       @Xi[3],@Xi[0]
149          movd           `4*$j-16*4`(@ptr[0]),@Xi[1]
150         punpckldq       @Xi[4],@Xi[2]
151          movd           `4*$j-16*4`(@ptr[1]),$t3
152         punpckldq       @Xi[2],@Xi[0]
153          movd           `4*$j-16*4`(@ptr[2]),$t2
154         pshufb          $tx,@Xi[0]
155 ___
156 $code.=<<___ if ($i<14);                        # just load input
157          movd           `4*$j-16*4`(@ptr[3]),$t1
158          punpckldq      $t2,@Xi[1]
159         movdqa  $a,$t2
160         paddd   $K,$e                           # e+=K_00_19
161          punpckldq      $t1,$t3
162         movdqa  $b,$t1
163         movdqa  $b,$t0
164         pslld   \$5,$t2
165         pandn   $d,$t1
166         pand    $c,$t0
167          punpckldq      $t3,@Xi[1]
168         movdqa  $a,$t3
169
170         movdqa  @Xi[0],`&Xi_off($i)`
171         paddd   @Xi[0],$e                       # e+=X[i]
172          movd           `4*$k-16*4`(@ptr[0]),@Xi[2]
173         psrld   \$27,$t3
174         pxor    $t1,$t0                         # Ch(b,c,d)
175         movdqa  $b,$t1
176
177         por     $t3,$t2                         # rol(a,5)
178          movd           `4*$k-16*4`(@ptr[1]),$t3
179         pslld   \$30,$t1
180         paddd   $t0,$e                          # e+=Ch(b,c,d)
181
182         psrld   \$2,$b
183         paddd   $t2,$e                          # e+=rol(a,5)
184          pshufb $tx,@Xi[1]
185          movd           `4*$k-16*4`(@ptr[2]),$t2
186         por     $t1,$b                          # b=rol(b,30)
187 ___
188 $code.=<<___ if ($i==14);                       # just load input
189          movd           `4*$j-16*4`(@ptr[3]),$t1
190          punpckldq      $t2,@Xi[1]
191         movdqa  $a,$t2
192         paddd   $K,$e                           # e+=K_00_19
193          punpckldq      $t1,$t3
194         movdqa  $b,$t1
195         movdqa  $b,$t0
196         pslld   \$5,$t2
197          prefetcht0     63(@ptr[0])
198         pandn   $d,$t1
199         pand    $c,$t0
200          punpckldq      $t3,@Xi[1]
201         movdqa  $a,$t3
202
203         movdqa  @Xi[0],`&Xi_off($i)`
204         paddd   @Xi[0],$e                       # e+=X[i]
205         psrld   \$27,$t3
206         pxor    $t1,$t0                         # Ch(b,c,d)
207         movdqa  $b,$t1
208          prefetcht0     63(@ptr[1])
209
210         por     $t3,$t2                         # rol(a,5)
211         pslld   \$30,$t1
212         paddd   $t0,$e                          # e+=Ch(b,c,d)
213          prefetcht0     63(@ptr[2])
214
215         psrld   \$2,$b
216         paddd   $t2,$e                          # e+=rol(a,5)
217          pshufb $tx,@Xi[1]
218          prefetcht0     63(@ptr[3])
219         por     $t1,$b                          # b=rol(b,30)
220 ___
221 $code.=<<___ if ($i>=13 && $i<15);
222         movdqa  `&Xi_off($j+2)`,@Xi[3]          # preload "X[2]"
223 ___
224 $code.=<<___ if ($i>=15);                       # apply Xupdate
225         pxor    @Xi[-2],@Xi[1]                  # "X[13]"
226         movdqa  `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
227
228         movdqa  $a,$t2
229          pxor   `&Xi_off($j+8)`,@Xi[1]
230         paddd   $K,$e                           # e+=K_00_19
231         movdqa  $b,$t1
232         pslld   \$5,$t2
233          pxor   @Xi[3],@Xi[1]
234         movdqa  $b,$t0
235         pandn   $d,$t1
236          movdqa @Xi[1],$tx
237         pand    $c,$t0
238         movdqa  $a,$t3
239          psrld  \$31,$tx
240          paddd  @Xi[1],@Xi[1]
241
242         movdqa  @Xi[0],`&Xi_off($i)`
243         paddd   @Xi[0],$e                       # e+=X[i]
244         psrld   \$27,$t3
245         pxor    $t1,$t0                         # Ch(b,c,d)
246
247         movdqa  $b,$t1
248         por     $t3,$t2                         # rol(a,5)
249         pslld   \$30,$t1
250         paddd   $t0,$e                          # e+=Ch(b,c,d)
251
252         psrld   \$2,$b
253         paddd   $t2,$e                          # e+=rol(a,5)
254          por    $tx,@Xi[1]                      # rol   \$1,@Xi[1]
255         por     $t1,$b                          # b=rol(b,30)
256 ___
257 push(@Xi,shift(@Xi));
258 }
259
260 sub BODY_20_39 {
261 my ($i,$a,$b,$c,$d,$e)=@_;
262 my $j=$i+1;
263
264 $code.=<<___ if ($i<79);
265         pxor    @Xi[-2],@Xi[1]                  # "X[13]"
266         movdqa  `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
267
268         movdqa  $a,$t2
269         movdqa  $d,$t0
270          pxor   `&Xi_off($j+8)`,@Xi[1]
271         paddd   $K,$e                           # e+=K_20_39
272         pslld   \$5,$t2
273         pxor    $b,$t0
274
275         movdqa  $a,$t3
276 ___
277 $code.=<<___ if ($i<72);
278         movdqa  @Xi[0],`&Xi_off($i)`
279 ___
280 $code.=<<___ if ($i<79);
281         paddd   @Xi[0],$e                       # e+=X[i]
282          pxor   @Xi[3],@Xi[1]
283         psrld   \$27,$t3
284         pxor    $c,$t0                          # Parity(b,c,d)
285         movdqa  $b,$t1
286
287         pslld   \$30,$t1
288          movdqa @Xi[1],$tx
289         por     $t3,$t2                         # rol(a,5)
290          psrld  \$31,$tx
291         paddd   $t0,$e                          # e+=Parity(b,c,d)
292          paddd  @Xi[1],@Xi[1]
293
294         psrld   \$2,$b
295         paddd   $t2,$e                          # e+=rol(a,5)
296          por    $tx,@Xi[1]                      # rol(@Xi[1],1)
297         por     $t1,$b                          # b=rol(b,30)
298 ___
299 $code.=<<___ if ($i==79);
300         movdqa  $a,$t2
301         paddd   $K,$e                           # e+=K_20_39
302         movdqa  $d,$t0
303         pslld   \$5,$t2
304         pxor    $b,$t0
305
306         movdqa  $a,$t3
307         paddd   @Xi[0],$e                       # e+=X[i]
308         psrld   \$27,$t3
309         movdqa  $b,$t1
310         pxor    $c,$t0                          # Parity(b,c,d)
311
312         pslld   \$30,$t1
313         por     $t3,$t2                         # rol(a,5)
314         paddd   $t0,$e                          # e+=Parity(b,c,d)
315
316         psrld   \$2,$b
317         paddd   $t2,$e                          # e+=rol(a,5)
318         por     $t1,$b                          # b=rol(b,30)
319 ___
320 push(@Xi,shift(@Xi));
321 }
322
323 sub BODY_40_59 {
324 my ($i,$a,$b,$c,$d,$e)=@_;
325 my $j=$i+1;
326
327 $code.=<<___;
328         pxor    @Xi[-2],@Xi[1]                  # "X[13]"
329         movdqa  `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
330
331         movdqa  $a,$t2
332         movdqa  $d,$t1
333          pxor   `&Xi_off($j+8)`,@Xi[1]
334         pxor    @Xi[3],@Xi[1]
335         paddd   $K,$e                           # e+=K_40_59
336         pslld   \$5,$t2
337         movdqa  $a,$t3
338         pand    $c,$t1
339
340         movdqa  $d,$t0
341          movdqa @Xi[1],$tx
342         psrld   \$27,$t3
343         paddd   $t1,$e
344         pxor    $c,$t0
345
346         movdqa  @Xi[0],`&Xi_off($i)`
347         paddd   @Xi[0],$e                       # e+=X[i]
348         por     $t3,$t2                         # rol(a,5)
349          psrld  \$31,$tx
350         pand    $b,$t0
351         movdqa  $b,$t1
352
353         pslld   \$30,$t1
354          paddd  @Xi[1],@Xi[1]
355         paddd   $t0,$e                          # e+=Maj(b,d,c)
356
357         psrld   \$2,$b
358         paddd   $t2,$e                          # e+=rol(a,5)
359          por    $tx,@Xi[1]                      # rol(@X[1],1)
360         por     $t1,$b                          # b=rol(b,30)
361 ___
362 push(@Xi,shift(@Xi));
363 }
364
365 $code.=<<___;
366 .text
367
368 .extern OPENSSL_ia32cap_P
369
370 .globl  sha1_multi_block
371 .type   sha1_multi_block,\@function,3
372 .align  32
373 sha1_multi_block:
374 .cfi_startproc
375         mov     OPENSSL_ia32cap_P+4(%rip),%rcx
376         bt      \$61,%rcx                       # check SHA bit
377         jc      _shaext_shortcut
378 ___
379 $code.=<<___ if ($avx);
380         test    \$`1<<28`,%ecx
381         jnz     _avx_shortcut
382 ___
383 $code.=<<___;
384         mov     %rsp,%rax
385 .cfi_def_cfa_register   %rax
386         push    %rbx
387 .cfi_push       %rbx
388         push    %rbp
389 .cfi_push       %rbx
390 ___
391 $code.=<<___ if ($win64);
392         lea     -0xa8(%rsp),%rsp
393         movaps  %xmm6,(%rsp)
394         movaps  %xmm7,0x10(%rsp)
395         movaps  %xmm8,0x20(%rsp)
396         movaps  %xmm9,0x30(%rsp)
397         movaps  %xmm10,-0x78(%rax)
398         movaps  %xmm11,-0x68(%rax)
399         movaps  %xmm12,-0x58(%rax)
400         movaps  %xmm13,-0x48(%rax)
401         movaps  %xmm14,-0x38(%rax)
402         movaps  %xmm15,-0x28(%rax)
403 ___
404 $code.=<<___;
405         sub     \$`$REG_SZ*18`,%rsp
406         and     \$-256,%rsp
407         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
408 .cfi_cfa_expression     %rsp+`$REG_SZ*17`,deref,+8
409 .Lbody:
410         lea     K_XX_XX(%rip),$Tbl
411         lea     `$REG_SZ*16`(%rsp),%rbx
412
413 .Loop_grande:
414         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
415         xor     $num,$num
416 ___
417 for($i=0;$i<4;$i++) {
418     $ptr_reg=&pointer_register($flavour,@ptr[$i]);
419     $code.=<<___;
420         # input pointer
421         mov     `$inp_elm_size*$i+0`($inp),$ptr_reg
422         # number of blocks
423         mov     `$inp_elm_size*$i+$ptr_size`($inp),%ecx
424         cmp     $num,%ecx
425         cmovg   %ecx,$num                       # find maximum
426         test    %ecx,%ecx
427         mov     %ecx,`4*$i`(%rbx)               # initialize counters
428         cmovle  $Tbl,@ptr[$i]                   # cancel input
429 ___
430 }
431 $code.=<<___;
432         test    $num,$num
433         jz      .Ldone
434
435         movdqu  0x00($ctx),$A                   # load context
436          lea    128(%rsp),%rax
437         movdqu  0x20($ctx),$B
438         movdqu  0x40($ctx),$C
439         movdqu  0x60($ctx),$D
440         movdqu  0x80($ctx),$E
441         movdqa  0x60($Tbl),$tx                  # pbswap_mask
442         movdqa  -0x20($Tbl),$K                  # K_00_19
443         jmp     .Loop
444
445 .align  32
446 .Loop:
447 ___
448 for($i=0;$i<20;$i++)    { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
449 $code.="        movdqa  0x00($Tbl),$K\n";       # K_20_39
450 for(;$i<40;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
451 $code.="        movdqa  0x20($Tbl),$K\n";       # K_40_59
452 for(;$i<60;$i++)        { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
453 $code.="        movdqa  0x40($Tbl),$K\n";       # K_60_79
454 for(;$i<80;$i++)        { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
455 $code.=<<___;
456         movdqa  (%rbx),@Xi[0]                   # pull counters
457         mov     \$1,%ecx
458         cmp     4*0(%rbx),%ecx                  # examine counters
459         pxor    $t2,$t2
460         cmovge  $Tbl,@ptr[0]                    # cancel input
461         cmp     4*1(%rbx),%ecx
462         movdqa  @Xi[0],@Xi[1]
463         cmovge  $Tbl,@ptr[1]
464         cmp     4*2(%rbx),%ecx
465         pcmpgtd $t2,@Xi[1]                      # mask value
466         cmovge  $Tbl,@ptr[2]
467         cmp     4*3(%rbx),%ecx
468         paddd   @Xi[1],@Xi[0]                   # counters--
469         cmovge  $Tbl,@ptr[3]
470
471         movdqu  0x00($ctx),$t0
472         pand    @Xi[1],$A
473         movdqu  0x20($ctx),$t1
474         pand    @Xi[1],$B
475         paddd   $t0,$A
476         movdqu  0x40($ctx),$t2
477         pand    @Xi[1],$C
478         paddd   $t1,$B
479         movdqu  0x60($ctx),$t3
480         pand    @Xi[1],$D
481         paddd   $t2,$C
482         movdqu  0x80($ctx),$tx
483         pand    @Xi[1],$E
484         movdqu  $A,0x00($ctx)
485         paddd   $t3,$D
486         movdqu  $B,0x20($ctx)
487         paddd   $tx,$E
488         movdqu  $C,0x40($ctx)
489         movdqu  $D,0x60($ctx)
490         movdqu  $E,0x80($ctx)
491
492         movdqa  @Xi[0],(%rbx)                   # save counters
493         movdqa  0x60($Tbl),$tx                  # pbswap_mask
494         movdqa  -0x20($Tbl),$K                  # K_00_19
495         dec     $num
496         jnz     .Loop
497
498         mov     `$REG_SZ*17+8`(%rsp),$num
499         lea     $REG_SZ($ctx),$ctx
500         lea     `$inp_elm_size*$REG_SZ/4`($inp),$inp
501         dec     $num
502         jnz     .Loop_grande
503
504 .Ldone:
505         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
506 .cfi_def_cfa    %rax,8
507 ___
508 $code.=<<___ if ($win64);
509         movaps  -0xb8(%rax),%xmm6
510         movaps  -0xa8(%rax),%xmm7
511         movaps  -0x98(%rax),%xmm8
512         movaps  -0x88(%rax),%xmm9
513         movaps  -0x78(%rax),%xmm10
514         movaps  -0x68(%rax),%xmm11
515         movaps  -0x58(%rax),%xmm12
516         movaps  -0x48(%rax),%xmm13
517         movaps  -0x38(%rax),%xmm14
518         movaps  -0x28(%rax),%xmm15
519 ___
520 $code.=<<___;
521         mov     -16(%rax),%rbp
522 .cfi_restore    %rbp
523         mov     -8(%rax),%rbx
524 .cfi_restore    %rbx
525         lea     (%rax),%rsp
526 .cfi_def_cfa_register   %rsp
527 .Lepilogue:
528         ret
529 .cfi_endproc
530 .size   sha1_multi_block,.-sha1_multi_block
531 ___
532                                                 {{{
533 my ($ABCD0,$E0,$E0_,$BSWAP,$ABCD1,$E1,$E1_)=map("%xmm$_",(0..3,8..10));
534 my @MSG0=map("%xmm$_",(4..7));
535 my @MSG1=map("%xmm$_",(11..14));
536
537 $code.=<<___;
538 .type   sha1_multi_block_shaext,\@function,3
539 .align  32
540 sha1_multi_block_shaext:
541 .cfi_startproc
542 _shaext_shortcut:
543         mov     %rsp,%rax
544 .cfi_def_cfa_register   %rax
545         push    %rbx
546 .cfi_push       %rbx
547         push    %rbp
548 .cfi_push       %rbp
549 ___
550 $code.=<<___ if ($win64);
551         lea     -0xa8(%rsp),%rsp
552         movaps  %xmm6,(%rsp)
553         movaps  %xmm7,0x10(%rsp)
554         movaps  %xmm8,0x20(%rsp)
555         movaps  %xmm9,0x30(%rsp)
556         movaps  %xmm10,-0x78(%rax)
557         movaps  %xmm11,-0x68(%rax)
558         movaps  %xmm12,-0x58(%rax)
559         movaps  %xmm13,-0x48(%rax)
560         movaps  %xmm14,-0x38(%rax)
561         movaps  %xmm15,-0x28(%rax)
562 ___
563 $code.=<<___;
564         sub     \$`$REG_SZ*18`,%rsp
565         shl     \$1,$num                        # we process pair at a time
566         and     \$-256,%rsp
567         lea     0x40($ctx),$ctx                 # size optimization
568         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
569 .Lbody_shaext:
570         lea     `$REG_SZ*16`(%rsp),%rbx
571         movdqa  K_XX_XX+0x80(%rip),$BSWAP       # byte-n-word swap
572
573 .Loop_grande_shaext:
574         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
575         xor     $num,$num
576 ___
577 for($i=0;$i<2;$i++) {
578     $ptr_reg=&pointer_register($flavour,@ptr[$i]);
579     $code.=<<___;
580         # input pointer
581         mov     `$inp_elm_size*$i+0`($inp),$ptr_reg
582         # number of blocks
583         mov     `$inp_elm_size*$i+$ptr_size`($inp),%ecx
584         cmp     $num,%ecx
585         cmovg   %ecx,$num                       # find maximum
586         test    %ecx,%ecx
587         mov     %ecx,`4*$i`(%rbx)               # initialize counters
588         cmovle  %rsp,@ptr[$i]                   # cancel input
589 ___
590 }
591 $code.=<<___;
592         test    $num,$num
593         jz      .Ldone_shaext
594
595         movq            0x00-0x40($ctx),$ABCD0  # a1.a0
596         movq            0x20-0x40($ctx),@MSG0[0]# b1.b0
597         movq            0x40-0x40($ctx),@MSG0[1]# c1.c0
598         movq            0x60-0x40($ctx),@MSG0[2]# d1.d0
599         movq            0x80-0x40($ctx),@MSG0[3]# e1.e0
600
601         punpckldq       @MSG0[0],$ABCD0         # b1.a1.b0.a0
602         punpckldq       @MSG0[2],@MSG0[1]       # d1.c1.d0.c0
603
604         movdqa          $ABCD0,$ABCD1
605         punpcklqdq      @MSG0[1],$ABCD0         # d0.c0.b0.a0
606         punpckhqdq      @MSG0[1],$ABCD1         # d1.c1.b1.a1
607
608         pshufd          \$0b00111111,@MSG0[3],$E0
609         pshufd          \$0b01111111,@MSG0[3],$E1
610         pshufd          \$0b00011011,$ABCD0,$ABCD0
611         pshufd          \$0b00011011,$ABCD1,$ABCD1
612         jmp             .Loop_shaext
613
614 .align  32
615 .Loop_shaext:
616         movdqu          0x00(@ptr[0]),@MSG0[0]
617          movdqu         0x00(@ptr[1]),@MSG1[0]
618         movdqu          0x10(@ptr[0]),@MSG0[1]
619          movdqu         0x10(@ptr[1]),@MSG1[1]
620         movdqu          0x20(@ptr[0]),@MSG0[2]
621         pshufb          $BSWAP,@MSG0[0]
622          movdqu         0x20(@ptr[1]),@MSG1[2]
623          pshufb         $BSWAP,@MSG1[0]
624         movdqu          0x30(@ptr[0]),@MSG0[3]
625         lea             0x40(@ptr[0]),@ptr[0]
626         pshufb          $BSWAP,@MSG0[1]
627          movdqu         0x30(@ptr[1]),@MSG1[3]
628          lea            0x40(@ptr[1]),@ptr[1]
629          pshufb         $BSWAP,@MSG1[1]
630
631         movdqa          $E0,0x50(%rsp)          # offload
632         paddd           @MSG0[0],$E0
633          movdqa         $E1,0x70(%rsp)
634          paddd          @MSG1[0],$E1
635         movdqa          $ABCD0,0x40(%rsp)       # offload
636         movdqa          $ABCD0,$E0_
637          movdqa         $ABCD1,0x60(%rsp)
638          movdqa         $ABCD1,$E1_
639         sha1rnds4       \$0,$E0,$ABCD0          # 0-3
640         sha1nexte       @MSG0[1],$E0_
641          sha1rnds4      \$0,$E1,$ABCD1          # 0-3
642          sha1nexte      @MSG1[1],$E1_
643         pshufb          $BSWAP,@MSG0[2]
644         prefetcht0      127(@ptr[0])
645         sha1msg1        @MSG0[1],@MSG0[0]
646          pshufb         $BSWAP,@MSG1[2]
647          prefetcht0     127(@ptr[1])
648          sha1msg1       @MSG1[1],@MSG1[0]
649
650         pshufb          $BSWAP,@MSG0[3]
651         movdqa          $ABCD0,$E0
652          pshufb         $BSWAP,@MSG1[3]
653          movdqa         $ABCD1,$E1
654         sha1rnds4       \$0,$E0_,$ABCD0         # 4-7
655         sha1nexte       @MSG0[2],$E0
656          sha1rnds4      \$0,$E1_,$ABCD1         # 4-7
657          sha1nexte      @MSG1[2],$E1
658         pxor            @MSG0[2],@MSG0[0]
659         sha1msg1        @MSG0[2],@MSG0[1]
660          pxor           @MSG1[2],@MSG1[0]
661          sha1msg1       @MSG1[2],@MSG1[1]
662 ___
663 for($i=2;$i<20-4;$i++) {
664 $code.=<<___;
665         movdqa          $ABCD0,$E0_
666          movdqa         $ABCD1,$E1_
667         sha1rnds4       \$`int($i/5)`,$E0,$ABCD0        # 8-11
668         sha1nexte       @MSG0[3],$E0_
669          sha1rnds4      \$`int($i/5)`,$E1,$ABCD1        # 8-11
670          sha1nexte      @MSG1[3],$E1_
671         sha1msg2        @MSG0[3],@MSG0[0]
672          sha1msg2       @MSG1[3],@MSG1[0]
673         pxor            @MSG0[3],@MSG0[1]
674         sha1msg1        @MSG0[3],@MSG0[2]
675          pxor           @MSG1[3],@MSG1[1]
676          sha1msg1       @MSG1[3],@MSG1[2]
677 ___
678         ($E0,$E0_)=($E0_,$E0);          ($E1,$E1_)=($E1_,$E1);
679         push(@MSG0,shift(@MSG0));       push(@MSG1,shift(@MSG1));
680 }
681 $code.=<<___;
682         movdqa          $ABCD0,$E0_
683          movdqa         $ABCD1,$E1_
684         sha1rnds4       \$3,$E0,$ABCD0          # 64-67
685         sha1nexte       @MSG0[3],$E0_
686          sha1rnds4      \$3,$E1,$ABCD1          # 64-67
687          sha1nexte      @MSG1[3],$E1_
688         sha1msg2        @MSG0[3],@MSG0[0]
689          sha1msg2       @MSG1[3],@MSG1[0]
690         pxor            @MSG0[3],@MSG0[1]
691          pxor           @MSG1[3],@MSG1[1]
692
693         mov             \$1,%ecx
694         pxor            @MSG0[2],@MSG0[2]       # zero
695         cmp             4*0(%rbx),%ecx          # examine counters
696         cmovge          %rsp,@ptr[0]            # cancel input
697
698         movdqa          $ABCD0,$E0
699          movdqa         $ABCD1,$E1
700         sha1rnds4       \$3,$E0_,$ABCD0         # 68-71
701         sha1nexte       @MSG0[0],$E0
702          sha1rnds4      \$3,$E1_,$ABCD1         # 68-71
703          sha1nexte      @MSG1[0],$E1
704         sha1msg2        @MSG0[0],@MSG0[1]
705          sha1msg2       @MSG1[0],@MSG1[1]
706
707         cmp             4*1(%rbx),%ecx
708         cmovge          %rsp,@ptr[1]
709         movq            (%rbx),@MSG0[0]         # pull counters
710
711         movdqa          $ABCD0,$E0_
712          movdqa         $ABCD1,$E1_
713         sha1rnds4       \$3,$E0,$ABCD0          # 72-75
714         sha1nexte       @MSG0[1],$E0_
715          sha1rnds4      \$3,$E1,$ABCD1          # 72-75
716          sha1nexte      @MSG1[1],$E1_
717
718         pshufd          \$0x00,@MSG0[0],@MSG1[2]
719         pshufd          \$0x55,@MSG0[0],@MSG1[3]
720         movdqa          @MSG0[0],@MSG0[1]
721         pcmpgtd         @MSG0[2],@MSG1[2]
722         pcmpgtd         @MSG0[2],@MSG1[3]
723
724         movdqa          $ABCD0,$E0
725          movdqa         $ABCD1,$E1
726         sha1rnds4       \$3,$E0_,$ABCD0         # 76-79
727         sha1nexte       $MSG0[2],$E0
728          sha1rnds4      \$3,$E1_,$ABCD1         # 76-79
729          sha1nexte      $MSG0[2],$E1
730
731         pcmpgtd         @MSG0[2],@MSG0[1]       # counter mask
732         pand            @MSG1[2],$ABCD0
733         pand            @MSG1[2],$E0
734          pand           @MSG1[3],$ABCD1
735          pand           @MSG1[3],$E1
736         paddd           @MSG0[1],@MSG0[0]       # counters--
737
738         paddd           0x40(%rsp),$ABCD0
739         paddd           0x50(%rsp),$E0
740          paddd          0x60(%rsp),$ABCD1
741          paddd          0x70(%rsp),$E1
742
743         movq            @MSG0[0],(%rbx)         # save counters
744         dec             $num
745         jnz             .Loop_shaext
746
747         mov             `$REG_SZ*17+8`(%rsp),$num
748
749         pshufd          \$0b00011011,$ABCD0,$ABCD0
750         pshufd          \$0b00011011,$ABCD1,$ABCD1
751
752         movdqa          $ABCD0,@MSG0[0]
753         punpckldq       $ABCD1,$ABCD0           # b1.b0.a1.a0
754         punpckhdq       $ABCD1,@MSG0[0]         # d1.d0.c1.c0
755         punpckhdq       $E1,$E0                 # e1.e0.xx.xx
756         movq            $ABCD0,0x00-0x40($ctx)  # a1.a0
757         psrldq          \$8,$ABCD0
758         movq            @MSG0[0],0x40-0x40($ctx)# c1.c0
759         psrldq          \$8,@MSG0[0]
760         movq            $ABCD0,0x20-0x40($ctx)  # b1.b0
761         psrldq          \$8,$E0
762         movq            @MSG0[0],0x60-0x40($ctx)# d1.d0
763         movq            $E0,0x80-0x40($ctx)     # e1.e0
764
765         lea     `$REG_SZ/2`($ctx),$ctx
766         lea     `$inp_elm_size*2`($inp),$inp
767         dec     $num
768         jnz     .Loop_grande_shaext
769
770 .Ldone_shaext:
771         #mov    `$REG_SZ*17`(%rsp),%rax         # original %rsp
772 ___
773 $code.=<<___ if ($win64);
774         movaps  -0xb8(%rax),%xmm6
775         movaps  -0xa8(%rax),%xmm7
776         movaps  -0x98(%rax),%xmm8
777         movaps  -0x88(%rax),%xmm9
778         movaps  -0x78(%rax),%xmm10
779         movaps  -0x68(%rax),%xmm11
780         movaps  -0x58(%rax),%xmm12
781         movaps  -0x48(%rax),%xmm13
782         movaps  -0x38(%rax),%xmm14
783         movaps  -0x28(%rax),%xmm15
784 ___
785 $code.=<<___;
786         mov     -16(%rax),%rbp
787 .cfi_restore    %rbp
788         mov     -8(%rax),%rbx
789 .cfi_restore    %rbx
790         lea     (%rax),%rsp
791 .cfi_def_cfa_register   %rsp
792 .Lepilogue_shaext:
793         ret
794 .cfi_endproc
795 .size   sha1_multi_block_shaext,.-sha1_multi_block_shaext
796 ___
797                                                 }}}
798
799                                                 if ($avx) {{{
800 sub BODY_00_19_avx {
801 my ($i,$a,$b,$c,$d,$e)=@_;
802 my $j=$i+1;
803 my $k=$i+2;
804 my $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128";
805 my $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4];
806
807 $code.=<<___ if ($i==0 && $REG_SZ==16);
808         vmovd           (@ptr[0]),@Xi[0]
809          lea            `16*4`(@ptr[0]),@ptr[0]
810         vmovd           (@ptr[1]),@Xi[2]        # borrow Xi[2]
811          lea            `16*4`(@ptr[1]),@ptr[1]
812         vpinsrd         \$1,(@ptr[2]),@Xi[0],@Xi[0]
813          lea            `16*4`(@ptr[2]),@ptr[2]
814         vpinsrd         \$1,(@ptr[3]),@Xi[2],@Xi[2]
815          lea            `16*4`(@ptr[3]),@ptr[3]
816          vmovd          `4*$j-16*4`(@ptr[0]),@Xi[1]
817         vpunpckldq      @Xi[2],@Xi[0],@Xi[0]
818          vmovd          `4*$j-16*4`($ptr_n),$t3
819         vpshufb         $tx,@Xi[0],@Xi[0]
820 ___
821 $code.=<<___ if ($i<15 && $REG_SZ==16);         # just load input
822          vpinsrd        \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
823          vpinsrd        \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3
824 ___
825 $code.=<<___ if ($i==0 && $REG_SZ==32);
826         vmovd           (@ptr[0]),@Xi[0]
827          lea            `16*4`(@ptr[0]),@ptr[0]
828         vmovd           (@ptr[4]),@Xi[2]        # borrow Xi[2]
829          lea            `16*4`(@ptr[4]),@ptr[4]
830         vmovd           (@ptr[1]),$t2
831          lea            `16*4`(@ptr[1]),@ptr[1]
832         vmovd           (@ptr[5]),$t1
833          lea            `16*4`(@ptr[5]),@ptr[5]
834         vpinsrd         \$1,(@ptr[2]),@Xi[0],@Xi[0]
835          lea            `16*4`(@ptr[2]),@ptr[2]
836         vpinsrd         \$1,(@ptr[6]),@Xi[2],@Xi[2]
837          lea            `16*4`(@ptr[6]),@ptr[6]
838         vpinsrd         \$1,(@ptr[3]),$t2,$t2
839          lea            `16*4`(@ptr[3]),@ptr[3]
840         vpunpckldq      $t2,@Xi[0],@Xi[0]
841         vpinsrd         \$1,(@ptr[7]),$t1,$t1
842          lea            `16*4`(@ptr[7]),@ptr[7]
843         vpunpckldq      $t1,@Xi[2],@Xi[2]
844          vmovd          `4*$j-16*4`(@ptr[0]),@Xi[1]
845         vinserti128     @Xi[2],@Xi[0],@Xi[0]
846          vmovd          `4*$j-16*4`($ptr_n),$t3
847         vpshufb         $tx,@Xi[0],@Xi[0]
848 ___
849 $code.=<<___ if ($i<15 && $REG_SZ==32);         # just load input
850          vmovd          `4*$j-16*4`(@ptr[1]),$t2
851          vmovd          `4*$j-16*4`(@ptr[5]),$t1
852          vpinsrd        \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
853          vpinsrd        \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3
854          vpinsrd        \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2
855          vpunpckldq     $t2,@Xi[1],@Xi[1]
856          vpinsrd        \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1
857          vpunpckldq     $t1,$t3,$t3
858 ___
859 $code.=<<___ if ($i<14);
860         vpaddd  $K,$e,$e                        # e+=K_00_19
861         vpslld  \$5,$a,$t2
862         vpandn  $d,$b,$t1
863         vpand   $c,$b,$t0
864
865         vmovdqa @Xi[0],`&Xi_off($i)`
866         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
867          $vpack         $t3,@Xi[1],@Xi[1]
868         vpsrld  \$27,$a,$t3
869         vpxor   $t1,$t0,$t0                     # Ch(b,c,d)
870          vmovd          `4*$k-16*4`(@ptr[0]),@Xi[2]
871
872         vpslld  \$30,$b,$t1
873         vpor    $t3,$t2,$t2                     # rol(a,5)
874          vmovd          `4*$k-16*4`($ptr_n),$t3
875         vpaddd  $t0,$e,$e                       # e+=Ch(b,c,d)
876
877         vpsrld  \$2,$b,$b
878         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
879          vpshufb        $tx,@Xi[1],@Xi[1]
880         vpor    $t1,$b,$b                       # b=rol(b,30)
881 ___
882 $code.=<<___ if ($i==14);
883         vpaddd  $K,$e,$e                        # e+=K_00_19
884          prefetcht0     63(@ptr[0])
885         vpslld  \$5,$a,$t2
886         vpandn  $d,$b,$t1
887         vpand   $c,$b,$t0
888
889         vmovdqa @Xi[0],`&Xi_off($i)`
890         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
891          $vpack         $t3,@Xi[1],@Xi[1]
892         vpsrld  \$27,$a,$t3
893          prefetcht0     63(@ptr[1])
894         vpxor   $t1,$t0,$t0                     # Ch(b,c,d)
895
896         vpslld  \$30,$b,$t1
897         vpor    $t3,$t2,$t2                     # rol(a,5)
898          prefetcht0     63(@ptr[2])
899         vpaddd  $t0,$e,$e                       # e+=Ch(b,c,d)
900
901         vpsrld  \$2,$b,$b
902         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
903          prefetcht0     63(@ptr[3])
904          vpshufb        $tx,@Xi[1],@Xi[1]
905         vpor    $t1,$b,$b                       # b=rol(b,30)
906 ___
907 $code.=<<___ if ($i>=13 && $i<15);
908         vmovdqa `&Xi_off($j+2)`,@Xi[3]          # preload "X[2]"
909 ___
910 $code.=<<___ if ($i>=15);                       # apply Xupdate
911         vpxor   @Xi[-2],@Xi[1],@Xi[1]           # "X[13]"
912         vmovdqa `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
913
914         vpaddd  $K,$e,$e                        # e+=K_00_19
915         vpslld  \$5,$a,$t2
916         vpandn  $d,$b,$t1
917          `"prefetcht0   63(@ptr[4])"            if ($i==15 && $REG_SZ==32)`
918         vpand   $c,$b,$t0
919
920         vmovdqa @Xi[0],`&Xi_off($i)`
921         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
922          vpxor  `&Xi_off($j+8)`,@Xi[1],@Xi[1]
923         vpsrld  \$27,$a,$t3
924         vpxor   $t1,$t0,$t0                     # Ch(b,c,d)
925          vpxor  @Xi[3],@Xi[1],@Xi[1]
926          `"prefetcht0   63(@ptr[5])"            if ($i==15 && $REG_SZ==32)`
927
928         vpslld  \$30,$b,$t1
929         vpor    $t3,$t2,$t2                     # rol(a,5)
930         vpaddd  $t0,$e,$e                       # e+=Ch(b,c,d)
931          `"prefetcht0   63(@ptr[6])"            if ($i==15 && $REG_SZ==32)`
932          vpsrld \$31,@Xi[1],$tx
933          vpaddd @Xi[1],@Xi[1],@Xi[1]
934
935         vpsrld  \$2,$b,$b
936          `"prefetcht0   63(@ptr[7])"            if ($i==15 && $REG_SZ==32)`
937         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
938          vpor   $tx,@Xi[1],@Xi[1]               # rol   \$1,@Xi[1]
939         vpor    $t1,$b,$b                       # b=rol(b,30)
940 ___
941 push(@Xi,shift(@Xi));
942 }
943
944 sub BODY_20_39_avx {
945 my ($i,$a,$b,$c,$d,$e)=@_;
946 my $j=$i+1;
947
948 $code.=<<___ if ($i<79);
949         vpxor   @Xi[-2],@Xi[1],@Xi[1]           # "X[13]"
950         vmovdqa `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
951
952         vpslld  \$5,$a,$t2
953         vpaddd  $K,$e,$e                        # e+=K_20_39
954         vpxor   $b,$d,$t0
955 ___
956 $code.=<<___ if ($i<72);
957         vmovdqa @Xi[0],`&Xi_off($i)`
958 ___
959 $code.=<<___ if ($i<79);
960         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
961          vpxor  `&Xi_off($j+8)`,@Xi[1],@Xi[1]
962         vpsrld  \$27,$a,$t3
963         vpxor   $c,$t0,$t0                      # Parity(b,c,d)
964          vpxor  @Xi[3],@Xi[1],@Xi[1]
965
966         vpslld  \$30,$b,$t1
967         vpor    $t3,$t2,$t2                     # rol(a,5)
968         vpaddd  $t0,$e,$e                       # e+=Parity(b,c,d)
969          vpsrld \$31,@Xi[1],$tx
970          vpaddd @Xi[1],@Xi[1],@Xi[1]
971
972         vpsrld  \$2,$b,$b
973         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
974          vpor   $tx,@Xi[1],@Xi[1]               # rol(@Xi[1],1)
975         vpor    $t1,$b,$b                       # b=rol(b,30)
976 ___
977 $code.=<<___ if ($i==79);
978         vpslld  \$5,$a,$t2
979         vpaddd  $K,$e,$e                        # e+=K_20_39
980         vpxor   $b,$d,$t0
981
982         vpsrld  \$27,$a,$t3
983         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
984         vpxor   $c,$t0,$t0                      # Parity(b,c,d)
985
986         vpslld  \$30,$b,$t1
987         vpor    $t3,$t2,$t2                     # rol(a,5)
988         vpaddd  $t0,$e,$e                       # e+=Parity(b,c,d)
989
990         vpsrld  \$2,$b,$b
991         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
992         vpor    $t1,$b,$b                       # b=rol(b,30)
993 ___
994 push(@Xi,shift(@Xi));
995 }
996
997 sub BODY_40_59_avx {
998 my ($i,$a,$b,$c,$d,$e)=@_;
999 my $j=$i+1;
1000
1001 $code.=<<___;
1002         vpxor   @Xi[-2],@Xi[1],@Xi[1]           # "X[13]"
1003         vmovdqa `&Xi_off($j+2)`,@Xi[3]          # "X[2]"
1004
1005         vpaddd  $K,$e,$e                        # e+=K_40_59
1006         vpslld  \$5,$a,$t2
1007         vpand   $c,$d,$t1
1008          vpxor  `&Xi_off($j+8)`,@Xi[1],@Xi[1]
1009
1010         vpaddd  $t1,$e,$e
1011         vpsrld  \$27,$a,$t3
1012         vpxor   $c,$d,$t0
1013          vpxor  @Xi[3],@Xi[1],@Xi[1]
1014
1015         vmovdqu @Xi[0],`&Xi_off($i)`
1016         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
1017         vpor    $t3,$t2,$t2                     # rol(a,5)
1018          vpsrld \$31,@Xi[1],$tx
1019         vpand   $b,$t0,$t0
1020          vpaddd @Xi[1],@Xi[1],@Xi[1]
1021
1022         vpslld  \$30,$b,$t1
1023         vpaddd  $t0,$e,$e                       # e+=Maj(b,d,c)
1024
1025         vpsrld  \$2,$b,$b
1026         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
1027          vpor   $tx,@Xi[1],@Xi[1]               # rol(@X[1],1)
1028         vpor    $t1,$b,$b                       # b=rol(b,30)
1029 ___
1030 push(@Xi,shift(@Xi));
1031 }
1032
1033 $code.=<<___;
1034 .type   sha1_multi_block_avx,\@function,3
1035 .align  32
1036 sha1_multi_block_avx:
1037 .cfi_startproc
1038 _avx_shortcut:
1039 ___
1040 $code.=<<___ if ($avx>1);
1041         shr     \$32,%rcx
1042         cmp     \$2,$num
1043         jb      .Lavx
1044         test    \$`1<<5`,%ecx
1045         jnz     _avx2_shortcut
1046         jmp     .Lavx
1047 .align  32
1048 .Lavx:
1049 ___
1050 $code.=<<___;
1051         mov     %rsp,%rax
1052 .cfi_def_cfa_register   %rax
1053         push    %rbx
1054 .cfi_push       %rbx
1055         push    %rbp
1056 .cfi_push       %rbp
1057 ___
1058 $code.=<<___ if ($win64);
1059         lea     -0xa8(%rsp),%rsp
1060         movaps  %xmm6,(%rsp)
1061         movaps  %xmm7,0x10(%rsp)
1062         movaps  %xmm8,0x20(%rsp)
1063         movaps  %xmm9,0x30(%rsp)
1064         movaps  %xmm10,-0x78(%rax)
1065         movaps  %xmm11,-0x68(%rax)
1066         movaps  %xmm12,-0x58(%rax)
1067         movaps  %xmm13,-0x48(%rax)
1068         movaps  %xmm14,-0x38(%rax)
1069         movaps  %xmm15,-0x28(%rax)
1070 ___
1071 $code.=<<___;
1072         sub     \$`$REG_SZ*18`, %rsp
1073         and     \$-256,%rsp
1074         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
1075 .cfi_cfa_expression     %rsp+`$REG_SZ*17`,deref,+8
1076 .Lbody_avx:
1077         lea     K_XX_XX(%rip),$Tbl
1078         lea     `$REG_SZ*16`(%rsp),%rbx
1079
1080         vzeroupper
1081 .Loop_grande_avx:
1082         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
1083         xor     $num,$num
1084 ___
1085 for($i=0;$i<4;$i++) {
1086     $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1087     $code.=<<___;
1088         # input pointer
1089         mov     `$inp_elm_size*$i+0`($inp),$ptr_reg
1090         # number of blocks
1091         mov     `$inp_elm_size*$i+$ptr_size`($inp),%ecx
1092         cmp     $num,%ecx
1093         cmovg   %ecx,$num                       # find maximum
1094         test    %ecx,%ecx
1095         mov     %ecx,`4*$i`(%rbx)               # initialize counters
1096         cmovle  $Tbl,@ptr[$i]                   # cancel input
1097 ___
1098 }
1099 $code.=<<___;
1100         test    $num,$num
1101         jz      .Ldone_avx
1102
1103         vmovdqu 0x00($ctx),$A                   # load context
1104          lea    128(%rsp),%rax
1105         vmovdqu 0x20($ctx),$B
1106         vmovdqu 0x40($ctx),$C
1107         vmovdqu 0x60($ctx),$D
1108         vmovdqu 0x80($ctx),$E
1109         vmovdqu 0x60($Tbl),$tx                  # pbswap_mask
1110         jmp     .Loop_avx
1111
1112 .align  32
1113 .Loop_avx:
1114 ___
1115 $code.="        vmovdqa -0x20($Tbl),$K\n";      # K_00_19
1116 for($i=0;$i<20;$i++)    { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
1117 $code.="        vmovdqa 0x00($Tbl),$K\n";       # K_20_39
1118 for(;$i<40;$i++)        { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1119 $code.="        vmovdqa 0x20($Tbl),$K\n";       # K_40_59
1120 for(;$i<60;$i++)        { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
1121 $code.="        vmovdqa 0x40($Tbl),$K\n";       # K_60_79
1122 for(;$i<80;$i++)        { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1123 $code.=<<___;
1124         mov     \$1,%ecx
1125 ___
1126 for($i=0;$i<4;$i++) {
1127     $code.=<<___;
1128         cmp     `4*$i`(%rbx),%ecx               # examine counters
1129         cmovge  $Tbl,@ptr[$i]                   # cancel input
1130 ___
1131 }
1132 $code.=<<___;
1133         vmovdqu (%rbx),$t0                      # pull counters
1134         vpxor   $t2,$t2,$t2
1135         vmovdqa $t0,$t1
1136         vpcmpgtd $t2,$t1,$t1                    # mask value
1137         vpaddd  $t1,$t0,$t0                     # counters--
1138
1139         vpand   $t1,$A,$A
1140         vpand   $t1,$B,$B
1141         vpaddd  0x00($ctx),$A,$A
1142         vpand   $t1,$C,$C
1143         vpaddd  0x20($ctx),$B,$B
1144         vpand   $t1,$D,$D
1145         vpaddd  0x40($ctx),$C,$C
1146         vpand   $t1,$E,$E
1147         vpaddd  0x60($ctx),$D,$D
1148         vpaddd  0x80($ctx),$E,$E
1149         vmovdqu $A,0x00($ctx)
1150         vmovdqu $B,0x20($ctx)
1151         vmovdqu $C,0x40($ctx)
1152         vmovdqu $D,0x60($ctx)
1153         vmovdqu $E,0x80($ctx)
1154
1155         vmovdqu $t0,(%rbx)                      # save counters
1156         vmovdqu 0x60($Tbl),$tx                  # pbswap_mask
1157         dec     $num
1158         jnz     .Loop_avx
1159
1160         mov     `$REG_SZ*17+8`(%rsp),$num
1161         lea     $REG_SZ($ctx),$ctx
1162         lea     `$inp_elm_size*$REG_SZ/4`($inp),$inp
1163         dec     $num
1164         jnz     .Loop_grande_avx
1165
1166 .Ldone_avx:
1167         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
1168 .cfi_def_cfa    %rax,8
1169         vzeroupper
1170 ___
1171 $code.=<<___ if ($win64);
1172         movaps  -0xb8(%rax),%xmm6
1173         movaps  -0xa8(%rax),%xmm7
1174         movaps  -0x98(%rax),%xmm8
1175         movaps  -0x88(%rax),%xmm9
1176         movaps  -0x78(%rax),%xmm10
1177         movaps  -0x68(%rax),%xmm11
1178         movaps  -0x58(%rax),%xmm12
1179         movaps  -0x48(%rax),%xmm13
1180         movaps  -0x38(%rax),%xmm14
1181         movaps  -0x28(%rax),%xmm15
1182 ___
1183 $code.=<<___;
1184         mov     -16(%rax),%rbp
1185 .cfi_restore    %rbp
1186         mov     -8(%rax),%rbx
1187 .cfi_restore    %rbx
1188         lea     (%rax),%rsp
1189 .cfi_def_cfa_register   %rsp
1190 .Lepilogue_avx:
1191         ret
1192 .cfi_endproc
1193 .size   sha1_multi_block_avx,.-sha1_multi_block_avx
1194 ___
1195
1196                                                 if ($avx>1) {
1197 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1198
1199 $REG_SZ=32;
1200
1201 @ptr=map("%r$_",(12..15,8..11));
1202
1203 @V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4));
1204 ($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9));
1205 @Xi=map("%ymm$_",(10..14));
1206 $K="%ymm15";
1207
1208 $code.=<<___;
1209 .type   sha1_multi_block_avx2,\@function,3
1210 .align  32
1211 sha1_multi_block_avx2:
1212 .cfi_startproc
1213 _avx2_shortcut:
1214         mov     %rsp,%rax
1215 .cfi_def_cfa_register   %rax
1216         push    %rbx
1217 .cfi_push       %rbx
1218         push    %rbp
1219 .cfi_push       %rbp
1220         push    %r12
1221 .cfi_push       %r12
1222         push    %r13
1223 .cfi_push       %r13
1224         push    %r14
1225 .cfi_push       %r14
1226         push    %r15
1227 .cfi_push       %r15
1228 ___
1229 $code.=<<___ if ($win64);
1230         lea     -0xa8(%rsp),%rsp
1231         movaps  %xmm6,(%rsp)
1232         movaps  %xmm7,0x10(%rsp)
1233         movaps  %xmm8,0x20(%rsp)
1234         movaps  %xmm9,0x30(%rsp)
1235         movaps  %xmm10,0x40(%rsp)
1236         movaps  %xmm11,0x50(%rsp)
1237         movaps  %xmm12,-0x78(%rax)
1238         movaps  %xmm13,-0x68(%rax)
1239         movaps  %xmm14,-0x58(%rax)
1240         movaps  %xmm15,-0x48(%rax)
1241 ___
1242 $code.=<<___;
1243         sub     \$`$REG_SZ*18`, %rsp
1244         and     \$-256,%rsp
1245         mov     %rax,`$REG_SZ*17`(%rsp)         # original %rsp
1246 .cfi_cfa_expression     %rsp+`$REG_SZ*17`,deref,+8
1247 .Lbody_avx2:
1248         lea     K_XX_XX(%rip),$Tbl
1249         shr     \$1,$num
1250
1251         vzeroupper
1252 .Loop_grande_avx2:
1253         mov     $num,`$REG_SZ*17+8`(%rsp)       # original $num
1254         xor     $num,$num
1255         lea     `$REG_SZ*16`(%rsp),%rbx
1256 ___
1257 for($i=0;$i<8;$i++) {
1258     $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1259     $code.=<<___;
1260         # input pointer
1261         mov     `$inp_elm_size*$i+0`($inp),$ptr_reg
1262         # number of blocks
1263         mov     `$inp_elm_size*$i+$ptr_size`($inp),%ecx
1264         cmp     $num,%ecx
1265         cmovg   %ecx,$num                       # find maximum
1266         test    %ecx,%ecx
1267         mov     %ecx,`4*$i`(%rbx)               # initialize counters
1268         cmovle  $Tbl,@ptr[$i]                   # cancel input
1269 ___
1270 }
1271 $code.=<<___;
1272         vmovdqu 0x00($ctx),$A                   # load context
1273          lea    128(%rsp),%rax
1274         vmovdqu 0x20($ctx),$B
1275          lea    256+128(%rsp),%rbx
1276         vmovdqu 0x40($ctx),$C
1277         vmovdqu 0x60($ctx),$D
1278         vmovdqu 0x80($ctx),$E
1279         vmovdqu 0x60($Tbl),$tx                  # pbswap_mask
1280         jmp     .Loop_avx2
1281
1282 .align  32
1283 .Loop_avx2:
1284 ___
1285 $code.="        vmovdqa -0x20($Tbl),$K\n";      # K_00_19
1286 for($i=0;$i<20;$i++)    { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
1287 $code.="        vmovdqa 0x00($Tbl),$K\n";       # K_20_39
1288 for(;$i<40;$i++)        { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1289 $code.="        vmovdqa 0x20($Tbl),$K\n";       # K_40_59
1290 for(;$i<60;$i++)        { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
1291 $code.="        vmovdqa 0x40($Tbl),$K\n";       # K_60_79
1292 for(;$i<80;$i++)        { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
1293 $code.=<<___;
1294         mov     \$1,%ecx
1295         lea     `$REG_SZ*16`(%rsp),%rbx
1296 ___
1297 for($i=0;$i<8;$i++) {
1298     $code.=<<___;
1299         cmp     `4*$i`(%rbx),%ecx               # examine counters
1300         cmovge  $Tbl,@ptr[$i]                   # cancel input
1301 ___
1302 }
1303 $code.=<<___;
1304         vmovdqu (%rbx),$t0              # pull counters
1305         vpxor   $t2,$t2,$t2
1306         vmovdqa $t0,$t1
1307         vpcmpgtd $t2,$t1,$t1                    # mask value
1308         vpaddd  $t1,$t0,$t0                     # counters--
1309
1310         vpand   $t1,$A,$A
1311         vpand   $t1,$B,$B
1312         vpaddd  0x00($ctx),$A,$A
1313         vpand   $t1,$C,$C
1314         vpaddd  0x20($ctx),$B,$B
1315         vpand   $t1,$D,$D
1316         vpaddd  0x40($ctx),$C,$C
1317         vpand   $t1,$E,$E
1318         vpaddd  0x60($ctx),$D,$D
1319         vpaddd  0x80($ctx),$E,$E
1320         vmovdqu $A,0x00($ctx)
1321         vmovdqu $B,0x20($ctx)
1322         vmovdqu $C,0x40($ctx)
1323         vmovdqu $D,0x60($ctx)
1324         vmovdqu $E,0x80($ctx)
1325
1326         vmovdqu $t0,(%rbx)                      # save counters
1327         lea     256+128(%rsp),%rbx
1328         vmovdqu 0x60($Tbl),$tx                  # pbswap_mask
1329         dec     $num
1330         jnz     .Loop_avx2
1331
1332         #mov    `$REG_SZ*17+8`(%rsp),$num
1333         #lea    $REG_SZ($ctx),$ctx
1334         #lea    `$inp_elm_size*$REG_SZ/4`($inp),$inp
1335         #dec    $num
1336         #jnz    .Loop_grande_avx2
1337
1338 .Ldone_avx2:
1339         mov     `$REG_SZ*17`(%rsp),%rax         # original %rsp
1340 .cfi_def_cfa    %rax,8
1341         vzeroupper
1342 ___
1343 $code.=<<___ if ($win64);
1344         movaps  -0xd8(%rax),%xmm6
1345         movaps  -0xc8(%rax),%xmm7
1346         movaps  -0xb8(%rax),%xmm8
1347         movaps  -0xa8(%rax),%xmm9
1348         movaps  -0x98(%rax),%xmm10
1349         movaps  -0x88(%rax),%xmm11
1350         movaps  -0x78(%rax),%xmm12
1351         movaps  -0x68(%rax),%xmm13
1352         movaps  -0x58(%rax),%xmm14
1353         movaps  -0x48(%rax),%xmm15
1354 ___
1355 $code.=<<___;
1356         mov     -48(%rax),%r15
1357 .cfi_restore    %r15
1358         mov     -40(%rax),%r14
1359 .cfi_restore    %r14
1360         mov     -32(%rax),%r13
1361 .cfi_restore    %r13
1362         mov     -24(%rax),%r12
1363 .cfi_restore    %r12
1364         mov     -16(%rax),%rbp
1365 .cfi_restore    %rbp
1366         mov     -8(%rax),%rbx
1367 .cfi_restore    %rbx
1368         lea     (%rax),%rsp
1369 .cfi_def_cfa_register   %rsp
1370 .Lepilogue_avx2:
1371         ret
1372 .cfi_endproc
1373 .size   sha1_multi_block_avx2,.-sha1_multi_block_avx2
1374 ___
1375                                                 }       }}}
1376 $code.=<<___;
1377
1378 .align  256
1379         .long   0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
1380         .long   0x5a827999,0x5a827999,0x5a827999,0x5a827999     # K_00_19
1381 K_XX_XX:
1382         .long   0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
1383         .long   0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1     # K_20_39
1384         .long   0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
1385         .long   0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc     # K_40_59
1386         .long   0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
1387         .long   0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6     # K_60_79
1388         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1389         .long   0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f     # pbswap
1390         .byte   0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1391         .asciz  "SHA1 multi-block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1392 ___
1393
1394 if ($win64) {
1395 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1396 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1397 $rec="%rcx";
1398 $frame="%rdx";
1399 $context="%r8";
1400 $disp="%r9";
1401
1402 $code.=<<___;
1403 .extern __imp_RtlVirtualUnwind
1404 .type   se_handler,\@abi-omnipotent
1405 .align  16
1406 se_handler:
1407         push    %rsi
1408         push    %rdi
1409         push    %rbx
1410         push    %rbp
1411         push    %r12
1412         push    %r13
1413         push    %r14
1414         push    %r15
1415         pushfq
1416         sub     \$64,%rsp
1417
1418         mov     120($context),%rax      # pull context->Rax
1419         mov     248($context),%rbx      # pull context->Rip
1420
1421         mov     8($disp),%rsi           # disp->ImageBase
1422         mov     56($disp),%r11          # disp->HandlerData
1423
1424         mov     0(%r11),%r10d           # HandlerData[0]
1425         lea     (%rsi,%r10),%r10        # end of prologue label
1426         cmp     %r10,%rbx               # context->Rip<.Lbody
1427         jb      .Lin_prologue
1428
1429         mov     152($context),%rax      # pull context->Rsp
1430
1431         mov     4(%r11),%r10d           # HandlerData[1]
1432         lea     (%rsi,%r10),%r10        # epilogue label
1433         cmp     %r10,%rbx               # context->Rip>=.Lepilogue
1434         jae     .Lin_prologue
1435
1436         mov     `16*17`(%rax),%rax      # pull saved stack pointer
1437
1438         mov     -8(%rax),%rbx
1439         mov     -16(%rax),%rbp
1440         mov     %rbx,144($context)      # restore context->Rbx
1441         mov     %rbp,160($context)      # restore context->Rbp
1442
1443         lea     -24-10*16(%rax),%rsi
1444         lea     512($context),%rdi      # &context.Xmm6
1445         mov     \$20,%ecx
1446         .long   0xa548f3fc              # cld; rep movsq
1447
1448 .Lin_prologue:
1449         mov     8(%rax),%rdi
1450         mov     16(%rax),%rsi
1451         mov     %rax,152($context)      # restore context->Rsp
1452         mov     %rsi,168($context)      # restore context->Rsi
1453         mov     %rdi,176($context)      # restore context->Rdi
1454
1455         mov     40($disp),%rdi          # disp->ContextRecord
1456         mov     $context,%rsi           # context
1457         mov     \$154,%ecx              # sizeof(CONTEXT)
1458         .long   0xa548f3fc              # cld; rep movsq
1459
1460         mov     $disp,%rsi
1461         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1462         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1463         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1464         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1465         mov     40(%rsi),%r10           # disp->ContextRecord
1466         lea     56(%rsi),%r11           # &disp->HandlerData
1467         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1468         mov     %r10,32(%rsp)           # arg5
1469         mov     %r11,40(%rsp)           # arg6
1470         mov     %r12,48(%rsp)           # arg7
1471         mov     %rcx,56(%rsp)           # arg8, (NULL)
1472         call    *__imp_RtlVirtualUnwind(%rip)
1473
1474         mov     \$1,%eax                # ExceptionContinueSearch
1475         add     \$64,%rsp
1476         popfq
1477         pop     %r15
1478         pop     %r14
1479         pop     %r13
1480         pop     %r12
1481         pop     %rbp
1482         pop     %rbx
1483         pop     %rdi
1484         pop     %rsi
1485         ret
1486 .size   se_handler,.-se_handler
1487 ___
1488 $code.=<<___ if ($avx>1);
1489 .type   avx2_handler,\@abi-omnipotent
1490 .align  16
1491 avx2_handler:
1492         push    %rsi
1493         push    %rdi
1494         push    %rbx
1495         push    %rbp
1496         push    %r12
1497         push    %r13
1498         push    %r14
1499         push    %r15
1500         pushfq
1501         sub     \$64,%rsp
1502
1503         mov     120($context),%rax      # pull context->Rax
1504         mov     248($context),%rbx      # pull context->Rip
1505
1506         mov     8($disp),%rsi           # disp->ImageBase
1507         mov     56($disp),%r11          # disp->HandlerData
1508
1509         mov     0(%r11),%r10d           # HandlerData[0]
1510         lea     (%rsi,%r10),%r10        # end of prologue label
1511         cmp     %r10,%rbx               # context->Rip<body label
1512         jb      .Lin_prologue
1513
1514         mov     152($context),%rax      # pull context->Rsp
1515
1516         mov     4(%r11),%r10d           # HandlerData[1]
1517         lea     (%rsi,%r10),%r10        # epilogue label
1518         cmp     %r10,%rbx               # context->Rip>=epilogue label
1519         jae     .Lin_prologue
1520
1521         mov     `32*17`($context),%rax  # pull saved stack pointer
1522
1523         mov     -8(%rax),%rbx
1524         mov     -16(%rax),%rbp
1525         mov     -24(%rax),%r12
1526         mov     -32(%rax),%r13
1527         mov     -40(%rax),%r14
1528         mov     -48(%rax),%r15
1529         mov     %rbx,144($context)      # restore context->Rbx
1530         mov     %rbp,160($context)      # restore context->Rbp
1531         mov     %r12,216($context)      # restore context->R12
1532         mov     %r13,224($context)      # restore context->R13
1533         mov     %r14,232($context)      # restore context->R14
1534         mov     %r15,240($context)      # restore context->R15
1535
1536         lea     -56-10*16(%rax),%rsi
1537         lea     512($context),%rdi      # &context.Xmm6
1538         mov     \$20,%ecx
1539         .long   0xa548f3fc              # cld; rep movsq
1540
1541         jmp     .Lin_prologue
1542 .size   avx2_handler,.-avx2_handler
1543 ___
1544 $code.=<<___;
1545 .section        .pdata
1546 .align  4
1547         .rva    .LSEH_begin_sha1_multi_block
1548         .rva    .LSEH_end_sha1_multi_block
1549         .rva    .LSEH_info_sha1_multi_block
1550         .rva    .LSEH_begin_sha1_multi_block_shaext
1551         .rva    .LSEH_end_sha1_multi_block_shaext
1552         .rva    .LSEH_info_sha1_multi_block_shaext
1553 ___
1554 $code.=<<___ if ($avx);
1555         .rva    .LSEH_begin_sha1_multi_block_avx
1556         .rva    .LSEH_end_sha1_multi_block_avx
1557         .rva    .LSEH_info_sha1_multi_block_avx
1558 ___
1559 $code.=<<___ if ($avx>1);
1560         .rva    .LSEH_begin_sha1_multi_block_avx2
1561         .rva    .LSEH_end_sha1_multi_block_avx2
1562         .rva    .LSEH_info_sha1_multi_block_avx2
1563 ___
1564 $code.=<<___;
1565 .section        .xdata
1566 .align  8
1567 .LSEH_info_sha1_multi_block:
1568         .byte   9,0,0,0
1569         .rva    se_handler
1570         .rva    .Lbody,.Lepilogue                       # HandlerData[]
1571 .LSEH_info_sha1_multi_block_shaext:
1572         .byte   9,0,0,0
1573         .rva    se_handler
1574         .rva    .Lbody_shaext,.Lepilogue_shaext # HandlerData[]
1575 ___
1576 $code.=<<___ if ($avx);
1577 .LSEH_info_sha1_multi_block_avx:
1578         .byte   9,0,0,0
1579         .rva    se_handler
1580         .rva    .Lbody_avx,.Lepilogue_avx               # HandlerData[]
1581 ___
1582 $code.=<<___ if ($avx>1);
1583 .LSEH_info_sha1_multi_block_avx2:
1584         .byte   9,0,0,0
1585         .rva    avx2_handler
1586         .rva    .Lbody_avx2,.Lepilogue_avx2             # HandlerData[]
1587 ___
1588 }
1589 ####################################################################
1590
1591 sub rex {
1592   local *opcode=shift;
1593   my ($dst,$src)=@_;
1594   my $rex=0;
1595
1596     $rex|=0x04                  if ($dst>=8);
1597     $rex|=0x01                  if ($src>=8);
1598     unshift @opcode,$rex|0x40   if ($rex);
1599 }
1600
1601 sub sha1rnds4 {
1602     if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1603       my @opcode=(0x0f,0x3a,0xcc);
1604         rex(\@opcode,$3,$2);
1605         push @opcode,0xc0|($2&7)|(($3&7)<<3);           # ModR/M
1606         my $c=$1;
1607         push @opcode,$c=~/^0/?oct($c):$c;
1608         return ".byte\t".join(',',@opcode);
1609     } else {
1610         return "sha1rnds4\t".@_[0];
1611     }
1612 }
1613
1614 sub sha1op38 {
1615     my $instr = shift;
1616     my %opcodelet = (
1617                 "sha1nexte" => 0xc8,
1618                 "sha1msg1"  => 0xc9,
1619                 "sha1msg2"  => 0xca     );
1620
1621     if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1622       my @opcode=(0x0f,0x38);
1623         rex(\@opcode,$2,$1);
1624         push @opcode,$opcodelet{$instr};
1625         push @opcode,0xc0|($1&7)|(($2&7)<<3);           # ModR/M
1626         return ".byte\t".join(',',@opcode);
1627     } else {
1628         return $instr."\t".@_[0];
1629     }
1630 }
1631
1632 foreach (split("\n",$code)) {
1633         s/\`([^\`]*)\`/eval($1)/ge;
1634
1635         s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo                or
1636         s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo             or
1637
1638         s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go          or
1639         s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go         or
1640         s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go    or
1641         s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go        or
1642         s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go            or
1643         s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
1644
1645         print $_,"\n";
1646 }
1647
1648 close STDOUT or die "error closing STDOUT: $!";