3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # Multi-buffer SHA1 procedure processes n buffers in parallel by
11 # placing buffer data to designated lane of SIMD register. n is
12 # naturally limited to 4 on pre-AVX2 processors and to 8 on
13 # AVX2-capable processors such as Haswell.
15 # this +aesni(i) sha1 aesni-sha1 gain(iv)
16 # -------------------------------------------------------------------
17 # Westmere(ii) 10.4/n +1.28=3.88(n=4) 5.44 6.58 +70%
18 # Atom(ii) 18.9/n +3.93=8.66(n=4) 10.0 14.0 +62%
19 # Sandy Bridge (8.16 +5.15=13.3)/n 4.99 5.98 +80%
20 # Ivy Bridge (8.03 +5.14=13.2)/n 4.60 5.54 +68%
21 # Haswell(iii) (8.96 +5.00=14.0)/n 3.57 4.55 +160%
22 # Bulldozer (9.75 +5.76=15.5)/n 5.95 6.37 +64%
24 # (i) multi-block CBC encrypt with 128-bit key;
25 # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
26 # because of lower AES-NI instruction throughput;
27 # (iii) "this" is for n=8, when we gather twice as much data, result
28 # for n=4 is 7.98+4.44=12.4;
29 # (iv) improvement coefficients in real-life application are somewhat
30 # lower and range from 30% to 100% (on Haswell);
34 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
36 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
38 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
39 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
40 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
41 die "can't locate x86_64-xlate.pl";
45 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
46 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
47 $avx = ($1>=2.19) + ($1>=2.22);
50 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
51 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
52 $avx = ($1>=2.09) + ($1>=2.10);
55 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
56 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
57 $avx = ($1>=10) + ($1>=11);
60 open OUT,"| \"$^X\" $xlate $flavour $output";
63 # void sha1_multi_block (
64 # struct { unsigned int A[8];
68 # unsigned int E[8]; } *ctx,
69 # struct { void *ptr; int blocks; } inp[8],
70 # int num); /* 1 or 2 */
72 $ctx="%rdi"; # 1st arg
73 $inp="%rsi"; # 2nd arg
75 @ptr=map("%r$_",(8..11));
78 @V=($A,$B,$C,$D,$E)=map("%xmm$_",(0..4));
79 ($t0,$t1,$t2,$t3,$tx)=map("%xmm$_",(5..9));
80 @Xi=map("%xmm$_",(10..14));
88 $off %= 16; $off *= $REG_SZ;
89 $off<256 ? "$off-128(%rax)" : "$off-256-128(%rbx)";
93 my ($i,$a,$b,$c,$d,$e)=@_;
97 $code.=<<___ if ($i==0);
99 lea `16*4`(@ptr[0]),@ptr[0]
100 movd (@ptr[1]),@Xi[2] # borrow @Xi[2]
101 lea `16*4`(@ptr[1]),@ptr[1]
102 movd (@ptr[2]),@Xi[3] # borrow @Xi[3]
103 lea `16*4`(@ptr[2]),@ptr[2]
104 movd (@ptr[3]),@Xi[4] # borrow @Xi[4]
105 lea `16*4`(@ptr[3]),@ptr[3]
106 punpckldq @Xi[3],@Xi[0]
107 movd `4*$j-16*4`(@ptr[0]),@Xi[1]
108 punpckldq @Xi[4],@Xi[2]
109 movd `4*$j-16*4`(@ptr[1]),$t3
110 punpckldq @Xi[2],@Xi[0]
111 movd `4*$j-16*4`(@ptr[2]),$t2
114 $code.=<<___ if ($i<14); # just load input
115 movd `4*$j-16*4`(@ptr[3]),$t1
118 paddd $K,$e # e+=K_00_19
128 movdqa @Xi[0],`&Xi_off($i)`
129 paddd @Xi[0],$e # e+=X[i]
130 movd `4*$k-16*4`(@ptr[0]),@Xi[2]
132 pxor $t1,$t0 # Ch(b,c,d)
135 por $t3,$t2 # rol(a,5)
136 movd `4*$k-16*4`(@ptr[1]),$t3
138 paddd $t0,$e # e+=Ch(b,c,d)
141 paddd $t2,$e # e+=rol(a,5)
142 movd `4*$j-16*4`(@ptr[2]),$t2
144 por $t1,$b # b=rol(b,30)
146 $code.=<<___ if ($i==14); # just load input
147 movd `4*$j-16*4`(@ptr[3]),$t1
150 paddd $K,$e # e+=K_00_19
160 movdqa @Xi[0],`&Xi_off($i)`
161 paddd @Xi[0],$e # e+=X[i]
163 pxor $t1,$t0 # Ch(b,c,d)
166 por $t3,$t2 # rol(a,5)
168 paddd $t0,$e # e+=Ch(b,c,d)
171 paddd $t2,$e # e+=rol(a,5)
173 por $t1,$b # b=rol(b,30)
175 $code.=<<___ if ($i>=13 && $i<15);
176 movdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]"
178 $code.=<<___ if ($i>=15); # apply Xupdate
179 pxor @Xi[-2],@Xi[1] # "X[13]"
180 movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
183 pxor `&Xi_off($j+8)`,@Xi[1]
184 paddd $K,$e # e+=K_00_19
196 movdqa @Xi[0],`&Xi_off($i)`
197 paddd @Xi[0],$e # e+=X[i]
199 pxor $t1,$t0 # Ch(b,c,d)
202 por $t3,$t2 # rol(a,5)
204 paddd $t0,$e # e+=Ch(b,c,d)
207 paddd $t2,$e # e+=rol(a,5)
208 por $tx,@Xi[1] # rol \$1,@Xi[1]
209 por $t1,$b # b=rol(b,30)
211 push(@Xi,shift(@Xi));
215 my ($i,$a,$b,$c,$d,$e)=@_;
218 $code.=<<___ if ($i<79);
219 pxor @Xi[-2],@Xi[1] # "X[13]"
220 movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
224 pxor `&Xi_off($j+8)`,@Xi[1]
225 paddd $K,$e # e+=K_20_39
231 $code.=<<___ if ($i<72);
232 movdqa @Xi[0],`&Xi_off($i)`
234 $code.=<<___ if ($i<79);
235 paddd @Xi[0],$e # e+=X[i]
238 pxor $c,$t0 # Parity(b,c,d)
243 por $t3,$t2 # rol(a,5)
245 paddd $t0,$e # e+=Parity(b,c,d)
249 paddd $t2,$e # e+=rol(a,5)
250 por $tx,@Xi[1] # rol(@Xi[1],1)
251 por $t1,$b # b=rol(b,30)
253 $code.=<<___ if ($i==79);
255 paddd $K,$e # e+=K_20_39
261 paddd @Xi[0],$e # e+=X[i]
264 pxor $c,$t0 # Parity(b,c,d)
267 por $t3,$t2 # rol(a,5)
268 paddd $t0,$e # e+=Parity(b,c,d)
271 paddd $t2,$e # e+=rol(a,5)
272 por $t1,$b # b=rol(b,30)
274 push(@Xi,shift(@Xi));
278 my ($i,$a,$b,$c,$d,$e)=@_;
282 pxor @Xi[-2],@Xi[1] # "X[13]"
283 movdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
287 pxor `&Xi_off($j+8)`,@Xi[1]
289 paddd $K,$e # e+=K_40_59
300 movdqa @Xi[0],`&Xi_off($i)`
301 paddd @Xi[0],$e # e+=X[i]
302 por $t3,$t2 # rol(a,5)
309 paddd $t0,$e # e+=Maj(b,d,c)
312 paddd $t2,$e # e+=rol(a,5)
313 por $tx,@Xi[1] # rol(@X[1],1)
314 por $t1,$b # b=rol(b,30)
316 push(@Xi,shift(@Xi));
322 .extern OPENSSL_ia32cap_P
324 .globl sha1_multi_block
325 .type sha1_multi_block,\@function,3
329 $code.=<<___ if ($avx);
330 mov OPENSSL_ia32cap_P+4(%rip),%rcx
339 $code.=<<___ if ($win64);
342 movaps %xmm7,0x10(%rsp)
343 movaps %xmm8,0x20(%rsp)
344 movaps %xmm9,0x30(%rsp)
345 movaps %xmm10,-0x78(%rax)
346 movaps %xmm11,-0x68(%rax)
347 movaps %xmm12,-0x58(%rax)
348 movaps %xmm13,-0x48(%rax)
349 movaps %xmm14,-0x38(%rax)
350 movaps %xmm15,-0x28(%rax)
353 sub \$`$REG_SZ*18`,%rsp
355 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
356 lea K_XX_XX(%rip),$Tbl
357 lea `$REG_SZ*16`(%rsp),%rbx
360 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
363 for($i=0;$i<4;$i++) {
365 mov `16*$i+0`($inp),@ptr[$i] # input pointer
366 mov `16*$i+8`($inp),%ecx # number of blocks
368 cmovg %ecx,$num # find maximum
370 mov %ecx,`4*$i`(%rbx) # initialize counters
371 cmovle $Tbl,@ptr[$i] # cancel input
378 movdqu 0x00($ctx),$A # load context
384 movdqa 0x60($Tbl),$tx # pbswap_mask
390 $code.=" movdqa -0x20($Tbl),$K\n"; # K_00_19
391 for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
392 $code.=" movdqa 0x00($Tbl),$K\n"; # K_20_39
393 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
394 $code.=" movdqa 0x20($Tbl),$K\n"; # K_40_59
395 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
396 $code.=" movdqa 0x40($Tbl),$K\n"; # K_60_79
397 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
399 movdqa (%rbx),@Xi[0] # pull counters
401 cmp 4*0(%rbx),%ecx # examinte counters
403 cmovge $Tbl,@ptr[0] # cancel input
408 pcmpgtd $t2,@Xi[1] # mask value
411 paddd @Xi[1],@Xi[0] # counters--
414 movdqu 0x00($ctx),$t0
416 movdqu 0x20($ctx),$t1
419 movdqu 0x40($ctx),$t2
422 movdqu 0x60($ctx),$t3
425 movdqu 0x80($ctx),$tx
435 movdqa @Xi[0],(%rbx) # save counters
436 movdqa 0x60($Tbl),$tx # pbswap_mask
440 mov `$REG_SZ*17+8`(%rsp),$num
441 lea $REG_SZ($ctx),$ctx
442 lea `16*$REG_SZ/4`($inp),$inp
447 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
449 $code.=<<___ if ($win64);
450 movaps -0xb8(%rax),%xmm6
451 movaps -0xa8(%rax),%xmm7
452 movaps -0x98(%rax),%xmm8
453 movaps -0x88(%rax),%xmm9
454 movaps -0x78(%rax),%xmm10
455 movaps -0x68(%rax),%xmm11
456 movaps -0x58(%rax),%xmm12
457 movaps -0x48(%rax),%xmm13
458 movaps -0x38(%rax),%xmm14
459 movaps -0x28(%rax),%xmm15
466 .size sha1_multi_block,.-sha1_multi_block
471 my ($i,$a,$b,$c,$d,$e)=@_;
474 my $vpack = $REG_SZ==16 ? "vpunpckldq" : "vinserti128";
475 my $ptr_n = $REG_SZ==16 ? @ptr[1] : @ptr[4];
477 $code.=<<___ if ($i==0 && $REG_SZ==16);
478 vmovd (@ptr[0]),@Xi[0]
479 lea `16*4`(@ptr[0]),@ptr[0]
480 vmovd (@ptr[1]),@Xi[2] # borrow Xi[2]
481 lea `16*4`(@ptr[1]),@ptr[1]
482 vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0]
483 lea `16*4`(@ptr[2]),@ptr[2]
484 vpinsrd \$1,(@ptr[3]),@Xi[2],@Xi[2]
485 lea `16*4`(@ptr[3]),@ptr[3]
486 vmovd `4*$j-16*4`(@ptr[0]),@Xi[1]
487 vpunpckldq @Xi[2],@Xi[0],@Xi[0]
488 vmovd `4*$j-16*4`($ptr_n),$t3
489 vpshufb $tx,@Xi[0],@Xi[0]
491 $code.=<<___ if ($i<15 && $REG_SZ==16); # just load input
492 vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
493 vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t3,$t3
495 $code.=<<___ if ($i==0 && $REG_SZ==32);
496 vmovd (@ptr[0]),@Xi[0]
497 lea `16*4`(@ptr[0]),@ptr[0]
498 vmovd (@ptr[4]),@Xi[2] # borrow Xi[2]
499 lea `16*4`(@ptr[4]),@ptr[4]
501 lea `16*4`(@ptr[1]),@ptr[1]
503 lea `16*4`(@ptr[5]),@ptr[5]
504 vpinsrd \$1,(@ptr[2]),@Xi[0],@Xi[0]
505 lea `16*4`(@ptr[2]),@ptr[2]
506 vpinsrd \$1,(@ptr[6]),@Xi[2],@Xi[2]
507 lea `16*4`(@ptr[6]),@ptr[6]
508 vpinsrd \$1,(@ptr[3]),$t2,$t2
509 lea `16*4`(@ptr[3]),@ptr[3]
510 vpunpckldq $t2,@Xi[0],@Xi[0]
511 vpinsrd \$1,(@ptr[7]),$t1,$t1
512 lea `16*4`(@ptr[7]),@ptr[7]
513 vpunpckldq $t1,@Xi[2],@Xi[2]
514 vmovd `4*$j-16*4`(@ptr[0]),@Xi[1]
515 vinserti128 @Xi[2],@Xi[0],@Xi[0]
516 vmovd `4*$j-16*4`($ptr_n),$t3
517 vpshufb $tx,@Xi[0],@Xi[0]
519 $code.=<<___ if ($i<15 && $REG_SZ==32); # just load input
520 vmovd `4*$j-16*4`(@ptr[1]),$t2
521 vmovd `4*$j-16*4`(@ptr[5]),$t1
522 vpinsrd \$1,`4*$j-16*4`(@ptr[2]),@Xi[1],@Xi[1]
523 vpinsrd \$1,`4*$j-16*4`(@ptr[6]),$t3,$t3
524 vpinsrd \$1,`4*$j-16*4`(@ptr[3]),$t2,$t2
525 vpunpckldq $t2,@Xi[1],@Xi[1]
526 vpinsrd \$1,`4*$j-16*4`(@ptr[7]),$t1,$t1
527 vpunpckldq $t1,$t3,$t3
529 $code.=<<___ if ($i<14);
530 vpaddd $K,$e,$e # e+=K_00_19
535 vmovdqa @Xi[0],`&Xi_off($i)`
536 vpaddd @Xi[0],$e,$e # e+=X[i]
537 $vpack $t3,@Xi[1],@Xi[1]
539 vpxor $t1,$t0,$t0 # Ch(b,c,d)
540 vmovd `4*$k-16*4`(@ptr[0]),@Xi[2]
543 vpor $t3,$t2,$t2 # rol(a,5)
544 vmovd `4*$k-16*4`($ptr_n),$t3
545 vpaddd $t0,$e,$e # e+=Ch(b,c,d)
548 vpaddd $t2,$e,$e # e+=rol(a,5)
549 vpshufb $tx,@Xi[1],@Xi[1]
550 vpor $t1,$b,$b # b=rol(b,30)
552 $code.=<<___ if ($i==14);
553 vpaddd $K,$e,$e # e+=K_00_19
558 vmovdqa @Xi[0],`&Xi_off($i)`
559 vpaddd @Xi[0],$e,$e # e+=X[i]
560 $vpack $t3,@Xi[1],@Xi[1]
562 vpxor $t1,$t0,$t0 # Ch(b,c,d)
565 vpor $t3,$t2,$t2 # rol(a,5)
566 vpaddd $t0,$e,$e # e+=Ch(b,c,d)
569 vpaddd $t2,$e,$e # e+=rol(a,5)
570 vpshufb $tx,@Xi[1],@Xi[1]
571 vpor $t1,$b,$b # b=rol(b,30)
573 $code.=<<___ if ($i>=13 && $i<15);
574 vmovdqa `&Xi_off($j+2)`,@Xi[3] # preload "X[2]"
576 $code.=<<___ if ($i>=15); # apply Xupdate
577 vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]"
578 vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
580 vpaddd $K,$e,$e # e+=K_00_19
585 vmovdqa @Xi[0],`&Xi_off($i)`
586 vpaddd @Xi[0],$e,$e # e+=X[i]
587 vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1]
589 vpxor $t1,$t0,$t0 # Ch(b,c,d)
590 vpxor @Xi[3],@Xi[1],@Xi[1]
593 vpor $t3,$t2,$t2 # rol(a,5)
594 vpaddd $t0,$e,$e # e+=Ch(b,c,d)
595 vpsrld \$31,@Xi[1],$tx
596 vpaddd @Xi[1],@Xi[1],@Xi[1]
599 vpaddd $t2,$e,$e # e+=rol(a,5)
600 vpor $tx,@Xi[1],@Xi[1] # rol \$1,@Xi[1]
601 vpor $t1,$b,$b # b=rol(b,30)
603 push(@Xi,shift(@Xi));
607 my ($i,$a,$b,$c,$d,$e)=@_;
610 $code.=<<___ if ($i<79);
611 vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]"
612 vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
615 vpaddd $K,$e,$e # e+=K_20_39
618 $code.=<<___ if ($i<72);
619 vmovdqa @Xi[0],`&Xi_off($i)`
621 $code.=<<___ if ($i<79);
622 vpaddd @Xi[0],$e,$e # e+=X[i]
623 vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1]
625 vpxor $c,$t0,$t0 # Parity(b,c,d)
626 vpxor @Xi[3],@Xi[1],@Xi[1]
629 vpor $t3,$t2,$t2 # rol(a,5)
630 vpaddd $t0,$e,$e # e+=Parity(b,c,d)
631 vpsrld \$31,@Xi[1],$tx
632 vpaddd @Xi[1],@Xi[1],@Xi[1]
635 vpaddd $t2,$e,$e # e+=rol(a,5)
636 vpor $tx,@Xi[1],@Xi[1] # rol(@Xi[1],1)
637 vpor $t1,$b,$b # b=rol(b,30)
639 $code.=<<___ if ($i==79);
641 vpaddd $K,$e,$e # e+=K_20_39
645 vpaddd @Xi[0],$e,$e # e+=X[i]
646 vpxor $c,$t0,$t0 # Parity(b,c,d)
649 vpor $t3,$t2,$t2 # rol(a,5)
650 vpaddd $t0,$e,$e # e+=Parity(b,c,d)
653 vpaddd $t2,$e,$e # e+=rol(a,5)
654 vpor $t1,$b,$b # b=rol(b,30)
656 push(@Xi,shift(@Xi));
660 my ($i,$a,$b,$c,$d,$e)=@_;
664 vpxor @Xi[-2],@Xi[1],@Xi[1] # "X[13]"
665 vmovdqa `&Xi_off($j+2)`,@Xi[3] # "X[2]"
667 vpaddd $K,$e,$e # e+=K_40_59
670 vpxor `&Xi_off($j+8)`,@Xi[1],@Xi[1]
675 vpxor @Xi[3],@Xi[1],@Xi[1]
677 vmovdqu @Xi[0],`&Xi_off($i)`
678 vpaddd @Xi[0],$e,$e # e+=X[i]
679 vpor $t3,$t2,$t2 # rol(a,5)
680 vpsrld \$31,@Xi[1],$tx
682 vpaddd @Xi[1],@Xi[1],@Xi[1]
685 vpaddd $t0,$e,$e # e+=Maj(b,d,c)
688 vpaddd $t2,$e,$e # e+=rol(a,5)
689 vpor $tx,@Xi[1],@Xi[1] # rol(@X[1],1)
690 vpor $t1,$b,$b # b=rol(b,30)
692 push(@Xi,shift(@Xi));
696 .type sha1_multi_block_avx,\@function,3
698 sha1_multi_block_avx:
701 $code.=<<___ if ($avx>1);
716 $code.=<<___ if ($win64);
719 movaps %xmm7,0x10(%rsp)
720 movaps %xmm8,0x20(%rsp)
721 movaps %xmm9,0x30(%rsp)
722 movaps %xmm10,-0x78(%rax)
723 movaps %xmm11,-0x68(%rax)
724 movaps %xmm12,-0x58(%rax)
725 movaps %xmm13,-0x48(%rax)
726 movaps %xmm14,-0x38(%rax)
727 movaps %xmm15,-0x28(%rax)
730 sub \$`$REG_SZ*18`, %rsp
732 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
733 lea K_XX_XX(%rip),$Tbl
734 lea `$REG_SZ*16`(%rsp),%rbx
738 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
741 for($i=0;$i<4;$i++) {
743 mov `16*$i+0`($inp),@ptr[$i] # input pointer
744 mov `16*$i+8`($inp),%ecx # number of blocks
746 cmovg %ecx,$num # find maximum
748 mov %ecx,`4*$i`(%rbx) # initialize counters
749 cmovle $Tbl,@ptr[$i] # cancel input
756 vmovdqu 0x00($ctx),$A # load context
758 vmovdqu 0x20($ctx),$B
759 vmovdqu 0x40($ctx),$C
760 vmovdqu 0x60($ctx),$D
761 vmovdqu 0x80($ctx),$E
762 vmovdqu 0x60($Tbl),$tx # pbswap_mask
768 $code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19
769 for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
770 $code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39
771 for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
772 $code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59
773 for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
774 $code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79
775 for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
779 for($i=0;$i<4;$i++) {
781 cmp `4*$i`(%rbx),%ecx # examine counters
782 cmovge $Tbl,@ptr[$i] # cancel input
786 vmovdqu (%rbx),$t0 # pull counters
789 vpcmpgtd $t2,$t1,$t1 # mask value
790 vpaddd $t1,$t0,$t0 # counters--
794 vpaddd 0x00($ctx),$A,$A
796 vpaddd 0x20($ctx),$B,$B
798 vpaddd 0x40($ctx),$C,$C
800 vpaddd 0x60($ctx),$D,$D
801 vpaddd 0x80($ctx),$E,$E
802 vmovdqu $A,0x00($ctx)
803 vmovdqu $B,0x20($ctx)
804 vmovdqu $C,0x40($ctx)
805 vmovdqu $D,0x60($ctx)
806 vmovdqu $E,0x80($ctx)
808 vmovdqu $t0,(%rbx) # save counters
809 vmovdqu 0x60($Tbl),$tx # pbswap_mask
813 mov `$REG_SZ*17+8`(%rsp),$num
814 lea $REG_SZ($ctx),$ctx
815 lea `16*$REG_SZ/4`($inp),$inp
820 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
823 $code.=<<___ if ($win64);
824 movaps -0xb8(%rax),%xmm6
825 movaps -0xa8(%rax),%xmm7
826 movaps -0x98(%rax),%xmm8
827 movaps -0x88(%rax),%xmm9
828 movaps -0x78(%rax),%xmm10
829 movaps -0x68(%rax),%xmm11
830 movaps -0x58(%rax),%xmm12
831 movaps -0x48(%rax),%xmm13
832 movaps -0x38(%rax),%xmm14
833 movaps -0x28(%rax),%xmm15
840 .size sha1_multi_block_avx,.-sha1_multi_block_avx
844 $code =~ s/\`([^\`]*)\`/eval $1/gem;
848 @ptr=map("%r$_",(12..15,8..11));
850 @V=($A,$B,$C,$D,$E)=map("%ymm$_",(0..4));
851 ($t0,$t1,$t2,$t3,$tx)=map("%ymm$_",(5..9));
852 @Xi=map("%ymm$_",(10..14));
856 .type sha1_multi_block_avx2,\@function,3
858 sha1_multi_block_avx2:
868 $code.=<<___ if ($win64);
871 movaps %xmm7,0x10(%rsp)
872 movaps %xmm8,0x20(%rsp)
873 movaps %xmm9,0x30(%rsp)
874 movaps %xmm10,0x40(%rsp)
875 movaps %xmm11,0x50(%rsp)
876 movaps %xmm12,-0x78(%rax)
877 movaps %xmm13,-0x68(%rax)
878 movaps %xmm14,-0x58(%rax)
879 movaps %xmm15,-0x48(%rax)
882 sub \$`$REG_SZ*18`, %rsp
884 mov %rax,`$REG_SZ*17`(%rsp) # original %rsp
885 lea K_XX_XX(%rip),$Tbl
890 mov $num,`$REG_SZ*17+8`(%rsp) # original $num
892 lea `$REG_SZ*16`(%rsp),%rbx
894 for($i=0;$i<8;$i++) {
896 mov `16*$i+0`($inp),@ptr[$i] # input pointer
897 mov `16*$i+8`($inp),%ecx # number of blocks
899 cmovg %ecx,$num # find maximum
901 mov %ecx,`4*$i`(%rbx) # initialize counters
902 cmovle $Tbl,@ptr[$i] # cancel input
906 vmovdqu 0x00($ctx),$A # load context
908 vmovdqu 0x20($ctx),$B
909 lea 256+128(%rsp),%rbx
910 vmovdqu 0x40($ctx),$C
911 vmovdqu 0x60($ctx),$D
912 vmovdqu 0x80($ctx),$E
913 vmovdqu 0x60($Tbl),$tx # pbswap_mask
919 $code.=" vmovdqa -0x20($Tbl),$K\n"; # K_00_19
920 for($i=0;$i<20;$i++) { &BODY_00_19_avx($i,@V); unshift(@V,pop(@V)); }
921 $code.=" vmovdqa 0x00($Tbl),$K\n"; # K_20_39
922 for(;$i<40;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
923 $code.=" vmovdqa 0x20($Tbl),$K\n"; # K_40_59
924 for(;$i<60;$i++) { &BODY_40_59_avx($i,@V); unshift(@V,pop(@V)); }
925 $code.=" vmovdqa 0x40($Tbl),$K\n"; # K_60_79
926 for(;$i<80;$i++) { &BODY_20_39_avx($i,@V); unshift(@V,pop(@V)); }
929 lea `$REG_SZ*16`(%rsp),%rbx
931 for($i=0;$i<8;$i++) {
933 cmp `4*$i`(%rbx),%ecx # examine counters
934 cmovge $Tbl,@ptr[$i] # cancel input
938 vmovdqu (%rbx),$t0 # pull counters
941 vpcmpgtd $t2,$t1,$t1 # mask value
942 vpaddd $t1,$t0,$t0 # counters--
946 vpaddd 0x00($ctx),$A,$A
948 vpaddd 0x20($ctx),$B,$B
950 vpaddd 0x40($ctx),$C,$C
952 vpaddd 0x60($ctx),$D,$D
953 vpaddd 0x80($ctx),$E,$E
954 vmovdqu $A,0x00($ctx)
955 vmovdqu $B,0x20($ctx)
956 vmovdqu $C,0x40($ctx)
957 vmovdqu $D,0x60($ctx)
958 vmovdqu $E,0x80($ctx)
960 vmovdqu $t0,(%rbx) # save counters
961 lea 256+128(%rsp),%rbx
962 vmovdqu 0x60($Tbl),$tx # pbswap_mask
966 #mov `$REG_SZ*17+8`(%rsp),$num
967 #lea $REG_SZ($ctx),$ctx
968 #lea `16*$REG_SZ/4`($inp),$inp
970 #jnz .Loop_grande_avx2
973 mov `$REG_SZ*17`(%rsp),%rax # orignal %rsp
976 $code.=<<___ if ($win64);
977 movaps -0xd8(%rax),%xmm6
978 movaps -0xc8(%rax),%xmm7
979 movaps -0xb8(%rax),%xmm8
980 movaps -0xa8(%rax),%xmm9
981 movaps -0x98(%rax),%xmm10
982 movaps -0x88(%rax),%xmm11
983 movaps -0x78(%rax),%xmm12
984 movaps -0x68(%rax),%xmm13
985 movaps -0x58(%rax),%xmm14
986 movaps -0x48(%rax),%xmm15
997 .size sha1_multi_block_avx2,.-sha1_multi_block_avx2
1003 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1004 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1006 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1007 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1008 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1009 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1010 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1011 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1012 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1013 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap
1016 foreach (split("\n",$code)) {
1017 s/\`([^\`]*)\`/eval($1)/ge;
1019 s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1020 s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
1021 s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+),%ymm([0-9]+)/$1$2%xmm$3,%xmm$4/go or
1022 s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
1023 s/\b(vinserti128)\b(\s+)%ymm/$1$2\$1,%xmm/go or
1024 s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;