2 # Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the OpenSSL license (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # sha1_block procedure for x86_64.
19 # It was brought to my attention that on EM64T compiler-generated code
20 # was far behind 32-bit assembler implementation. This is unlike on
21 # Opteron where compiler-generated code was only 15% behind 32-bit
22 # assembler, which originally made it hard to motivate the effort.
23 # There was suggestion to mechanically translate 32-bit code, but I
24 # dismissed it, reasoning that x86_64 offers enough register bank
25 # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
26 # implementation:-) However! While 64-bit code does perform better
27 # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
28 # x86_64 does offer larger *addressable* bank, but out-of-order core
29 # reaches for even more registers through dynamic aliasing, and EM64T
30 # core must have managed to run-time optimize even 32-bit code just as
31 # good as 64-bit one. Performance improvement is summarized in the
34 # gcc 3.4 32-bit asm cycles/byte
35 # Opteron +45% +20% 6.8
36 # Xeon P4 +65% +0% 9.9
41 # The code was revised to minimize code size and to maximize
42 # "distance" between instructions producing input to 'lea'
43 # instruction and the 'lea' instruction itself, which is essential
44 # for Intel Atom core.
48 # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
49 # is to offload message schedule denoted by Wt in NIST specification,
50 # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
51 # for background and implementation details. The only difference from
52 # 32-bit code is that 64-bit code doesn't have to spill @X[] elements
53 # to free temporary registers.
57 # Add AVX code path. See sha1-586.pl for further information.
61 # Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
62 # and loading pair of consecutive blocks to 256-bit %ymm registers)
63 # did not provide impressive performance improvement till a crucial
64 # hint regarding the number of Xupdate iterations to pre-compute in
65 # advance was provided by Ilya Albrekht of Intel Corp.
69 # Add support for Intel SHA Extensions.
71 ######################################################################
72 # Current performance is summarized in following table. Numbers are
73 # CPU clock cycles spent to process single byte (less is better).
78 # Core2 6.55 6.05/+8% -
79 # Westmere 6.73 5.30/+27% -
80 # Sandy Bridge 7.70 6.10/+26% 4.99/+54%
81 # Ivy Bridge 6.06 4.67/+30% 4.60/+32%
82 # Haswell 5.45 4.15/+31% 3.57/+53%
83 # Skylake 5.18 4.06/+28% 3.54/+46%
84 # Bulldozer 9.11 5.95/+53%
85 # Ryzen 4.75 ? 1.93/+150%(**)
86 # VIA Nano 9.32 7.15/+30%
88 # Silvermont 13.1(*) 9.37/+40%
89 # Goldmont 8.13 6.42/+27% 1.70/+380%(**)
91 # (*) obviously suboptimal result, nothing was done about it,
92 # because SSSE3 code is compiled unconditionally;
97 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
99 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
101 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
102 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
103 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
104 die "can't locate x86_64-xlate.pl";
106 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
107 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
108 $avx = ($1>=2.19) + ($1>=2.22);
111 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
112 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
113 $avx = ($1>=2.09) + ($1>=2.10);
116 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
117 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
118 $avx = ($1>=10) + ($1>=11);
121 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([2-9]\.[0-9]+)/) {
122 $avx = ($2>=3.0) + ($2>3.0);
125 $shaext=1; ### set to zero if compiling for 1.0.1
126 $avx=1 if (!$shaext && $avx);
128 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
131 $ctx="%rdi"; # 1st arg
132 $inp="%rsi"; # 2nd arg
133 $num="%rdx"; # 3rd arg
135 # reassign arguments in order to produce more compact code
143 @xi=("%edx","%ebp","%r14d");
153 my ($i,$a,$b,$c,$d,$e)=@_;
155 $code.=<<___ if ($i==0);
156 mov `4*$i`($inp),$xi[0]
159 $code.=<<___ if ($i<15);
160 mov `4*$j`($inp),$xi[1]
162 mov $xi[0],`4*$i`(%rsp)
168 lea 0x5a827999($xi[0],$e),$e
174 $code.=<<___ if ($i>=15);
175 xor `4*($j%16)`(%rsp),$xi[1]
177 mov $xi[0],`4*($i%16)`(%rsp)
179 xor `4*(($j+2)%16)`(%rsp),$xi[1]
182 xor `4*(($j+8)%16)`(%rsp),$xi[1]
184 lea 0x5a827999($xi[0],$e),$e
191 push(@xi,shift(@xi));
195 my ($i,$a,$b,$c,$d,$e)=@_;
197 my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
198 $code.=<<___ if ($i<79);
199 xor `4*($j%16)`(%rsp),$xi[1]
201 `"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)`
203 xor `4*(($j+2)%16)`(%rsp),$xi[1]
206 xor `4*(($j+8)%16)`(%rsp),$xi[1]
214 $code.=<<___ if ($i==79);
225 push(@xi,shift(@xi));
229 my ($i,$a,$b,$c,$d,$e)=@_;
232 xor `4*($j%16)`(%rsp),$xi[1]
234 mov $xi[0],`4*($i%16)`(%rsp)
236 xor `4*(($j+2)%16)`(%rsp),$xi[1]
239 xor `4*(($j+8)%16)`(%rsp),$xi[1]
240 lea 0x8f1bbcdc($xi[0],$e),$e
250 push(@xi,shift(@xi));
255 .extern OPENSSL_ia32cap_P
257 .globl sha1_block_data_order
258 .type sha1_block_data_order,\@function,3
260 sha1_block_data_order:
262 mov OPENSSL_ia32cap_P+0(%rip),%r9d
263 mov OPENSSL_ia32cap_P+4(%rip),%r8d
264 mov OPENSSL_ia32cap_P+8(%rip),%r10d
265 test \$`1<<9`,%r8d # check SSSE3 bit
268 $code.=<<___ if ($shaext);
269 test \$`1<<29`,%r10d # check SHA bit
272 $code.=<<___ if ($avx>1);
273 and \$`1<<3|1<<5|1<<8`,%r10d # check AVX2+BMI1+BMI2
274 cmp \$`1<<3|1<<5|1<<8`,%r10d
277 $code.=<<___ if ($avx);
278 and \$`1<<28`,%r8d # mask AVX bit
279 and \$`1<<30`,%r9d # mask "Intel CPU" bit
281 cmp \$`1<<28|1<<30`,%r8d
290 .cfi_def_cfa_register %rax
301 mov %rdi,$ctx # reassigned argument
303 mov %rsi,$inp # reassigned argument
305 mov %rdx,$num # reassigned argument
306 mov %rax,`16*4`(%rsp)
307 .cfi_cfa_expression %rsp+64,deref,+8
320 for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
321 for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
322 for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
323 for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
337 lea `16*4`($inp),$inp
340 mov `16*4`(%rsp),%rsi
353 .cfi_def_cfa_register %rsp
357 .size sha1_block_data_order,.-sha1_block_data_order
360 ######################################################################
361 # Intel SHA Extensions implementation of SHA1 update function.
363 my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
364 my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
365 my @MSG=map("%xmm$_",(4..7));
368 .type sha1_block_data_order_shaext,\@function,3
370 sha1_block_data_order_shaext:
374 $code.=<<___ if ($win64);
375 lea `-8-4*16`(%rsp),%rsp
376 movaps %xmm6,-8-4*16(%rax)
377 movaps %xmm7,-8-3*16(%rax)
378 movaps %xmm8,-8-2*16(%rax)
379 movaps %xmm9,-8-1*16(%rax)
385 movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap
387 movdqu ($inp),@MSG[0]
388 pshufd \$0b00011011,$ABCD,$ABCD # flip word order
389 movdqu 0x10($inp),@MSG[1]
390 pshufd \$0b00011011,$E,$E # flip word order
391 movdqu 0x20($inp),@MSG[2]
392 pshufb $BSWAP,@MSG[0]
393 movdqu 0x30($inp),@MSG[3]
394 pshufb $BSWAP,@MSG[1]
395 pshufb $BSWAP,@MSG[2]
396 movdqa $E,$E_SAVE # offload $E
397 pshufb $BSWAP,@MSG[3]
403 lea 0x40($inp),%r8 # next input block
406 movdqa $ABCD,$ABCD_SAVE # offload $ABCD
408 for($i=0;$i<20-4;$i+=2) {
410 sha1msg1 @MSG[1],@MSG[0]
412 sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3...
413 sha1nexte @MSG[1],$E_
415 sha1msg1 @MSG[2],@MSG[1]
416 sha1msg2 @MSG[3],@MSG[0]
419 sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD
422 sha1msg2 @MSG[0],@MSG[1]
424 push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG));
427 movdqu ($inp),@MSG[0]
429 sha1rnds4 \$3,$E,$ABCD # 64-67
430 sha1nexte @MSG[1],$E_
431 movdqu 0x10($inp),@MSG[1]
432 pshufb $BSWAP,@MSG[0]
435 sha1rnds4 \$3,$E_,$ABCD # 68-71
437 movdqu 0x20($inp),@MSG[2]
438 pshufb $BSWAP,@MSG[1]
441 sha1rnds4 \$3,$E,$ABCD # 72-75
442 sha1nexte @MSG[3],$E_
443 movdqu 0x30($inp),@MSG[3]
444 pshufb $BSWAP,@MSG[2]
447 sha1rnds4 \$3,$E_,$ABCD # 76-79
449 pshufb $BSWAP,@MSG[3]
451 paddd $ABCD_SAVE,$ABCD
452 movdqa $E,$E_SAVE # offload $E
456 pshufd \$0b00011011,$ABCD,$ABCD
457 pshufd \$0b00011011,$E,$E
461 $code.=<<___ if ($win64);
462 movaps -8-4*16(%rax),%xmm6
463 movaps -8-3*16(%rax),%xmm7
464 movaps -8-2*16(%rax),%xmm8
465 movaps -8-1*16(%rax),%xmm9
472 .size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
477 my @X=map("%xmm$_",(4..7,0..3));
478 my @Tx=map("%xmm$_",(8..10));
480 my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
481 my @T=("%esi","%edi");
487 my $_rol=sub { &rol(@_) };
488 my $_ror=sub { &ror(@_) };
494 jmp .Lalign32_$sn # see "Decoded ICache" in manual
502 .type sha1_block_data_order_ssse3,\@function,3
504 sha1_block_data_order_ssse3:
507 mov %rsp,$fp # frame pointer
508 .cfi_def_cfa_register $fp
515 push %r13 # redundant, done to share Win64 SE handler
519 lea `-64-($win64?6*16:0)`(%rsp),%rsp
521 $code.=<<___ if ($win64);
522 movaps %xmm6,-40-6*16($fp)
523 movaps %xmm7,-40-5*16($fp)
524 movaps %xmm8,-40-4*16($fp)
525 movaps %xmm9,-40-3*16($fp)
526 movaps %xmm10,-40-2*16($fp)
527 movaps %xmm11,-40-1*16($fp)
532 mov %rdi,$ctx # reassigned argument
533 mov %rsi,$inp # reassigned argument
534 mov %rdx,$num # reassigned argument
538 lea K_XX_XX+64(%rip),$K_XX_XX
540 mov 0($ctx),$A # load context
544 mov $B,@T[0] # magic seed
550 movdqa 64($K_XX_XX),@X[2] # pbswap mask
551 movdqa -64($K_XX_XX),@Tx[1] # K_00_19
552 movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
553 movdqu 16($inp),@X[-3&7]
554 movdqu 32($inp),@X[-2&7]
555 movdqu 48($inp),@X[-1&7]
556 pshufb @X[2],@X[-4&7] # byte swap
557 pshufb @X[2],@X[-3&7]
558 pshufb @X[2],@X[-2&7]
560 paddd @Tx[1],@X[-4&7] # add K_00_19
561 pshufb @X[2],@X[-1&7]
562 paddd @Tx[1],@X[-3&7]
563 paddd @Tx[1],@X[-2&7]
564 movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
565 psubd @Tx[1],@X[-4&7] # restore X[]
566 movdqa @X[-3&7],16(%rsp)
567 psubd @Tx[1],@X[-3&7]
568 movdqa @X[-2&7],32(%rsp)
569 psubd @Tx[1],@X[-2&7]
573 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
574 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
576 $arg = "\$$arg" if ($arg*1 eq $arg);
577 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
580 sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4
583 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
586 eval(shift(@insns)); # ror
587 &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
589 &movdqa (@Tx[0],@X[-1&7]);
590 &paddd (@Tx[1],@X[-1&7]);
594 &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
596 eval(shift(@insns)); # rol
598 &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
602 &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
604 eval(shift(@insns)); # ror
605 &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
610 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
612 eval(shift(@insns)); # rol
613 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
617 &movdqa (@Tx[2],@X[0]);
620 eval(shift(@insns)); # ror
621 &movdqa (@Tx[0],@X[0]);
624 &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
625 &paddd (@X[0],@X[0]);
631 eval(shift(@insns)); # rol
633 &movdqa (@Tx[1],@Tx[2]);
639 eval(shift(@insns)); # ror
640 &por (@X[0],@Tx[0]); # "X[0]"<<<=1
646 &pxor (@X[0],@Tx[2]);
648 &movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX
649 eval(shift(@insns)); # rol
653 &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
654 &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
656 foreach (@insns) { eval; } # remaining instructions [if any]
658 $Xi++; push(@X,shift(@X)); # "rotate" X[]
659 push(@Tx,shift(@Tx));
662 sub Xupdate_ssse3_32_79()
665 my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
668 eval(shift(@insns)) if ($Xi==8);
669 &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
670 eval(shift(@insns)) if ($Xi==8);
671 eval(shift(@insns)); # body_20_39
673 eval(shift(@insns)) if (@insns[1] =~ /_ror/);
674 eval(shift(@insns)) if (@insns[0] =~ /_ror/);
675 &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
677 eval(shift(@insns)); # rol
679 &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
683 &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
684 } else { # ... or load next one
685 &movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
687 eval(shift(@insns)); # ror
688 &paddd (@Tx[1],@X[-1&7]);
691 &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
692 eval(shift(@insns)); # body_20_39
695 eval(shift(@insns)); # rol
696 eval(shift(@insns)) if (@insns[0] =~ /_ror/);
698 &movdqa (@Tx[0],@X[0]);
701 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
702 eval(shift(@insns)); # ror
704 eval(shift(@insns)); # body_20_39
710 eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
713 eval(shift(@insns)); # ror
715 &por (@X[0],@Tx[0]); # "X[0]"<<<=2
717 eval(shift(@insns)); # body_20_39
718 eval(shift(@insns)) if (@insns[1] =~ /_rol/);
719 eval(shift(@insns)) if (@insns[0] =~ /_rol/);
720 &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
722 eval(shift(@insns)); # rol
725 eval(shift(@insns)); # rol
728 foreach (@insns) { eval; } # remaining instructions
730 $Xi++; push(@X,shift(@X)); # "rotate" X[]
731 push(@Tx,shift(@Tx));
734 sub Xuplast_ssse3_80()
737 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
744 &paddd (@Tx[1],@X[-1&7]);
748 &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
750 foreach (@insns) { eval; } # remaining instructions
753 &je (".Ldone_ssse3");
755 unshift(@Tx,pop(@Tx));
757 &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
758 &movdqa (@Tx[1],"-64($K_XX_XX)"); # K_00_19
759 &movdqu (@X[-4&7],"0($inp)"); # load input
760 &movdqu (@X[-3&7],"16($inp)");
761 &movdqu (@X[-2&7],"32($inp)");
762 &movdqu (@X[-1&7],"48($inp)");
763 &pshufb (@X[-4&7],@X[2]); # byte swap
772 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
778 &pshufb (@X[($Xi-3)&7],@X[2]);
783 &paddd (@X[($Xi-4)&7],@Tx[1]);
788 &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
793 &psubd (@X[($Xi-4)&7],@Tx[1]);
795 foreach (@insns) { eval; }
802 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
805 foreach (@insns) { eval; }
808 sub body_00_19 () { # ((c^d)&b)^d
809 # on start @T[0]=(c^d)&b
810 return &body_20_39() if ($rx==19); $rx++;
812 '($a,$b,$c,$d,$e)=@V;'.
813 '&$_ror ($b,$j?7:2)', # $b>>>2
815 '&mov (@T[1],$a)', # $b for next round
817 '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
818 '&xor ($b,$c)', # $c^$d for next round
822 '&and (@T[1],$b)', # ($b&($c^$d)) for next round
824 '&xor ($b,$c)', # restore $b
825 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
829 sub body_20_39 () { # b^d^c
831 return &body_40_59() if ($rx==39); $rx++;
833 '($a,$b,$c,$d,$e)=@V;'.
834 '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
835 '&xor (@T[0],$d) if($j==19);'.
836 '&xor (@T[0],$c) if($j> 19)', # ($b^$d^$c)
837 '&mov (@T[1],$a)', # $b for next round
841 '&xor (@T[1],$c) if ($j< 79)', # $b^$d for next round
843 '&$_ror ($b,7)', # $b>>>2
844 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
848 sub body_40_59 () { # ((b^c)&(c^d))^c
849 # on entry @T[0]=(b^c), (c^=d)
852 '($a,$b,$c,$d,$e)=@V;'.
853 '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
854 '&and (@T[0],$c) if ($j>=40)', # (b^c)&(c^d)
855 '&xor ($c,$d) if ($j>=40)', # restore $c
857 '&$_ror ($b,7)', # $b>>>2
858 '&mov (@T[1],$a)', # $b for next round
863 '&xor (@T[1],$c) if ($j==59);'.
864 '&xor (@T[1],$b) if ($j< 59)', # b^c for next round
866 '&xor ($b,$c) if ($j< 59)', # c^d for next round
867 '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
874 &Xupdate_ssse3_16_31(\&body_00_19);
875 &Xupdate_ssse3_16_31(\&body_00_19);
876 &Xupdate_ssse3_16_31(\&body_00_19);
877 &Xupdate_ssse3_16_31(\&body_00_19);
878 &Xupdate_ssse3_32_79(\&body_00_19);
879 &Xupdate_ssse3_32_79(\&body_20_39);
880 &Xupdate_ssse3_32_79(\&body_20_39);
881 &Xupdate_ssse3_32_79(\&body_20_39);
882 &Xupdate_ssse3_32_79(\&body_20_39);
883 &Xupdate_ssse3_32_79(\&body_20_39);
884 &Xupdate_ssse3_32_79(\&body_40_59);
885 &Xupdate_ssse3_32_79(\&body_40_59);
886 &Xupdate_ssse3_32_79(\&body_40_59);
887 &Xupdate_ssse3_32_79(\&body_40_59);
888 &Xupdate_ssse3_32_79(\&body_40_59);
889 &Xupdate_ssse3_32_79(\&body_20_39);
890 &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
892 $saved_j=$j; @saved_V=@V;
894 &Xloop_ssse3(\&body_20_39);
895 &Xloop_ssse3(\&body_20_39);
896 &Xloop_ssse3(\&body_20_39);
899 add 0($ctx),$A # update context
906 mov @T[0],$B # magic seed
918 $j=$saved_j; @V=@saved_V;
920 &Xtail_ssse3(\&body_20_39);
921 &Xtail_ssse3(\&body_20_39);
922 &Xtail_ssse3(\&body_20_39);
925 add 0($ctx),$A # update context
936 $code.=<<___ if ($win64);
937 movaps -40-6*16($fp),%xmm6
938 movaps -40-5*16($fp),%xmm7
939 movaps -40-4*16($fp),%xmm8
940 movaps -40-3*16($fp),%xmm9
941 movaps -40-2*16($fp),%xmm10
942 movaps -40-1*16($fp),%xmm11
956 .cfi_def_cfa_register %rsp
960 .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
964 $Xi=4; # reset variables
965 @X=map("%xmm$_",(4..7,0..3));
966 @Tx=map("%xmm$_",(8..10));
970 my $done_avx_label=".Ldone_avx";
972 my $_rol=sub { &shld(@_[0],@_) };
973 my $_ror=sub { &shrd(@_[0],@_) };
976 .type sha1_block_data_order_avx,\@function,3
978 sha1_block_data_order_avx:
982 .cfi_def_cfa_register $fp
989 push %r13 # redundant, done to share Win64 SE handler
993 lea `-64-($win64?6*16:0)`(%rsp),%rsp
996 $code.=<<___ if ($win64);
997 vmovaps %xmm6,-40-6*16($fp)
998 vmovaps %xmm7,-40-5*16($fp)
999 vmovaps %xmm8,-40-4*16($fp)
1000 vmovaps %xmm9,-40-3*16($fp)
1001 vmovaps %xmm10,-40-2*16($fp)
1002 vmovaps %xmm11,-40-1*16($fp)
1007 mov %rdi,$ctx # reassigned argument
1008 mov %rsi,$inp # reassigned argument
1009 mov %rdx,$num # reassigned argument
1013 lea K_XX_XX+64(%rip),$K_XX_XX
1015 mov 0($ctx),$A # load context
1019 mov $B,@T[0] # magic seed
1025 vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
1026 vmovdqa -64($K_XX_XX),$Kx # K_00_19
1027 vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
1028 vmovdqu 16($inp),@X[-3&7]
1029 vmovdqu 32($inp),@X[-2&7]
1030 vmovdqu 48($inp),@X[-1&7]
1031 vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
1033 vpshufb @X[2],@X[-3&7],@X[-3&7]
1034 vpshufb @X[2],@X[-2&7],@X[-2&7]
1035 vpshufb @X[2],@X[-1&7],@X[-1&7]
1036 vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
1037 vpaddd $Kx,@X[-3&7],@X[1]
1038 vpaddd $Kx,@X[-2&7],@X[2]
1039 vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
1040 vmovdqa @X[1],16(%rsp)
1041 vmovdqa @X[2],32(%rsp)
1045 sub Xupdate_avx_16_31() # recall that $Xi starts with 4
1048 my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
1049 my ($a,$b,$c,$d,$e);
1051 eval(shift(@insns));
1052 eval(shift(@insns));
1053 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
1054 eval(shift(@insns));
1055 eval(shift(@insns));
1057 &vpaddd (@Tx[1],$Kx,@X[-1&7]);
1058 eval(shift(@insns));
1059 eval(shift(@insns));
1060 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
1061 eval(shift(@insns));
1062 eval(shift(@insns));
1063 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
1064 eval(shift(@insns));
1065 eval(shift(@insns));
1067 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
1068 eval(shift(@insns));
1069 eval(shift(@insns));
1070 eval(shift(@insns));
1071 eval(shift(@insns));
1073 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
1074 eval(shift(@insns));
1075 eval(shift(@insns));
1076 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1077 eval(shift(@insns));
1078 eval(shift(@insns));
1080 &vpsrld (@Tx[0],@X[0],31);
1081 eval(shift(@insns));
1082 eval(shift(@insns));
1083 eval(shift(@insns));
1084 eval(shift(@insns));
1086 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
1087 &vpaddd (@X[0],@X[0],@X[0]);
1088 eval(shift(@insns));
1089 eval(shift(@insns));
1090 eval(shift(@insns));
1091 eval(shift(@insns));
1093 &vpsrld (@Tx[1],@Tx[2],30);
1094 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
1095 eval(shift(@insns));
1096 eval(shift(@insns));
1097 eval(shift(@insns));
1098 eval(shift(@insns));
1100 &vpslld (@Tx[2],@Tx[2],2);
1101 &vpxor (@X[0],@X[0],@Tx[1]);
1102 eval(shift(@insns));
1103 eval(shift(@insns));
1104 eval(shift(@insns));
1105 eval(shift(@insns));
1107 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
1108 eval(shift(@insns));
1109 eval(shift(@insns));
1110 &vmovdqa ($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
1111 eval(shift(@insns));
1112 eval(shift(@insns));
1115 foreach (@insns) { eval; } # remaining instructions [if any]
1117 $Xi++; push(@X,shift(@X)); # "rotate" X[]
1120 sub Xupdate_avx_32_79()
1123 my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
1124 my ($a,$b,$c,$d,$e);
1126 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
1127 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
1128 eval(shift(@insns)); # body_20_39
1129 eval(shift(@insns));
1130 eval(shift(@insns));
1131 eval(shift(@insns)); # rol
1133 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
1134 eval(shift(@insns));
1135 eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
1136 &vpaddd (@Tx[1],$Kx,@X[-1&7]);
1137 &vmovdqa ($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
1138 eval(shift(@insns)); # ror
1139 eval(shift(@insns));
1141 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
1142 eval(shift(@insns)); # body_20_39
1143 eval(shift(@insns));
1144 eval(shift(@insns));
1145 eval(shift(@insns)); # rol
1147 &vpsrld (@Tx[0],@X[0],30);
1148 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1149 eval(shift(@insns));
1150 eval(shift(@insns));
1151 eval(shift(@insns)); # ror
1152 eval(shift(@insns));
1154 &vpslld (@X[0],@X[0],2);
1155 eval(shift(@insns)); # body_20_39
1156 eval(shift(@insns));
1157 eval(shift(@insns));
1158 eval(shift(@insns)); # rol
1159 eval(shift(@insns));
1160 eval(shift(@insns));
1161 eval(shift(@insns)); # ror
1162 eval(shift(@insns));
1164 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
1165 eval(shift(@insns)); # body_20_39
1166 eval(shift(@insns));
1167 eval(shift(@insns));
1168 eval(shift(@insns)); # rol
1169 eval(shift(@insns));
1170 eval(shift(@insns));
1171 eval(shift(@insns)); # rol
1172 eval(shift(@insns));
1174 foreach (@insns) { eval; } # remaining instructions
1176 $Xi++; push(@X,shift(@X)); # "rotate" X[]
1179 sub Xuplast_avx_80()
1182 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1183 my ($a,$b,$c,$d,$e);
1185 eval(shift(@insns));
1186 &vpaddd (@Tx[1],$Kx,@X[-1&7]);
1187 eval(shift(@insns));
1188 eval(shift(@insns));
1189 eval(shift(@insns));
1190 eval(shift(@insns));
1192 &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
1194 foreach (@insns) { eval; } # remaining instructions
1197 &je ($done_avx_label);
1199 &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
1200 &vmovdqa($Kx,"-64($K_XX_XX)"); # K_00_19
1201 &vmovdqu(@X[-4&7],"0($inp)"); # load input
1202 &vmovdqu(@X[-3&7],"16($inp)");
1203 &vmovdqu(@X[-2&7],"32($inp)");
1204 &vmovdqu(@X[-1&7],"48($inp)");
1205 &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
1214 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1215 my ($a,$b,$c,$d,$e);
1217 eval(shift(@insns));
1218 eval(shift(@insns));
1219 &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
1220 eval(shift(@insns));
1221 eval(shift(@insns));
1222 &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx);
1223 eval(shift(@insns));
1224 eval(shift(@insns));
1225 eval(shift(@insns));
1226 eval(shift(@insns));
1227 &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
1228 eval(shift(@insns));
1229 eval(shift(@insns));
1231 foreach (@insns) { eval; }
1238 my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
1239 my ($a,$b,$c,$d,$e);
1241 foreach (@insns) { eval; }
1248 &Xupdate_avx_16_31(\&body_00_19);
1249 &Xupdate_avx_16_31(\&body_00_19);
1250 &Xupdate_avx_16_31(\&body_00_19);
1251 &Xupdate_avx_16_31(\&body_00_19);
1252 &Xupdate_avx_32_79(\&body_00_19);
1253 &Xupdate_avx_32_79(\&body_20_39);
1254 &Xupdate_avx_32_79(\&body_20_39);
1255 &Xupdate_avx_32_79(\&body_20_39);
1256 &Xupdate_avx_32_79(\&body_20_39);
1257 &Xupdate_avx_32_79(\&body_20_39);
1258 &Xupdate_avx_32_79(\&body_40_59);
1259 &Xupdate_avx_32_79(\&body_40_59);
1260 &Xupdate_avx_32_79(\&body_40_59);
1261 &Xupdate_avx_32_79(\&body_40_59);
1262 &Xupdate_avx_32_79(\&body_40_59);
1263 &Xupdate_avx_32_79(\&body_20_39);
1264 &Xuplast_avx_80(\&body_20_39); # can jump to "done"
1266 $saved_j=$j; @saved_V=@V;
1268 &Xloop_avx(\&body_20_39);
1269 &Xloop_avx(\&body_20_39);
1270 &Xloop_avx(\&body_20_39);
1273 add 0($ctx),$A # update context
1280 mov @T[0],$B # magic seed
1292 $j=$saved_j; @V=@saved_V;
1294 &Xtail_avx(\&body_20_39);
1295 &Xtail_avx(\&body_20_39);
1296 &Xtail_avx(\&body_20_39);
1301 add 0($ctx),$A # update context
1312 $code.=<<___ if ($win64);
1313 movaps -40-6*16($fp),%xmm6
1314 movaps -40-5*16($fp),%xmm7
1315 movaps -40-4*16($fp),%xmm8
1316 movaps -40-3*16($fp),%xmm9
1317 movaps -40-2*16($fp),%xmm10
1318 movaps -40-1*16($fp),%xmm11
1332 .cfi_def_cfa_register %rsp
1336 .size sha1_block_data_order_avx,.-sha1_block_data_order_avx
1341 $Xi=4; # reset variables
1342 @X=map("%ymm$_",(4..7,0..3));
1343 @Tx=map("%ymm$_",(8..10));
1347 my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
1348 my ($a5,$t0)=("%r12d","%edi");
1350 my ($A,$F,$B,$C,$D,$E)=@ROTX;
1355 .type sha1_block_data_order_avx2,\@function,3
1357 sha1_block_data_order_avx2:
1361 .cfi_def_cfa_register $fp
1374 $code.=<<___ if ($win64);
1375 lea -6*16(%rsp),%rsp
1376 vmovaps %xmm6,-40-6*16($fp)
1377 vmovaps %xmm7,-40-5*16($fp)
1378 vmovaps %xmm8,-40-4*16($fp)
1379 vmovaps %xmm9,-40-3*16($fp)
1380 vmovaps %xmm10,-40-2*16($fp)
1381 vmovaps %xmm11,-40-1*16($fp)
1385 mov %rdi,$ctx # reassigned argument
1386 mov %rsi,$inp # reassigned argument
1387 mov %rdx,$num # reassigned argument
1394 lea K_XX_XX+64(%rip),$K_XX_XX
1396 mov 0($ctx),$A # load context
1398 cmovae $inp,$frame # next or same block
1403 vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
1405 vmovdqu ($inp),%xmm0
1406 vmovdqu 16($inp),%xmm1
1407 vmovdqu 32($inp),%xmm2
1408 vmovdqu 48($inp),%xmm3
1410 vinserti128 \$1,($frame),@X[-4&7],@X[-4&7]
1411 vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
1412 vpshufb @X[2],@X[-4&7],@X[-4&7]
1413 vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
1414 vpshufb @X[2],@X[-3&7],@X[-3&7]
1415 vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
1416 vpshufb @X[2],@X[-2&7],@X[-2&7]
1417 vmovdqu -64($K_XX_XX),$Kx # K_00_19
1418 vpshufb @X[2],@X[-1&7],@X[-1&7]
1420 vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
1421 vpaddd $Kx,@X[-3&7],@X[1]
1422 vmovdqu @X[0],0(%rsp) # X[]+K xfer to IALU
1423 vpaddd $Kx,@X[-2&7],@X[2]
1424 vmovdqu @X[1],32(%rsp)
1425 vpaddd $Kx,@X[-1&7],@X[3]
1426 vmovdqu @X[2],64(%rsp)
1427 vmovdqu @X[3],96(%rsp)
1429 for (;$Xi<8;$Xi++) { # Xupdate_avx2_16_31
1432 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
1433 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
1434 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
1435 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
1436 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
1437 &vpsrld (@Tx[0],@X[0],31);
1438 &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
1439 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
1440 &vpaddd (@X[0],@X[0],@X[0]);
1441 &vpsrld (@Tx[1],@Tx[2],30);
1442 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
1443 &vpslld (@Tx[2],@Tx[2],2);
1444 &vpxor (@X[0],@X[0],@Tx[1]);
1445 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
1446 &vpaddd (@Tx[1],@X[0],$Kx);
1447 &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1449 push(@X,shift(@X)); # "rotate" X[]
1452 lea 128(%rsp),$frame
1461 sub bodyx_00_19 () { # 8 instructions, 3 cycles critical path
1462 # at start $f=(b&c)^(~b&d), $b>>>=2
1463 return &bodyx_20_39() if ($rx==19); $rx++;
1465 '($a,$f,$b,$c,$d,$e)=@ROTX;'.
1467 '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
1468 '&lea ($frame,"256($frame)") if ($j%32==31);',
1469 '&andn ($t0,$a,$c)', # ~b&d for next round
1471 '&add ($e,$f)', # e+=(b&c)^(~b&d)
1472 '&rorx ($a5,$a,27)', # a<<<5
1473 '&rorx ($f,$a,2)', # b>>>2 for next round
1474 '&and ($a,$b)', # b&c for next round
1476 '&add ($e,$a5)', # e+=a<<<5
1477 '&xor ($a,$t0);'. # f=(b&c)^(~b&d) for next round
1479 'unshift(@ROTX,pop(@ROTX)); $j++;'
1483 sub bodyx_20_39 () { # 7 instructions, 2 cycles critical path
1484 # on entry $f=b^c^d, $b>>>=2
1485 return &bodyx_40_59() if ($rx==39); $rx++;
1487 '($a,$f,$b,$c,$d,$e)=@ROTX;'.
1489 '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
1490 '&lea ($frame,"256($frame)") if ($j%32==31);',
1492 '&lea ($e,"($e,$f)")', # e+=b^c^d
1493 '&rorx ($a5,$a,27)', # a<<<5
1494 '&rorx ($f,$a,2) if ($j<79)', # b>>>2 in next round
1495 '&xor ($a,$b) if ($j<79)', # b^c for next round
1497 '&add ($e,$a5)', # e+=a<<<5
1498 '&xor ($a,$c) if ($j<79);'. # f=b^c^d for next round
1500 'unshift(@ROTX,pop(@ROTX)); $j++;'
1504 sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path
1505 # on entry $f=((b^c)&(c^d)), $b>>>=2
1508 '($a,$f,$b,$c,$d,$e)=@ROTX;'.
1510 '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
1511 '&lea ($frame,"256($frame)") if ($j%32==31);',
1512 '&xor ($f,$c) if ($j>39)', # (b^c)&(c^d)^c
1513 '&mov ($t0,$b) if ($j<59)', # count on zero latency
1514 '&xor ($t0,$c) if ($j<59)', # c^d for next round
1516 '&lea ($e,"($e,$f)")', # e+=(b^c)&(c^d)^c
1517 '&rorx ($a5,$a,27)', # a<<<5
1518 '&rorx ($f,$a,2)', # b>>>2 in next round
1519 '&xor ($a,$b)', # b^c for next round
1521 '&add ($e,$a5)', # e+=a<<<5
1522 '&and ($a,$t0) if ($j< 59);'. # f=(b^c)&(c^d) for next round
1523 '&xor ($a,$c) if ($j==59);'. # f=b^c^d for next round
1525 'unshift(@ROTX,pop(@ROTX)); $j++;'
1529 sub Xupdate_avx2_16_31() # recall that $Xi starts with 4
1532 my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions
1533 my ($a,$b,$c,$d,$e);
1535 &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
1536 eval(shift(@insns));
1537 eval(shift(@insns));
1538 eval(shift(@insns));
1539 eval(shift(@insns));
1541 &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
1542 eval(shift(@insns));
1543 eval(shift(@insns));
1544 eval(shift(@insns));
1546 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
1547 &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
1548 eval(shift(@insns));
1549 eval(shift(@insns));
1551 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
1552 eval(shift(@insns));
1553 eval(shift(@insns));
1554 eval(shift(@insns));
1555 eval(shift(@insns));
1557 &vpsrld (@Tx[0],@X[0],31);
1558 &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
1559 eval(shift(@insns));
1560 eval(shift(@insns));
1561 eval(shift(@insns));
1563 &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
1564 &vpaddd (@X[0],@X[0],@X[0]);
1565 eval(shift(@insns));
1566 eval(shift(@insns));
1568 &vpsrld (@Tx[1],@Tx[2],30);
1569 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
1570 eval(shift(@insns));
1571 eval(shift(@insns));
1573 &vpslld (@Tx[2],@Tx[2],2);
1574 &vpxor (@X[0],@X[0],@Tx[1]);
1575 eval(shift(@insns));
1576 eval(shift(@insns));
1578 &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
1579 eval(shift(@insns));
1580 eval(shift(@insns));
1581 eval(shift(@insns));
1583 &vpaddd (@Tx[1],@X[0],$Kx);
1584 eval(shift(@insns));
1585 eval(shift(@insns));
1586 eval(shift(@insns));
1587 &vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1589 foreach (@insns) { eval; } # remaining instructions [if any]
1592 push(@X,shift(@X)); # "rotate" X[]
1595 sub Xupdate_avx2_32_79()
1598 my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 to 50 instructions
1599 my ($a,$b,$c,$d,$e);
1601 &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
1602 &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
1603 eval(shift(@insns));
1604 eval(shift(@insns));
1606 &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
1607 &vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
1608 eval(shift(@insns));
1609 eval(shift(@insns));
1610 eval(shift(@insns));
1612 &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
1613 eval(shift(@insns));
1614 eval(shift(@insns));
1615 eval(shift(@insns));
1617 &vpsrld (@Tx[0],@X[0],30);
1618 &vpslld (@X[0],@X[0],2);
1619 eval(shift(@insns));
1620 eval(shift(@insns));
1621 eval(shift(@insns));
1623 #&vpslld (@X[0],@X[0],2);
1624 eval(shift(@insns));
1625 eval(shift(@insns));
1626 eval(shift(@insns));
1628 &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
1629 eval(shift(@insns));
1630 eval(shift(@insns));
1631 eval(shift(@insns));
1632 eval(shift(@insns));
1634 &vpaddd (@Tx[1],@X[0],$Kx);
1635 eval(shift(@insns));
1636 eval(shift(@insns));
1637 eval(shift(@insns));
1638 eval(shift(@insns));
1640 &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
1642 foreach (@insns) { eval; } # remaining instructions
1645 push(@X,shift(@X)); # "rotate" X[]
1651 my @insns = (&$body,&$body,&$body,&$body,&$body); # 32 instructions
1652 my ($a,$b,$c,$d,$e);
1654 foreach (@insns) { eval; }
1658 &Xupdate_avx2_32_79(\&bodyx_00_19);
1659 &Xupdate_avx2_32_79(\&bodyx_00_19);
1660 &Xupdate_avx2_32_79(\&bodyx_00_19);
1661 &Xupdate_avx2_32_79(\&bodyx_00_19);
1663 &Xupdate_avx2_32_79(\&bodyx_20_39);
1664 &Xupdate_avx2_32_79(\&bodyx_20_39);
1665 &Xupdate_avx2_32_79(\&bodyx_20_39);
1666 &Xupdate_avx2_32_79(\&bodyx_20_39);
1669 &Xupdate_avx2_32_79(\&bodyx_40_59);
1670 &Xupdate_avx2_32_79(\&bodyx_40_59);
1671 &Xupdate_avx2_32_79(\&bodyx_40_59);
1672 &Xupdate_avx2_32_79(\&bodyx_40_59);
1674 &Xloop_avx2(\&bodyx_20_39);
1675 &Xloop_avx2(\&bodyx_20_39);
1676 &Xloop_avx2(\&bodyx_20_39);
1677 &Xloop_avx2(\&bodyx_20_39);
1680 lea 128($inp),$frame
1681 lea 128($inp),%rdi # borrow $t0
1683 cmovae $inp,$frame # next or previous block
1685 # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1686 add 0($ctx),@ROTX[0] # update context
1687 add 4($ctx),@ROTX[1]
1688 add 8($ctx),@ROTX[3]
1689 mov @ROTX[0],0($ctx)
1690 add 12($ctx),@ROTX[4]
1691 mov @ROTX[1],4($ctx)
1692 mov @ROTX[0],$A # A=d
1693 add 16($ctx),@ROTX[5]
1695 mov @ROTX[3],8($ctx)
1696 mov @ROTX[4],$D # D=b
1697 #xchg @ROTX[5],$F # F=c, C=f
1698 mov @ROTX[4],12($ctx)
1699 mov @ROTX[1],$F # F=e
1700 mov @ROTX[5],16($ctx)
1702 mov @ROTX[5],$E # E=c
1704 #xchg $F,$E # E=c, F=e
1710 $Xi=4; # reset variables
1711 @X=map("%ymm$_",(4..7,0..3));
1714 vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
1715 cmp $num,%rdi # borrowed $t0
1718 vmovdqu -64(%rdi),%xmm0 # low part of @X[-4&7]
1719 vmovdqu -48(%rdi),%xmm1
1720 vmovdqu -32(%rdi),%xmm2
1721 vmovdqu -16(%rdi),%xmm3
1722 vinserti128 \$1,0($frame),@X[-4&7],@X[-4&7]
1723 vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
1724 vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
1725 vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
1730 lea 128+16(%rsp),$frame
1737 $rx=$j=0; @ROTX=($A,$F,$B,$C,$D,$E);
1739 &Xloop_avx2 (\&bodyx_00_19);
1740 &Xloop_avx2 (\&bodyx_00_19);
1741 &Xloop_avx2 (\&bodyx_00_19);
1742 &Xloop_avx2 (\&bodyx_00_19);
1744 &Xloop_avx2 (\&bodyx_20_39);
1745 &vmovdqu ($Kx,"-64($K_XX_XX)"); # K_00_19
1746 &vpshufb (@X[-4&7],@X[-4&7],@X[2]); # byte swap
1747 &Xloop_avx2 (\&bodyx_20_39);
1748 &vpshufb (@X[-3&7],@X[-3&7],@X[2]);
1749 &vpaddd (@Tx[0],@X[-4&7],$Kx); # add K_00_19
1750 &Xloop_avx2 (\&bodyx_20_39);
1751 &vmovdqu ("0(%rsp)",@Tx[0]);
1752 &vpshufb (@X[-2&7],@X[-2&7],@X[2]);
1753 &vpaddd (@Tx[1],@X[-3&7],$Kx);
1754 &Xloop_avx2 (\&bodyx_20_39);
1755 &vmovdqu ("32(%rsp)",@Tx[1]);
1756 &vpshufb (@X[-1&7],@X[-1&7],@X[2]);
1757 &vpaddd (@X[2],@X[-2&7],$Kx);
1759 &Xloop_avx2 (\&bodyx_40_59);
1761 &vmovdqu ("64(%rsp)",@X[2]);
1762 &vpaddd (@X[3],@X[-1&7],$Kx);
1763 &Xloop_avx2 (\&bodyx_40_59);
1764 &vmovdqu ("96(%rsp)",@X[3]);
1765 &Xloop_avx2 (\&bodyx_40_59);
1766 &Xupdate_avx2_16_31(\&bodyx_40_59);
1768 &Xupdate_avx2_16_31(\&bodyx_20_39);
1769 &Xupdate_avx2_16_31(\&bodyx_20_39);
1770 &Xupdate_avx2_16_31(\&bodyx_20_39);
1771 &Xloop_avx2 (\&bodyx_20_39);
1774 lea 128(%rsp),$frame
1776 # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
1777 add 0($ctx),@ROTX[0] # update context
1778 add 4($ctx),@ROTX[1]
1779 add 8($ctx),@ROTX[3]
1780 mov @ROTX[0],0($ctx)
1781 add 12($ctx),@ROTX[4]
1782 mov @ROTX[1],4($ctx)
1783 mov @ROTX[0],$A # A=d
1784 add 16($ctx),@ROTX[5]
1786 mov @ROTX[3],8($ctx)
1787 mov @ROTX[4],$D # D=b
1788 #xchg @ROTX[5],$F # F=c, C=f
1789 mov @ROTX[4],12($ctx)
1790 mov @ROTX[1],$F # F=e
1791 mov @ROTX[5],16($ctx)
1793 mov @ROTX[5],$E # E=c
1795 #xchg $F,$E # E=c, F=e
1803 $code.=<<___ if ($win64);
1804 movaps -40-6*16($fp),%xmm6
1805 movaps -40-5*16($fp),%xmm7
1806 movaps -40-4*16($fp),%xmm8
1807 movaps -40-3*16($fp),%xmm9
1808 movaps -40-2*16($fp),%xmm10
1809 movaps -40-1*16($fp),%xmm11
1823 .cfi_def_cfa_register %rsp
1827 .size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
1834 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1835 .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
1836 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1837 .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
1838 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1839 .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
1840 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1841 .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
1842 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
1843 .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
1844 .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
1848 .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1852 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1853 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1861 .extern __imp_RtlVirtualUnwind
1862 .type se_handler,\@abi-omnipotent
1876 mov 120($context),%rax # pull context->Rax
1877 mov 248($context),%rbx # pull context->Rip
1879 lea .Lprologue(%rip),%r10
1880 cmp %r10,%rbx # context->Rip<.Lprologue
1881 jb .Lcommon_seh_tail
1883 mov 152($context),%rax # pull context->Rsp
1885 lea .Lepilogue(%rip),%r10
1886 cmp %r10,%rbx # context->Rip>=.Lepilogue
1887 jae .Lcommon_seh_tail
1889 mov `16*4`(%rax),%rax # pull saved stack pointer
1896 mov %rbx,144($context) # restore context->Rbx
1897 mov %rbp,160($context) # restore context->Rbp
1898 mov %r12,216($context) # restore context->R12
1899 mov %r13,224($context) # restore context->R13
1900 mov %r14,232($context) # restore context->R14
1902 jmp .Lcommon_seh_tail
1903 .size se_handler,.-se_handler
1906 $code.=<<___ if ($shaext);
1907 .type shaext_handler,\@abi-omnipotent
1921 mov 120($context),%rax # pull context->Rax
1922 mov 248($context),%rbx # pull context->Rip
1924 lea .Lprologue_shaext(%rip),%r10
1925 cmp %r10,%rbx # context->Rip<.Lprologue
1926 jb .Lcommon_seh_tail
1928 lea .Lepilogue_shaext(%rip),%r10
1929 cmp %r10,%rbx # context->Rip>=.Lepilogue
1930 jae .Lcommon_seh_tail
1932 lea -8-4*16(%rax),%rsi
1933 lea 512($context),%rdi # &context.Xmm6
1935 .long 0xa548f3fc # cld; rep movsq
1937 jmp .Lcommon_seh_tail
1938 .size shaext_handler,.-shaext_handler
1942 .type ssse3_handler,\@abi-omnipotent
1956 mov 120($context),%rax # pull context->Rax
1957 mov 248($context),%rbx # pull context->Rip
1959 mov 8($disp),%rsi # disp->ImageBase
1960 mov 56($disp),%r11 # disp->HandlerData
1962 mov 0(%r11),%r10d # HandlerData[0]
1963 lea (%rsi,%r10),%r10 # prologue label
1964 cmp %r10,%rbx # context->Rip<prologue label
1965 jb .Lcommon_seh_tail
1967 mov 208($context),%rax # pull context->R11
1969 mov 4(%r11),%r10d # HandlerData[1]
1970 lea (%rsi,%r10),%r10 # epilogue label
1971 cmp %r10,%rbx # context->Rip>=epilogue label
1972 jae .Lcommon_seh_tail
1974 lea -40-6*16(%rax),%rsi
1975 lea 512($context),%rdi # &context.Xmm6
1977 .long 0xa548f3fc # cld; rep movsq
1984 mov %rbx,144($context) # restore context->Rbx
1985 mov %rbp,160($context) # restore context->Rbp
1986 mov %r12,216($context) # restore cotnext->R12
1987 mov %r13,224($context) # restore cotnext->R13
1988 mov %r14,232($context) # restore cotnext->R14
1993 mov %rax,152($context) # restore context->Rsp
1994 mov %rsi,168($context) # restore context->Rsi
1995 mov %rdi,176($context) # restore context->Rdi
1997 mov 40($disp),%rdi # disp->ContextRecord
1998 mov $context,%rsi # context
1999 mov \$154,%ecx # sizeof(CONTEXT)
2000 .long 0xa548f3fc # cld; rep movsq
2003 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
2004 mov 8(%rsi),%rdx # arg2, disp->ImageBase
2005 mov 0(%rsi),%r8 # arg3, disp->ControlPc
2006 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
2007 mov 40(%rsi),%r10 # disp->ContextRecord
2008 lea 56(%rsi),%r11 # &disp->HandlerData
2009 lea 24(%rsi),%r12 # &disp->EstablisherFrame
2010 mov %r10,32(%rsp) # arg5
2011 mov %r11,40(%rsp) # arg6
2012 mov %r12,48(%rsp) # arg7
2013 mov %rcx,56(%rsp) # arg8, (NULL)
2014 call *__imp_RtlVirtualUnwind(%rip)
2016 mov \$1,%eax # ExceptionContinueSearch
2028 .size ssse3_handler,.-ssse3_handler
2032 .rva .LSEH_begin_sha1_block_data_order
2033 .rva .LSEH_end_sha1_block_data_order
2034 .rva .LSEH_info_sha1_block_data_order
2036 $code.=<<___ if ($shaext);
2037 .rva .LSEH_begin_sha1_block_data_order_shaext
2038 .rva .LSEH_end_sha1_block_data_order_shaext
2039 .rva .LSEH_info_sha1_block_data_order_shaext
2042 .rva .LSEH_begin_sha1_block_data_order_ssse3
2043 .rva .LSEH_end_sha1_block_data_order_ssse3
2044 .rva .LSEH_info_sha1_block_data_order_ssse3
2046 $code.=<<___ if ($avx);
2047 .rva .LSEH_begin_sha1_block_data_order_avx
2048 .rva .LSEH_end_sha1_block_data_order_avx
2049 .rva .LSEH_info_sha1_block_data_order_avx
2051 $code.=<<___ if ($avx>1);
2052 .rva .LSEH_begin_sha1_block_data_order_avx2
2053 .rva .LSEH_end_sha1_block_data_order_avx2
2054 .rva .LSEH_info_sha1_block_data_order_avx2
2059 .LSEH_info_sha1_block_data_order:
2063 $code.=<<___ if ($shaext);
2064 .LSEH_info_sha1_block_data_order_shaext:
2069 .LSEH_info_sha1_block_data_order_ssse3:
2072 .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
2074 $code.=<<___ if ($avx);
2075 .LSEH_info_sha1_block_data_order_avx:
2078 .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
2080 $code.=<<___ if ($avx>1);
2081 .LSEH_info_sha1_block_data_order_avx2:
2084 .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
2088 ####################################################################
2091 if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
2092 my @opcode=(0x0f,0x3a,0xcc);
2093 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
2095 push @opcode,$c=~/^0/?oct($c):$c;
2096 return ".byte\t".join(',',@opcode);
2098 return "sha1rnds4\t".@_[0];
2105 "sha1nexte" => 0xc8,
2107 "sha1msg2" => 0xca );
2109 if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
2110 my @opcode=(0x0f,0x38);
2112 $rex|=0x04 if ($2>=8);
2113 $rex|=0x01 if ($1>=8);
2114 unshift @opcode,0x40|$rex if ($rex);
2115 push @opcode,$opcodelet{$instr};
2116 push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
2117 return ".byte\t".join(',',@opcode);
2119 return $instr."\t".@_[0];
2123 foreach (split("\n",$code)) {
2124 s/\`([^\`]*)\`/eval $1/geo;
2126 s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
2127 s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;