X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=blobdiff_plain;f=crypto%2Fsha%2Fasm%2Fsha512-x86_64.pl;h=d1d20914a59068961f79c0e8cf76c47f1f2c1d46;hp=ab9a0b472de4909357943eb1d6be13585e42e1fc;hb=c7f690c243f5ad3dccaa6dbb918c92eb39622711;hpb=6251989eb6168c1baada70c5b16c7742861d3d03 diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl index ab9a0b472d..d1d20914a5 100755 --- a/crypto/sha/asm/sha512-x86_64.pl +++ b/crypto/sha/asm/sha512-x86_64.pl @@ -44,7 +44,7 @@ # # Optimization including one of Pavel Semjanov's ideas, alternative # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and -# unfortunately -10% SHA512 on P4 [which nobody should care about +# unfortunately -2% SHA512 on P4 [which nobody should care about # that much]. # # June 2012. @@ -59,23 +59,33 @@ # higher coefficients are observed on VIA Nano and Bulldozer has more # to do with specifics of their architecture [which is topic for # separate discussion]. +# +# November 2012. +# +# Add AVX2 code path. Two consecutive input blocks are loaded to +# 256-bit %ymm registers, with data from first block to least +# significant 128-bit halves and data from second to most significant. +# The data is then processed with same SIMD instruction sequence as +# for AVX, but with %ymm as operands. Side effect is increased stack +# frame, 448 additional bytes in SHA256 and 1152 in SHA512. ###################################################################### # Current performance in cycles per processed byte (less is better): # # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) # -# AMD K8 15.1 - - 9.70 - -# P4 17.5 - - 33.4 - -# Core 2 15.5 13.9(+11%) - 10.3 - -# Westmere 15.1 12.5(+21%) - 9.72 - -# Atom 23.0 21.6(+6%) - 14.7 - -# VIA Nano 23.0 16.3(+41%) - 14.7 - -# Sandy Bridge 17.4 14.0(+24%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) -# Ivy Bridge 12.6 10.3(+22%) 10.3(+22%) 8.17 7.22(+13%) -# Bulldozer 21.5 13.7(+57%) 13.7(+57%(***)) 13.5 8.58(+57%) +# AMD K8 14.9 - - 9.57 - +# P4 17.3 - - 30.8 - +# Core 2 15.6 13.8(+13%) - 9.97 - +# Westmere 14.8 12.3(+19%) - 9.58 - +# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) +# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) +# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) +# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) +# VIA Nano 23.0 16.5(+39%) - 14.7 - +# Atom 23.0 18.9(+22%) - 14.7 - # -# (*) whichever applicable; +# (*) whichever best applicable; # (**) switch from ror to shrd stands for fair share of improvement; # (***) execution time is fully determined by remaining integer-only # part, body_00_15; reducing the amount of SIMD instructions @@ -93,17 +103,23 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/ && - $1>=2.19); -$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && - $1>=2.09); -$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && - `ml64 2>&1` =~ /Version ([0-9]+)\./ && - $1>=10); +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} -open STDOUT,"| \"$^X\" $xlate $flavour $output"; +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +open OUT,"| \"$^X\" $xlate $flavour $output"; +*STDOUT=*OUT; if ($output =~ /512/) { $func="sha512_block_data_order"; @@ -144,13 +160,15 @@ $framesz="16*$SZ+4*8"; sub ROUND_00_15() { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + my $STRIDE=$SZ; + $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); $code.=<<___; ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 mov $f,$a2 - ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 xor $e,$a0 + ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 xor $g,$a2 # f^g mov $T1,`$SZ*($i&0xf)`(%rsp) @@ -169,25 +187,22 @@ $code.=<<___; add ($Tbl),$T1 # T1+=K[round] xor $a,$a1 - ror \$$Sigma1[0],$a0 # Sigma1(e) xor $b,$a2 # a^b, b^c in next round + ror \$$Sigma1[0],$a0 # Sigma1(e) mov $b,$h - ror \$$Sigma0[0],$a1 # Sigma0(a) and $a2,$a3 + ror \$$Sigma0[0],$a1 # Sigma0(a) add $a0,$T1 # T1+=Sigma1(e) xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) add $T1,$d # d+=T1 add $T1,$h # h+=T1 + + lea $STRIDE($Tbl),$Tbl # round++ ___ -$code.=<<___ if ($i>=15); - mov `$SZ*(($i+2)&0xf)`(%rsp),$a0 -___ -$code.=<<___; - lea $SZ($Tbl),$Tbl # round++ +$code.=<<___ if ($i<15); add $a1,$h # h+=Sigma0(a) - ___ ($a2,$a3) = ($a3,$a2); } @@ -196,28 +211,29 @@ sub ROUND_16_XX() { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; - #mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 - mov `$SZ*(($i+14)&0xf)`(%rsp),$a1 + mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 + mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 mov $a0,$T1 ror \$`$sigma0[1]-$sigma0[0]`,$a0 - mov $a1,$a2 - ror \$`$sigma1[1]-$sigma1[0]`,$a1 + add $a1,$a # modulo-scheduled h+=Sigma0(a) + mov $a2,$a1 + ror \$`$sigma1[1]-$sigma1[0]`,$a2 xor $T1,$a0 shr \$$sigma0[2],$T1 ror \$$sigma0[0],$a0 - xor $a2,$a1 - shr \$$sigma1[2],$a2 + xor $a1,$a2 + shr \$$sigma1[2],$a1 + ror \$$sigma1[0],$a2 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) - ror \$$sigma1[0],$a1 + xor $a1,$a2 # sigma1(X[(i+14)&0xf]) add `$SZ*(($i+9)&0xf)`(%rsp),$T1 - xor $a2,$a1 # sigma1(X[(i+14)&0xf]) add `$SZ*($i&0xf)`(%rsp),$T1 mov $e,$a0 - add $a1,$T1 + add $a2,$T1 mov $a,$a1 ___ &ROUND_00_15(@_); @@ -228,28 +244,34 @@ $code=<<___; .extern OPENSSL_ia32cap_P .globl $func -.type $func,\@function,4 +.type $func,\@function,3 .align 16 $func: ___ $code.=<<___ if ($SZ==4 || $avx); lea OPENSSL_ia32cap_P(%rip),%r11 - mov 0(%r11),%r10d - mov 4(%r11),%r11d + mov 0(%r11),%r9d + mov 4(%r11),%r10d + mov 8(%r11),%r11d ___ $code.=<<___ if ($avx && $SZ==8); - test \$`1<<11`,%r11d # check for XOP + test \$`1<<11`,%r10d # check for XOP jnz .Lxop_shortcut ___ +$code.=<<___ if ($avx>1); + and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 + cmp \$`1<<8|1<<5|1<<3`,%r11d + je .Lavx2_shortcut +___ $code.=<<___ if ($avx); - and \$`1<<30`,%r10d # mask "Intel CPU" bit - and \$`1<<28|1<<9`,%r11d # mask AVX and SSSE3 bits - or %r10d,%r11d - cmp \$`1<<28|1<<9|1<<30`,%r11d + and \$`1<<30`,%r9d # mask "Intel CPU" bit + and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits + or %r9d,%r10d + cmp \$`1<<28|1<<9|1<<30`,%r10d je .Lavx_shortcut ___ $code.=<<___ if ($SZ==4); - test \$`1<<9`,%r11d + test \$`1<<9`,%r10d jnz .Lssse3_shortcut ___ $code.=<<___; @@ -309,6 +331,7 @@ $code.=<<___; jnz .Lrounds_16_xx mov $_ctx,$ctx + add $a1,$A # modulo-scheduled h+=Sigma0(a) lea 16*$SZ($inp),$inp add $SZ*0($ctx),$A @@ -350,25 +373,44 @@ $code.=<<___; .align 64 .type $TABLE,\@object $TABLE: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by " ___ @@ -378,48 +420,89 @@ $code.=<<___; .type $TABLE,\@object $TABLE: .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0xd807aa98a3030242,0x12835b0145706fbe .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x0001020304050607,0x08090a0b0c0d0e0f - .asciz "SHA512 block transfort for x86_64, CRYPTOGAMS by " + .quad 0x0001020304050607,0x08090a0b0c0d0e0f + .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by " ___ } @@ -446,8 +529,8 @@ sub body_00_15 () { '&mov ($a,$a1)', '&mov ($a4,$f)', - '&xor ($a0,$e)', '&ror ($a1,$Sigma0[2]-$Sigma0[1])', + '&xor ($a0,$e)', '&xor ($a4,$g)', # f^g '&ror ($a0,$Sigma1[1]-$Sigma1[0])', @@ -458,20 +541,20 @@ sub body_00_15 () { '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] '&mov ($a2,$a)', - '&ror ($a1,$Sigma0[1]-$Sigma0[0])', '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g + '&ror ($a1,$Sigma0[1]-$Sigma0[0])', '&xor ($a2,$b)', # a^b, b^c in next round - '&ror ($a0,$Sigma1[0])', # Sigma1(e) '&add ($h,$a4)', # h+=Ch(e,f,g) + '&ror ($a0,$Sigma1[0])', # Sigma1(e) '&and ($a3,$a2)', # (b^c)&(a^b) '&xor ($a1,$a)', '&add ($h,$a0)', # h+=Sigma1(e) '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) - '&add ($d,$h)', # d+=h '&ror ($a1,$Sigma0[0])', # Sigma0(a) + '&add ($d,$h)', # d+=h '&add ($h,$a3)', # h+=Maj(a,b,c) '&mov ($a0,$d)', @@ -488,7 +571,7 @@ my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); $code.=<<___; -.type ${func}_ssse3,\@function,4 +.type ${func}_ssse3,\@function,3 .align 64 ${func}_ssse3: .Lssse3_shortcut: @@ -528,12 +611,12 @@ $code.=<<___; ___ $code.=<<___; - movdqa $TABLE+`$SZ*$rounds`+16(%rip),$t4 - movdqa $TABLE+`$SZ*$rounds`+32(%rip),$t5 + #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 + #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 jmp .Lloop_ssse3 .align 16 .Lloop_ssse3: - movdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 movdqu 0x00($inp),@X[0] movdqu 0x10($inp),@X[1] movdqu 0x20($inp),@X[2] @@ -543,11 +626,11 @@ $code.=<<___; pshufb $t3,@X[1] movdqa 0x00($Tbl),$t0 pshufb $t3,@X[2] - movdqa 0x10($Tbl),$t1 + movdqa 0x20($Tbl),$t1 paddd @X[0],$t0 - movdqa 0x20($Tbl),$t2 + movdqa 0x40($Tbl),$t2 pshufb $t3,@X[3] - movdqa 0x30($Tbl),$t3 + movdqa 0x60($Tbl),$t3 paddd @X[1],$t1 paddd @X[2],$t2 paddd @X[3],$t3 @@ -563,7 +646,7 @@ $code.=<<___; .align 16 .Lssse3_00_47: - add \$16*$SZ,$Tbl + sub \$-16*2*$SZ,$Tbl # size optimization ___ sub Xupdate_256_SSSE3 () { ( @@ -600,7 +683,7 @@ sub Xupdate_256_SSSE3 () { '&pxor ($t3,$t2);', '&psrlq ($t2,$sigma1[1]-$sigma1[0])', '&pxor ($t3,$t2);', - '&movdqa ($t2,16*$j."($Tbl)")', + '&movdqa ($t2,16*2*$j."($Tbl)")', '&pshufb ($t3,$t5)', '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) ); @@ -619,95 +702,98 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions eval(shift(@insns)); eval(shift(@insns)); } - } else { # squeeze extra 3% on Westmere and Atom + } else { # squeeze extra 4% on Westmere and 19% on Atom eval(shift(@insns)); #@ - eval(shift(@insns)); &movdqa ($t0,@X[1]); eval(shift(@insns)); + eval(shift(@insns)); &movdqa ($t3,@X[3]); + eval(shift(@insns)); #@ + eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &palignr ($t0,@X[0],$SZ); # X[1..4] - eval(shift(@insns)); #@ eval(shift(@insns)); - &palignr ($t3,@X[2],$SZ); # X[9..12] eval(shift(@insns)); + &palignr ($t3,@X[2],$SZ); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ - eval(shift(@insns)); &movdqa ($t1,$t0); eval(shift(@insns)); + eval(shift(@insns)); &movdqa ($t2,$t0); eval(shift(@insns)); #@ eval(shift(@insns)); - eval(shift(@insns)); &psrld ($t0,$sigma0[2]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t3); # X[0..3] += X[9..12] - eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &psrld ($t2,$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); - eval(shift(@insns)); #@ - eval(shift(@insns)); &pshufd ($t3,@X[3],0b11111010); # X[4..15] eval(shift(@insns)); + eval(shift(@insns)); #@ &pslld ($t1,8*$SZ-$sigma0[1]); eval(shift(@insns)); + eval(shift(@insns)); &pxor ($t0,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); - &psrld ($t2,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); eval(shift(@insns)); #@ + &psrld ($t2,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); &pxor ($t0,$t1); eval(shift(@insns)); eval(shift(@insns)); &pslld ($t1,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); + eval(shift(@insns)); &pxor ($t0,$t2); eval(shift(@insns)); eval(shift(@insns)); #@ - eval(shift(@insns)); &movdqa ($t2,$t3); eval(shift(@insns)); - eval(shift(@insns)); #@ eval(shift(@insns)); &pxor ($t0,$t1); # sigma0(X[1..4]) + eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); &psrld ($t3,$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) - eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); - eval(shift(@insns)); &psrlq ($t2,$sigma1[0]); eval(shift(@insns)); - eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &psrlq ($t2,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); - eval(shift(@insns)); #@ eval(shift(@insns)); &pxor ($t3,$t2); + eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); + #&pshufb ($t3,$t4); # sigma1(X[14..15]) + &pshufd ($t3,$t3,0b10000000); eval(shift(@insns)); - &pshufb ($t3,$t4); # sigma1(X[14..15]) + eval(shift(@insns)); + eval(shift(@insns)); + &psrldq ($t3,8); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); @@ -715,45 +801,48 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions eval(shift(@insns)); #@ &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); - &pshufd ($t3,@X[0],0b01010000); # X[16..17] eval(shift(@insns)); eval(shift(@insns)); + &pshufd ($t3,@X[0],0b01010000); # X[16..17] eval(shift(@insns)); + eval(shift(@insns)); #@ eval(shift(@insns)); &movdqa ($t2,$t3); eval(shift(@insns)); - eval(shift(@insns)); #@ eval(shift(@insns)); &psrld ($t3,$sigma1[2]); eval(shift(@insns)); - &psrlq ($t2,$sigma1[0]); - eval(shift(@insns)); eval(shift(@insns)); #@ + &psrlq ($t2,$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &psrlq ($t2,$sigma1[1]-$sigma1[0]); - eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); eval(shift(@insns)); - &movdqa ($t2,16*$j."($Tbl)"); eval(shift(@insns)); #@ + #&pshufb ($t3,$t5); + &pshufd ($t3,$t3,0b00001000); eval(shift(@insns)); - &pshufb ($t3,$t5); eval(shift(@insns)); + &movdqa ($t2,16*2*$j."($Tbl)"); eval(shift(@insns)); #@ eval(shift(@insns)); + &pslldq ($t3,8); eval(shift(@insns)); eval(shift(@insns)); - &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); + &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); } @@ -766,7 +855,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions &SSSE3_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } - &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lssse3_00_47"); for ($i=0; $i<16; ) { @@ -826,7 +915,7 @@ if ($avx) {{ # if ($SZ==8) { # SHA512 only $code.=<<___; -.type ${func}_xop,\@function,4 +.type ${func}_xop,\@function,3 .align 64 ${func}_xop: .Lxop_shortcut: @@ -877,7 +966,7 @@ ___ $code.=<<___; .align 16 .Lloop_xop: - vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] @@ -888,9 +977,9 @@ $code.=<<___; vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] - vpaddd 0x10($Tbl),@X[1],$t1 - vpaddd 0x20($Tbl),@X[2],$t2 - vpaddd 0x30($Tbl),@X[3],$t3 + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) mov $A,$a1 vmovdqa $t1,0x10(%rsp) @@ -903,7 +992,7 @@ $code.=<<___; .align 16 .Lxop_00_47: - add \$16*$SZ,$Tbl + sub \$-16*2*$SZ,$Tbl # size optimization ___ sub XOP_256_00_47 () { my $j = shift; @@ -1000,7 +1089,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &vpaddd ($t2,@X[0],16*$j."($Tbl)"); + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } @@ -1009,7 +1098,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions &XOP_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } - &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lxop_00_47"); for ($i=0; $i<16; ) { @@ -1023,9 +1112,9 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions $code.=<<___; .align 16 .Lloop_xop: - vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] - lea $TABLE(%rip),$Tbl + lea $TABLE+0x80(%rip),$Tbl # size optimization vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] vpshufb $t3,@X[0],@X[0] @@ -1039,20 +1128,20 @@ $code.=<<___; vpshufb $t3,@X[4],@X[4] vmovdqu 0x70($inp),@X[7] vpshufb $t3,@X[5],@X[5] - vpaddq 0x00($Tbl),@X[0],$t0 + vpaddq -0x80($Tbl),@X[0],$t0 vpshufb $t3,@X[6],@X[6] - vpaddq 0x10($Tbl),@X[1],$t1 + vpaddq -0x60($Tbl),@X[1],$t1 vpshufb $t3,@X[7],@X[7] - vpaddq 0x20($Tbl),@X[2],$t2 - vpaddq 0x30($Tbl),@X[3],$t3 + vpaddq -0x40($Tbl),@X[2],$t2 + vpaddq -0x20($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) - vpaddq 0x40($Tbl),@X[4],$t0 + vpaddq 0x00($Tbl),@X[4],$t0 vmovdqa $t1,0x10(%rsp) - vpaddq 0x50($Tbl),@X[5],$t1 + vpaddq 0x20($Tbl),@X[5],$t1 vmovdqa $t2,0x20(%rsp) - vpaddq 0x60($Tbl),@X[6],$t2 + vpaddq 0x40($Tbl),@X[6],$t2 vmovdqa $t3,0x30(%rsp) - vpaddq 0x70($Tbl),@X[7],$t3 + vpaddq 0x60($Tbl),@X[7],$t3 vmovdqa $t0,0x40(%rsp) mov $A,$a1 vmovdqa $t1,0x50(%rsp) @@ -1065,7 +1154,7 @@ $code.=<<___; .align 16 .Lxop_00_47: - add \$16*$SZ,$Tbl + add \$16*2*$SZ,$Tbl ___ sub XOP_512_00_47 () { my $j = shift; @@ -1128,7 +1217,7 @@ my @insns = (&$body,&$body); # 52 instructions eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &vpaddq ($t2,@X[0],16*$j."($Tbl)"); + &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } @@ -1137,7 +1226,7 @@ my @insns = (&$body,&$body); # 52 instructions &XOP_512_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } - &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); &jne (".Lxop_00_47"); for ($i=0; $i<16; ) { @@ -1202,7 +1291,7 @@ ___ local *ror = sub { &shrd(@_[0],@_) }; $code.=<<___; -.type ${func}_avx,\@function,4 +.type ${func}_avx,\@function,3 .align 64 ${func}_avx: .Lavx_shortcut: @@ -1250,12 +1339,12 @@ ___ my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); $code.=<<___; - vmovdqa $TABLE+`$SZ*$rounds`+16(%rip),$t4 - vmovdqa $TABLE+`$SZ*$rounds`+32(%rip),$t5 + vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 + vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 jmp .Lloop_avx .align 16 .Lloop_avx: - vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] @@ -1266,9 +1355,9 @@ $code.=<<___; vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] - vpaddd 0x10($Tbl),@X[1],$t1 - vpaddd 0x20($Tbl),@X[2],$t2 - vpaddd 0x30($Tbl),@X[3],$t3 + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) mov $A,$a1 vmovdqa $t1,0x10(%rsp) @@ -1281,7 +1370,7 @@ $code.=<<___; .align 16 .Lavx_00_47: - add \$16*$SZ,$Tbl + sub \$-16*2*$SZ,$Tbl # size optimization ___ sub Xupdate_256_AVX () { ( @@ -1329,7 +1418,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions eval(shift(@insns)); eval(shift(@insns)); } - &vpaddd ($t2,@X[0],16*$j."($Tbl)"); + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } @@ -1338,7 +1427,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions &AVX_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } - &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lavx_00_47"); for ($i=0; $i<16; ) { @@ -1353,9 +1442,9 @@ $code.=<<___; jmp .Lloop_avx .align 16 .Lloop_avx: - vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] - lea $TABLE(%rip),$Tbl + lea $TABLE+0x80(%rip),$Tbl # size optimization vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] vpshufb $t3,@X[0],@X[0] @@ -1369,20 +1458,20 @@ $code.=<<___; vpshufb $t3,@X[4],@X[4] vmovdqu 0x70($inp),@X[7] vpshufb $t3,@X[5],@X[5] - vpaddq 0x00($Tbl),@X[0],$t0 + vpaddq -0x80($Tbl),@X[0],$t0 vpshufb $t3,@X[6],@X[6] - vpaddq 0x10($Tbl),@X[1],$t1 + vpaddq -0x60($Tbl),@X[1],$t1 vpshufb $t3,@X[7],@X[7] - vpaddq 0x20($Tbl),@X[2],$t2 - vpaddq 0x30($Tbl),@X[3],$t3 + vpaddq -0x40($Tbl),@X[2],$t2 + vpaddq -0x20($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) - vpaddq 0x40($Tbl),@X[4],$t0 + vpaddq 0x00($Tbl),@X[4],$t0 vmovdqa $t1,0x10(%rsp) - vpaddq 0x50($Tbl),@X[5],$t1 + vpaddq 0x20($Tbl),@X[5],$t1 vmovdqa $t2,0x20(%rsp) - vpaddq 0x60($Tbl),@X[6],$t2 + vpaddq 0x40($Tbl),@X[6],$t2 vmovdqa $t3,0x30(%rsp) - vpaddq 0x70($Tbl),@X[7],$t3 + vpaddq 0x60($Tbl),@X[7],$t3 vmovdqa $t0,0x40(%rsp) mov $A,$a1 vmovdqa $t1,0x50(%rsp) @@ -1395,14 +1484,14 @@ $code.=<<___; .align 16 .Lavx_00_47: - add \$16*$SZ,$Tbl + add \$16*2*$SZ,$Tbl ___ sub Xupdate_512_AVX () { ( '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] - '&vpsrlq ($t2,$t0,$sigma0[0]);', - '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += X[9..10] + '&vpsrlq ($t2,$t0,$sigma0[0])', + '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] '&vpsrlq ($t3,$t0,$sigma0[2])', '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', '&vpxor ($t0,$t3,$t2)', @@ -1412,7 +1501,7 @@ sub Xupdate_512_AVX () { '&vpxor ($t0,$t0,$t2)', '&vpsrlq ($t3,@X[7],$sigma1[2]);', '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) - '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1])', + '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) '&vpsrlq ($t1,@X[7],$sigma1[0]);', '&vpxor ($t3,$t3,$t2)', @@ -1436,7 +1525,7 @@ my @insns = (&$body,&$body); # 52 instructions eval(shift(@insns)); eval(shift(@insns)); } - &vpaddq ($t2,@X[0],16*$j."($Tbl)"); + &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } @@ -1445,7 +1534,7 @@ my @insns = (&$body,&$body); # 52 instructions &AVX_512_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } - &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); &jne (".Lavx_00_47"); for ($i=0; $i<16; ) { @@ -1503,6 +1592,381 @@ $code.=<<___; ret .size ${func}_avx,.-${func}_avx ___ + +if ($avx>1) {{ +###################################################################### +# AVX2+BMI code path +# +my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp +my $PUSH8=8*2*$SZ; +use integer; + +sub bodyx_00_15 () { + # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. + + '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] + '&and ($a4,$e)', # f&e + '&rorx ($a0,$e,$Sigma1[2])', + '&rorx ($a2,$e,$Sigma1[1])', + + '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past + '&lea ($h,"($h,$a4)")', + '&andn ($a4,$e,$g)', # ~e&g + '&xor ($a0,$a2)', + + '&rorx ($a1,$e,$Sigma1[0])', + '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) + '&xor ($a0,$a1)', # Sigma1(e) + '&mov ($a2,$a)', + + '&rorx ($a4,$a,$Sigma0[2])', + '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) + '&xor ($a2,$b)', # a^b, b^c in next round + '&rorx ($a1,$a,$Sigma0[1])', + + '&rorx ($a0,$a,$Sigma0[0])', + '&lea ($d,"($d,$h)")', # d+=h + '&and ($a3,$a2)', # (b^c)&(a^b) + '&xor ($a1,$a4)', + + '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) + '&xor ($a1,$a0)', # Sigma0(a) + '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) + '&mov ($a4,$e)', # copy of f in future + + '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' + ); + # and at the finish one has to $a+=$a1 +} + +$code.=<<___; +.type ${func}_avx2,\@function,3 +.align 64 +${func}_avx2: +.Lavx2_shortcut: + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%r11 # copy %rsp + sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp + shl \$4,%rdx # num*16 + and \$-256*$SZ,%rsp # align stack frame + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + add \$`2*$SZ*($rounds-8)`,%rsp + mov $ctx,$_ctx # save ctx, 1st arg + mov $inp,$_inp # save inp, 2nd arh + mov %rdx,$_end # save end pointer, "3rd" arg + mov %r11,$_rsp # save copy of %rsp +___ +$code.=<<___ if ($win64); + movaps %xmm6,16*$SZ+32(%rsp) + movaps %xmm7,16*$SZ+48(%rsp) + movaps %xmm8,16*$SZ+64(%rsp) + movaps %xmm9,16*$SZ+80(%rsp) +___ +$code.=<<___ if ($win64 && $SZ>4); + movaps %xmm10,16*$SZ+96(%rsp) + movaps %xmm11,16*$SZ+112(%rsp) +___ +$code.=<<___; +.Lprologue_avx2: + + vzeroall + sub \$-16*$SZ,$inp # inp++, size optimization + mov $SZ*0($ctx),$A + mov $inp,%r12 # borrow $T1 + mov $SZ*1($ctx),$B + cmp %rdx,$inp # $_end + mov $SZ*2($ctx),$C + cmove %rsp,%r12 # next block or random data + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H +___ + if ($SZ==4) { # SHA256 + my @X = map("%ymm$_",(0..3)); + my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); + +$code.=<<___; + vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 + vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + vmovdqu -16*$SZ+0($inp),%xmm0 + vmovdqu -16*$SZ+16($inp),%xmm1 + vmovdqu -16*$SZ+32($inp),%xmm2 + vmovdqu -16*$SZ+48($inp),%xmm3 + #mov $inp,$_inp # offload $inp + vinserti128 \$1,(%r12),@X[0],@X[0] + vinserti128 \$1,16(%r12),@X[1],@X[1] + vpshufb $t3,@X[0],@X[0] + vinserti128 \$1,32(%r12),@X[2],@X[2] + vpshufb $t3,@X[1],@X[1] + vinserti128 \$1,48(%r12),@X[3],@X[3] + + lea $TABLE(%rip),$Tbl + vpshufb $t3,@X[2],@X[2] + vpaddd 0x00($Tbl),@X[0],$t0 + vpshufb $t3,@X[3],@X[3] + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + xor $a1,$a1 + vmovdqa $t1,0x20(%rsp) + lea -$PUSH8(%rsp),%rsp + mov $B,$a3 + vmovdqa $t2,0x00(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x20(%rsp) + mov $F,$a4 + sub \$-16*2*$SZ,$Tbl # size optimization + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: +___ + +sub AVX2_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 96 instructions +my $base = "+2*$PUSH8(%rsp)"; + + &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0); + foreach (Xupdate_256_AVX()) { # 29 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<4; $j++) { + &AVX2_256_00_47($j,\&bodyx_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &lea ($Tbl,16*2*$SZ."($Tbl)"); + &cmpb (($SZ-1)."($Tbl)",0); + &jne (".Lavx2_00_47"); + + for ($i=0; $i<16; ) { + my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; + foreach(bodyx_00_15()) { eval; } + } + } else { # SHA512 + my @X = map("%ymm$_",(0..7)); + my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); + +$code.=<<___; + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqu -16*$SZ($inp),%xmm0 + vmovdqu -16*$SZ+16($inp),%xmm1 + vmovdqu -16*$SZ+32($inp),%xmm2 + lea $TABLE+0x80(%rip),$Tbl # size optimization + vmovdqu -16*$SZ+48($inp),%xmm3 + vmovdqu -16*$SZ+64($inp),%xmm4 + vmovdqu -16*$SZ+80($inp),%xmm5 + vmovdqu -16*$SZ+96($inp),%xmm6 + vmovdqu -16*$SZ+112($inp),%xmm7 + #mov $inp,$_inp # offload $inp + vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 + vinserti128 \$1,(%r12),@X[0],@X[0] + vinserti128 \$1,16(%r12),@X[1],@X[1] + vpshufb $t2,@X[0],@X[0] + vinserti128 \$1,32(%r12),@X[2],@X[2] + vpshufb $t2,@X[1],@X[1] + vinserti128 \$1,48(%r12),@X[3],@X[3] + vpshufb $t2,@X[2],@X[2] + vinserti128 \$1,64(%r12),@X[4],@X[4] + vpshufb $t2,@X[3],@X[3] + vinserti128 \$1,80(%r12),@X[5],@X[5] + vpshufb $t2,@X[4],@X[4] + vinserti128 \$1,96(%r12),@X[6],@X[6] + vpshufb $t2,@X[5],@X[5] + vinserti128 \$1,112(%r12),@X[7],@X[7] + + vpaddq -0x80($Tbl),@X[0],$t0 + vpshufb $t2,@X[6],@X[6] + vpaddq -0x60($Tbl),@X[1],$t1 + vpshufb $t2,@X[7],@X[7] + vpaddq -0x40($Tbl),@X[2],$t2 + vpaddq -0x20($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + vpaddq 0x00($Tbl),@X[4],$t0 + vmovdqa $t1,0x20(%rsp) + vpaddq 0x20($Tbl),@X[5],$t1 + vmovdqa $t2,0x40(%rsp) + vpaddq 0x40($Tbl),@X[6],$t2 + vmovdqa $t3,0x60(%rsp) + lea -$PUSH8(%rsp),%rsp + vpaddq 0x60($Tbl),@X[7],$t3 + vmovdqa $t0,0x00(%rsp) + xor $a1,$a1 + vmovdqa $t1,0x20(%rsp) + mov $B,$a3 + vmovdqa $t2,0x40(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x60(%rsp) + mov $F,$a4 + add \$16*2*$SZ,$Tbl + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: +___ + +sub AVX2_512_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body); # 48 instructions +my $base = "+2*$PUSH8(%rsp)"; + + &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0); + foreach (Xupdate_512_AVX()) { # 23 instructions + eval; + if ($_ !~ /\;$/) { + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + } + &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<8; $j++) { + &AVX2_512_00_47($j,\&bodyx_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &lea ($Tbl,16*2*$SZ."($Tbl)"); + &cmpb (($SZ-1-0x80)."($Tbl)",0); + &jne (".Lavx2_00_47"); + + for ($i=0; $i<16; ) { + my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; + foreach(bodyx_00_15()) { eval; } + } +} +$code.=<<___; + mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx + add $a1,$A + #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp + lea `2*$SZ*($rounds-8)`(%rsp),$Tbl + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + + cmp `$PUSH8+2*8`($Tbl),$inp # $_end + je .Ldone_avx2 + + xor $a1,$a1 + mov $B,$a3 + xor $C,$a3 # magic + mov $F,$a4 + jmp .Lower_avx2 +.align 16 +.Lower_avx2: +___ + for ($i=0; $i<8; ) { + my $base="+16($Tbl)"; + foreach(bodyx_00_15()) { eval; } + } +$code.=<<___; + lea -$PUSH8($Tbl),$Tbl + cmp %rsp,$Tbl + jae .Lower_avx2 + + mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx + add $a1,$A + #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp + lea `2*$SZ*($rounds-8)`(%rsp),%rsp + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + lea `2*16*$SZ`($inp),$inp # inp+=2 + add $SZ*6($ctx),$G + mov $inp,%r12 + add $SZ*7($ctx),$H + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + cmove %rsp,%r12 # next block or stale data + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + + jbe .Loop_avx2 + lea (%rsp),$Tbl + +.Ldone_avx2: + lea ($Tbl),%rsp + mov $_rsp,%rsi + vzeroall +___ +$code.=<<___ if ($win64); + movaps 16*$SZ+32(%rsp),%xmm6 + movaps 16*$SZ+48(%rsp),%xmm7 + movaps 16*$SZ+64(%rsp),%xmm8 + movaps 16*$SZ+80(%rsp),%xmm9 +___ +$code.=<<___ if ($win64 && $SZ>4); + movaps 16*$SZ+96(%rsp),%xmm10 + movaps 16*$SZ+112(%rsp),%xmm11 +___ +$code.=<<___; + mov (%rsi),%r15 + mov 8(%rsi),%r14 + mov 16(%rsi),%r13 + mov 24(%rsi),%r12 + mov 32(%rsi),%rbp + mov 40(%rsi),%rbx + lea 48(%rsi),%rsp +.Lepilogue_avx2: + ret +.size ${func}_avx2,.-${func}_avx2 +___ +}} }}}}} # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, @@ -1546,7 +2010,17 @@ se_handler: lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue - +___ +$code.=<<___ if ($avx>1); + lea .Lavx2_shortcut(%rip),%r10 + cmp %r10,%rbx # context->Rip1); + .rva .LSEH_begin_${func}_avx2 + .rva .LSEH_end_${func}_avx2 + .rva .LSEH_info_${func}_avx2 +___ $code.=<<___; .section .xdata .align 8 @@ -1660,6 +2139,12 @@ $code.=<<___ if ($avx); .rva se_handler .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ +$code.=<<___ if ($avx>1); +.LSEH_info_${func}_avx2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] +___ } $code =~ s/\`([^\`]*)\`/eval $1/gem;