X-Git-Url: https://git.openssl.org/?p=openssl.git;a=blobdiff_plain;f=crypto%2Fsha%2Fasm%2Fsha512-x86_64.pl;h=e7140defb6c6ae981c6152db91227b98f4b8c246;hp=8f10f973e98c921f5e7dbfffa9b92647d6f81112;hb=b0d3442efc10b635863b915c2d014345f6e5a219;hpb=46bf83f07ae1ba7fda435c90af93960e77159f4b diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl index 8f10f973e9..e7140defb6 100755 --- a/crypto/sha/asm/sha512-x86_64.pl +++ b/crypto/sha/asm/sha512-x86_64.pl @@ -1,9 +1,16 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== # Written by Andy Polyakov for the OpenSSL # project. Rights for redistribution and usage in source and binary -# forms are granted according to the OpenSSL license. +# forms are granted according to the License. # ==================================================================== # # sha256/512_block procedure for x86_64. @@ -34,7 +41,7 @@ # level parallelism, on a given CPU implementation in this case. # # Special note on Intel EM64T. While Opteron CPU exhibits perfect -# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above], +# performance ratio of 1.5 between 64- and 32-bit flavors [see above], # [currently available] EM64T CPUs apparently are far from it. On the # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit # sha256_block:-( This is presumably because 64-bit shifts/rotates @@ -44,7 +51,7 @@ # # Optimization including one of Pavel Semjanov's ideas, alternative # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and -# unfortunately -10% SHA512 on P4 [which nobody should care about +# unfortunately -2% SHA512 on P4 [which nobody should care about # that much]. # # June 2012. @@ -59,32 +66,53 @@ # higher coefficients are observed on VIA Nano and Bulldozer has more # to do with specifics of their architecture [which is topic for # separate discussion]. +# +# November 2012. +# +# Add AVX2 code path. Two consecutive input blocks are loaded to +# 256-bit %ymm registers, with data from first block to least +# significant 128-bit halves and data from second to most significant. +# The data is then processed with same SIMD instruction sequence as +# for AVX, but with %ymm as operands. Side effect is increased stack +# frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB +# code size increase. +# +# March 2014. +# +# Add support for Intel SHA Extensions. ###################################################################### # Current performance in cycles per processed byte (less is better): # # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*) # -# AMD K8 15.1 - - 9.70 - -# P4 17.5 - - 33.4 - -# Core 2 15.5 13.9(+11%) - 10.3 - -# Westmere 15.1 12.5(+21%) - 9.72 - -# Atom 23.0 21.6(+6%) - 14.7 - -# VIA Nano 23.0 16.3(+41%) - 14.7 - -# Sandy Bridge 17.4 14.0(+24%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) -# Ivy Bridge 12.6 10.3(+22%) 10.3(+22%) 8.17 7.22(+13%) -# Bulldozer 21.5 13.7(+57%) 13.7(+57%(***)) 13.5 8.58(+57%) +# AMD K8 14.9 - - 9.57 - +# P4 17.3 - - 30.8 - +# Core 2 15.6 13.8(+13%) - 9.97 - +# Westmere 14.8 12.3(+19%) - 9.58 - +# Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**)) +# Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%) +# Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%) +# Skylake 11.4 9.03(+26%) 7.70(+48%) 7.25 5.20(+40%) +# Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%) +# Ryzen 11.0 9.02(+22%) 2.05(+440%) 7.05 5.67(+20%) +# VIA Nano 23.0 16.5(+39%) - 14.7 - +# Atom 23.0 18.9(+22%) - 14.7 - +# Silvermont 27.4 20.6(+33%) - 17.5 - +# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) +# Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - # -# (*) whichever applicable; +# (*) whichever best applicable, including SHAEXT; # (**) switch from ror to shrd stands for fair share of improvement; # (***) execution time is fully determined by remaining integer-only # part, body_00_15; reducing the amount of SIMD instructions # below certain limit makes no difference/sense; to conserve # space SHA256 XOP code path is therefore omitted; -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); @@ -93,17 +121,30 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -$avx=1 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` - =~ /GNU assembler version ([2-9]\.[0-9]+)/ && - $1>=2.19); -$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && - `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && - $1>=2.09); -$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && - `ml64 2>&1` =~ /Version ([0-9]+)\./ && - $1>=10); - -open OUT,"| \"$^X\" $xlate $flavour $output"; +if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` + =~ /GNU assembler version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.19) + ($1>=2.22); +} + +if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) && + `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) { + $avx = ($1>=2.09) + ($1>=2.10); +} + +if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) && + `ml64 2>&1` =~ /Version ([0-9]+)\./) { + $avx = ($1>=10) + ($1>=11); +} + +if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) { + $avx = ($2>=3.0) + ($2>3.0); +} + +$shaext=1; ### set to zero if compiling for 1.0.1 +$avx=1 if (!$shaext && $avx); + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; *STDOUT=*OUT; if ($output =~ /512/) { @@ -139,19 +180,21 @@ $Tbl="%rbp"; $_ctx="16*$SZ+0*8(%rsp)"; $_inp="16*$SZ+1*8(%rsp)"; $_end="16*$SZ+2*8(%rsp)"; -$_rsp="16*$SZ+3*8(%rsp)"; +$_rsp="`16*$SZ+3*8`(%rsp)"; $framesz="16*$SZ+4*8"; sub ROUND_00_15() { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; + my $STRIDE=$SZ; + $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1)); $code.=<<___; ror \$`$Sigma1[2]-$Sigma1[1]`,$a0 mov $f,$a2 - ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 xor $e,$a0 + ror \$`$Sigma0[2]-$Sigma0[1]`,$a1 xor $g,$a2 # f^g mov $T1,`$SZ*($i&0xf)`(%rsp) @@ -170,25 +213,22 @@ $code.=<<___; add ($Tbl),$T1 # T1+=K[round] xor $a,$a1 - ror \$$Sigma1[0],$a0 # Sigma1(e) xor $b,$a2 # a^b, b^c in next round + ror \$$Sigma1[0],$a0 # Sigma1(e) mov $b,$h - ror \$$Sigma0[0],$a1 # Sigma0(a) and $a2,$a3 + ror \$$Sigma0[0],$a1 # Sigma0(a) add $a0,$T1 # T1+=Sigma1(e) xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b) add $T1,$d # d+=T1 add $T1,$h # h+=T1 + + lea $STRIDE($Tbl),$Tbl # round++ ___ -$code.=<<___ if ($i>=15); - mov `$SZ*(($i+2)&0xf)`(%rsp),$a0 -___ -$code.=<<___; - lea $SZ($Tbl),$Tbl # round++ +$code.=<<___ if ($i<15); add $a1,$h # h+=Sigma0(a) - ___ ($a2,$a3) = ($a3,$a2); } @@ -197,28 +237,29 @@ sub ROUND_16_XX() { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; $code.=<<___; - #mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 - mov `$SZ*(($i+14)&0xf)`(%rsp),$a1 + mov `$SZ*(($i+1)&0xf)`(%rsp),$a0 + mov `$SZ*(($i+14)&0xf)`(%rsp),$a2 mov $a0,$T1 ror \$`$sigma0[1]-$sigma0[0]`,$a0 - mov $a1,$a2 - ror \$`$sigma1[1]-$sigma1[0]`,$a1 + add $a1,$a # modulo-scheduled h+=Sigma0(a) + mov $a2,$a1 + ror \$`$sigma1[1]-$sigma1[0]`,$a2 xor $T1,$a0 shr \$$sigma0[2],$T1 ror \$$sigma0[0],$a0 - xor $a2,$a1 - shr \$$sigma1[2],$a2 + xor $a1,$a2 + shr \$$sigma1[2],$a1 + ror \$$sigma1[0],$a2 xor $a0,$T1 # sigma0(X[(i+1)&0xf]) - ror \$$sigma1[0],$a1 + xor $a1,$a2 # sigma1(X[(i+14)&0xf]) add `$SZ*(($i+9)&0xf)`(%rsp),$T1 - xor $a2,$a1 # sigma1(X[(i+14)&0xf]) add `$SZ*($i&0xf)`(%rsp),$T1 mov $e,$a0 - add $a1,$T1 + add $a2,$T1 mov $a,$a1 ___ &ROUND_00_15(@_); @@ -229,38 +270,56 @@ $code=<<___; .extern OPENSSL_ia32cap_P .globl $func -.type $func,\@function,4 +.type $func,\@function,3 .align 16 $func: +.cfi_startproc ___ $code.=<<___ if ($SZ==4 || $avx); lea OPENSSL_ia32cap_P(%rip),%r11 - mov 0(%r11),%r10d - mov 4(%r11),%r11d + mov 0(%r11),%r9d + mov 4(%r11),%r10d + mov 8(%r11),%r11d +___ +$code.=<<___ if ($SZ==4 && $shaext); + test \$`1<<29`,%r11d # check for SHA + jnz _shaext_shortcut ___ $code.=<<___ if ($avx && $SZ==8); - test \$`1<<11`,%r11d # check for XOP + test \$`1<<11`,%r10d # check for XOP jnz .Lxop_shortcut ___ +$code.=<<___ if ($avx>1); + and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 + cmp \$`1<<8|1<<5|1<<3`,%r11d + je .Lavx2_shortcut +___ $code.=<<___ if ($avx); - and \$`1<<30`,%r10d # mask "Intel CPU" bit - and \$`1<<28|1<<9`,%r11d # mask AVX and SSSE3 bits - or %r10d,%r11d - cmp \$`1<<28|1<<9|1<<30`,%r11d + and \$`1<<30`,%r9d # mask "Intel CPU" bit + and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits + or %r9d,%r10d + cmp \$`1<<28|1<<9|1<<30`,%r10d je .Lavx_shortcut ___ $code.=<<___ if ($SZ==4); - test \$`1<<9`,%r11d + test \$`1<<9`,%r10d jnz .Lssse3_shortcut ___ $code.=<<___; + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 - mov %rsp,%r11 # copy %rsp +.cfi_push %r15 shl \$4,%rdx # num*16 sub \$$framesz,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ @@ -268,7 +327,8 @@ $code.=<<___; mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp +.cfi_cfa_expression $_rsp,deref,+8 .Lprologue: mov $SZ*0($ctx),$A @@ -310,6 +370,7 @@ $code.=<<___; jnz .Lrounds_16_xx mov $_ctx,$ctx + add $a1,$A # modulo-scheduled h+=Sigma0(a) lea 16*$SZ($inp),$inp add $SZ*0($ctx),$A @@ -334,15 +395,24 @@ $code.=<<___; jb .Lloop mov $_rsp,%rsi - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp +.cfi_def_cfa %rsi,8 + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue: ret +.cfi_endproc .size $func,.-$func ___ @@ -352,24 +422,43 @@ $code.=<<___; .type $TABLE,\@object $TABLE: .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by " ___ @@ -378,55 +467,258 @@ $code.=<<___; .align 64 .type $TABLE,\@object $TABLE: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0x428a2f98d728ae22,0x7137449123ef65cd .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x3956c25bf348b538,0x59f111f1b605d019 .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0x9bdc06a725c71235,0xc19bf174cf692694 .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0x983e5152ee66dfab,0xa831c66d2db43210 .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x06ca6351e003826f,0x142929670a0e6e70 .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0x81c2c92e47edaee6,0x92722c851482353b .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xd192e819d6ef5218,0xd69906245565a910 .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0x90befffa23631e28,0xa4506cebde82bde9 .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xca273eceea26619c,0xd186b8c721c0c207 .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x113f9804bef90dae,0x1b710b35131c471b .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 .quad 0x0001020304050607,0x08090a0b0c0d0e0f - .asciz "SHA512 block transfort for x86_64, CRYPTOGAMS by " + .quad 0x0001020304050607,0x08090a0b0c0d0e0f + .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by " ___ } ###################################################################### # SIMD code paths # +if ($SZ==4 && $shaext) {{{ +###################################################################### +# Intel SHA Extensions implementation of SHA256 update function. +# +my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx"); + +my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10)); +my @MSG=map("%xmm$_",(3..6)); + +$code.=<<___; +.type sha256_block_data_order_shaext,\@function,3 +.align 64 +sha256_block_data_order_shaext: +_shaext_shortcut: +.cfi_startproc +___ +$code.=<<___ if ($win64); + lea `-8-5*16`(%rsp),%rsp + movaps %xmm6,-8-5*16(%rax) + movaps %xmm7,-8-4*16(%rax) + movaps %xmm8,-8-3*16(%rax) + movaps %xmm9,-8-2*16(%rax) + movaps %xmm10,-8-1*16(%rax) +.Lprologue_shaext: +___ +$code.=<<___; + lea K256+0x80(%rip),$Tbl + movdqu ($ctx),$ABEF # DCBA + movdqu 16($ctx),$CDGH # HGFE + movdqa 0x200-0x80($Tbl),$TMP # byte swap mask + + pshufd \$0x1b,$ABEF,$Wi # ABCD + pshufd \$0xb1,$ABEF,$ABEF # CDAB + pshufd \$0x1b,$CDGH,$CDGH # EFGH + movdqa $TMP,$BSWAP # offload + palignr \$8,$CDGH,$ABEF # ABEF + punpcklqdq $Wi,$CDGH # CDGH + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu ($inp),@MSG[0] + movdqu 0x10($inp),@MSG[1] + movdqu 0x20($inp),@MSG[2] + pshufb $TMP,@MSG[0] + movdqu 0x30($inp),@MSG[3] + + movdqa 0*32-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + pshufb $TMP,@MSG[1] + movdqa $CDGH,$CDGH_SAVE # offload + sha256rnds2 $ABEF,$CDGH # 0-3 + pshufd \$0x0e,$Wi,$Wi + nop + movdqa $ABEF,$ABEF_SAVE # offload + sha256rnds2 $CDGH,$ABEF + + movdqa 1*32-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + pshufb $TMP,@MSG[2] + sha256rnds2 $ABEF,$CDGH # 4-7 + pshufd \$0x0e,$Wi,$Wi + lea 0x40($inp),$inp + sha256msg1 @MSG[1],@MSG[0] + sha256rnds2 $CDGH,$ABEF + + movdqa 2*32-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + pshufb $TMP,@MSG[3] + sha256rnds2 $ABEF,$CDGH # 8-11 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[3],$TMP + palignr \$4,@MSG[2],$TMP + nop + paddd $TMP,@MSG[0] + sha256msg1 @MSG[2],@MSG[1] + sha256rnds2 $CDGH,$ABEF + + movdqa 3*32-0x80($Tbl),$Wi + paddd @MSG[3],$Wi + sha256msg2 @MSG[3],@MSG[0] + sha256rnds2 $ABEF,$CDGH # 12-15 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[0],$TMP + palignr \$4,@MSG[3],$TMP + nop + paddd $TMP,@MSG[1] + sha256msg1 @MSG[3],@MSG[2] + sha256rnds2 $CDGH,$ABEF +___ +for($i=4;$i<16-3;$i++) { +$code.=<<___; + movdqa $i*32-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256rnds2 $ABEF,$CDGH # 16-19... + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + nop + paddd $TMP,@MSG[2] + sha256msg1 @MSG[0],@MSG[3] + sha256rnds2 $CDGH,$ABEF +___ + push(@MSG,shift(@MSG)); +} +$code.=<<___; + movdqa 13*32-0x80($Tbl),$Wi + paddd @MSG[0],$Wi + sha256msg2 @MSG[0],@MSG[1] + sha256rnds2 $ABEF,$CDGH # 52-55 + pshufd \$0x0e,$Wi,$Wi + movdqa @MSG[1],$TMP + palignr \$4,@MSG[0],$TMP + sha256rnds2 $CDGH,$ABEF + paddd $TMP,@MSG[2] + + movdqa 14*32-0x80($Tbl),$Wi + paddd @MSG[1],$Wi + sha256rnds2 $ABEF,$CDGH # 56-59 + pshufd \$0x0e,$Wi,$Wi + sha256msg2 @MSG[1],@MSG[2] + movdqa $BSWAP,$TMP + sha256rnds2 $CDGH,$ABEF + + movdqa 15*32-0x80($Tbl),$Wi + paddd @MSG[2],$Wi + nop + sha256rnds2 $ABEF,$CDGH # 60-63 + pshufd \$0x0e,$Wi,$Wi + dec $num + nop + sha256rnds2 $CDGH,$ABEF + + paddd $CDGH_SAVE,$CDGH + paddd $ABEF_SAVE,$ABEF + jnz .Loop_shaext + + pshufd \$0xb1,$CDGH,$CDGH # DCHG + pshufd \$0x1b,$ABEF,$TMP # FEBA + pshufd \$0xb1,$ABEF,$ABEF # BAFE + punpckhqdq $CDGH,$ABEF # DCBA + palignr \$8,$TMP,$CDGH # HGFE + + movdqu $ABEF,($ctx) + movdqu $CDGH,16($ctx) +___ +$code.=<<___ if ($win64); + movaps -8-5*16(%rax),%xmm6 + movaps -8-4*16(%rax),%xmm7 + movaps -8-3*16(%rax),%xmm8 + movaps -8-2*16(%rax),%xmm9 + movaps -8-1*16(%rax),%xmm10 + mov %rax,%rsp +.Lepilogue_shaext: +___ +$code.=<<___; + ret +.cfi_endproc +.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext +___ +}}} {{{ my $a4=$T1; @@ -447,8 +739,8 @@ sub body_00_15 () { '&mov ($a,$a1)', '&mov ($a4,$f)', - '&xor ($a0,$e)', '&ror ($a1,$Sigma0[2]-$Sigma0[1])', + '&xor ($a0,$e)', '&xor ($a4,$g)', # f^g '&ror ($a0,$Sigma1[1]-$Sigma1[0])', @@ -459,20 +751,20 @@ sub body_00_15 () { '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i] '&mov ($a2,$a)', - '&ror ($a1,$Sigma0[1]-$Sigma0[0])', '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g + '&ror ($a1,$Sigma0[1]-$Sigma0[0])', '&xor ($a2,$b)', # a^b, b^c in next round - '&ror ($a0,$Sigma1[0])', # Sigma1(e) '&add ($h,$a4)', # h+=Ch(e,f,g) + '&ror ($a0,$Sigma1[0])', # Sigma1(e) '&and ($a3,$a2)', # (b^c)&(a^b) '&xor ($a1,$a)', '&add ($h,$a0)', # h+=Sigma1(e) '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) - '&add ($d,$h)', # d+=h '&ror ($a1,$Sigma0[0])', # Sigma0(a) + '&add ($d,$h)', # d+=h '&add ($h,$a3)', # h+=Maj(a,b,c) '&mov ($a0,$d)', @@ -489,17 +781,25 @@ my @X = map("%xmm$_",(0..3)); my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); $code.=<<___; -.type ${func}_ssse3,\@function,4 +.type ${func}_ssse3,\@function,3 .align 64 ${func}_ssse3: +.cfi_startproc .Lssse3_shortcut: + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 - mov %rsp,%r11 # copy %rsp +.cfi_push %r15 shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*4`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ @@ -507,7 +807,8 @@ ${func}_ssse3: mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp +.cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) @@ -529,26 +830,26 @@ $code.=<<___; ___ $code.=<<___; - movdqa $TABLE+`$SZ*$rounds`+16(%rip),$t4 - movdqa $TABLE+`$SZ*$rounds`+32(%rip),$t5 + #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 + #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 jmp .Lloop_ssse3 .align 16 .Lloop_ssse3: - movdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 movdqu 0x00($inp),@X[0] movdqu 0x10($inp),@X[1] movdqu 0x20($inp),@X[2] - movdqu 0x30($inp),@X[3] pshufb $t3,@X[0] + movdqu 0x30($inp),@X[3] lea $TABLE(%rip),$Tbl pshufb $t3,@X[1] movdqa 0x00($Tbl),$t0 + movdqa 0x20($Tbl),$t1 pshufb $t3,@X[2] - movdqa 0x10($Tbl),$t1 paddd @X[0],$t0 - movdqa 0x20($Tbl),$t2 + movdqa 0x40($Tbl),$t2 pshufb $t3,@X[3] - movdqa 0x30($Tbl),$t3 + movdqa 0x60($Tbl),$t3 paddd @X[1],$t1 paddd @X[2],$t2 paddd @X[3],$t3 @@ -564,7 +865,7 @@ $code.=<<___; .align 16 .Lssse3_00_47: - add \$16*$SZ,$Tbl + sub \$`-16*2*$SZ`,$Tbl # size optimization ___ sub Xupdate_256_SSSE3 () { ( @@ -601,7 +902,7 @@ sub Xupdate_256_SSSE3 () { '&pxor ($t3,$t2);', '&psrlq ($t2,$sigma1[1]-$sigma1[0])', '&pxor ($t3,$t2);', - '&movdqa ($t2,16*$j."($Tbl)")', + '&movdqa ($t2,16*2*$j."($Tbl)")', '&pshufb ($t3,$t5)', '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17]) ); @@ -620,95 +921,98 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions eval(shift(@insns)); eval(shift(@insns)); } - } else { # squeeze extra 3% on Westmere and Atom + } else { # squeeze extra 4% on Westmere and 19% on Atom eval(shift(@insns)); #@ - eval(shift(@insns)); &movdqa ($t0,@X[1]); eval(shift(@insns)); + eval(shift(@insns)); &movdqa ($t3,@X[3]); + eval(shift(@insns)); #@ + eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &palignr ($t0,@X[0],$SZ); # X[1..4] - eval(shift(@insns)); #@ eval(shift(@insns)); - &palignr ($t3,@X[2],$SZ); # X[9..12] eval(shift(@insns)); + &palignr ($t3,@X[2],$SZ); # X[9..12] eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ - eval(shift(@insns)); &movdqa ($t1,$t0); eval(shift(@insns)); + eval(shift(@insns)); &movdqa ($t2,$t0); eval(shift(@insns)); #@ eval(shift(@insns)); - eval(shift(@insns)); &psrld ($t0,$sigma0[2]); eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t3); # X[0..3] += X[9..12] - eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &psrld ($t2,$sigma0[0]); eval(shift(@insns)); eval(shift(@insns)); - eval(shift(@insns)); #@ - eval(shift(@insns)); &pshufd ($t3,@X[3],0b11111010); # X[4..15] eval(shift(@insns)); + eval(shift(@insns)); #@ &pslld ($t1,8*$SZ-$sigma0[1]); eval(shift(@insns)); + eval(shift(@insns)); &pxor ($t0,$t2); eval(shift(@insns)); #@ eval(shift(@insns)); - &psrld ($t2,$sigma0[1]-$sigma0[0]); + eval(shift(@insns)); eval(shift(@insns)); #@ + &psrld ($t2,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); &pxor ($t0,$t1); eval(shift(@insns)); eval(shift(@insns)); &pslld ($t1,$sigma0[1]-$sigma0[0]); eval(shift(@insns)); + eval(shift(@insns)); &pxor ($t0,$t2); eval(shift(@insns)); eval(shift(@insns)); #@ - eval(shift(@insns)); &movdqa ($t2,$t3); eval(shift(@insns)); - eval(shift(@insns)); #@ eval(shift(@insns)); &pxor ($t0,$t1); # sigma0(X[1..4]) + eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); &psrld ($t3,$sigma1[2]); eval(shift(@insns)); eval(shift(@insns)); &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4]) - eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); - eval(shift(@insns)); &psrlq ($t2,$sigma1[0]); eval(shift(@insns)); - eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ &psrlq ($t2,$sigma1[1]-$sigma1[0]); eval(shift(@insns)); - eval(shift(@insns)); #@ eval(shift(@insns)); &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); eval(shift(@insns)); + #&pshufb ($t3,$t4); # sigma1(X[14..15]) + &pshufd ($t3,$t3,0b10000000); eval(shift(@insns)); eval(shift(@insns)); - &pshufb ($t3,$t4); # sigma1(X[14..15]) + eval(shift(@insns)); + &psrldq ($t3,8); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); @@ -716,45 +1020,48 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions eval(shift(@insns)); #@ &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15]) eval(shift(@insns)); - &pshufd ($t3,@X[0],0b01010000); # X[16..17] eval(shift(@insns)); eval(shift(@insns)); + &pshufd ($t3,@X[0],0b01010000); # X[16..17] eval(shift(@insns)); + eval(shift(@insns)); #@ eval(shift(@insns)); &movdqa ($t2,$t3); eval(shift(@insns)); - eval(shift(@insns)); #@ eval(shift(@insns)); &psrld ($t3,$sigma1[2]); eval(shift(@insns)); - &psrlq ($t2,$sigma1[0]); - eval(shift(@insns)); eval(shift(@insns)); #@ + &psrlq ($t2,$sigma1[0]); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); + eval(shift(@insns)); #@ + eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); #@ eval(shift(@insns)); &psrlq ($t2,$sigma1[1]-$sigma1[0]); - eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); &pxor ($t3,$t2); eval(shift(@insns)); eval(shift(@insns)); - &movdqa ($t2,16*$j."($Tbl)"); eval(shift(@insns)); #@ + #&pshufb ($t3,$t5); + &pshufd ($t3,$t3,0b00001000); eval(shift(@insns)); - &pshufb ($t3,$t5); eval(shift(@insns)); + &movdqa ($t2,16*2*$j."($Tbl)"); eval(shift(@insns)); #@ eval(shift(@insns)); + &pslldq ($t3,8); eval(shift(@insns)); eval(shift(@insns)); - &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) eval(shift(@insns)); + &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17]) + eval(shift(@insns)); #@ eval(shift(@insns)); eval(shift(@insns)); } @@ -767,7 +1074,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions &SSSE3_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } - &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lssse3_00_47"); for ($i=0; $i<16; ) { @@ -800,6 +1107,7 @@ $code.=<<___; jb .Lloop_ssse3 mov $_rsp,%rsi +.cfi_def_cfa %rsi,8 ___ $code.=<<___ if ($win64); movaps 16*$SZ+32(%rsp),%xmm6 @@ -808,15 +1116,23 @@ $code.=<<___ if ($win64); movaps 16*$SZ+80(%rsp),%xmm9 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_ssse3: ret +.cfi_endproc .size ${func}_ssse3,.-${func}_ssse3 ___ } @@ -827,17 +1143,25 @@ if ($avx) {{ # if ($SZ==8) { # SHA512 only $code.=<<___; -.type ${func}_xop,\@function,4 +.type ${func}_xop,\@function,3 .align 64 ${func}_xop: +.cfi_startproc .Lxop_shortcut: + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 - mov %rsp,%r11 # copy %rsp +.cfi_push %r15 shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ @@ -845,7 +1169,8 @@ ${func}_xop: mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp +.cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) @@ -860,7 +1185,7 @@ ___ $code.=<<___; .Lprologue_xop: - vzeroall + vzeroupper mov $SZ*0($ctx),$A mov $SZ*1($ctx),$B mov $SZ*2($ctx),$C @@ -878,7 +1203,7 @@ ___ $code.=<<___; .align 16 .Lloop_xop: - vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] @@ -889,9 +1214,9 @@ $code.=<<___; vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] - vpaddd 0x10($Tbl),@X[1],$t1 - vpaddd 0x20($Tbl),@X[2],$t2 - vpaddd 0x30($Tbl),@X[3],$t3 + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) mov $A,$a1 vmovdqa $t1,0x10(%rsp) @@ -904,7 +1229,7 @@ $code.=<<___; .align 16 .Lxop_00_47: - add \$16*$SZ,$Tbl + sub \$`-16*2*$SZ`,$Tbl # size optimization ___ sub XOP_256_00_47 () { my $j = shift; @@ -1001,7 +1326,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &vpaddd ($t2,@X[0],16*$j."($Tbl)"); + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } @@ -1010,7 +1335,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions &XOP_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } - &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lxop_00_47"); for ($i=0; $i<16; ) { @@ -1024,9 +1349,9 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions $code.=<<___; .align 16 .Lloop_xop: - vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] - lea $TABLE(%rip),$Tbl + lea $TABLE+0x80(%rip),$Tbl # size optimization vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] vpshufb $t3,@X[0],@X[0] @@ -1040,20 +1365,20 @@ $code.=<<___; vpshufb $t3,@X[4],@X[4] vmovdqu 0x70($inp),@X[7] vpshufb $t3,@X[5],@X[5] - vpaddq 0x00($Tbl),@X[0],$t0 + vpaddq -0x80($Tbl),@X[0],$t0 vpshufb $t3,@X[6],@X[6] - vpaddq 0x10($Tbl),@X[1],$t1 + vpaddq -0x60($Tbl),@X[1],$t1 vpshufb $t3,@X[7],@X[7] - vpaddq 0x20($Tbl),@X[2],$t2 - vpaddq 0x30($Tbl),@X[3],$t3 + vpaddq -0x40($Tbl),@X[2],$t2 + vpaddq -0x20($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) - vpaddq 0x40($Tbl),@X[4],$t0 + vpaddq 0x00($Tbl),@X[4],$t0 vmovdqa $t1,0x10(%rsp) - vpaddq 0x50($Tbl),@X[5],$t1 + vpaddq 0x20($Tbl),@X[5],$t1 vmovdqa $t2,0x20(%rsp) - vpaddq 0x60($Tbl),@X[6],$t2 + vpaddq 0x40($Tbl),@X[6],$t2 vmovdqa $t3,0x30(%rsp) - vpaddq 0x70($Tbl),@X[7],$t3 + vpaddq 0x60($Tbl),@X[7],$t3 vmovdqa $t0,0x40(%rsp) mov $A,$a1 vmovdqa $t1,0x50(%rsp) @@ -1066,7 +1391,7 @@ $code.=<<___; .align 16 .Lxop_00_47: - add \$16*$SZ,$Tbl + add \$`16*2*$SZ`,$Tbl ___ sub XOP_512_00_47 () { my $j = shift; @@ -1129,7 +1454,7 @@ my @insns = (&$body,&$body); # 52 instructions eval(shift(@insns)); eval(shift(@insns)); eval(shift(@insns)); - &vpaddq ($t2,@X[0],16*$j."($Tbl)"); + &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } @@ -1138,7 +1463,7 @@ my @insns = (&$body,&$body); # 52 instructions &XOP_512_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } - &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); &jne (".Lxop_00_47"); for ($i=0; $i<16; ) { @@ -1172,7 +1497,8 @@ $code.=<<___; jb .Lloop_xop mov $_rsp,%rsi - vzeroall +.cfi_def_cfa %rsi,8 + vzeroupper ___ $code.=<<___ if ($win64); movaps 16*$SZ+32(%rsp),%xmm6 @@ -1185,15 +1511,23 @@ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+112(%rsp),%xmm11 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_xop: ret +.cfi_endproc .size ${func}_xop,.-${func}_xop ___ } @@ -1203,17 +1537,25 @@ ___ local *ror = sub { &shrd(@_[0],@_) }; $code.=<<___; -.type ${func}_avx,\@function,4 +.type ${func}_avx,\@function,3 .align 64 ${func}_avx: +.cfi_startproc .Lavx_shortcut: + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax push %rbx +.cfi_push %rbx push %rbp +.cfi_push %rbp push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 push %r14 +.cfi_push %r14 push %r15 - mov %rsp,%r11 # copy %rsp +.cfi_push %r15 shl \$4,%rdx # num*16 sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ @@ -1221,7 +1563,8 @@ ${func}_avx: mov $ctx,$_ctx # save ctx, 1st arg mov $inp,$_inp # save inp, 2nd arh mov %rdx,$_end # save end pointer, "3rd" arg - mov %r11,$_rsp # save copy of %rsp + mov %rax,$_rsp # save copy of %rsp +.cfi_cfa_expression $_rsp,deref,+8 ___ $code.=<<___ if ($win64); movaps %xmm6,16*$SZ+32(%rsp) @@ -1236,7 +1579,7 @@ ___ $code.=<<___; .Lprologue_avx: - vzeroall + vzeroupper mov $SZ*0($ctx),$A mov $SZ*1($ctx),$B mov $SZ*2($ctx),$C @@ -1251,12 +1594,12 @@ ___ my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9)); $code.=<<___; - vmovdqa $TABLE+`$SZ*$rounds`+16(%rip),$t4 - vmovdqa $TABLE+`$SZ*$rounds`+32(%rip),$t5 + vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 + vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 jmp .Lloop_avx .align 16 .Lloop_avx: - vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] @@ -1267,9 +1610,9 @@ $code.=<<___; vpshufb $t3,@X[2],@X[2] vpaddd 0x00($Tbl),@X[0],$t0 vpshufb $t3,@X[3],@X[3] - vpaddd 0x10($Tbl),@X[1],$t1 - vpaddd 0x20($Tbl),@X[2],$t2 - vpaddd 0x30($Tbl),@X[3],$t3 + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) mov $A,$a1 vmovdqa $t1,0x10(%rsp) @@ -1282,7 +1625,7 @@ $code.=<<___; .align 16 .Lavx_00_47: - add \$16*$SZ,$Tbl + sub \$`-16*2*$SZ`,$Tbl # size optimization ___ sub Xupdate_256_AVX () { ( @@ -1330,7 +1673,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions eval(shift(@insns)); eval(shift(@insns)); } - &vpaddd ($t2,@X[0],16*$j."($Tbl)"); + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } @@ -1339,7 +1682,7 @@ my @insns = (&$body,&$body,&$body,&$body); # 104 instructions &AVX_256_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } - &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); &jne (".Lavx_00_47"); for ($i=0; $i<16; ) { @@ -1354,9 +1697,9 @@ $code.=<<___; jmp .Lloop_avx .align 16 .Lloop_avx: - vmovdqa $TABLE+`$SZ*$rounds`(%rip),$t3 + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 vmovdqu 0x00($inp),@X[0] - lea $TABLE(%rip),$Tbl + lea $TABLE+0x80(%rip),$Tbl # size optimization vmovdqu 0x10($inp),@X[1] vmovdqu 0x20($inp),@X[2] vpshufb $t3,@X[0],@X[0] @@ -1370,20 +1713,20 @@ $code.=<<___; vpshufb $t3,@X[4],@X[4] vmovdqu 0x70($inp),@X[7] vpshufb $t3,@X[5],@X[5] - vpaddq 0x00($Tbl),@X[0],$t0 + vpaddq -0x80($Tbl),@X[0],$t0 vpshufb $t3,@X[6],@X[6] - vpaddq 0x10($Tbl),@X[1],$t1 + vpaddq -0x60($Tbl),@X[1],$t1 vpshufb $t3,@X[7],@X[7] - vpaddq 0x20($Tbl),@X[2],$t2 - vpaddq 0x30($Tbl),@X[3],$t3 + vpaddq -0x40($Tbl),@X[2],$t2 + vpaddq -0x20($Tbl),@X[3],$t3 vmovdqa $t0,0x00(%rsp) - vpaddq 0x40($Tbl),@X[4],$t0 + vpaddq 0x00($Tbl),@X[4],$t0 vmovdqa $t1,0x10(%rsp) - vpaddq 0x50($Tbl),@X[5],$t1 + vpaddq 0x20($Tbl),@X[5],$t1 vmovdqa $t2,0x20(%rsp) - vpaddq 0x60($Tbl),@X[6],$t2 + vpaddq 0x40($Tbl),@X[6],$t2 vmovdqa $t3,0x30(%rsp) - vpaddq 0x70($Tbl),@X[7],$t3 + vpaddq 0x60($Tbl),@X[7],$t3 vmovdqa $t0,0x40(%rsp) mov $A,$a1 vmovdqa $t1,0x50(%rsp) @@ -1396,14 +1739,14 @@ $code.=<<___; .align 16 .Lavx_00_47: - add \$16*$SZ,$Tbl + add \$`16*2*$SZ`,$Tbl ___ sub Xupdate_512_AVX () { ( '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2] '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10] - '&vpsrlq ($t2,$t0,$sigma0[0]);', - '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += X[9..10] + '&vpsrlq ($t2,$t0,$sigma0[0])', + '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10] '&vpsrlq ($t3,$t0,$sigma0[2])', '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);', '&vpxor ($t0,$t3,$t2)', @@ -1413,7 +1756,7 @@ sub Xupdate_512_AVX () { '&vpxor ($t0,$t0,$t2)', '&vpsrlq ($t3,@X[7],$sigma1[2]);', '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2]) - '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1])', + '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);', '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2]) '&vpsrlq ($t1,@X[7],$sigma1[0]);', '&vpxor ($t3,$t3,$t2)', @@ -1437,7 +1780,7 @@ my @insns = (&$body,&$body); # 52 instructions eval(shift(@insns)); eval(shift(@insns)); } - &vpaddq ($t2,@X[0],16*$j."($Tbl)"); + &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); foreach (@insns) { eval; } # remaining instructions &vmovdqa (16*$j."(%rsp)",$t2); } @@ -1446,7 +1789,7 @@ my @insns = (&$body,&$body); # 52 instructions &AVX_512_00_47($j,\&body_00_15,@X); push(@X,shift(@X)); # rotate(@X) } - &cmpb ($SZ-1+16*$SZ."($Tbl)",0); + &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); &jne (".Lavx_00_47"); for ($i=0; $i<16; ) { @@ -1480,7 +1823,8 @@ $code.=<<___; jb .Lloop_avx mov $_rsp,%rsi - vzeroall +.cfi_def_cfa %rsi,8 + vzeroupper ___ $code.=<<___ if ($win64); movaps 16*$SZ+32(%rsp),%xmm6 @@ -1493,17 +1837,476 @@ $code.=<<___ if ($win64 && $SZ>4); movaps 16*$SZ+112(%rsp),%xmm11 ___ $code.=<<___; - mov (%rsi),%r15 - mov 8(%rsi),%r14 - mov 16(%rsi),%r13 - mov 24(%rsi),%r12 - mov 32(%rsi),%rbp - mov 40(%rsi),%rbx - lea 48(%rsi),%rsp + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp .Lepilogue_avx: ret +.cfi_endproc .size ${func}_avx,.-${func}_avx ___ + +if ($avx>1) {{ +###################################################################### +# AVX2+BMI code path +# +my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp +my $PUSH8=8*2*$SZ; +use integer; + +sub bodyx_00_15 () { + # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f + ( + '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'. + + '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i] + '&and ($a4,$e)', # f&e + '&rorx ($a0,$e,$Sigma1[2])', + '&rorx ($a2,$e,$Sigma1[1])', + + '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past + '&lea ($h,"($h,$a4)")', + '&andn ($a4,$e,$g)', # ~e&g + '&xor ($a0,$a2)', + + '&rorx ($a1,$e,$Sigma1[0])', + '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g) + '&xor ($a0,$a1)', # Sigma1(e) + '&mov ($a2,$a)', + + '&rorx ($a4,$a,$Sigma0[2])', + '&lea ($h,"($h,$a0)")', # h+=Sigma1(e) + '&xor ($a2,$b)', # a^b, b^c in next round + '&rorx ($a1,$a,$Sigma0[1])', + + '&rorx ($a0,$a,$Sigma0[0])', + '&lea ($d,"($d,$h)")', # d+=h + '&and ($a3,$a2)', # (b^c)&(a^b) + '&xor ($a1,$a4)', + + '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b) + '&xor ($a1,$a0)', # Sigma0(a) + '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c) + '&mov ($a4,$e)', # copy of f in future + + '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;' + ); + # and at the finish one has to $a+=$a1 +} + +$code.=<<___; +.type ${func}_avx2,\@function,3 +.align 64 +${func}_avx2: +.cfi_startproc +.Lavx2_shortcut: + mov %rsp,%rax # copy %rsp +.cfi_def_cfa_register %rax + push %rbx +.cfi_push %rbx + push %rbp +.cfi_push %rbp + push %r12 +.cfi_push %r12 + push %r13 +.cfi_push %r13 + push %r14 +.cfi_push %r14 + push %r15 +.cfi_push %r15 + sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp + shl \$4,%rdx # num*16 + and \$-256*$SZ,%rsp # align stack frame + lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ + add \$`2*$SZ*($rounds-8)`,%rsp + mov $ctx,$_ctx # save ctx, 1st arg + mov $inp,$_inp # save inp, 2nd arh + mov %rdx,$_end # save end pointer, "3rd" arg + mov %rax,$_rsp # save copy of %rsp +.cfi_cfa_expression $_rsp,deref,+8 +___ +$code.=<<___ if ($win64); + movaps %xmm6,16*$SZ+32(%rsp) + movaps %xmm7,16*$SZ+48(%rsp) + movaps %xmm8,16*$SZ+64(%rsp) + movaps %xmm9,16*$SZ+80(%rsp) +___ +$code.=<<___ if ($win64 && $SZ>4); + movaps %xmm10,16*$SZ+96(%rsp) + movaps %xmm11,16*$SZ+112(%rsp) +___ +$code.=<<___; +.Lprologue_avx2: + + vzeroupper + sub \$-16*$SZ,$inp # inp++, size optimization + mov $SZ*0($ctx),$A + mov $inp,%r12 # borrow $T1 + mov $SZ*1($ctx),$B + cmp %rdx,$inp # $_end + mov $SZ*2($ctx),$C + cmove %rsp,%r12 # next block or random data + mov $SZ*3($ctx),$D + mov $SZ*4($ctx),$E + mov $SZ*5($ctx),$F + mov $SZ*6($ctx),$G + mov $SZ*7($ctx),$H +___ + if ($SZ==4) { # SHA256 + my @X = map("%ymm$_",(0..3)); + my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9)); + +$code.=<<___; + vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4 + vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5 + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 + vmovdqu -16*$SZ+0($inp),%xmm0 + vmovdqu -16*$SZ+16($inp),%xmm1 + vmovdqu -16*$SZ+32($inp),%xmm2 + vmovdqu -16*$SZ+48($inp),%xmm3 + #mov $inp,$_inp # offload $inp + vinserti128 \$1,(%r12),@X[0],@X[0] + vinserti128 \$1,16(%r12),@X[1],@X[1] + vpshufb $t3,@X[0],@X[0] + vinserti128 \$1,32(%r12),@X[2],@X[2] + vpshufb $t3,@X[1],@X[1] + vinserti128 \$1,48(%r12),@X[3],@X[3] + + lea $TABLE(%rip),$Tbl + vpshufb $t3,@X[2],@X[2] + vpaddd 0x00($Tbl),@X[0],$t0 + vpshufb $t3,@X[3],@X[3] + vpaddd 0x20($Tbl),@X[1],$t1 + vpaddd 0x40($Tbl),@X[2],$t2 + vpaddd 0x60($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + xor $a1,$a1 + vmovdqa $t1,0x20(%rsp) +___ +$code.=<<___ if (!$win64); +# temporarily use %rdi as frame pointer + mov $_rsp,%rdi +.cfi_def_cfa %rdi,8 +___ +$code.=<<___; + lea -$PUSH8(%rsp),%rsp +___ +$code.=<<___ if (!$win64); +# the frame info is at $_rsp, but the stack is moving... +# so a second frame pointer is saved at -8(%rsp) +# that is in the red zone + mov %rdi,-8(%rsp) +.cfi_cfa_expression %rsp-8,deref,+8 +___ +$code.=<<___; + mov $B,$a3 + vmovdqa $t2,0x00(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x20(%rsp) + mov $F,$a4 + sub \$-16*2*$SZ,$Tbl # size optimization + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: +___ + +sub AVX2_256_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body,&$body,&$body); # 96 instructions +my $base = "+2*$PUSH8(%rsp)"; + + if (($j%2)==0) { + &lea ("%rsp","-$PUSH8(%rsp)"); +$code.=<<___ if (!$win64); +.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 +# copy secondary frame pointer to new location again at -8(%rsp) + pushq $PUSH8-8(%rsp) +.cfi_cfa_expression %rsp,deref,+8 + lea 8(%rsp),%rsp +.cfi_cfa_expression %rsp-8,deref,+8 +___ + } + + foreach (Xupdate_256_AVX()) { # 29 instructions + eval; + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<4; $j++) { + &AVX2_256_00_47($j,\&bodyx_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &lea ($Tbl,16*2*$SZ."($Tbl)"); + &cmpb (($SZ-1)."($Tbl)",0); + &jne (".Lavx2_00_47"); + + for ($i=0; $i<16; ) { + my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; + foreach(bodyx_00_15()) { eval; } + } + } else { # SHA512 + my @X = map("%ymm$_",(0..7)); + my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11)); + +$code.=<<___; + jmp .Loop_avx2 +.align 16 +.Loop_avx2: + vmovdqu -16*$SZ($inp),%xmm0 + vmovdqu -16*$SZ+16($inp),%xmm1 + vmovdqu -16*$SZ+32($inp),%xmm2 + lea $TABLE+0x80(%rip),$Tbl # size optimization + vmovdqu -16*$SZ+48($inp),%xmm3 + vmovdqu -16*$SZ+64($inp),%xmm4 + vmovdqu -16*$SZ+80($inp),%xmm5 + vmovdqu -16*$SZ+96($inp),%xmm6 + vmovdqu -16*$SZ+112($inp),%xmm7 + #mov $inp,$_inp # offload $inp + vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2 + vinserti128 \$1,(%r12),@X[0],@X[0] + vinserti128 \$1,16(%r12),@X[1],@X[1] + vpshufb $t2,@X[0],@X[0] + vinserti128 \$1,32(%r12),@X[2],@X[2] + vpshufb $t2,@X[1],@X[1] + vinserti128 \$1,48(%r12),@X[3],@X[3] + vpshufb $t2,@X[2],@X[2] + vinserti128 \$1,64(%r12),@X[4],@X[4] + vpshufb $t2,@X[3],@X[3] + vinserti128 \$1,80(%r12),@X[5],@X[5] + vpshufb $t2,@X[4],@X[4] + vinserti128 \$1,96(%r12),@X[6],@X[6] + vpshufb $t2,@X[5],@X[5] + vinserti128 \$1,112(%r12),@X[7],@X[7] + + vpaddq -0x80($Tbl),@X[0],$t0 + vpshufb $t2,@X[6],@X[6] + vpaddq -0x60($Tbl),@X[1],$t1 + vpshufb $t2,@X[7],@X[7] + vpaddq -0x40($Tbl),@X[2],$t2 + vpaddq -0x20($Tbl),@X[3],$t3 + vmovdqa $t0,0x00(%rsp) + vpaddq 0x00($Tbl),@X[4],$t0 + vmovdqa $t1,0x20(%rsp) + vpaddq 0x20($Tbl),@X[5],$t1 + vmovdqa $t2,0x40(%rsp) + vpaddq 0x40($Tbl),@X[6],$t2 + vmovdqa $t3,0x60(%rsp) +___ +$code.=<<___ if (!$win64); +# temporarily use %rdi as frame pointer + mov $_rsp,%rdi +.cfi_def_cfa %rdi,8 +___ +$code.=<<___; + lea -$PUSH8(%rsp),%rsp +___ +$code.=<<___ if (!$win64); +# the frame info is at $_rsp, but the stack is moving... +# so a second frame pointer is saved at -8(%rsp) +# that is in the red zone + mov %rdi,-8(%rsp) +.cfi_cfa_expression %rsp-8,deref,+8 +___ +$code.=<<___; + vpaddq 0x60($Tbl),@X[7],$t3 + vmovdqa $t0,0x00(%rsp) + xor $a1,$a1 + vmovdqa $t1,0x20(%rsp) + mov $B,$a3 + vmovdqa $t2,0x40(%rsp) + xor $C,$a3 # magic + vmovdqa $t3,0x60(%rsp) + mov $F,$a4 + add \$16*2*$SZ,$Tbl + jmp .Lavx2_00_47 + +.align 16 +.Lavx2_00_47: +___ + +sub AVX2_512_00_47 () { +my $j = shift; +my $body = shift; +my @X = @_; +my @insns = (&$body,&$body); # 48 instructions +my $base = "+2*$PUSH8(%rsp)"; + + if (($j%4)==0) { + &lea ("%rsp","-$PUSH8(%rsp)"); +$code.=<<___ if (!$win64); +.cfi_cfa_expression %rsp+`$PUSH8-8`,deref,+8 +# copy secondary frame pointer to new location again at -8(%rsp) + pushq $PUSH8-8(%rsp) +.cfi_cfa_expression %rsp,deref,+8 + lea 8(%rsp),%rsp +.cfi_cfa_expression %rsp-8,deref,+8 +___ + } + + foreach (Xupdate_512_AVX()) { # 23 instructions + eval; + if ($_ !~ /\;$/) { + eval(shift(@insns)); + eval(shift(@insns)); + eval(shift(@insns)); + } + } + &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); + foreach (@insns) { eval; } # remaining instructions + &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2); +} + + for ($i=0,$j=0; $j<8; $j++) { + &AVX2_512_00_47($j,\&bodyx_00_15,@X); + push(@X,shift(@X)); # rotate(@X) + } + &lea ($Tbl,16*2*$SZ."($Tbl)"); + &cmpb (($SZ-1-0x80)."($Tbl)",0); + &jne (".Lavx2_00_47"); + + for ($i=0; $i<16; ) { + my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)"; + foreach(bodyx_00_15()) { eval; } + } +} +$code.=<<___; + mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx + add $a1,$A + #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp + lea `2*$SZ*($rounds-8)`(%rsp),$Tbl + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + add $SZ*6($ctx),$G + add $SZ*7($ctx),$H + + mov $A,$SZ*0($ctx) + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + + cmp `$PUSH8+2*8`($Tbl),$inp # $_end + je .Ldone_avx2 + + xor $a1,$a1 + mov $B,$a3 + xor $C,$a3 # magic + mov $F,$a4 + jmp .Lower_avx2 +.align 16 +.Lower_avx2: +___ + for ($i=0; $i<8; ) { + my $base="+16($Tbl)"; + foreach(bodyx_00_15()) { eval; } + } +$code.=<<___; + lea -$PUSH8($Tbl),$Tbl + cmp %rsp,$Tbl + jae .Lower_avx2 + + mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx + add $a1,$A + #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp + lea `2*$SZ*($rounds-8)`(%rsp),%rsp +# restore frame pointer to original location at $_rsp +.cfi_cfa_expression $_rsp,deref,+8 + + add $SZ*0($ctx),$A + add $SZ*1($ctx),$B + add $SZ*2($ctx),$C + add $SZ*3($ctx),$D + add $SZ*4($ctx),$E + add $SZ*5($ctx),$F + lea `2*16*$SZ`($inp),$inp # inp+=2 + add $SZ*6($ctx),$G + mov $inp,%r12 + add $SZ*7($ctx),$H + cmp $_end,$inp + + mov $A,$SZ*0($ctx) + cmove %rsp,%r12 # next block or stale data + mov $B,$SZ*1($ctx) + mov $C,$SZ*2($ctx) + mov $D,$SZ*3($ctx) + mov $E,$SZ*4($ctx) + mov $F,$SZ*5($ctx) + mov $G,$SZ*6($ctx) + mov $H,$SZ*7($ctx) + + jbe .Loop_avx2 + lea (%rsp),$Tbl +# temporarily use $Tbl as index to $_rsp +# this avoids the need to save a secondary frame pointer at -8(%rsp) +.cfi_cfa_expression $Tbl+`16*$SZ+3*8`,deref,+8 + +.Ldone_avx2: + mov `16*$SZ+3*8`($Tbl),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper +___ +$code.=<<___ if ($win64); + movaps 16*$SZ+32($Tbl),%xmm6 + movaps 16*$SZ+48($Tbl),%xmm7 + movaps 16*$SZ+64($Tbl),%xmm8 + movaps 16*$SZ+80($Tbl),%xmm9 +___ +$code.=<<___ if ($win64 && $SZ>4); + movaps 16*$SZ+96($Tbl),%xmm10 + movaps 16*$SZ+112($Tbl),%xmm11 +___ +$code.=<<___; + mov -48(%rsi),%r15 +.cfi_restore %r15 + mov -40(%rsi),%r14 +.cfi_restore %r14 + mov -32(%rsi),%r13 +.cfi_restore %r13 + mov -24(%rsi),%r12 +.cfi_restore %r12 + mov -16(%rsi),%rbp +.cfi_restore %rbp + mov -8(%rsi),%rbx +.cfi_restore %rbx + lea (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + ret +.cfi_endproc +.size ${func}_avx2,.-${func}_avx2 +___ +}} }}}}} # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame, @@ -1547,10 +2350,19 @@ se_handler: lea (%rsi,%r10),%r10 # epilogue label cmp %r10,%rbx # context->Rip>=epilogue label jae .Lin_prologue - +___ +$code.=<<___ if ($avx>1); + lea .Lavx2_shortcut(%rip),%r10 + cmp %r10,%rbx # context->RipRax + mov 248($context),%rbx # pull context->Rip + + lea .Lprologue_shaext(%rip),%r10 + cmp %r10,%rbx # context->Rip<.Lprologue + jb .Lin_prologue + + lea .Lepilogue_shaext(%rip),%r10 + cmp %r10,%rbx # context->Rip>=.Lepilogue + jae .Lin_prologue + + lea -8-5*16(%rax),%rsi + lea 512($context),%rdi # &context.Xmm6 + mov \$10,%ecx + .long 0xa548f3fc # cld; rep movsq + + jmp .Lin_prologue +.size shaext_handler,.-shaext_handler +___ + +$code.=<<___; .section .pdata .align 4 .rva .LSEH_begin_$func .rva .LSEH_end_$func .rva .LSEH_info_$func ___ +$code.=<<___ if ($SZ==4 && $shaext); + .rva .LSEH_begin_${func}_shaext + .rva .LSEH_end_${func}_shaext + .rva .LSEH_info_${func}_shaext +___ $code.=<<___ if ($SZ==4); .rva .LSEH_begin_${func}_ssse3 .rva .LSEH_end_${func}_ssse3 @@ -1635,6 +2489,11 @@ $code.=<<___ if ($avx); .rva .LSEH_end_${func}_avx .rva .LSEH_info_${func}_avx ___ +$code.=<<___ if ($avx>1); + .rva .LSEH_begin_${func}_avx2 + .rva .LSEH_end_${func}_avx2 + .rva .LSEH_info_${func}_avx2 +___ $code.=<<___; .section .xdata .align 8 @@ -1643,6 +2502,11 @@ $code.=<<___; .rva se_handler .rva .Lprologue,.Lepilogue # HandlerData[] ___ +$code.=<<___ if ($SZ==4 && $shaext); +.LSEH_info_${func}_shaext: + .byte 9,0,0,0 + .rva shaext_handler +___ $code.=<<___ if ($SZ==4); .LSEH_info_${func}_ssse3: .byte 9,0,0,0 @@ -1661,8 +2525,36 @@ $code.=<<___ if ($avx); .rva se_handler .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[] ___ +$code.=<<___ if ($avx>1); +.LSEH_info_${func}_avx2: + .byte 9,0,0,0 + .rva se_handler + .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[] +___ } -$code =~ s/\`([^\`]*)\`/eval $1/gem; -print $code; +sub sha256op38 { + my $instr = shift; + my %opcodelet = ( + "sha256rnds2" => 0xcb, + "sha256msg1" => 0xcc, + "sha256msg2" => 0xcd ); + + if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) { + my @opcode=(0x0f,0x38); + push @opcode,$opcodelet{$instr}; + push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M + return ".byte\t".join(',',@opcode); + } else { + return $instr."\t".@_[0]; + } +} + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo; + + print $_,"\n"; +} close STDOUT;