X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=blobdiff_plain;f=crypto%2Fsha%2Fasm%2Fsha1-586.pl;h=996707c07b9c278f4a4c633bc532abf3117c42d5;hp=b161d1cc14b73e5e4cb966b33e606a039820d9b0;hb=b55e21b357902959ae8ec0255952402f5ccaa515;hpb=7eb048828008f195fb6edceb8f767622694e7426 diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl index b161d1cc14..996707c07b 100644 --- a/crypto/sha/asm/sha1-586.pl +++ b/crypto/sha/asm/sha1-586.pl @@ -1,4 +1,11 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 1998-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the OpenSSL license (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== # [Re]written by Andy Polyakov for the OpenSSL @@ -28,10 +35,9 @@ # P4 +85%(!) +45% # # As you can see Pentium came out as looser:-( Yet I reckoned that -# improvement on P4 outweights the loss and incorporate this +# improvement on P4 outweighs the loss and incorporate this # re-tuned code to 0.9.7 and later. # ---------------------------------------------------------------- -# # August 2009. # @@ -66,9 +72,9 @@ # switch to AVX alone improves performance by as little as 4% in # comparison to SSSE3 code path. But below result doesn't look like # 4% improvement... Trouble is that Sandy Bridge decodes 'ro[rl]' as -# pair of µ-ops, and it's the additional µ-ops, two per round, that +# pair of µ-ops, and it's the additional µ-ops, two per round, that # make it run slower than Core2 and Westmere. But 'sh[rl]d' is decoded -# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with +# as single µ-op by Sandy Bridge and it's replacing 'ro[rl]' with # equivalent 'sh[rl]d' that is responsible for the impressive 5.1 # cycles per processed byte. But 'sh[rl]d' is not something that used # to be fast, nor does it appear to be fast in upcoming Bulldozer @@ -93,24 +99,34 @@ # P4 10.6 - # AMD K8 7.1 - # Core2 7.3 6.0/+22% - -# Atom 12.5 9.3(*)/+35% - # Westmere 7.3 5.5/+33% - # Sandy Bridge 8.8 6.2/+40% 5.1(**)/+73% # Ivy Bridge 7.2 4.8/+51% 4.7(**)/+53% # Haswell 6.5 4.3/+51% 4.1(**)/+58% +# Skylake 6.4 4.1/+55% 4.1(**)/+55% # Bulldozer 11.6 6.0/+92% # VIA Nano 10.6 7.5/+41% +# Atom 12.5 9.3(*)/+35% +# Silvermont 14.5 9.9(*)/+46% +# Goldmont 8.8 6.7/+30% 1.7(***)/+415% # # (*) Loop is 1056 instructions long and expected result is ~8.25. -# It remains mystery [to me] why ILP is limited to 1.7. +# The discrepancy is because of front-end limitations, so +# called MS-ROM penalties, and on Silvermont even rotate's +# limited parallelism. # # (**) As per above comment, the result is for AVX *plus* sh[rl]d. +# +# (***) SHAEXT result $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; -&asm_init($ARGV[0],"sha1-586.pl",$ARGV[$#ARGV] eq "386"); +$output=pop; +open STDOUT,">$output"; + +&asm_init($ARGV[0],$ARGV[$#ARGV] eq "386"); $xmm=$ymm=0; for (@ARGV) { $xmm=1 if (/-DOPENSSL_IA32_SSE2/); } @@ -120,7 +136,7 @@ $ymm=1 if ($xmm && =~ /GNU assembler version ([2-9]\.[0-9]+)/ && $1>=2.19); # first version supporting AVX -$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && +$ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32n" && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/ && $1>=2.03); # first version supporting AVX @@ -128,8 +144,8 @@ $ymm=1 if ($xmm && !$ymm && $ARGV[0] eq "win32" && `ml 2>&1` =~ /Version ([0-9]+)\./ && $1>=10); # first version supporting AVX -$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /LLVM ([3-9]\.[0-9]+)/ && - $1>=3.0); # first version supporting AVX +$ymm=1 if ($xmm && !$ymm && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|based on LLVM) ([3-9]\.[0-9]+)/ && + $2>=3.0); # first version supporting AVX $shaext=$xmm; ### set to zero if compiling for 1.0.1 @@ -447,7 +463,7 @@ sub sha1msg2 { sha1op38(0xca,@_); } &sub ("esp",32); &movdqu ($ABCD,&QWP(0,$ctx)); - &movd ($E,&QWP(16,$ctx)); + &movd ($E,&DWP(16,$ctx)); &and ("esp",-32); &movdqa ($BSWAP,&QWP(0x50,$tmp1)); # byte-n-word swap @@ -533,7 +549,7 @@ for($i=0;$i<20-4;$i+=2) { # being implemented in SSSE3). Once 8 quadruples or 32 elements are # collected, it switches to routine proposed by Max Locktyukhin. # -# Calculations inevitably require temporary reqisters, and there are +# Calculations inevitably require temporary registers, and there are # no %xmm registers left to spare. For this reason part of the ring # buffer, X[2..4] to be specific, is offloaded to 3 quadriples ring # buffer on the stack. Keep in mind that X[2] is alias X[-6], X[3] - @@ -644,7 +660,7 @@ my $_ror=sub { &ror(@_) }; &jmp (&label("loop")); ###################################################################### -# SSE instruction sequence is first broken to groups of indepentent +# SSE instruction sequence is first broken to groups of independent # instructions, independent in respect to their inputs and shifter # (not all architectures have more than one). Then IALU instructions # are "knitted in" between the SSE groups. Distance is maintained for @@ -653,14 +669,14 @@ my $_ror=sub { &ror(@_) }; # # Temporary registers usage. X[2] is volatile at the entry and at the # end is restored from backtrace ring buffer. X[3] is expected to -# contain current K_XX_XX constant and is used to caclulate X[-1]+K +# contain current K_XX_XX constant and is used to calculate X[-1]+K # from previous round, it becomes volatile the moment the value is # saved to stack for transfer to IALU. X[4] becomes volatile whenever # X[-4] is accumulated and offloaded to backtrace ring buffer, at the # end it is loaded with next K_XX_XX [which becomes X[3] in next # round]... # -sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_ssse3_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions @@ -1183,7 +1199,7 @@ my $_ror=sub { &shrd(@_[0],@_) }; &and (@T[0],@T[1]); &jmp (&label("loop")); -sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4 +sub Xupdate_avx_16_31() # recall that $Xi starts with 4 { use integer; my $body = shift; my @insns = (&$body,&$body,&$body,&$body); # 40 instructions @@ -1471,3 +1487,5 @@ sub Xtail_avx() &asciz("SHA1 block transform for x86, CRYPTOGAMS by "); &asm_finish(); + +close STDOUT;