X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=blobdiff_plain;f=crypto%2Fmodes%2Fasm%2Fghash-x86.pl;h=63e76c1da67d34964caef3c0f533f69a0b6053fc;hp=0222ede585941e0066a62e1167298046346b20dc;hb=480cd6ab6e994626177de701c418264257954b03;hpb=6c6bdd543d2c6871b7a3a53fb17db3f36b7fa7cf diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl index 0222ede585..63e76c1da6 100644 --- a/crypto/modes/asm/ghash-x86.pl +++ b/crypto/modes/asm/ghash-x86.pl @@ -7,9 +7,11 @@ # details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # +# March 2010 +# # The module implements "4-bit" Galois field multiplication and # streamed GHASH function. "4-bit" means that it uses 256 bytes -# per-key table [+128/256 bytes fixed table]. It has two code paths: +# per-key table [+64/128 bytes fixed table]. It has two code paths: # vanilla x86 and vanilla MMX. Former will be executed on 486 and # Pentium, latter on all others. Performance results are for streamed # GHASH subroutine and are expressed in cycles per processed byte, @@ -18,13 +20,13 @@ # gcc 2.95.3(*) MMX assembler x86 assembler # # Pentium 100/112(**) - 50 -# PIII 63 /77 17 24 -# P4 96 /122 33 84(***) -# Opteron 50 /71 22 30 -# Core2 63 /102 21 28 +# PIII 63 /77 16 24 +# P4 96 /122 30 84(***) +# Opteron 50 /71 21 30 +# Core2 63 /102 19 28 # # (*) gcc 3.4.x was observed to generate few percent slower code, -# which is one of reasons why 2.95.3 result were chosen; +# which is one of reasons why 2.95.3 results were chosen, # another reason is lack of 3.4.x results for older CPUs; # (**) second number is result for code compiled with -fPIC flag, # which is actually more relevant, because assembler code is @@ -32,8 +34,8 @@ # (***) see comment in non-MMX routine for further details; # # To summarize, it's 2-3 times faster than gcc-generated code. To -# anchor it to something else SHA1 assembler processes single byte -# in 11-13 cycles. +# anchor it to something else SHA1 assembler processes one byte in +# 11-13 cycles on contemporary x86 cores. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); @@ -52,13 +54,13 @@ $Htbl = "esi"; $unroll = 0; # Affects x86 loop. Folded loop performs ~7% worse # than unrolled, which has to be weighted against - # almost 2x code size reduction. Well, *overall* - # code size. x86-specific code shrinks by 7.5x... + # 1.7x code size reduction. Well, *overall* 1.7x, + # x86-specific code itself shrinks by 2.5x... sub mmx_loop() { -# MMX version performs 2.5 times better on P4 (see comment in non-MMX -# routine for further details), 35% better on Opteron and Core2, 40% -# better on PIII... In other words effort is considered to be well +# MMX version performs 2.8 times better on P4 (see comment in non-MMX +# routine for further details), 40% better on Opteron, 50% better +# on PIII and Core2... In other words effort is considered to be well # spent... my $inp = shift; my $rem_4bit = shift; @@ -74,7 +76,7 @@ sub mmx_loop() { &xor ($nlo,$nlo); # avoid partial register stalls on PIII &mov ($nhi,$Zll); &mov (&LB($nlo),&LB($nhi)); - &mov ($cnt,15); + &mov ($cnt,14); &shl (&LB($nlo),4); &and ($nhi,0xf0); &movq ($Zlo,&QWP(8,$Htbl,$nlo)); @@ -85,34 +87,59 @@ sub mmx_loop() { &set_label("mmx_loop",16); &psrlq ($Zlo,4); &and ($rem,0xf); + &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); &movq ($tmp,$Zhi); &psrlq ($Zhi,4); + &mov (&LB($nlo),&BP(0,$inp,$cnt)); &dec ($cnt); - &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); &psllq ($tmp,60); &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); &movd ($rem,$Zlo); &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); + &mov ($nhi,$nlo); &pxor ($Zlo,$tmp); &js (&label("mmx_break")); - &movz ($nhi,&BP(0,$inp,$cnt)); + &shl (&LB($nlo),4); + &and ($rem,0xf); &psrlq ($Zlo,4); - &mov (&LB($nlo),&LB($nhi)); + &and ($nhi,0xf0); &movq ($tmp,$Zhi); - &shl (&LB($nlo),4); &psrlq ($Zhi,4); - &and ($rem,0xf); &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); &psllq ($tmp,60); &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); &movd ($rem,$Zlo); &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); &pxor ($Zlo,$tmp); - &and ($nhi,0xf0); &jmp (&label("mmx_loop")); &set_label("mmx_break",16); + &shl (&LB($nlo),4); + &and ($rem,0xf); + &psrlq ($Zlo,4); + &and ($nhi,0xf0); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &pxor ($Zlo,&QWP(8,$Htbl,$nlo)); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nlo)); + &pxor ($Zlo,$tmp); + + &psrlq ($Zlo,4); + &and ($rem,0xf); + &pxor ($Zlo,&QWP(8,$Htbl,$nhi)); + &movq ($tmp,$Zhi); + &psrlq ($Zhi,4); + &psllq ($tmp,60); + &pxor ($Zhi,&QWP(0,$rem_4bit,$rem,8)); + &movd ($rem,$Zlo); + &pxor ($Zhi,&QWP(0,$Htbl,$nhi)); + &mov ($nhi,$nlo); + &pxor ($Zlo,$tmp); + &psrlq ($Zlo,32); # lower part of Zlo is already there &movd ($Zhl,$Zhi); &psrlq ($Zhi,32);