#!/usr/bin/env perl
#
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. Rights for redistribution and usage in source and binary
+# forms are granted according to the OpenSSL license.
+# ====================================================================
+#
# SHA512_Transform_SSE2.
#
# As the name suggests, this is an IA-32 SSE2 implementation of
# a 64-bit instruction set? Is it rich enough to implement SHA512?
# If answer was "no," then you wouldn't have been reading this...
#
-# [Preliminary] throughput numbers (larger is better):
-#
-# 2.4GHz P4 1.4GHz AMD32 1.4GHz AMD64
-# SHA256/gcc 38 36 46
-# SHA512/gcc 9 15 72
-# SHA512/sse2 53(*) 51
-# SHA512/icc 21 21
-# SHA256/icc 52 42
+# Throughput performance in MBps (larger is better):
#
-# (*) I.e. it gives ~6x speed-up on P4 if compared to code generated
-# by gcc, and 2.5x over icc. It was worth it:-) Well, one can
-# argue that handcoded *non*-SSE2 implementation would perform
-# better than compiler generated one, and comparison therefore
-# is not exactly fair. As SHA512 puts enormous pressure on IA-32
-# GP register bank, I reckon handcoded version wouldn't perform
-# significantly better than one compiled with icc, ~20% perhaps.
-# So that this code would still outperform it with distinguishing
-# marginal. But feel free to prove me wrong:-)
+# 2.4GHz P4 1.4GHz AMD32 1.4GHz AMD64(*)
+# SHA256/gcc(*) 54 43 59
+# SHA512/gcc 17 23 92
+# SHA512/sse2 61(**) 57(**)
+# SHA512/icc 26 28
+# SHA256/icc(*) 65 54
#
+# (*) AMD64 and SHA256 numbers are presented mostly for amusement or
+# reference purposes.
+# (**) I.e. it gives ~2-3x speed-up if compared with compiler generated
+# code. One can argue that hand-coded *non*-SSE2 implementation
+# would perform better than compiler generated one as well, and
+# that comparison is therefore not exactly fair. Well, as SHA512
+# puts enormous pressure on IA-32 GP register bank, I reckon that
+# hand-coded version wouldn't perform significantly better than
+# one compiled with icc, ~20% perhaps... So that this code would
+# still outperform it with distinguishing marginal. But feel free
+# to prove me wrong:-)
# <appro@fy.chalmers.se>
push(@INC,"perlasm","../../perlasm");
require "x86asm.pl";
# I adhere to 64-bit %mmX registers in order to avoid/not care
# about #GP exceptions on misaligned 128-bit access, most
- # notably in paddq with memory operand.
+ # notably in paddq with memory operand. Not to mention that
+ # SSE2 intructions operating on %mmX can be scheduled every
+ # cycle [and not every second one if operating on %xmmN].
&movq ("mm4",&QWP($Foff,$W512)); # load f
&movq ("mm5",&QWP($Goff,$W512)); # load g
&movq ("mm6",&QWP($Hoff,$W512)); # load h
- &movq (&QWP($Foff,$W512),$E); # f = e
- &movq (&QWP($Goff,$W512),"mm4"); # g = f
- &movq (&QWP($Hoff,$W512),"mm5"); # h = g
&movq ("mm2",$E); # %mm2 is sliding right
&movq ("mm3",$E); # %mm3 is sliding left
&pxor ("mm7","mm2");
&pxor ("mm7","mm3"); # T1=Sigma1_512(e)
+ &movq (&QWP($Foff,$W512),$E); # f = e
+ &movq (&QWP($Goff,$W512),"mm4"); # g = f
+ &movq (&QWP($Hoff,$W512),"mm5"); # h = g
+
&pxor ("mm4","mm5"); # f^=g
&pand ("mm4",$E); # f&=e
&pxor ("mm4","mm5"); # f^=g
&movq ("mm2",&QWP($Boff,$W512)); # load b
&movq ("mm3",&QWP($Coff,$W512)); # load c
&movq ($E,&QWP($Doff,$W512)); # e = d
- &movq (&QWP($Boff,$W512),$A); # b = a
- &movq (&QWP($Coff,$W512),"mm2"); # c = b
- &movq (&QWP($Doff,$W512),"mm3"); # d = c
&paddq ("mm7","mm6"); # T1+=h
&paddq ("mm7",&QWP(0,$K512,$kidx,8)); # T1+=K512[i]
&pxor ("mm6","mm4");
&pxor ("mm6","mm5"); # T2=Sigma0_512(a)
- &movq ("mm4","mm2"); # %mm4=b
- &pand ("mm2",$A); # b&=a
- &pand ("mm4","mm3"); # %mm4&=c
- &pand ("mm3",$A); # c&=a
- &pxor ("mm4","mm2"); # %mm4^=b&a
- &pxor ("mm4","mm3"); # %mm4^=c&a
+ &movq (&QWP($Boff,$W512),$A); # b = a
+ &movq (&QWP($Coff,$W512),"mm2"); # c = b
+ &movq (&QWP($Doff,$W512),"mm3"); # d = c
+
+ &movq ("mm4",$A); # %mm4=a
+ &por ($A,"mm3"); # a=a|c
+ &pand ("mm4","mm3"); # %mm4=a&c
+ &pand ($A,"mm2"); # a=(a|c)&b
+ &por ("mm4",$A); # %mm4=(a&c)|((a|c)&b)
&paddq ("mm6","mm4"); # T2+=Maj(a,b,c)
&movq ($A,"mm7"); # a=T1
&paddq ($A,"mm6"); # a+=T2
}
-$func="SHA512_Transform_SSE2";
+$func="sha512_block_sse2";
&function_begin_B($func);
if (0) {# Caller is expected to check if it's appropriate to
&movdqu ("xmm1",&QWP(16,$Widx));
&movdqu ("xmm2",&QWP(32,$Widx));
&movdqu ("xmm3",&QWP(48,$Widx));
+
+&align(8);
+&set_label("_chunk_loop");
+
&movdqa (&QWP($Aoff,$W512),"xmm0"); # a,b
&movdqa (&QWP($Coff,$W512),"xmm1"); # c,d
&movdqa (&QWP($Eoff,$W512),"xmm2"); # e,f
# Why aren't loops unrolled? It makes sense to unroll if
# execution time for loop body is comparable with branch
- # penalties and/or if whole data-set resides in register
- # bank. Neither is case here...
+ # penalties and/or if whole data-set resides in register bank.
+ # Neither is case here... Well, it would be possible to
+ # eliminate few store operations, but it would hardly affect
+ # so to say stop-watch performance, as there is a lot of
+ # available memory slots to fill. It will only relieve some
+ # pressure off memory bus...
-&align(8);
-&set_label("_1st_loop"); # 0-15
# flip input stream byte order...
&mov ("eax",&DWP(0,$data,$Widx,8));
&mov ("ebx",&DWP(4,$data,$Widx,8));
&mov (&DWP(128+0,$W512,$Widx,8),"ebx"); # copy of W512[i]
&mov (&DWP(128+4,$W512,$Widx,8),"eax");
+&align(8);
+&set_label("_1st_loop"); # 0-15
+ # flip input stream byte order...
+ &mov ("eax",&DWP(0+8,$data,$Widx,8));
+ &mov ("ebx",&DWP(4+8,$data,$Widx,8));
+ &bswap ("eax");
+ &bswap ("ebx");
+ &mov (&DWP(0+8,$W512,$Widx,8),"ebx"); # W512[i]
+ &mov (&DWP(4+8,$W512,$Widx,8),"eax");
+ &mov (&DWP(128+0+8,$W512,$Widx,8),"ebx"); # copy of W512[i]
+ &mov (&DWP(128+4+8,$W512,$Widx,8),"eax");
+&set_label("_1st_looplet");
&SHA2_ROUND($Widx,$Widx); &inc($Widx);
-&cmp ($Widx,16)
+&cmp ($Widx,15)
&jl (&label("_1st_loop"));
+&je (&label("_1st_looplet")); # playing similar trick on 2nd loop
+ # does not improve performance...
$Kidx = "ebx"; # start using %ebx as Kidx
&mov ($Kidx,$Widx);
&movdqu (&QWP(32,$Widx),"xmm2");
&movdqu (&QWP(48,$Widx),"xmm3");
+&add ($data,16*8); # advance input data pointer
+&dec (&DWP(16,"ebp")); # decrement 3rd arg
+&jnz (&label("_chunk_loop"));
+
# epilogue
&emms (); # required for at least ELF and Win32 ABIs
&mov ("edi",&DWP(-12,"ebp"));
&leave ();
&ret ();
-&align(16);
+&align(64);
&set_label("K512"); # Yes! I keep it in the code segment!
&data_word(0xd728ae22,0x428a2f98); # u64
&data_word(0x23ef65cd,0x71374491); # u64