ghash-x86*.pl: fix performance numbers for Core2, as it turned out
previous ones were "tainted" by variable clock frequency.
$Tlo1="t6";
$rem="t7"; # $8
#################
$Tlo1="t6";
$rem="t7"; # $8
#################
+$Xi="a0"; # $16, input argument block
$nlo="a4"; # $20
$nhi="a5";
$Zhi="t8";
$nlo="a4"; # $20
$nhi="a5";
$Zhi="t8";
-# argument block for gcm_ghash_4bit
-$inp="a0"; # $16
-$len="a1";
-$Xi ="a2";
-$Htbl="a3";
-
.prologue
{ .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,4,4,0,8
.prologue
{ .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,4,4,0,8
- $ADDP inp=15,in0 // &inp[15]
+ $ADDP inp=15,in2 // &inp[15]
-{ .mmi; $ADDP end=in1,in0 // &inp[len]
- $ADDP Xi=15,in2 // &Xi[15]
+{ .mmi; $ADDP end=in3,in2 // &inp[len]
+ $ADDP Xi=15,in0 // &Xi[15]
.save ar.lc,prevlc
mov prevlc=ar.lc };;
.save ar.lc,prevlc
mov prevlc=ar.lc };;
-{ .mmi; $ADDP Htbl=8,in3 // &Htbl[0].lo
+{ .mmi; $ADDP Htbl=8,in1 // &Htbl[0].lo
mov mask0xf0=0xf0
.save pr,prevpr
mov prevpr=pr }
mov mask0xf0=0xf0
.save pr,prevpr
mov prevpr=pr }
$Htblo="%l6";
$cnt="%l7";
$Htblo="%l6";
$cnt="%l7";
-$inp="%i0"; # input arguments for gcm_ghash_4bit
-$len="%i1";
-$Xi="%i2";
-$Htbl="%i3";
+$Xi="%i0"; # input argument block
+$Htbl="%i1";
+$inp="%i2";
+$len="%i3";
$code.=<<___;
.section ".text",#alloc,#execinstr
$code.=<<___;
.section ".text",#alloc,#execinstr
.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
___
.size gcm_ghash_4bit,(.-gcm_ghash_4bit)
___
-$Xi="%i0"; # input arguments for gcm_gmult_4bit
-$Htbl="%i1";
# PIII 63 /77 16 24
# P4 96 /122 30 84(***)
# Opteron 50 /71 21 30
# PIII 63 /77 16 24
# P4 96 /122 30 84(***)
# Opteron 50 /71 21 30
#
# (*) gcc 3.4.x was observed to generate few percent slower code,
# which is one of reasons why 2.95.3 results were chosen,
#
# (*) gcc 3.4.x was observed to generate few percent slower code,
# which is one of reasons why 2.95.3 results were chosen,
&lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
&lea ("eax",&DWP(&label("rem_4bit")."-".&label("pic_point"),"eax"));
- &mov ($inp,&wparam(0)); # load in
- &mov ($Zlh,&wparam(1)); # load len
- &mov ($Zhh,&wparam(2)); # load Xi
- &mov ($Htbl,&wparam(3)); # load Htable
+ &mov ($Zhh,&wparam(0)); # load Xi
+ &mov ($Htbl,&wparam(1)); # load Htable
+ &mov ($inp,&wparam(2)); # load in
+ &mov ($Zlh,&wparam(3)); # load len
- &mov (&wparam(1),$Zlh); # len to point at the end of input
+ &mov (&wparam(3),$Zlh); # len to point at the end of input
&stack_push(4+1); # +1 for stack alignment
&mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
&mov ($Zhl,&DWP(4,$Zhh));
&stack_push(4+1); # +1 for stack alignment
&mov ($Zll,&DWP(12,$Zhh)); # load Xi[16]
&mov ($Zhl,&DWP(4,$Zhh));
&mmx_loop("esp","eax");
&lea ($inp,&DWP(16,$inp));
&mmx_loop("esp","eax");
&lea ($inp,&DWP(16,$inp));
- &cmp ($inp,&wparam(1));
+ &cmp ($inp,&wparam(3));
&jb (&label("mmx_outer_loop"));
&jb (&label("mmx_outer_loop"));
- &mov ($inp,&wparam(2)); # load Xi
+ &mov ($inp,&wparam(0)); # load Xi
&emms ();
&mov (&DWP(12,$inp),$Zll);
&mov (&DWP(4,$inp),$Zhl);
&emms ();
&mov (&DWP(12,$inp),$Zll);
&mov (&DWP(4,$inp),$Zhl);
&set_label("x86",16);
}
&stack_push(16+4+1); # +1 for 64-bit alignment
&set_label("x86",16);
}
&stack_push(16+4+1); # +1 for 64-bit alignment
- &mov ($inp,&wparam(0)); # load in
- &mov ("ecx",&wparam(1)); # load len
- &mov ($Zll,&wparam(2)); # load Xi
- &mov ($Htbl,&wparam(3)); # load Htable
+ &mov ($Zll,&wparam(0)); # load Xi
+ &mov ($Htbl,&wparam(1)); # load Htable
+ &mov ($inp,&wparam(2)); # load in
+ &mov ("ecx",&wparam(3)); # load len
- &mov (&wparam(1),"ecx");
+ &mov (&wparam(3),"ecx");
&mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
&mov ($Zhl,&DWP(4,$Zll));
&mov ($Zhh,&DWP(0,$Zll)); # load Xi[16]
&mov ($Zhl,&DWP(4,$Zll));
&call ("_x86_gmult_4bit_inner");
} else {
&x86_loop(0);
&call ("_x86_gmult_4bit_inner");
} else {
&x86_loop(0);
- &mov ($inp,&wparam(0));
+ &mov ($inp,&wparam(2));
}
&lea ($inp,&DWP(16,$inp));
}
&lea ($inp,&DWP(16,$inp));
- &cmp ($inp,&wparam(1));
- &mov (&wparam(0),$inp) if (!$unroll);
+ &cmp ($inp,&wparam(3));
+ &mov (&wparam(2),$inp) if (!$unroll);
&jb (&label("x86_outer_loop"));
&jb (&label("x86_outer_loop"));
- &mov ($inp,&wparam(2)); # load Xi
+ &mov ($inp,&wparam(0)); # load Xi
&mov (&DWP(12,$inp),$Zll);
&mov (&DWP(8,$inp),$Zlh);
&mov (&DWP(4,$inp),$Zhl);
&mov (&DWP(12,$inp),$Zll);
&mov (&DWP(8,$inp),$Zlh);
&mov (&DWP(4,$inp),$Zhl);
# gcc 3.4.x assembler
#
# Opteron 18.5 10.2 +80%
# gcc 3.4.x assembler
#
# Opteron 18.5 10.2 +80%
$flavour = shift;
$output = shift;
$flavour = shift;
$output = shift;
$tmp="%r10";
$rem_4bit = "%r11";
$tmp="%r10";
$rem_4bit = "%r11";
-# per-function register layout
$Xi="%rdi";
$Htbl="%rsi";
$Xi="%rdi";
$Htbl="%rsi";
+# per-function register layout
$cnt="%rcx";
$rem="%rdx";
$cnt="%rcx";
$rem="%rdx";
# per-function register layout
# per-function register layout
-$inp="%rdi";
-$len="%rsi";
-$Xi="%rdx";
-$Htbl="%rcx";
+$inp="%rdx";
+$len="%rcx";
$cnt="%rbp";
$rem="%r12";
$cnt="%rbp";
$rem="%r12";
PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
-static void gcm_gmult_4bit(u64 Xi[2], u128 Htable[16])
+static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
* mostly as reference and a placeholder for possible future
* non-trivial optimization[s]...
*/
* mostly as reference and a placeholder for possible future
* non-trivial optimization[s]...
*/
-static void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2], u128 Htable[16])
+static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
+ const u8 *inp,size_t len)
-void gcm_gmult_4bit(u64 Xi[2],u128 Htable[16]);
-void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2],u128 Htable[16]);
+void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
#endif
#define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
#endif
#define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
#if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
-#define GHASH(in,len,ctx) gcm_ghash_4bit(in,len,(ctx)->Xi.u,(ctx)->Htable)
+#define GHASH(in,len,ctx) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
* trashing effect. In other words idea is to hash data while it's
* still in L1 cache after encryption pass... */
/* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
* trashing effect. In other words idea is to hash data while it's
* still in L1 cache after encryption pass... */