Skip to content

Commit

Permalink
SPARCv9 assembly pack: harmonize ABI handling (so that it's handled i…
Browse files Browse the repository at this point in the history
…n one

place at a time, by pre-processor in .S case and perl - in .s).
  • Loading branch information
Andy Polyakov committed Oct 25, 2012
1 parent 8ed11a8 commit 1efd583
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 82 deletions.
26 changes: 9 additions & 17 deletions crypto/bn/asm/sparcv9-gf2m.pl
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,8 @@
# ~100-230% faster than gcc-generated code and ~35-90% faster than
# the pure SPARCv9 code path.

$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=112; }

$locals=16*8;

$code.=<<___;
#include <sparc_arch.h>
.section ".text",#alloc,#execinstr
___
$code.=<<___ if ($bits==64);
.register %g2,#scratch
.register %g3,#scratch
___

$tab="%l0";

@T=("%g2","%g3");
Expand All @@ -44,6 +29,13 @@
($lo,$hi,$b)=("%g1",$a8,"%o7"); $a=$lo;

$code.=<<___;
#include <sparc_arch.h>
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
#ifdef __PIC__
SPARC_PIC_THUNK(%g1)
#endif
Expand Down Expand Up @@ -74,7 +66,7 @@
.align 16
.Lsoftware:
save %sp,-$frame-$locals,%sp
save %sp,-STACK_FRAME-$locals,%sp
sllx %i1,32,$a
mov -1,$a12
Expand All @@ -83,7 +75,7 @@
srlx $a12,1,$a48 ! 0x7fff...
or %i4,$b,$b
srlx $a12,2,$a12 ! 0x3fff...
add %sp,$bias+$frame,$tab
add %sp,STACK_BIAS+STACK_FRAME,$tab
sllx $a,2,$a4
mov $a,$a1
Expand Down
22 changes: 9 additions & 13 deletions crypto/md5/asm/md5-sparcv9.pl
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,6 @@
# single-process result on 8-core processor, or ~11GBps per 2.85GHz
# socket.

$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=112; }

$output=shift;
open STDOUT,">$output";

Expand Down Expand Up @@ -198,13 +193,14 @@ sub R3 {
___
}

$code.=<<___ if ($bits==64);
.register %g2,#scratch
.register %g3,#scratch
___
$code.=<<___;
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.section ".text",#alloc,#execinstr
#ifdef __PIC__
Expand Down Expand Up @@ -246,7 +242,7 @@ sub R3 {
.word 0x81b02800 ! MD5
bne,pt `$bits==64?"%xcc":"%icc"`, .Lhw_loop
bne,pt SIZE_T_CC, .Lhw_loop
nop
.Lhwfinish:
Expand Down Expand Up @@ -287,15 +283,15 @@ sub R3 {
.word 0x81b02800 ! MD5
bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
bne,pt SIZE_T_CC, .Lhwunaligned_loop
for %f26, %f26, %f10 ! %f10=%f26
ba .Lhwfinish
nop
.align 16
.Lsoftware:
save %sp,-$frame,%sp
save %sp,-STACK_FRAME,%sp
rd %asi,$saved_asi
wr %g0,0x88,%asi ! ASI_PRIMARY_LITTLE
Expand Down Expand Up @@ -355,7 +351,7 @@ sub R3 {
add $t2,$C,$C
add $CD,$D,$D
srl $B,0,$B ! clruw $B
bne `$bits==64?"%xcc":"%icc"`,.Loop
bne SIZE_T_CC,.Loop
srl $D,0,$D ! clruw $D
st $A,[$ctx+0] ! write out ctx
Expand Down
22 changes: 9 additions & 13 deletions crypto/sha/asm/sha1-sparcv9.pl
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,6 @@
# single-process result on 8-core processor, or ~9GBps per 2.85GHz
# socket.

$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=112; }

$output=shift;
open STDOUT,">$output";

Expand Down Expand Up @@ -185,13 +180,14 @@ sub BODY_40_59 {
___
}

$code.=<<___ if ($bits==64);
.register %g2,#scratch
.register %g3,#scratch
___
$code.=<<___;
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.section ".text",#alloc,#execinstr
#ifdef __PIC__
Expand Down Expand Up @@ -231,7 +227,7 @@ sub BODY_40_59 {
.word 0x81b02820 ! SHA1
bne,pt `$bits==64?"%xcc":"%icc"`, .Lhw_loop
bne,pt SIZE_T_CC, .Lhw_loop
nop
.Lhwfinish:
Expand Down Expand Up @@ -271,15 +267,15 @@ sub BODY_40_59 {
.word 0x81b02820 ! SHA1
bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
bne,pt SIZE_T_CC, .Lhwunaligned_loop
for %f26, %f26, %f10 ! %f10=%f26
ba .Lhwfinish
nop
.align 16
.Lsoftware:
save %sp,-$frame,%sp
save %sp,-STACK_FRAME,%sp
sllx $len,6,$len
add $inp,$len,$len
Expand Down Expand Up @@ -359,7 +355,7 @@ sub BODY_40_59 {
add $E,@X[4],$E
st $E,[$ctx+16]
bne `$bits==64?"%xcc":"%icc"`,.Lloop
bne SIZE_T_CC,.Lloop
andn $inp,7,$tmp0
ret
Expand Down
65 changes: 30 additions & 35 deletions crypto/sha/asm/sha512-sparcv9.pl
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,6 @@
# saturates at 11.5x single-process result on 8-core processor, or
# ~11/16GBps per 2.85GHz socket.


$bits=32;
for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
if ($bits==64) { $bias=2047; $frame=192; }
else { $bias=0; $frame=112; }

$output=shift;
open STDOUT,">$output";

Expand Down Expand Up @@ -191,29 +185,29 @@
or @pair[1],$tmp2,$tmp2
`"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
add $h,$tmp2,$T1
$ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
$ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
___
$code.=<<___ if ($i==12);
bnz,a,pn %icc,.+8
ld [$inp+128],%l0
___
$code.=<<___ if ($i==15);
ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
add $tmp31,32,$tmp0
ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
sllx @pair[0],$tmp0,$tmp1
ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
srlx @pair[2],$tmp32,@pair[1]
or $tmp1,$tmp2,$tmp2
ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
or @pair[1],$tmp2,$tmp2
ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
add $h,$tmp2,$T1
$ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
$ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
___
} if ($SZ==8);

Expand Down Expand Up @@ -349,9 +343,9 @@ sub BODY_00_15 {
or %l3,$tmp0,$tmp0
srlx $tmp0,@sigma0[0],$T1
ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
sllx $tmp0,`64-@sigma0[2]`,$tmp1
ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
srlx $tmp0,@sigma0[1],$tmp0
xor $tmp1,$T1,$T1
sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
Expand All @@ -363,9 +357,9 @@ sub BODY_00_15 {
or %l7,$tmp2,$tmp2
srlx $tmp2,@sigma1[0],$tmp1
ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
sllx $tmp2,`64-@sigma1[2]`,$tmp0
ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
srlx $tmp2,@sigma1[1],$tmp2
xor $tmp0,$tmp1,$tmp1
sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
Expand All @@ -374,29 +368,30 @@ sub BODY_00_15 {
xor $tmp0,$tmp1,$tmp1
sllx %l4,32,$tmp0
xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
or %l5,$tmp0,$tmp0
ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
sllx %l0,32,$tmp2
add $tmp1,$T1,$T1
ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
or %l1,$tmp2,$tmp2
add $tmp0,$T1,$T1 ! +=X[$i+9]
ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
add $tmp2,$T1,$T1 ! +=X[$i]
$ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
$ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
___
&BODY_00_15(@_);
} if ($SZ==8);

$code.=<<___ if ($bits==64);
.register %g2,#scratch
.register %g3,#scratch
___
$code.=<<___;
#include "sparc_arch.h"
#ifdef __arch64__
.register %g2,#scratch
.register %g3,#scratch
#endif
.section ".text",#alloc,#execinstr
.align 64
Expand Down Expand Up @@ -519,7 +514,7 @@ sub BODY_00_15 {
.word 0x81b02860 ! SHA512
bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwaligned_loop
bne,pt SIZE_T_CC, .Lhwaligned_loop
nop
.Lhwfinish:
Expand Down Expand Up @@ -579,7 +574,7 @@ sub BODY_00_15 {
.word 0x81b02860 ! SHA512
bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
bne,pt SIZE_T_CC, .Lhwunaligned_loop
for %f50, %f50, %f18 ! %f18=%f50
ba .Lhwfinish
Expand Down Expand Up @@ -612,7 +607,7 @@ sub BODY_00_15 {
.word 0x81b02840 ! SHA256
bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwloop
bne,pt SIZE_T_CC, .Lhwloop
nop
.Lhwfinish:
Expand Down Expand Up @@ -655,7 +650,7 @@ sub BODY_00_15 {
.word 0x81b02840 ! SHA256
bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
bne,pt SIZE_T_CC, .Lhwunaligned_loop
for %f26, %f26, %f10 ! %f10=%f26
ba .Lhwfinish
Expand All @@ -664,7 +659,7 @@ sub BODY_00_15 {
$code.=<<___;
.align 16
.Lsoftware:
save %sp,`-$frame-$locals`,%sp
save %sp,-STACK_FRAME-$locals,%sp
and $inp,`$align-1`,$tmp31
sllx $len,`log(16*$SZ)/log(2)`,$len
andn $inp,`$align-1`,$inp
Expand Down Expand Up @@ -783,7 +778,7 @@ sub BODY_00_15 {
$code.=<<___;
add $inp,`16*$SZ`,$inp ! advance inp
cmp $inp,$len
bne `$bits==64?"%xcc":"%icc"`,.Lloop
bne SIZE_T_CC,.Lloop
sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
ret
Expand Down
17 changes: 13 additions & 4 deletions crypto/sparc_arch.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@
# define __PIC__
#endif

#if defined(__SUNPRO_C) && defined(__sparcv9) && !defined(__arch64__)
# define __arch64__
#endif

#define SPARC_PIC_THUNK(reg) \
.align 32; \
.Lpic_thunk: \
Expand All @@ -53,18 +57,23 @@
add %o7, reg, reg
#endif

#if (defined(__GNUC__) && defined(__arch64__)) || \
(defined(__SUNPRO_C) && defined(__sparcv9))
#if defined(__arch64__)

# define SPARC_LOAD_ADDRESS(SYM, reg) \
setx SYM, %o7, reg;
# define LDPTR ldx
# define LDPTR ldx
# define SIZE_T_CC %xcc
# define STACK_FRAME 192
# define STACK_BIAS 2047

#else

# define SPARC_LOAD_ADDRESS(SYM, reg) \
set SYM, reg;
# define LDPTR ld
# define LDPTR ld
# define SIZE_T_CC %icc
# define STACK_FRAME 112
# define STACK_BIAS 0
# define SPARC_LOAD_ADDRESS_LEAF(SYM,reg,tmp) SPARC_LOAD_ADDRESS(SYM,reg)

#endif
Expand Down

0 comments on commit 1efd583

Please sign in to comment.