#
# SHA256-hw SHA256(*) SHA512
# Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**))
-# Cortex-A5x n/a n/a n/a
+# Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***))
+# Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***))
#
# (*) Software SHA256 results are of lesser relevance, presented
# mostly for informational purposes.
# (**) The result is a trade-off: it's possible to improve it by
-# 10%, but at the cost of 20% loss on Cortex-A5x.
+# 10% (or by 1 cycle per round), but at the cost of 20% loss
+# on Cortex-A53 (or by 4 cycles per round).
+# (***) Super-impressive coefficients over gcc-generated code are
+# indication of some compiler "pathology", most notably code
+# generated with -mgeneral-regs-only is significanty faster
+# and the gap is only 40-90%.
$flavour=shift;
$output=shift;
-open STDOUT,">$output";
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
if ($output =~ /512/) {
$BITS=512;
.text
+.extern OPENSSL_armcap_P
.globl $func
.type $func,%function
.align 6
ldp $E,$F,[$ctx,#4*$SZ]
add $num,$inp,$num,lsl#`log(16*$SZ)/log(2)` // end of input
ldp $G,$H,[$ctx,#6*$SZ]
- adr $Ktbl,K$BITS
+ adr $Ktbl,.LK$BITS
stp $ctx,$num,[x29,#96]
.Loop:
.size $func,.-$func
.align 6
-.type K$BITS,%object
-K$BITS:
+.type .LK$BITS,%object
+.LK$BITS:
___
$code.=<<___ if ($SZ==8);
.quad 0x428a2f98d728ae22,0x7137449123ef65cd
.long 0 //terminator
___
$code.=<<___;
-.size K$BITS,.-K$BITS
+.size .LK$BITS,.-.LK$BITS
.align 3
.LOPENSSL_armcap_P:
.quad OPENSSL_armcap_P-.
add x29,sp,#0
ld1.32 {$ABCD,$EFGH},[$ctx]
- adr $Ktbl,K256
+ adr $Ktbl,.LK256
.Loop_hw:
ld1 {@MSG[0]-@MSG[3]},[$inp],#64
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
&&
- sprintf ".long\t0x%08x\t//%s %s",
+ sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5)|($3<<16),
$mnemonic,$arg;
}