Alpha assembler pack update from HEAD.
authorAndy Polyakov <appro@openssl.org>
Mon, 14 Nov 2011 20:56:15 +0000 (20:56 +0000)
committerAndy Polyakov <appro@openssl.org>
Mon, 14 Nov 2011 20:56:15 +0000 (20:56 +0000)
crypto/sha/asm/sha1-alpha.pl [new file with mode: 0644]

diff --git a/crypto/sha/asm/sha1-alpha.pl b/crypto/sha/asm/sha1-alpha.pl
new file mode 100644 (file)
index 0000000..6c4b925
--- /dev/null
@@ -0,0 +1,322 @@
+#!/usr/bin/env perl
+
+# ====================================================================
+# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+
+# SHA1 block procedure for Alpha.
+
+# On 21264 performance is 33% better than code generated by vendor
+# compiler, and 75% better than GCC [3.4], and in absolute terms is
+# 8.7 cycles per processed byte. Implementation features vectorized
+# byte swap, but not Xupdate.
+
+@X=(   "\$0",  "\$1",  "\$2",  "\$3",  "\$4",  "\$5",  "\$6",  "\$7",
+       "\$8",  "\$9",  "\$10", "\$11", "\$12", "\$13", "\$14", "\$15");
+$ctx="a0";     # $16
+$inp="a1";
+$num="a2";
+$A="a3";
+$B="a4";       # 20
+$C="a5";
+$D="t8";
+$E="t9";       @V=($A,$B,$C,$D,$E);
+$t0="t10";     # 24
+$t1="t11";
+$t2="ra";
+$t3="t12";
+$K="AT";       # 28
+
+sub BODY_00_19 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i==0);
+       ldq_u   @X[0],0+0($inp)
+       ldq_u   @X[1],0+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<14);
+       ldq_u   @X[$i+2],($i+2)*4+0($inp)
+       ldq_u   @X[$i+3],($i+2)*4+7($inp)
+___
+$code.=<<___ if (!($i&1) && $i<15);
+       extql   @X[$i],$inp,@X[$i]
+       extqh   @X[$i+1],$inp,@X[$i+1]
+
+       or      @X[$i+1],@X[$i],@X[$i]  # pair of 32-bit values are fetched
+
+       srl     @X[$i],24,$t0           # vectorized byte swap
+       srl     @X[$i],8,$t2
+
+       sll     @X[$i],8,$t3
+       sll     @X[$i],24,@X[$i]
+       zapnot  $t0,0x11,$t0
+       zapnot  $t2,0x22,$t2
+
+       zapnot  @X[$i],0x88,@X[$i]
+       or      $t0,$t2,$t0
+       zapnot  $t3,0x44,$t3
+       sll     $a,5,$t1
+
+       or      @X[$i],$t0,@X[$i]
+       addl    $K,$e,$e
+       and     $b,$c,$t2
+       zapnot  $a,0xf,$a
+
+       or      @X[$i],$t3,@X[$i]
+       srl     $a,27,$t0
+       bic     $d,$b,$t3
+       sll     $b,30,$b
+
+       extll   @X[$i],4,@X[$i+1]       # extract upper half
+       or      $t2,$t3,$t2
+       addl    @X[$i],$e,$e
+
+       addl    $t1,$e,$e
+       srl     $b,32,$t3
+       zapnot  @X[$i],0xf,@X[$i]
+
+       addl    $t0,$e,$e
+       addl    $t2,$e,$e
+       or      $t3,$b,$b
+___
+$code.=<<___ if (($i&1) && $i<15);
+       sll     $a,5,$t1
+       addl    $K,$e,$e
+       and     $b,$c,$t2
+       zapnot  $a,0xf,$a
+
+       srl     $a,27,$t0
+       addl    @X[$i%16],$e,$e
+       bic     $d,$b,$t3
+       sll     $b,30,$b
+
+       or      $t2,$t3,$t2
+       addl    $t1,$e,$e
+       srl     $b,32,$t3
+       zapnot  @X[$i],0xf,@X[$i]
+
+       addl    $t0,$e,$e
+       addl    $t2,$e,$e
+       or      $t3,$b,$b
+___
+$code.=<<___ if ($i>=15);      # with forward Xupdate
+       sll     $a,5,$t1
+       addl    $K,$e,$e
+       and     $b,$c,$t2
+       xor     @X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+       zapnot  $a,0xf,$a
+       addl    @X[$i%16],$e,$e
+       bic     $d,$b,$t3
+       xor     @X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+       srl     $a,27,$t0
+       addl    $t1,$e,$e
+       or      $t2,$t3,$t2
+       xor     @X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+       sll     $b,30,$b
+       addl    $t0,$e,$e
+       srl     @X[$j%16],31,$t1
+
+       addl    $t2,$e,$e
+       srl     $b,32,$t3
+       addl    @X[$j%16],@X[$j%16],@X[$j%16]
+
+       or      $t3,$b,$b
+       zapnot  @X[$i%16],0xf,@X[$i%16]
+       or      $t1,@X[$j%16],@X[$j%16]
+___
+}
+
+sub BODY_20_39 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___ if ($i<79);       # with forward Xupdate
+       sll     $a,5,$t1
+       addl    $K,$e,$e
+       zapnot  $a,0xf,$a
+       xor     @X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+       sll     $b,30,$t3
+       addl    $t1,$e,$e
+       xor     $b,$c,$t2
+       xor     @X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+       srl     $b,2,$b
+       addl    @X[$i%16],$e,$e
+       xor     $d,$t2,$t2
+       xor     @X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+       srl     @X[$j%16],31,$t1
+       addl    $t2,$e,$e
+       srl     $a,27,$t0
+       addl    @X[$j%16],@X[$j%16],@X[$j%16]
+
+       or      $t3,$b,$b
+       addl    $t0,$e,$e
+       or      $t1,@X[$j%16],@X[$j%16]
+___
+$code.=<<___ if ($i<77);
+       zapnot  @X[$i%16],0xf,@X[$i%16]
+___
+$code.=<<___ if ($i==79);      # with context fetch
+       sll     $a,5,$t1
+       addl    $K,$e,$e
+       zapnot  $a,0xf,$a
+       ldl     @X[0],0($ctx)
+
+       sll     $b,30,$t3
+       addl    $t1,$e,$e
+       xor     $b,$c,$t2
+       ldl     @X[1],4($ctx)
+
+       srl     $b,2,$b
+       addl    @X[$i%16],$e,$e
+       xor     $d,$t2,$t2
+       ldl     @X[2],8($ctx)
+
+       srl     $a,27,$t0
+       addl    $t2,$e,$e
+       ldl     @X[3],12($ctx)
+
+       or      $t3,$b,$b
+       addl    $t0,$e,$e
+       ldl     @X[4],16($ctx)
+___
+}
+
+sub BODY_40_59 {
+my ($i,$a,$b,$c,$d,$e)=@_;
+my $j=$i+1;
+$code.=<<___;  # with forward Xupdate
+       sll     $a,5,$t1
+       addl    $K,$e,$e
+       zapnot  $a,0xf,$a
+       xor     @X[($j+2)%16],@X[$j%16],@X[$j%16]
+
+       srl     $a,27,$t0
+       and     $b,$c,$t2
+       and     $b,$d,$t3
+       xor     @X[($j+8)%16],@X[$j%16],@X[$j%16]
+
+       sll     $b,30,$b
+       addl    $t1,$e,$e
+       xor     @X[($j+13)%16],@X[$j%16],@X[$j%16]
+
+       srl     @X[$j%16],31,$t1
+       addl    $t0,$e,$e
+       or      $t2,$t3,$t2
+       and     $c,$d,$t3
+
+       or      $t2,$t3,$t2
+       srl     $b,32,$t3
+       addl    @X[$i%16],$e,$e
+       addl    @X[$j%16],@X[$j%16],@X[$j%16]
+
+       or      $t3,$b,$b
+       addl    $t2,$e,$e
+       or      $t1,@X[$j%16],@X[$j%16]
+       zapnot  @X[$i%16],0xf,@X[$i%16]
+___
+}
+
+$code=<<___;
+#ifdef __linux__
+#include <asm/regdef.h>
+#else
+#include <asm.h>
+#include <regdef.h>
+#endif
+
+.text
+
+.set   noat
+.set   noreorder
+.globl sha1_block_data_order
+.align 5
+.ent   sha1_block_data_order
+sha1_block_data_order:
+       lda     sp,-64(sp)
+       stq     ra,0(sp)
+       stq     s0,8(sp)
+       stq     s1,16(sp)
+       stq     s2,24(sp)
+       stq     s3,32(sp)
+       stq     s4,40(sp)
+       stq     s5,48(sp)
+       stq     fp,56(sp)
+       .mask   0x0400fe00,-64
+       .frame  sp,64,ra
+       .prologue 0
+
+       ldl     $A,0($ctx)
+       ldl     $B,4($ctx)
+       sll     $num,6,$num
+       ldl     $C,8($ctx)
+       ldl     $D,12($ctx)
+       ldl     $E,16($ctx)
+       addq    $inp,$num,$num
+
+.Lloop:
+       .set    noreorder
+       ldah    $K,23170(zero)
+       zapnot  $B,0xf,$B
+       lda     $K,31129($K)    # K_00_19
+___
+for ($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+       ldah    $K,28378(zero)
+       lda     $K,-5215($K)    # K_20_39
+___
+for (;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+       ldah    $K,-28900(zero)
+       lda     $K,-17188($K)   # K_40_59
+___
+for (;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+       ldah    $K,-13725(zero)
+       lda     $K,-15914($K)   # K_60_79
+___
+for (;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
+
+$code.=<<___;
+       addl    @X[0],$A,$A
+       addl    @X[1],$B,$B
+       addl    @X[2],$C,$C
+       addl    @X[3],$D,$D
+       addl    @X[4],$E,$E
+       stl     $A,0($ctx)
+       stl     $B,4($ctx)
+       addq    $inp,64,$inp
+       stl     $C,8($ctx)
+       stl     $D,12($ctx)
+       stl     $E,16($ctx)
+       cmpult  $inp,$num,$t1
+       bne     $t1,.Lloop
+
+       .set    noreorder
+       ldq     ra,0(sp)
+       ldq     s0,8(sp)
+       ldq     s1,16(sp)
+       ldq     s2,24(sp)
+       ldq     s3,32(sp)
+       ldq     s4,40(sp)
+       ldq     s5,48(sp)
+       ldq     fp,56(sp)
+       lda     sp,64(sp)
+       ret     (ra)
+.end   sha1_block_data_order
+.ascii "SHA1 block transform for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align 2
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;