Add ghash-alpha.pl assembler module.

author Andy Polyakov <appro@openssl.org>

Sat, 10 Apr 2010 13:44:20 +0000 (13:44 +0000)

committer Andy Polyakov <appro@openssl.org>

Sat, 10 Apr 2010 13:44:20 +0000 (13:44 +0000)
author Andy Polyakov <appro@openssl.org>
Sat, 10 Apr 2010 13:44:20 +0000 (13:44 +0000)
committer Andy Polyakov <appro@openssl.org>
Sat, 10 Apr 2010 13:44:20 +0000 (13:44 +0000)
diff --git a/crypto/modes/asm/ghash-alpha.pl b/crypto/modes/asm/ghash-alpha.pl

new file mode 100644 (file)

index 0000000..d75dc78
--- /dev/null
+++ b/crypto/modes/asm/ghash-alpha.pl
@@ -0,0 +1,453 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# March 2010
+#
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Even though
+# loops are aggressively modulo-scheduled in respect to references to
+# Htbl and Z.hi updates for 8 cycles per byte, measured performance is
+# ~12 cycles per processed byte on 21264 CPU. It seems to be a dynamic
+# scheduling "glitch," because uprofile(1) indicates uniform sample
+# distribution, as if all instruction bundles execute in 1.5 cycles.
+# Meaning that it could have been even faster, yet 12 cycles is ~60%
+# better than gcc-generated code and ~80% than code generated by vendor
+# compiler.
+
+$cnt="v0";     # $0
+$t0="t0";
+$t1="t1";
+$t2="t2";
+$Thi0="t3";    # $4
+$Tlo0="t4";
+$Thi1="t5";
+$Tlo1="t6";
+$rem="t7";     # $8
+#################
+$Xi="a0";      # $16
+$Htbl="a1";
+
+
+$nlo="a4";     # $20
+$nhi="a5";
+$Zhi="t8";
+$Zlo="t9";
+$Xhi="t10";    # $24
+$Xlo="t11";
+$remp="t12";
+$rem_4bit="AT";        # $28
+
+{ my $N;
+  sub loop() {
+
+       $N++;
+$code.=<<___;
+.align 4
+       extbl   $Xlo,7,$nlo
+       and     $nlo,0xf0,$nhi
+       sll     $nlo,4,$nlo
+       and     $nlo,0xf0,$nlo
+
+       addq    $nlo,$Htbl,$nlo
+       ldq     $Zlo,8($nlo)
+       addq    $nhi,$Htbl,$nhi
+       ldq     $Zhi,0($nlo)
+
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       lda     $cnt,6(zero)
+       extbl   $Xlo,6,$nlo
+
+       ldq     $Tlo1,8($nhi)
+       s8addq  $remp,$rem_4bit,$remp
+       ldq     $Thi1,0($nhi)
+       srl     $Zlo,4,$Zlo
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       xor     $t0,$Zlo,$Zlo
+       and     $nlo,0xf0,$nhi
+
+       xor     $Tlo1,$Zlo,$Zlo
+       sll     $nlo,4,$nlo
+       xor     $Thi1,$Zhi,$Zhi
+       and     $nlo,0xf0,$nlo
+
+       addq    $nlo,$Htbl,$nlo
+       ldq     $Tlo0,8($nlo)
+       addq    $nhi,$Htbl,$nhi
+       ldq     $Thi0,0($nlo)
+
+.Looplo$N:
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       subq    $cnt,1,$cnt
+       srl     $Zlo,4,$Zlo
+
+       ldq     $Tlo1,8($nhi)
+       xor     $rem,$Zhi,$Zhi
+       ldq     $Thi1,0($nhi)
+       s8addq  $remp,$rem_4bit,$remp
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       xor     $t0,$Zlo,$Zlo
+       extbl   $Xlo,$cnt,$nlo
+
+       and     $nlo,0xf0,$nhi
+       xor     $Thi0,$Zhi,$Zhi
+       xor     $Tlo0,$Zlo,$Zlo
+       sll     $nlo,4,$nlo
+
+
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       and     $nlo,0xf0,$nlo
+       srl     $Zlo,4,$Zlo
+
+       s8addq  $remp,$rem_4bit,$remp
+       xor     $rem,$Zhi,$Zhi
+       addq    $nlo,$Htbl,$nlo
+       addq    $nhi,$Htbl,$nhi
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       ldq     $Tlo0,8($nlo)
+       xor     $t0,$Zlo,$Zlo
+
+       xor     $Tlo1,$Zlo,$Zlo
+       xor     $Thi1,$Zhi,$Zhi
+       ldq     $Thi0,0($nlo)
+       bne     $cnt,.Looplo$N
+
+
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       lda     $cnt,7(zero)
+       srl     $Zlo,4,$Zlo
+
+       ldq     $Tlo1,8($nhi)
+       xor     $rem,$Zhi,$Zhi
+       ldq     $Thi1,0($nhi)
+       s8addq  $remp,$rem_4bit,$remp
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       xor     $t0,$Zlo,$Zlo
+       extbl   $Xhi,$cnt,$nlo
+
+       and     $nlo,0xf0,$nhi
+       xor     $Thi0,$Zhi,$Zhi
+       xor     $Tlo0,$Zlo,$Zlo
+       sll     $nlo,4,$nlo
+
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       and     $nlo,0xf0,$nlo
+       srl     $Zlo,4,$Zlo
+
+       s8addq  $remp,$rem_4bit,$remp
+       xor     $rem,$Zhi,$Zhi
+       addq    $nlo,$Htbl,$nlo
+       addq    $nhi,$Htbl,$nhi
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       ldq     $Tlo0,8($nlo)
+       xor     $t0,$Zlo,$Zlo
+
+       xor     $Tlo1,$Zlo,$Zlo
+       xor     $Thi1,$Zhi,$Zhi
+       ldq     $Thi0,0($nlo)
+       unop
+
+
+.Loophi$N:
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       subq    $cnt,1,$cnt
+       srl     $Zlo,4,$Zlo
+
+       ldq     $Tlo1,8($nhi)
+       xor     $rem,$Zhi,$Zhi
+       ldq     $Thi1,0($nhi)
+       s8addq  $remp,$rem_4bit,$remp
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       xor     $t0,$Zlo,$Zlo
+       extbl   $Xhi,$cnt,$nlo
+
+       and     $nlo,0xf0,$nhi
+       xor     $Thi0,$Zhi,$Zhi
+       xor     $Tlo0,$Zlo,$Zlo
+       sll     $nlo,4,$nlo
+
+
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       and     $nlo,0xf0,$nlo
+       srl     $Zlo,4,$Zlo
+
+       s8addq  $remp,$rem_4bit,$remp
+       xor     $rem,$Zhi,$Zhi
+       addq    $nlo,$Htbl,$nlo
+       addq    $nhi,$Htbl,$nhi
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       ldq     $Tlo0,8($nlo)
+       xor     $t0,$Zlo,$Zlo
+
+       xor     $Tlo1,$Zlo,$Zlo
+       xor     $Thi1,$Zhi,$Zhi
+       ldq     $Thi0,0($nlo)
+       bne     $cnt,.Loophi$N
+
+
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       srl     $Zlo,4,$Zlo
+
+       ldq     $Tlo1,8($nhi)
+       xor     $rem,$Zhi,$Zhi
+       ldq     $Thi1,0($nhi)
+       s8addq  $remp,$rem_4bit,$remp
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       xor     $t0,$Zlo,$Zlo
+
+       xor     $Tlo0,$Zlo,$Zlo
+       xor     $Thi0,$Zhi,$Zhi
+
+       and     $Zlo,0x0f,$remp
+       sll     $Zhi,60,$t0
+       srl     $Zlo,4,$Zlo
+
+       s8addq  $remp,$rem_4bit,$remp
+       xor     $rem,$Zhi,$Zhi
+
+       ldq     $rem,0($remp)
+       srl     $Zhi,4,$Zhi
+       xor     $Tlo1,$Zlo,$Zlo
+       xor     $Thi1,$Zhi,$Zhi
+       xor     $t0,$Zlo,$Zlo
+       xor     $rem,$Zhi,$Zhi
+___
+}}
+
+$code=<<___;
+#include <asm.h>
+#include <regdef.h>
+
+.text
+
+.set   noat
+.set   noreorder
+.globl gcm_gmult_4bit
+.align 4
+.ent   gcm_gmult_4bit
+gcm_gmult_4bit:
+       .frame  sp,0,ra
+       .prologue 0
+
+       ldq     $Xlo,8($Xi)
+       ldq     $Xhi,0($Xi)
+
+       br      $rem_4bit,.Lpic1
+.Lpic1:        lda     $rem_4bit,rem_4bit-.Lpic1($rem_4bit)
+___
+
+       &loop();
+
+$code.=<<___;
+       srl     $Zlo,24,$t0     # byte swap
+       srl     $Zlo,8,$t1
+
+       sll     $Zlo,8,$t2
+       sll     $Zlo,24,$Zlo
+       zapnot  $t0,0x11,$t0
+       zapnot  $t1,0x22,$t1
+
+       zapnot  $Zlo,0x88,$Zlo
+       or      $t0,$t1,$t0
+       zapnot  $t2,0x44,$t2
+
+       or      $Zlo,$t0,$Zlo
+       srl     $Zhi,24,$t0
+       srl     $Zhi,8,$t1
+
+       or      $Zlo,$t2,$Zlo
+       sll     $Zhi,8,$t2
+       sll     $Zhi,24,$Zhi
+
+       srl     $Zlo,32,$Xlo
+       sll     $Zlo,32,$Zlo
+
+       zapnot  $t0,0x11,$t0
+       zapnot  $t1,0x22,$t1
+       or      $Zlo,$Xlo,$Xlo
+
+       zapnot  $Zhi,0x88,$Zhi
+       or      $t0,$t1,$t0
+       zapnot  $t2,0x44,$t2
+
+       or      $Zhi,$t0,$Zhi
+       or      $Zhi,$t2,$Zhi
+
+       srl     $Zhi,32,$Xhi
+       sll     $Zhi,32,$Zhi
+
+       or      $Zhi,$Xhi,$Xhi
+       stq     $Xlo,8($Xi)
+       stq     $Xhi,0($Xi)
+
+       ret     (ra)
+.end   gcm_gmult_4bit
+___
+
+# argument block for gcm_ghash_4bit
+$inp="a0";     # $16
+$len="a1";
+$Xi ="a2";
+$Htbl="a3";
+
+$inhi="s0";
+$inlo="s1";
+
+$code.=<<___;
+.globl gcm_ghash_4bit
+.align 4
+.ent   gcm_ghash_4bit
+gcm_ghash_4bit:
+       lda     sp,-32(sp)
+       stq     ra,0(sp)
+       stq     s0,8(sp)
+       stq     s1,16(sp)
+       .mask   0x04000600,-32
+       .frame  sp,32,ra
+       .prologue 0
+
+       ldq_u   $inhi,0($inp)
+       ldq_u   $Thi0,7($inp)
+       ldq_u   $inlo,8($inp)
+       ldq_u   $Tlo0,15($inp)
+       ldq     $Xhi,0($Xi)
+       ldq     $Xlo,8($Xi)
+
+       br      $rem_4bit,.Lpic2
+.Lpic2:        lda     $rem_4bit,rem_4bit-.Lpic2($rem_4bit)
+
+.Louter:
+       extql   $inhi,$inp,$inhi
+       extqh   $Thi0,$inp,$Thi0
+       or      $inhi,$Thi0,$inhi
+       lda     $inp,16($inp)
+
+       extql   $inlo,$inp,$inlo
+       extqh   $Tlo0,$inp,$Tlo0
+       or      $inlo,$Tlo0,$inlo
+       subq    $len,16,$len
+
+       xor     $Xlo,$inlo,$Xlo
+       xor     $Xhi,$inhi,$Xhi
+___
+
+       &loop();
+
+$code.=<<___;
+       srl     $Zlo,24,$t0     # byte swap
+       srl     $Zlo,8,$t1
+
+       sll     $Zlo,8,$t2
+       sll     $Zlo,24,$Zlo
+       zapnot  $t0,0x11,$t0
+       zapnot  $t1,0x22,$t1
+
+       zapnot  $Zlo,0x88,$Zlo
+       or      $t0,$t1,$t0
+       zapnot  $t2,0x44,$t2
+
+       or      $Zlo,$t0,$Zlo
+       srl     $Zhi,24,$t0
+       srl     $Zhi,8,$t1
+
+       or      $Zlo,$t2,$Zlo
+       sll     $Zhi,8,$t2
+       sll     $Zhi,24,$Zhi
+
+       srl     $Zlo,32,$Xlo
+       sll     $Zlo,32,$Zlo
+       beq     $len,.Ldone
+
+       zapnot  $t0,0x11,$t0
+       zapnot  $t1,0x22,$t1
+       or      $Zlo,$Xlo,$Xlo
+       ldq_u   $inhi,0($inp)
+
+       zapnot  $Zhi,0x88,$Zhi
+       or      $t0,$t1,$t0
+       zapnot  $t2,0x44,$t2
+       ldq_u   $Thi0,7($inp)
+
+       or      $Zhi,$t0,$Zhi
+       or      $Zhi,$t2,$Zhi
+       ldq_u   $inlo,8($inp)
+       ldq_u   $Tlo0,15($inp)
+
+       srl     $Zhi,32,$Xhi
+       sll     $Zhi,32,$Zhi
+
+       or      $Zhi,$Xhi,$Xhi
+       br      zero,.Louter
+
+.Ldone:
+       zapnot  $t0,0x11,$t0
+       zapnot  $t1,0x22,$t1
+       or      $Zlo,$Xlo,$Xlo
+
+       zapnot  $Zhi,0x88,$Zhi
+       or      $t0,$t1,$t0
+       zapnot  $t2,0x44,$t2
+
+       or      $Zhi,$t0,$Zhi
+       or      $Zhi,$t2,$Zhi
+
+       srl     $Zhi,32,$Xhi
+       sll     $Zhi,32,$Zhi
+
+       or      $Zhi,$Xhi,$Xhi
+
+       stq     $Xlo,8($Xi)
+       stq     $Xhi,0($Xi)
+
+       .set    noreorder
+       /*ldq   ra,0(sp)*/
+       ldq     s0,8(sp)
+       ldq     s1,16(sp)
+       lda     sp,32(sp)
+       ret     (ra)
+.end   gcm_ghash_4bit
+
+.align 4
+rem_4bit:
+       .quad   0x0000<<48, 0x1C20<<48, 0x3840<<48, 0x2460<<48
+       .quad   0x7080<<48, 0x6CA0<<48, 0x48C0<<48, 0x54E0<<48
+       .quad   0xE100<<48, 0xFD20<<48, 0xD940<<48, 0xC560<<48
+       .quad   0x9180<<48, 0x8DA0<<48, 0xA9C0<<48, 0xB5E0<<48
+.asciiz "GHASH for Alpha, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
+
+___
+$output=shift and open STDOUT,">$output";
+print $code;
+close STDOUT;
+
author	Andy Polyakov <appro@openssl.org>
	Sat, 10 Apr 2010 13:44:20 +0000 (13:44 +0000)
committer	Andy Polyakov <appro@openssl.org>
	Sat, 10 Apr 2010 13:44:20 +0000 (13:44 +0000)