modes/asm/ghashv8-armx.pl: implement 4x aggregate factor.
authorAndy Polyakov <appro@openssl.org>
Fri, 1 Dec 2017 10:59:18 +0000 (11:59 +0100)
committerAndy Polyakov <appro@openssl.org>
Mon, 4 Dec 2017 16:20:25 +0000 (17:20 +0100)
This initial commit is unoptimized reference version that handles
input lengths divisible by 4 blocks.

Reviewed-by: Rich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/4830)

crypto/modes/asm/ghashv8-armx.pl

index 7187d28..ef7c747 100644 (file)
 # GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
 #
 # June 2014
-# Initial version was developed in tight cooperation with Ard Biesheuvel
-# of Linaro from bits-n-pieces from other assembly modules. Just like
-# aesv8-armx.pl this module supports both AArch32 and AArch64 execution modes.
+#
+# Initial version was developed in tight cooperation with Ard
+# Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
+# Just like aesv8-armx.pl this module supports both AArch32 and
+# AArch64 execution modes.
 #
 # July 2014
+#
 # Implement 2x aggregated reduction [see ghash-x86.pl for background
 # information].
 #
+# November 2017
+#
+# AArch64 register bank to "accommodate" 4x aggregated reduction...
+#
 # Current performance in cycles per processed byte:
 #
-#              PMULL[2]        32-bit NEON(*)
-# Apple A7     0.92            5.62
-# Cortex-A53   1.01            8.39
-# Cortex-A57   1.17            7.61
-# Denver       0.71            6.02
-# Mongoose     1.10            8.06
-# Kryo         1.16            8.00
+#              64-bit PMULL    32-bit PMULL    32-bit NEON(*)
+# Apple A7                     0.92            5.62
+# Cortex-A53                   1.01            8.39
+# Cortex-A57                   1.17            7.61
+# Denver                       0.71            6.02
+# Mongoose                     1.10            8.06
+# Kryo                         1.16            8.00
 #
 # (*)  presented for reference/comparison purposes;
 
@@ -128,8 +135,56 @@ gcm_init_v8:
        vext.8          $t1,$H2,$H2,#8          @ Karatsuba pre-processing
        veor            $t1,$t1,$H2
        vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
-       vst1.64         {$Hhl-$H2},[x0]         @ store Htable[1..2]
+       vst1.64         {$Hhl-$H2},[x0],#32     @ store Htable[1..2]
+___
+if ($flavour =~ /64/) {
+my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
+
+$code.=<<___;
+       @ calculate H^3 and H^4
+       vpmull.p64      $Xl,$H, $H2
+        vpmull.p64     $Yl,$H2,$H2
+       vpmull2.p64     $Xh,$H, $H2
+        vpmull2.p64    $Yh,$H2,$H2
+       vpmull.p64      $Xm,$t0,$t1
+        vpmull.p64     $Ym,$t1,$t1
+
+       vext.8          $t0,$Xl,$Xh,#8          @ Karatsuba post-processing
+        vext.8         $t1,$Yl,$Yh,#8
+       veor            $t2,$Xl,$Xh
+       veor            $Xm,$Xm,$t0
+        veor           $t3,$Yl,$Yh
+        veor           $Ym,$Ym,$t1
+       veor            $Xm,$Xm,$t2
+       vpmull.p64      $t2,$Xl,$xC2            @ 1st phase
+        veor           $Ym,$Ym,$t3
+        vpmull.p64     $t3,$Yl,$xC2
 
+       vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
+        vmov           $Yh#lo,$Ym#hi
+       vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
+        vmov           $Ym#hi,$Yl#lo
+       veor            $Xl,$Xm,$t2
+        veor           $Yl,$Ym,$t3
+
+       vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase
+        vext.8         $t3,$Yl,$Yl,#8
+       vpmull.p64      $Xl,$Xl,$xC2
+        vpmull.p64     $Yl,$Yl,$xC2
+       veor            $t2,$t2,$Xh
+        veor           $t3,$t3,$Yh
+       veor            $H, $Xl,$t2             @ H^3
+        veor           $H2,$Yl,$t3             @ H^4
+
+       vext.8          $t0,$H, $H,#8           @ Karatsuba pre-processing
+        vext.8         $t1,$H2,$H2,#8
+       veor            $t0,$t0,$H
+        veor           $t1,$t1,$H2
+       vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
+       vst1.64         {$H-$H2},[x0]           @ store Htable[3..5]
+___
+}
+$code.=<<___;
        ret
 .size  gcm_init_v8,.-gcm_init_v8
 ___
@@ -198,6 +253,11 @@ $code.=<<___;
 .align 4
 gcm_ghash_v8:
 ___
+$code.=<<___   if ($flavour =~ /64/);
+       bic             $inc,$len,#63
+       cmp             $len,$inc
+       b.eq            .Lgcm_ghash_v8_4x
+___
 $code.=<<___           if ($flavour !~ /64/);
        vstmdb          sp!,{d8-d15}            @ 32-bit ABI says so
 ___
@@ -345,7 +405,105 @@ $code.=<<___;
        ret
 .size  gcm_ghash_v8,.-gcm_ghash_v8
 ___
+
+if ($flavour =~ /64/) {                                # 4x subroutine
+my ($I0,$j1,$j2,$j3,
+    $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
+
+$code.=<<___;
+.type  gcm_ghash_v8_4x,%function
+.align 4
+gcm_ghash_v8_4x:
+.Lgcm_ghash_v8_4x:
+       vld1.64         {$Xl},[$Xi]             @ load [rotated] Xi
+       vld1.64         {$H-$H2},[$Htbl],#48    @ load twisted H, ..., H^2
+       vmov.i8         $xC2,#0xe1
+       vld1.64         {$H3-$H4},[$Htbl]       @ load twisted H^3, ..., H^4
+       vshl.u64        $xC2,$xC2,#57           @ compose 0xc2.0 constant
+#ifndef __ARMEB__
+       vrev64.8        $Xl,$Xl
+#endif
+       b               .Loop4x
+
+.align 4
+.Loop4x:
+       vld1.64         {$I0-$j3},[$inp],#64
+#ifndef __ARMEB__
+       vrev64.8        $j1,$j1
+       vrev64.8        $j2,$j2
+       vrev64.8        $j3,$j3
+       vrev64.8        $I0,$I0
+#endif
+       vext.8          $I3,$j3,$j3,#8
+       vext.8          $I2,$j2,$j2,#8
+       vext.8          $I1,$j1,$j1,#8
+
+       vpmull.p64      $Yl,$H,$I3              @ H·Ii+3
+       veor            $j3,$j3,$I3
+       vpmull2.p64     $Yh,$H,$I3
+       vpmull.p64      $Ym,$Hhl,$j3
+
+       vpmull.p64      $t0,$H2,$I2             @ H^2·Ii+2
+       veor            $j2,$j2,$I2
+       vpmull2.p64     $I2,$H2,$I2
+       vpmull2.p64     $j2,$Hhl,$j2
+
+       veor            $Yl,$Yl,$t0
+       veor            $Yh,$Yh,$I2
+       veor            $Ym,$Ym,$j2
+
+       vpmull.p64      $j3,$H3,$I1             @ H^3·Ii+1
+       veor            $j1,$j1,$I1
+       vpmull2.p64     $I1,$H3,$I1
+       vpmull.p64      $j1,$H34,$j1
+
+       veor            $Yl,$Yl,$j3
+       veor            $Yh,$Yh,$I1
+       veor            $Ym,$Ym,$j1
+
+       veor            $t0,$I0,$Xl
+       vext.8          $IN,$t0,$t0,#8
+
+       vpmull.p64      $Xl,$H4,$IN             @ H^4·(Xi+Ii)
+       veor            $t0,$t0,$IN
+       vpmull2.p64     $Xh,$H4,$IN
+       vpmull2.p64     $Xm,$H34,$t0
+
+       veor            $Xl,$Xl,$Yl
+       veor            $Xh,$Xh,$Yh
+       veor            $Xm,$Xm,$Ym
+
+       vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
+       veor            $t2,$Xl,$Xh
+       veor            $Xm,$Xm,$t1
+       veor            $Xm,$Xm,$t2
+
+       vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
+       vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
+       vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
+       veor            $Xl,$Xm,$t2
+
+       vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
+       vpmull.p64      $Xl,$Xl,$xC2
+       veor            $t2,$t2,$Xh
+       veor            $Xl,$Xl,$t2
+       vext.8          $Xl,$Xl,$Xl,#8
+
+       subs            $len,$len,#64
+       b.ne            .Loop4x
+
+#ifndef __ARMEB__
+       vrev64.8        $Xl,$Xl
+#endif
+       vst1.64         {$Xl},[$Xi]             @ write out Xi
+
+       ret
+.size  gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
+___
+
 }
+}
+
 $code.=<<___;
 .asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align  2
@@ -356,7 +514,8 @@ if ($flavour =~ /64/) {                     ######## 64-bit code
        my $arg=shift;
 
        $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
-       sprintf "ins    v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
+       sprintf "ins    v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
+                                            $3<8?$3:$3+8,($4 eq "lo")?0:1;
     }
     foreach(split("\n",$code)) {
        s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or