modes/asm/ghashv8-armx.pl: implement 4x aggregate factor.

[openssl.git] / crypto / modes / asm / ghashv8-armx.pl
diff --git a/crypto/modes/asm/ghashv8-armx.pl b/crypto/modes/asm/ghashv8-armx.pl

index 7187d28b78eafe3c4937e2585a7815531397c97e..ef7c74798de562613d2ee7180c5dcab9e62ab452 100644 (file)
--- a/crypto/modes/asm/ghashv8-armx.pl
+++ b/crypto/modes/asm/ghashv8-armx.pl
@@ -17,23 +17,30 @@
  # GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
  #
  # June 2014
  # GHASH for ARMv8 Crypto Extension, 64-bit polynomial multiplication.
  #
  # June 2014
-# Initial version was developed in tight cooperation with Ard Biesheuvel
-# of Linaro from bits-n-pieces from other assembly modules. Just like
-# aesv8-armx.pl this module supports both AArch32 and AArch64 execution modes.
+#
+# Initial version was developed in tight cooperation with Ard
+# Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
+# Just like aesv8-armx.pl this module supports both AArch32 and
+# AArch64 execution modes.
  #
  # July 2014
  #
  # July 2014
+#
  # Implement 2x aggregated reduction [see ghash-x86.pl for background
  # information].
  #
  # Implement 2x aggregated reduction [see ghash-x86.pl for background
  # information].
  #
+# November 2017
+#
+# AArch64 register bank to "accommodate" 4x aggregated reduction...
+#
  # Current performance in cycles per processed byte:
  #
  # Current performance in cycles per processed byte:
  #
-#              PMULL[2]        32-bit NEON(*)
-# Apple A7     0.92            5.62
-# Cortex-A53   1.01            8.39
-# Cortex-A57   1.17            7.61
-# Denver       0.71            6.02
-# Mongoose     1.10            8.06
-# Kryo         1.16            8.00
+#              64-bit PMULL    32-bit PMULL    32-bit NEON(*)
+# Apple A7                     0.92            5.62
+# Cortex-A53                   1.01            8.39
+# Cortex-A57                   1.17            7.61
+# Denver                       0.71            6.02
+# Mongoose                     1.10            8.06
+# Kryo                         1.16            8.00
  #
  # (*)  presented for reference/comparison purposes;
  
  #
  # (*)  presented for reference/comparison purposes;
  
@@ -128,8 +135,56 @@ gcm_init_v8:
         vext.8          $t1,$H2,$H2,#8          @ Karatsuba pre-processing
         veor            $t1,$t1,$H2
         vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
         vext.8          $t1,$H2,$H2,#8          @ Karatsuba pre-processing
         veor            $t1,$t1,$H2
         vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
-       vst1.64         {$Hhl-$H2},[x0]         @ store Htable[1..2]
+       vst1.64         {$Hhl-$H2},[x0],#32     @ store Htable[1..2]
+___
+if ($flavour =~ /64/) {
+my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
+
+$code.=<<___;
+       @ calculate H^3 and H^4
+       vpmull.p64      $Xl,$H, $H2
+        vpmull.p64     $Yl,$H2,$H2
+       vpmull2.p64     $Xh,$H, $H2
+        vpmull2.p64    $Yh,$H2,$H2
+       vpmull.p64      $Xm,$t0,$t1
+        vpmull.p64     $Ym,$t1,$t1
+
+       vext.8          $t0,$Xl,$Xh,#8          @ Karatsuba post-processing
+        vext.8         $t1,$Yl,$Yh,#8
+       veor            $t2,$Xl,$Xh
+       veor            $Xm,$Xm,$t0
+        veor           $t3,$Yl,$Yh
+        veor           $Ym,$Ym,$t1
+       veor            $Xm,$Xm,$t2
+       vpmull.p64      $t2,$Xl,$xC2            @ 1st phase
+        veor           $Ym,$Ym,$t3
+        vpmull.p64     $t3,$Yl,$xC2
  
  
+       vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
+        vmov           $Yh#lo,$Ym#hi
+       vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
+        vmov           $Ym#hi,$Yl#lo
+       veor            $Xl,$Xm,$t2
+        veor           $Yl,$Ym,$t3
+
+       vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase
+        vext.8         $t3,$Yl,$Yl,#8
+       vpmull.p64      $Xl,$Xl,$xC2
+        vpmull.p64     $Yl,$Yl,$xC2
+       veor            $t2,$t2,$Xh
+        veor           $t3,$t3,$Yh
+       veor            $H, $Xl,$t2             @ H^3
+        veor           $H2,$Yl,$t3             @ H^4
+
+       vext.8          $t0,$H, $H,#8           @ Karatsuba pre-processing
+        vext.8         $t1,$H2,$H2,#8
+       veor            $t0,$t0,$H
+        veor           $t1,$t1,$H2
+       vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
+       vst1.64         {$H-$H2},[x0]           @ store Htable[3..5]
+___
+}
+$code.=<<___;
         ret
  .size  gcm_init_v8,.-gcm_init_v8
  ___
         ret
  .size  gcm_init_v8,.-gcm_init_v8
  ___
@@ -198,6 +253,11 @@ $code.=<<___;
  .align 4
  gcm_ghash_v8:
  ___
  .align 4
  gcm_ghash_v8:
  ___
+$code.=<<___   if ($flavour =~ /64/);
+       bic             $inc,$len,#63
+       cmp             $len,$inc
+       b.eq            .Lgcm_ghash_v8_4x
+___
  $code.=<<___           if ($flavour !~ /64/);
         vstmdb          sp!,{d8-d15}            @ 32-bit ABI says so
  ___
  $code.=<<___           if ($flavour !~ /64/);
         vstmdb          sp!,{d8-d15}            @ 32-bit ABI says so
  ___
@@ -345,7 +405,105 @@ $code.=<<___;
         ret
  .size  gcm_ghash_v8,.-gcm_ghash_v8
  ___
         ret
  .size  gcm_ghash_v8,.-gcm_ghash_v8
  ___
+
+if ($flavour =~ /64/) {                                # 4x subroutine
+my ($I0,$j1,$j2,$j3,
+    $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
+
+$code.=<<___;
+.type  gcm_ghash_v8_4x,%function
+.align 4
+gcm_ghash_v8_4x:
+.Lgcm_ghash_v8_4x:
+       vld1.64         {$Xl},[$Xi]             @ load [rotated] Xi
+       vld1.64         {$H-$H2},[$Htbl],#48    @ load twisted H, ..., H^2
+       vmov.i8         $xC2,#0xe1
+       vld1.64         {$H3-$H4},[$Htbl]       @ load twisted H^3, ..., H^4
+       vshl.u64        $xC2,$xC2,#57           @ compose 0xc2.0 constant
+#ifndef __ARMEB__
+       vrev64.8        $Xl,$Xl
+#endif
+       b               .Loop4x
+
+.align 4
+.Loop4x:
+       vld1.64         {$I0-$j3},[$inp],#64
+#ifndef __ARMEB__
+       vrev64.8        $j1,$j1
+       vrev64.8        $j2,$j2
+       vrev64.8        $j3,$j3
+       vrev64.8        $I0,$I0
+#endif
+       vext.8          $I3,$j3,$j3,#8
+       vext.8          $I2,$j2,$j2,#8
+       vext.8          $I1,$j1,$j1,#8
+
+       vpmull.p64      $Yl,$H,$I3              @ H·Ii+3
+       veor            $j3,$j3,$I3
+       vpmull2.p64     $Yh,$H,$I3
+       vpmull.p64      $Ym,$Hhl,$j3
+
+       vpmull.p64      $t0,$H2,$I2             @ H^2·Ii+2
+       veor            $j2,$j2,$I2
+       vpmull2.p64     $I2,$H2,$I2
+       vpmull2.p64     $j2,$Hhl,$j2
+
+       veor            $Yl,$Yl,$t0
+       veor            $Yh,$Yh,$I2
+       veor            $Ym,$Ym,$j2
+
+       vpmull.p64      $j3,$H3,$I1             @ H^3·Ii+1
+       veor            $j1,$j1,$I1
+       vpmull2.p64     $I1,$H3,$I1
+       vpmull.p64      $j1,$H34,$j1
+
+       veor            $Yl,$Yl,$j3
+       veor            $Yh,$Yh,$I1
+       veor            $Ym,$Ym,$j1
+
+       veor            $t0,$I0,$Xl
+       vext.8          $IN,$t0,$t0,#8
+
+       vpmull.p64      $Xl,$H4,$IN             @ H^4·(Xi+Ii)
+       veor            $t0,$t0,$IN
+       vpmull2.p64     $Xh,$H4,$IN
+       vpmull2.p64     $Xm,$H34,$t0
+
+       veor            $Xl,$Xl,$Yl
+       veor            $Xh,$Xh,$Yh
+       veor            $Xm,$Xm,$Ym
+
+       vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
+       veor            $t2,$Xl,$Xh
+       veor            $Xm,$Xm,$t1
+       veor            $Xm,$Xm,$t2
+
+       vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
+       vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
+       vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
+       veor            $Xl,$Xm,$t2
+
+       vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
+       vpmull.p64      $Xl,$Xl,$xC2
+       veor            $t2,$t2,$Xh
+       veor            $Xl,$Xl,$t2
+       vext.8          $Xl,$Xl,$Xl,#8
+
+       subs            $len,$len,#64
+       b.ne            .Loop4x
+
+#ifndef __ARMEB__
+       vrev64.8        $Xl,$Xl
+#endif
+       vst1.64         {$Xl},[$Xi]             @ write out Xi
+
+       ret
+.size  gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
+___
+
  }
  }
+}
+
  $code.=<<___;
  .asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  .align  2
  $code.=<<___;
  .asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
  .align  2
@@ -356,7 +514,8 @@ if ($flavour =~ /64/) {                     ######## 64-bit code
         my $arg=shift;
  
         $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
         my $arg=shift;
  
         $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
-       sprintf "ins    v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
+       sprintf "ins    v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
+                                            $3<8?$3:$3+8,($4 eq "lo")?0:1;
      }
      foreach(split("\n",$code)) {
         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
      }
      foreach(split("\n",$code)) {
         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or