modes/asm/ghashv8-armx.pl: modulo-schedule loop.
[openssl.git] / crypto / modes / asm / ghashv8-armx.pl
index 55ba7798a8430f780e7226bd50277918bb0a356a..2498a1d646984a7edc223d049e7015bbea050652 100644 (file)
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 #
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # June 2014
 #
 # Initial version was developed in tight cooperation with Ard
-# Biesheuvel <ard.biesheuvel@linaro.org> from bits-n-pieces from
-# other assembly modules. Just like aesv8-armx.pl this module
-# supports both AArch32 and AArch64 execution modes.
+# Biesheuvel of Linaro from bits-n-pieces from other assembly modules.
+# Just like aesv8-armx.pl this module supports both AArch32 and
+# AArch64 execution modes.
 #
 # July 2014
 #
 # Implement 2x aggregated reduction [see ghash-x86.pl for background
 # information].
 #
+# November 2017
+#
+# AArch64 register bank to "accommodate" 4x aggregated reduction...
+#
 # Current performance in cycles per processed byte:
 #
-#              PMULL[2]        32-bit NEON(*)
-# Apple A7     0.92            5.62
-# Cortex-A53   1.01            8.39
-# Cortex-A57   1.17            7.61
+#              64-bit PMULL    32-bit PMULL    32-bit NEON(*)
+# Apple A7                     0.92            5.62
+# Cortex-A53                   1.01            8.39
+# Cortex-A57                   1.17            7.61
+# Denver                       0.71            6.02
+# Mongoose                     1.10            8.06
+# Kryo                         1.16            8.00
 #
 # (*)  presented for reference/comparison purposes;
 
@@ -58,7 +72,11 @@ $code=<<___;
 .text
 ___
 $code.=".arch  armv8-a+crypto\n"       if ($flavour =~ /64/);
-$code.=".fpu   neon\n.code     32\n"   if ($flavour !~ /64/);
+$code.=<<___                           if ($flavour !~ /64/);
+.fpu   neon
+.code  32
+#undef __thumb2__
+___
 
 ################################################################################
 # void gcm_init_v8(u128 Htable[16],const u64 H[2]);
@@ -117,8 +135,56 @@ gcm_init_v8:
        vext.8          $t1,$H2,$H2,#8          @ Karatsuba pre-processing
        veor            $t1,$t1,$H2
        vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
-       vst1.64         {$Hhl-$H2},[x0]         @ store Htable[1..2]
+       vst1.64         {$Hhl-$H2},[x0],#32     @ store Htable[1..2]
+___
+if ($flavour =~ /64/) {
+my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
+
+$code.=<<___;
+       @ calculate H^3 and H^4
+       vpmull.p64      $Xl,$H, $H2
+        vpmull.p64     $Yl,$H2,$H2
+       vpmull2.p64     $Xh,$H, $H2
+        vpmull2.p64    $Yh,$H2,$H2
+       vpmull.p64      $Xm,$t0,$t1
+        vpmull.p64     $Ym,$t1,$t1
+
+       vext.8          $t0,$Xl,$Xh,#8          @ Karatsuba post-processing
+        vext.8         $t1,$Yl,$Yh,#8
+       veor            $t2,$Xl,$Xh
+       veor            $Xm,$Xm,$t0
+        veor           $t3,$Yl,$Yh
+        veor           $Ym,$Ym,$t1
+       veor            $Xm,$Xm,$t2
+       vpmull.p64      $t2,$Xl,$xC2            @ 1st phase
+        veor           $Ym,$Ym,$t3
+        vpmull.p64     $t3,$Yl,$xC2
+
+       vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
+        vmov           $Yh#lo,$Ym#hi
+       vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
+        vmov           $Ym#hi,$Yl#lo
+       veor            $Xl,$Xm,$t2
+        veor           $Yl,$Ym,$t3
 
+       vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase
+        vext.8         $t3,$Yl,$Yl,#8
+       vpmull.p64      $Xl,$Xl,$xC2
+        vpmull.p64     $Yl,$Yl,$xC2
+       veor            $t2,$t2,$Xh
+        veor           $t3,$t3,$Yh
+       veor            $H, $Xl,$t2             @ H^3
+        veor           $H2,$Yl,$t3             @ H^4
+
+       vext.8          $t0,$H, $H,#8           @ Karatsuba pre-processing
+        vext.8         $t1,$H2,$H2,#8
+       veor            $t0,$t0,$H
+        veor           $t1,$t1,$H2
+       vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
+       vst1.64         {$H-$H2},[x0]           @ store Htable[3..5]
+___
+}
+$code.=<<___;
        ret
 .size  gcm_init_v8,.-gcm_init_v8
 ___
@@ -143,10 +209,10 @@ gcm_gmult_v8:
 #endif
        vext.8          $IN,$t1,$t1,#8
 
-       vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
+       vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
        veor            $t1,$t1,$IN             @ Karatsuba pre-processing
-       vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
-       vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+       vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
+       vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
 
        vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
        veor            $t2,$Xl,$Xh
@@ -187,6 +253,11 @@ $code.=<<___;
 .align 4
 gcm_ghash_v8:
 ___
+$code.=<<___   if ($flavour =~ /64/);
+       bic             $inc,$len,#63
+       cmp             $len,$inc
+       b.eq            .Lgcm_ghash_v8_4x
+___
 $code.=<<___           if ($flavour !~ /64/);
        vstmdb          sp!,{d8-d15}            @ 32-bit ABI says so
 ___
@@ -196,13 +267,13 @@ $code.=<<___;
                                                @ loaded value would have
                                                @ to be rotated in order to
                                                @ make it appear as in
-                                               @ alorithm specification
+                                               @ algorithm specification
        subs            $len,$len,#32           @ see if $len is 32 or larger
        mov             $inc,#16                @ $inc is used as post-
                                                @ increment for input pointer;
                                                @ as loop is modulo-scheduled
                                                @ $inc is zeroed just in time
-                                               @ to preclude oversteping
+                                               @ to preclude overstepping
                                                @ inp[len], which means that
                                                @ last block[s] are actually
                                                @ loaded twice, but last
@@ -234,7 +305,7 @@ $code.=<<___;
 #endif
        vext.8          $In,$t1,$t1,#8
        veor            $IN,$IN,$Xl             @ I[i]^=Xi
-       vpmull.p64      $Xln,$H,$In             @ H·Ii+1
+       vpmull.p64      $Xln,$H,$In             @ H·Ii+1
        veor            $t1,$t1,$In             @ Karatsuba pre-processing
        vpmull2.p64     $Xhn,$H,$In
        b               .Loop_mod2x_v8
@@ -243,14 +314,14 @@ $code.=<<___;
 .Loop_mod2x_v8:
        vext.8          $t2,$IN,$IN,#8
        subs            $len,$len,#32           @ is there more data?
-       vpmull.p64      $Xl,$H2,$IN             @ H^2.lo·Xi.lo
+       vpmull.p64      $Xl,$H2,$IN             @ H^2.lo·Xi.lo
        cclr            $inc,lo                 @ is it time to zero $inc?
 
         vpmull.p64     $Xmn,$Hhl,$t1
        veor            $t2,$t2,$IN             @ Karatsuba pre-processing
-       vpmull2.p64     $Xh,$H2,$IN             @ H^2.hi·Xi.hi
+       vpmull2.p64     $Xh,$H2,$IN             @ H^2.hi·Xi.hi
        veor            $Xl,$Xl,$Xln            @ accumulate
-       vpmull2.p64     $Xm,$Hhl,$t2            @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
+       vpmull2.p64     $Xm,$Hhl,$t2            @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi)
         vld1.64        {$t0},[$inp],$inc       @ load [rotated] I[i+2]
 
        veor            $Xh,$Xh,$Xhn
@@ -275,7 +346,7 @@ $code.=<<___;
         vext.8         $In,$t1,$t1,#8
         vext.8         $IN,$t0,$t0,#8
        veor            $Xl,$Xm,$t2
-        vpmull.p64     $Xln,$H,$In             @ H·Ii+1
+        vpmull.p64     $Xln,$H,$In             @ H·Ii+1
        veor            $IN,$IN,$Xh             @ accumulate $IN early
 
        vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
@@ -299,10 +370,10 @@ $code.=<<___;
        veor            $IN,$IN,$Xl             @ inp^=Xi
        veor            $t1,$t0,$t2             @ $t1 is rotated inp^Xi
 
-       vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
+       vpmull.p64      $Xl,$H,$IN              @ H.lo·Xi.lo
        veor            $t1,$t1,$IN             @ Karatsuba pre-processing
-       vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
-       vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
+       vpmull2.p64     $Xh,$H,$IN              @ H.hi·Xi.hi
+       vpmull.p64      $Xm,$Hhl,$t1            @ (H.lo+H.hi)·(Xi.lo+Xi.hi)
 
        vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
        veor            $t2,$Xl,$Xh
@@ -334,7 +405,172 @@ $code.=<<___;
        ret
 .size  gcm_ghash_v8,.-gcm_ghash_v8
 ___
+
+if ($flavour =~ /64/) {                                # 4x subroutine
+my ($I0,$j1,$j2,$j3,
+    $I1,$I2,$I3,$H3,$H34,$H4,$Yl,$Ym,$Yh) = map("q$_",(4..7,15..23));
+
+$code.=<<___;
+.type  gcm_ghash_v8_4x,%function
+.align 4
+gcm_ghash_v8_4x:
+.Lgcm_ghash_v8_4x:
+       vld1.64         {$Xl},[$Xi]             @ load [rotated] Xi
+       vld1.64         {$H-$H2},[$Htbl],#48    @ load twisted H, ..., H^2
+       vmov.i8         $xC2,#0xe1
+       vld1.64         {$H3-$H4},[$Htbl]       @ load twisted H^3, ..., H^4
+       vshl.u64        $xC2,$xC2,#57           @ compose 0xc2.0 constant
+#ifndef __ARMEB__
+       vrev64.8        $Xl,$Xl
+#endif
+
+       vld1.64         {$I0-$j3},[$inp],#64
+#ifndef __ARMEB__
+       vrev64.8        $j1,$j1
+       vrev64.8        $j2,$j2
+       vrev64.8        $j3,$j3
+       vrev64.8        $I0,$I0
+#endif
+       vext.8          $I3,$j3,$j3,#8
+       vext.8          $I2,$j2,$j2,#8
+       vext.8          $I1,$j1,$j1,#8
+
+       vpmull.p64      $Yl,$H,$I3              @ H·Ii+3
+       veor            $j3,$j3,$I3
+       vpmull2.p64     $Yh,$H,$I3
+       vpmull.p64      $Ym,$Hhl,$j3
+
+       vpmull.p64      $t0,$H2,$I2             @ H^2·Ii+2
+       veor            $j2,$j2,$I2
+       vpmull2.p64     $I2,$H2,$I2
+       vpmull2.p64     $j2,$Hhl,$j2
+
+       veor            $Yl,$Yl,$t0
+       veor            $Yh,$Yh,$I2
+       veor            $Ym,$Ym,$j2
+
+       vpmull.p64      $j3,$H3,$I1             @ H^3·Ii+1
+       veor            $j1,$j1,$I1
+       vpmull2.p64     $I1,$H3,$I1
+       vpmull.p64      $j1,$H34,$j1
+
+       veor            $Yl,$Yl,$j3
+       veor            $Yh,$Yh,$I1
+       veor            $Ym,$Ym,$j1
+
+       subs            $len,$len,#64
+       b.eq            .Ltail4x
+
+       b               .Loop4x
+
+.align 4
+.Loop4x:
+       veor            $t0,$I0,$Xl
+       vext.8          $IN,$t0,$t0,#8
+
+       vpmull.p64      $Xl,$H4,$IN             @ H^4·(Xi+Ii)
+       veor            $t0,$t0,$IN
+       vpmull2.p64     $Xh,$H4,$IN
+       vpmull2.p64     $Xm,$H34,$t0
+
+       veor            $Xl,$Xl,$Yl
+       veor            $Xh,$Xh,$Yh
+       veor            $Xm,$Xm,$Ym
+
+       vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
+       veor            $t2,$Xl,$Xh
+       veor            $Xm,$Xm,$t1
+       veor            $Xm,$Xm,$t2
+
+       vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
+       vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
+       vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
+       veor            $Xl,$Xm,$t2
+
+       vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
+       vpmull.p64      $Xl,$Xl,$xC2
+       veor            $t2,$t2,$Xh
+       veor            $Xl,$Xl,$t2
+       vext.8          $Xl,$Xl,$Xl,#8
+
+        vld1.64        {$I0-$j3},[$inp],#64
+#ifndef __ARMEB__
+        vrev64.8       $j1,$j1
+        vrev64.8       $j2,$j2
+        vrev64.8       $j3,$j3
+        vrev64.8       $I0,$I0
+#endif
+        vext.8         $I3,$j3,$j3,#8
+        vext.8         $I2,$j2,$j2,#8
+        vext.8         $I1,$j1,$j1,#8
+
+        vpmull.p64     $Yl,$H,$I3              @ H·Ii+3
+        veor           $j3,$j3,$I3
+        vpmull2.p64    $Yh,$H,$I3
+        vpmull.p64     $Ym,$Hhl,$j3
+
+        vpmull.p64     $t0,$H2,$I2             @ H^2·Ii+2
+        veor           $j2,$j2,$I2
+        vpmull2.p64    $I2,$H2,$I2
+        vpmull2.p64    $j2,$Hhl,$j2
+
+        veor           $Yl,$Yl,$t0
+        veor           $Yh,$Yh,$I2
+        veor           $Ym,$Ym,$j2
+
+        vpmull.p64     $j3,$H3,$I1             @ H^3·Ii+1
+        veor           $j1,$j1,$I1
+        vpmull2.p64    $I1,$H3,$I1
+        vpmull.p64     $j1,$H34,$j1
+
+        veor           $Yl,$Yl,$j3
+        veor           $Yh,$Yh,$I1
+        veor           $Ym,$Ym,$j1
+
+       subs            $len,$len,#64
+       b.ne            .Loop4x
+
+.Ltail4x:
+       veor            $t0,$I0,$Xl
+       vext.8          $IN,$t0,$t0,#8
+
+       vpmull.p64      $Xl,$H4,$IN             @ H^4·(Xi+Ii)
+       veor            $t0,$t0,$IN
+       vpmull2.p64     $Xh,$H4,$IN
+       vpmull2.p64     $Xm,$H34,$t0
+
+       veor            $Xl,$Xl,$Yl
+       veor            $Xh,$Xh,$Yh
+       veor            $Xm,$Xm,$Ym
+
+       vext.8          $t1,$Xl,$Xh,#8          @ Karatsuba post-processing
+       veor            $t2,$Xl,$Xh
+       veor            $Xm,$Xm,$t1
+       veor            $Xm,$Xm,$t2
+
+       vpmull.p64      $t2,$Xl,$xC2            @ 1st phase of reduction
+       vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
+       vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
+       veor            $Xl,$Xm,$t2
+
+       vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase of reduction
+       vpmull.p64      $Xl,$Xl,$xC2
+       veor            $t2,$t2,$Xh
+       veor            $Xl,$Xl,$t2
+       vext.8          $Xl,$Xl,$Xl,#8
+
+#ifndef __ARMEB__
+       vrev64.8        $Xl,$Xl
+#endif
+       vst1.64         {$Xl},[$Xi]             @ write out Xi
+
+       ret
+.size  gcm_ghash_v8_4x,.-gcm_ghash_v8_4x
+___
+
+}
 }
+
 $code.=<<___;
 .asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
 .align  2
@@ -345,7 +581,8 @@ if ($flavour =~ /64/) {                     ######## 64-bit code
        my $arg=shift;
 
        $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
-       sprintf "ins    v%d.d[%d],v%d.d[%d]",$1,($2 eq "lo")?0:1,$3,($4 eq "lo")?0:1;
+       sprintf "ins    v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
+                                            $3<8?$3:$3+8,($4 eq "lo")?0:1;
     }
     foreach(split("\n",$code)) {
        s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
@@ -360,7 +597,7 @@ if ($flavour =~ /64/) {                     ######## 64-bit code
        s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
        s/@\s/\/\//o;                           # old->new style commentary
 
-       # fix up remainig legacy suffixes
+       # fix up remaining legacy suffixes
        s/\.[ui]?8(\s)/$1/o;
        s/\.[uis]?32//o and s/\.16b/\.4s/go;
        m/\.p64/o and s/\.16b/\.1q/o;           # 1st pmull argument
@@ -400,7 +637,7 @@ if ($flavour =~ /64/) {                     ######## 64-bit code
        s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
        s/\/\/\s?/@ /o;                         # new->old style commentary
 
-       # fix up remainig new-style suffixes
+       # fix up remaining new-style suffixes
        s/\],#[0-9]+/]!/o;
 
        s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o                 or