bn/asm/armv4-gf2m.pl, modes/asm/ghash-armv4.pl: faster multiplication
authorAndy Polyakov <appro@openssl.org>
Thu, 24 Apr 2014 08:16:58 +0000 (10:16 +0200)
committerAndy Polyakov <appro@openssl.org>
Thu, 24 Apr 2014 08:24:53 +0000 (10:24 +0200)
algorithm suggested in following paper:

Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
Polynomial Multiplication on ARM Processors using the NEON Engine.

http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf

crypto/bn/asm/armv4-gf2m.pl
crypto/modes/asm/ghash-armv4.pl
crypto/modes/gcm128.c

index c52e0b75b5b6231c00847ea5451d0965d527feff..c66495040cc59d7984766f83f07c8ed391e7590f 100644 (file)
 # length, more for longer keys. Even though NEON 1x1 multiplication
 # runs in even less cycles, ~30, improvement is measurable only on
 # longer keys. One has to optimize code elsewhere to get NEON glow...
+#
+# April 2014
+#
+# Double bn_GF2m_mul_2x2 performance by using algorithm from paper
+# referred below, which improves ECDH and ECDSA verify benchmarks
+# by 18-40%.
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+# 
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
 
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
 
-sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
-sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
-sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
-
 $code=<<___;
 #include "arm_arch.h"
 
@@ -36,31 +43,6 @@ $code=<<___;
 
 #if __ARM_ARCH__>=7
 .fpu   neon
-
-.type  mul_1x1_neon,%function
-.align 5
-mul_1x1_neon:
-       vshl.u64        `&Dlo("q1")`,d16,#8     @ q1-q3 are slided $a
-       vmull.p8        `&Q("d0")`,d16,d17      @ a·bb
-       vshl.u64        `&Dlo("q2")`,d16,#16
-       vmull.p8        q1,`&Dlo("q1")`,d17     @ a<<8·bb
-       vshl.u64        `&Dlo("q3")`,d16,#24
-       vmull.p8        q2,`&Dlo("q2")`,d17     @ a<<16·bb
-       vshr.u64        `&Dlo("q1")`,#8
-       vmull.p8        q3,`&Dlo("q3")`,d17     @ a<<24·bb
-       vshl.u64        `&Dhi("q1")`,#24
-       veor            d0,`&Dlo("q1")`
-       vshr.u64        `&Dlo("q2")`,#16
-       veor            d0,`&Dhi("q1")`
-       vshl.u64        `&Dhi("q2")`,#16
-       veor            d0,`&Dlo("q2")`
-       vshr.u64        `&Dlo("q3")`,#24
-       veor            d0,`&Dhi("q2")`
-       vshl.u64        `&Dhi("q3")`,#8
-       veor            d0,`&Dlo("q3")`
-       veor            d0,`&Dhi("q3")`
-       bx      lr
-.size  mul_1x1_neon,.-mul_1x1_neon
 #endif
 ___
 ################
@@ -159,8 +141,9 @@ ___
 # void bn_GF2m_mul_2x2(BN_ULONG *r,
 #      BN_ULONG a1,BN_ULONG a0,
 #      BN_ULONG b1,BN_ULONG b0);       # r[3..0]=a1a0·b1b0
-
-($A1,$B1,$A0,$B0,$A1B1,$A0B0)=map("d$_",(18..23));
+{
+my ($r,$t0,$t1,$t2,$t3)=map("q$_",(0..3,8..12));
+my ($a,$b,$k48,$k32,$k16)=map("d$_",(26..31));
 
 $code.=<<___;
 .global        bn_GF2m_mul_2x2
@@ -173,44 +156,58 @@ bn_GF2m_mul_2x2:
        tst     r12,#1
        beq     .Lialu
 
-       veor    $A1,$A1
-       vmov.32 $B1,r3,r3               @ two copies of b1
-       vmov.32 ${A1}[0],r1             @ a1
-
-       veor    $A0,$A0
-       vld1.32 ${B0}[],[sp,:32]        @ two copies of b0
-       vmov.32 ${A0}[0],r2             @ a0
-       mov     r12,lr
-
-       vmov    d16,$A1
-       vmov    d17,$B1
-       bl      mul_1x1_neon            @ a1·b1
-       vmov    $A1B1,d0
-
-       vmov    d16,$A0
-       vmov    d17,$B0
-       bl      mul_1x1_neon            @ a0·b0
-       vmov    $A0B0,d0
-
-       veor    d16,$A0,$A1
-       veor    d17,$B0,$B1
-       veor    $A0,$A0B0,$A1B1
-       bl      mul_1x1_neon            @ (a0+a1)·(b0+b1)
-
-       veor    d0,$A0                  @ (a0+a1)·(b0+b1)-a0·b0-a1·b1
-       vshl.u64 d1,d0,#32
-       vshr.u64 d0,d0,#32
-       veor    $A0B0,d1
-       veor    $A1B1,d0
-       vst1.32 {${A0B0}[0]},[r0,:32]!
-       vst1.32 {${A0B0}[1]},[r0,:32]!
-       vst1.32 {${A1B1}[0]},[r0,:32]!
-       vst1.32 {${A1B1}[1]},[r0,:32]
-       bx      r12
+       ldr             r12, [sp]               @ 5th argument
+       vmov.32         $a, r2, r1
+       vmov.32         $b, r12, r3
+       vmov.i64        $k48, #0x0000ffffffffffff
+       vmov.i64        $k32, #0x00000000ffffffff
+       vmov.i64        $k16, #0x000000000000ffff
+
+       vext.8          $t0#lo, $a, $a, #1      @ A1
+       vmull.p8        $t0, $t0#lo, $b         @ F = A1*B
+       vext.8          $r#lo, $b, $b, #1       @ B1
+       vmull.p8        $r, $a, $r#lo           @ E = A*B1
+       vext.8          $t1#lo, $a, $a, #2      @ A2
+       vmull.p8        $t1, $t1#lo, $b         @ H = A2*B
+       vext.8          $t3#lo, $b, $b, #2      @ B2
+       vmull.p8        $t3, $a, $t3#lo         @ G = A*B2
+       vext.8          $t2#lo, $a, $a, #3      @ A3
+       veor            $t0, $t0, $r            @ L = E + F
+       vmull.p8        $t2, $t2#lo, $b         @ J = A3*B
+       vext.8          $r#lo, $b, $b, #3       @ B3
+       veor            $t1, $t1, $t3           @ M = G + H
+       vmull.p8        $r, $a, $r#lo           @ I = A*B3
+       veor            $t0#lo, $t0#lo, $t0#hi  @ t0 = (L) (P0 + P1) << 8
+       vand            $t0#hi, $t0#hi, $k48
+       vext.8          $t3#lo, $b, $b, #4      @ B4
+       veor            $t1#lo, $t1#lo, $t1#hi  @ t1 = (M) (P2 + P3) << 16
+       vand            $t1#hi, $t1#hi, $k32
+       vmull.p8        $t3, $a, $t3#lo         @ K = A*B4
+       veor            $t2, $t2, $r            @ N = I + J
+       veor            $t0#lo, $t0#lo, $t0#hi
+       veor            $t1#lo, $t1#lo, $t1#hi
+       veor            $t2#lo, $t2#lo, $t2#hi  @ t2 = (N) (P4 + P5) << 24
+       vand            $t2#hi, $t2#hi, $k16
+       vext.8          $t0, $t0, $t0, #15
+       veor            $t3#lo, $t3#lo, $t3#hi  @ t3 = (K) (P6 + P7) << 32
+       vmov.i64        $t3#hi, #0
+       vext.8          $t1, $t1, $t1, #14
+       veor            $t2#lo, $t2#lo, $t2#hi
+       vmull.p8        $r, $a, $b              @ D = A*B
+       vext.8          $t3, $t3, $t3, #12
+       vext.8          $t2, $t2, $t2, #13
+       veor            $t0, $t0, $t1
+       veor            $t2, $t2, $t3
+       veor            $r, $r, $t0
+       veor            $r, $r, $t2
+
+       vst1.32         {$r}, [r0]
+       bx      lr
 .align 4
 .Lialu:
 #endif
 ___
+}
 $ret="r10";    # reassigned 1st argument
 $code.=<<___;
        stmdb   sp!,{r4-r10,lr}
@@ -272,7 +269,12 @@ $code.=<<___;
 .comm  OPENSSL_armcap_P,4,4
 ___
 
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
-print $code;
+foreach (split("\n",$code)) {
+       s/\`([^\`]*)\`/eval $1/geo;
+
+       s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
+       s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
+
+       print $_,"\n";
+}
 close STDOUT;   # enforce flush
index d91586ee2925bb695899b17bb8a7242aa3bf9150..0b0dcc8a68f64b52fb22aa0402dc02dfd0d0e451 100644 (file)
 # Add NEON implementation featuring polynomial multiplication, i.e. no
 # lookup tables involved. On Cortex A8 it was measured to process one
 # byte in 15 cycles or 55% faster than integer-only code.
+#
+# April 2014
+#
+# Switch to multiplication algorithm suggested in paper referred
+# below and combine it with reduction algorithm from x86 module.
+# Performance improvement over previous version varies from 65% on
+# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
+# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 -
+# in 9.33.
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+# 
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
 
 # ====================================================================
 # Note about "528B" variant. In ARM case it makes lesser sense to
@@ -303,115 +317,158 @@ $code.=<<___;
 .size  gcm_gmult_4bit,.-gcm_gmult_4bit
 ___
 {
-my $cnt=$Htbl; # $Htbl is used once in the very beginning
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
+my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
 
-my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
-my ($Qhi, $Qlo, $Z,  $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
-
-# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
-# in Zo. Or should I say "top bit", because GHASH is specified in
-# reverse bit order? Otherwise straightforward 128-bt H by one input
-# byte multiplication and modulo-reduction, times 16.
-
-sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
-sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
-sub Q()     { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
+sub clmul64x64 {
+my ($r,$a,$b)=@_;
+$code.=<<___;
+       vext.8          $t0#lo, $a, $a, #1      @ A1
+       vmull.p8        $t0, $t0#lo, $b         @ F = A1*B
+       vext.8          $r#lo, $b, $b, #1       @ B1
+       vmull.p8        $r, $a, $r#lo           @ E = A*B1
+       vext.8          $t1#lo, $a, $a, #2      @ A2
+       vmull.p8        $t1, $t1#lo, $b         @ H = A2*B
+       vext.8          $t3#lo, $b, $b, #2      @ B2
+       vmull.p8        $t3, $a, $t3#lo         @ G = A*B2
+       vext.8          $t2#lo, $a, $a, #3      @ A3
+       veor            $t0, $t0, $r            @ L = E + F
+       vmull.p8        $t2, $t2#lo, $b         @ J = A3*B
+       vext.8          $r#lo, $b, $b, #3       @ B3
+       veor            $t1, $t1, $t3           @ M = G + H
+       vmull.p8        $r, $a, $r#lo           @ I = A*B3
+       veor            $t0#lo, $t0#lo, $t0#hi  @ t0 = (L) (P0 + P1) << 8
+       vand            $t0#hi, $t0#hi, $k48
+       vext.8          $t3#lo, $b, $b, #4      @ B4
+       veor            $t1#lo, $t1#lo, $t1#hi  @ t1 = (M) (P2 + P3) << 16
+       vand            $t1#hi, $t1#hi, $k32
+       vmull.p8        $t3, $a, $t3#lo         @ K = A*B4
+       veor            $t2, $t2, $r            @ N = I + J
+       veor            $t0#lo, $t0#lo, $t0#hi
+       veor            $t1#lo, $t1#lo, $t1#hi
+       veor            $t2#lo, $t2#lo, $t2#hi  @ t2 = (N) (P4 + P5) << 24
+       vand            $t2#hi, $t2#hi, $k16
+       vext.8          $t0, $t0, $t0, #15
+       veor            $t3#lo, $t3#lo, $t3#hi  @ t3 = (K) (P6 + P7) << 32
+       vmov.i64        $t3#hi, #0
+       vext.8          $t1, $t1, $t1, #14
+       veor            $t2#lo, $t2#lo, $t2#hi
+       vmull.p8        $r, $a, $b              @ D = A*B
+       vext.8          $t3, $t3, $t3, #12
+       vext.8          $t2, $t2, $t2, #13
+       veor            $t0, $t0, $t1
+       veor            $t2, $t2, $t3
+       veor            $r, $r, $t0
+       veor            $r, $r, $t2
+___
+}
 
 $code.=<<___;
 #if __ARM_ARCH__>=7
 .fpu   neon
 
+.global        gcm_init_neon
+.type  gcm_init_neon,%function
+.align 4
+gcm_init_neon:
+       vld1.64         $IN#hi,[r1,:64]!        @ load H
+       vmov.i8         $t0,#0xe1
+       vld1.64         $IN#lo,[r1,:64]
+       vshl.i64        $t0#hi,#57
+       vshr.u64        $t0#lo,#63              @ t0=0xc2....01
+       vdup.8          $t1,$IN#hi[7]
+       vshr.u64        $Hlo,$IN#lo,#63
+       vshr.s8         $t1,#7                  @ broadcast carry bit
+       vshl.i64        $IN,$IN,#1
+       vand            $t0,$t0,$t1
+       vorr            $IN#hi,$Hlo             @ H<<<=1
+       veor            $IN,$IN,$t0             @ twisted H
+       vstmia          r0,{$IN}
+
+       bx      lr
+.size  gcm_init_neon,.-gcm_init_neon
+
 .global        gcm_gmult_neon
 .type  gcm_gmult_neon,%function
 .align 4
 gcm_gmult_neon:
-       sub             $Htbl,#16               @ point at H in GCM128_CTX
-       vld1.64         `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
-       vmov.i32        $mod,#0xe1              @ our irreducible polynomial
-       vld1.64         `&Dlo("$IN")`,[$Xi,:64]!
-       vshr.u64        $mod,#32
-       vldmia          $Htbl,{$Hhi-$Hlo}       @ load H
-       veor            $zero,$zero
+       vld1.64         $IN#hi,[$Xi,:64]!       @ load Xi
+       vld1.64         $IN#lo,[$Xi,:64]!
+       vmov.i64        $k48,#0x0000ffffffffffff
+       vldmia          $Htbl,{$Hlo-$Hhi}       @ load twisted H
+       vmov.i64        $k32,#0x00000000ffffffff
 #ifdef __ARMEL__
        vrev64.8        $IN,$IN
 #endif
-       veor            $Qpost,$Qpost
-       veor            $R,$R
-       mov             $cnt,#16
-       veor            $Z,$Z
+       vmov.i64        $k16,#0x000000000000ffff
+       veor            $Hhl,$Hlo,$Hhi          @ Karatsuba pre-processing
        mov             $len,#16
-       veor            $Zo,$Zo
-       vdup.8          $xi,`&Dlo("$IN")`[0]    @ broadcast lowest byte
-       b               .Linner_neon
+       b               .Lgmult_neon
 .size  gcm_gmult_neon,.-gcm_gmult_neon
 
 .global        gcm_ghash_neon
 .type  gcm_ghash_neon,%function
 .align 4
 gcm_ghash_neon:
-       vld1.64         `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
-       vmov.i32        $mod,#0xe1              @ our irreducible polynomial
-       vld1.64         `&Dlo("$Z")`,[$Xi,:64]!
-       vshr.u64        $mod,#32
-       vldmia          $Xi,{$Hhi-$Hlo}         @ load H
-       veor            $zero,$zero
-       nop
+       vld1.64         $Xl#hi,[$Xi,:64]!       @ load Xi
+       vld1.64         $Xl#lo,[$Xi,:64]!
+       vmov.i64        $k48,#0x0000ffffffffffff
+       vldmia          $Htbl,{$Hlo-$Hhi}       @ load twisted H
+       vmov.i64        $k32,#0x00000000ffffffff
 #ifdef __ARMEL__
-       vrev64.8        $Z,$Z
+       vrev64.8        $Xl,$Xl
 #endif
-.Louter_neon:
-       vld1.64         `&Dhi($IN)`,[$inp]!     @ load inp
-       veor            $Qpost,$Qpost
-       vld1.64         `&Dlo($IN)`,[$inp]!
-       veor            $R,$R
-       mov             $cnt,#16
+       vmov.i64        $k16,#0x000000000000ffff
+       veor            $Hhl,$Hlo,$Hhi          @ Karatsuba pre-processing
+
+.Loop_neon:
+       vld1.64         $IN#hi,[$inp]!          @ load inp
+       vld1.64         $IN#lo,[$inp]!
 #ifdef __ARMEL__
        vrev64.8        $IN,$IN
 #endif
-       veor            $Zo,$Zo
-       veor            $IN,$Z                  @ inp^=Xi
-       veor            $Z,$Z
-       vdup.8          $xi,`&Dlo("$IN")`[0]    @ broadcast lowest byte
-.Linner_neon:
-       subs            $cnt,$cnt,#1
-       vmull.p8        $Qlo,$Hlo,$xi           @ H.lo·Xi[i]
-       vmull.p8        $Qhi,$Hhi,$xi           @ H.hi·Xi[i]
-       vext.8          $IN,$zero,#1            @ IN>>=8
-
-       veor            $Z,$Qpost               @ modulo-scheduled part
-       vshl.i64        `&Dlo("$R")`,#48
-       vdup.8          $xi,`&Dlo("$IN")`[0]    @ broadcast lowest byte
-       veor            $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
-
-       veor            `&Dhi("$Z")`,`&Dlo("$R")`
-       vuzp.8          $Qlo,$Qhi
-       vsli.8          $Zo,$T,#1               @ compose the "carry" byte
-       vext.8          $Z,$zero,#1             @ Z>>=8
-
-       vmull.p8        $R,$Zo,$mod             @ "carry"·0xe1
-       vshr.u8         $Zo,$T,#7               @ save Z's bottom bit
-       vext.8          $Qpost,$Qlo,$zero,#1    @ Qlo>>=8
-       veor            $Z,$Qhi
-       bne             .Linner_neon
-
-       veor            $Z,$Qpost               @ modulo-scheduled artefact
-       vshl.i64        `&Dlo("$R")`,#48
-       veor            `&Dhi("$Z")`,`&Dlo("$R")`
-
-       @ finalization, normalize Z:Zo
-       vand            $Zo,$mod                @ suffices to mask the bit
-       vshr.u64        `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
-       vshl.i64        $Z,#1
+       veor            $IN,$Xl                 @ inp^=Xi
+.Lgmult_neon:
+___
+       &clmul64x64     ($Xl,$Hlo,"$IN#lo");    # H.lo·Xi.lo
+$code.=<<___;
+       veor            $IN#lo,$IN#lo,$IN#hi    @ Karatsuba pre-processing
+___
+       &clmul64x64     ($Xm,$Hhl,"$IN#lo");    # (H.lo+H.hi)·(Xi.lo+Xi.hi)
+       &clmul64x64     ($Xh,$Hhi,"$IN#hi");    # H.hi·Xi.hi
+$code.=<<___;
+       veor            $Xm,$Xm,$Xl             @ Karatsuba post-processing
+       veor            $Xm,$Xm,$Xh
+       veor            $Xl#hi,$Xl#hi,$Xm#lo
+       veor            $Xh#lo,$Xh#lo,$Xm#hi    @ Xh|Xl - 256-bit result
+
+       @ equivalent of reduction_avx from ghash-x86_64.pl
+       vshl.i64        $t1,$Xl,#57             @ 1st phase
+       vshl.i64        $t2,$Xl,#62
+       veor            $t2,$t2,$t1             @
+       vshl.i64        $t1,$Xl,#63
+       veor            $t2, $t2, $t1           @
+       veor            $Xl#hi,$Xl#hi,$t2#lo    @
+       veor            $Xh#lo,$Xh#lo,$t2#hi
+
+       vshr.u64        $t2,$Xl,#1              @ 2nd phase
+       veor            $Xh,$Xh,$Xl
+       veor            $Xl,$Xl,$t2             @
+       vshr.u64        $t2,$t2,#6
+       vshr.u64        $Xl,$Xl,#1              @
+       veor            $Xl,$Xl,$Xh             @
+       veor            $Xl,$Xl,$t2             @
+
        subs            $len,#16
-       vorr            $Z,`&Q("$Zo")`          @ Z=Z:Zo<<1
-       bne             .Louter_neon
+       bne             .Loop_neon
 
 #ifdef __ARMEL__
-       vrev64.8        $Z,$Z
+       vrev64.8        $Xl,$Xl
 #endif
        sub             $Xi,#16 
-       vst1.64         `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
-       vst1.64         `&Dlo("$Z")`,[$Xi,:64]
+       vst1.64         $Xl#hi,[$Xi,:64]!       @ write out Xi
+       vst1.64         $Xl#lo,[$Xi,:64]
 
        bx      lr
 .size  gcm_ghash_neon,.-gcm_ghash_neon
@@ -423,7 +480,12 @@ $code.=<<___;
 .align  2
 ___
 
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;   # make it possible to compile with -march=armv4
-print $code;
+foreach (split("\n",$code)) {
+       s/\`([^\`]*)\`/eval $1/geo;
+
+       s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
+       s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
+
+       print $_,"\n";
+}
 close STDOUT; # enforce flush
index 253746c5697ad039a721e465d216dbc521037be2..9afef1cf4e1c3b9136328080c971f2440fb4d83b 100644 (file)
@@ -681,6 +681,7 @@ void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len
 #  if __ARM_ARCH__>=7
 #   define GHASH_ASM_ARM
 #   define GCM_FUNCREF_4BIT
+void gcm_init_neon(u128 Htable[16],const u64 Xi[2]);
 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 #  endif
@@ -767,6 +768,7 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
 #  endif
 # elif defined(GHASH_ASM_ARM)
        if (OPENSSL_armcap_P & ARMV7_NEON) {
+               gcm_init_neon(ctx->Htable,ctx->H.u);
                ctx->gmult = gcm_gmult_neon;
                ctx->ghash = gcm_ghash_neon;
        } else {