Merge branch 'master' of git.openssl.org:openssl
[openssl.git] / crypto / modes / asm / ghash-armv4.pl
index 2036f46f40b68a2c065a36ce60cdda2127631431..0023bf994bf33306afa9e2760b3b55176fff7c08 100644 (file)
 # Cortex A8 core and ~25 cycles per processed byte (which was observed
 # to be ~3 times faster than gcc-generated code:-)
 #
+# February 2011
+#
+# Profiler-assisted and platform-specific optimization resulted in 7%
+# improvement on Cortex A8 core and ~23.5 cycles per byte.
+#
+# March 2011
+#
+# Add NEON implementation featuring polynomial multiplication, i.e. no
+# lookup tables involved. On Cortex A8 it was measured to process one
+# byte in 15 cycles or 55% faster than integer-only code.
+#
+# April 2014
+#
+# Switch to multiplication algorithm suggested in paper referred
+# below and combine it with reduction algorithm from x86 module.
+# Performance improvement over previous version varies from 65% on
+# Snapdragon S4 to 110% on Cortex A9. In absolute terms Cortex A8
+# processes one byte in 8.45 cycles, A9 - in 10.2, Snapdragon S4 -
+# in 9.33.
+#
+# Câmara, D.; Gouvêa, C. P. L.; López, J. & Dahab, R.: Fast Software
+# Polynomial Multiplication on ARM Processors using the NEON Engine.
+# 
+# http://conradoplg.cryptoland.net/files/2010/12/mocrysen13.pdf
+
+# ====================================================================
 # Note about "528B" variant. In ARM case it makes lesser sense to
 # implement it for following reasons:
 #
@@ -52,6 +78,7 @@ $Xi="r0";     # argument block
 $Htbl="r1";
 $inp="r2";
 $len="r3";
+
 $Zll="r4";     # variables
 $Zlh="r5";
 $Zhl="r6";
@@ -72,8 +99,13 @@ sub Zsmash() {
   my $i=12;
   my @args=@_;
   for ($Zll,$Zlh,$Zhl,$Zhh) {
-    # can be reduced to single "str $_,[$Xi,$i]" on big-endian platforms
     $code.=<<___;
+#if __ARM_ARCH__>=7 && defined(__ARMEL__)
+       rev     $_,$_
+       str     $_,[$Xi,#$i]
+#elif defined(__ARMEB__)
+       str     $_,[$Xi,#$i]
+#else
        mov     $Tlh,$_,lsr#8
        strb    $_,[$Xi,#$i+3]
        mov     $Thl,$_,lsr#16
@@ -81,6 +113,7 @@ sub Zsmash() {
        mov     $Thh,$_,lsr#24
        strb    $Thl,[$Xi,#$i+1]
        strb    $Thh,[$Xi,#$i]
+#endif
 ___
     $code.="\t".shift(@args)."\n";
     $i-=4;
@@ -88,6 +121,8 @@ ___
 }
 
 $code=<<___;
+#include "arm_arch.h"
+
 .text
 .code  32
 
@@ -149,41 +184,41 @@ gcm_ghash_4bit:
        and     $nlo,$nlo,#0x0f
        eor     $Zhh,$Zhh,$Tll,lsl#16
 
-.Loop:
+.Linner:
        add     $Thh,$Htbl,$nlo,lsl#4
-       subs    $cnt,$cnt,#1
        and     $nlo,$Zll,#0xf          @ rem
-       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
+       subs    $cnt,$cnt,#1
        add     $nlo,$nlo,$nlo
+       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
        eor     $Zll,$Tll,$Zll,lsr#4
-       ldrh    $Tll,[sp,$nlo]          @ rem_4bit[rem]
        eor     $Zll,$Zll,$Zlh,lsl#28
        eor     $Zlh,$Tlh,$Zlh,lsr#4
        eor     $Zlh,$Zlh,$Zhl,lsl#28
+       ldrh    $Tll,[sp,$nlo]          @ rem_4bit[rem]
        eor     $Zhl,$Thl,$Zhl,lsr#4
+       ldrplb  $nlo,[$inp,$cnt]
        eor     $Zhl,$Zhl,$Zhh,lsl#28
        eor     $Zhh,$Thh,$Zhh,lsr#4
-       ldrplb  $nlo,[$inp,$cnt]
 
        add     $Thh,$Htbl,$nhi
-       eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
        and     $nhi,$Zll,#0xf          @ rem
-       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
+       eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
        add     $nhi,$nhi,$nhi
+       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
        eor     $Zll,$Tll,$Zll,lsr#4
-       ldrh    $Tll,[sp,$nhi]          @ rem_4bit[rem]
+       ldrplb  $Tll,[$Xi,$cnt]
        eor     $Zll,$Zll,$Zlh,lsl#28
        eor     $Zlh,$Tlh,$Zlh,lsr#4
-       ldrplb  $nhi,[$Xi,$cnt]
+       ldrh    $Tlh,[sp,$nhi]
        eor     $Zlh,$Zlh,$Zhl,lsl#28
        eor     $Zhl,$Thl,$Zhl,lsr#4
        eor     $Zhl,$Zhl,$Zhh,lsl#28
-       eorpl   $nlo,$nlo,$nhi
+       eorpl   $nlo,$nlo,$Tll
        eor     $Zhh,$Thh,$Zhh,lsr#4
        andpl   $nhi,$nlo,#0xf0
        andpl   $nlo,$nlo,#0x0f
-       eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
-       bpl     .Loop
+       eor     $Zhh,$Zhh,$Tlh,lsl#16   @ ^= rem_4bit[rem]
+       bpl     .Linner
 
        ldr     $len,[sp,#32]           @ re-load $len/end
        add     $inp,$inp,#16
@@ -194,10 +229,14 @@ $code.=<<___;
        bne     .Louter
 
        add     sp,sp,#36
+#if __ARM_ARCH__>=5
+       ldmia   sp!,{r4-r11,pc}
+#else
        ldmia   sp!,{r4-r11,lr}
        tst     lr,#1
        moveq   pc,lr                   @ be binary compatible with V4, yet
        bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
 .size  gcm_ghash_4bit,.-gcm_ghash_4bit
 
 .global        gcm_gmult_4bit
@@ -231,31 +270,31 @@ gcm_gmult_4bit:
        eor     $Zhh,$Zhh,$Tll,lsl#16
        and     $nlo,$nlo,#0x0f
 
-.Loop2:
+.Loop:
        add     $Thh,$Htbl,$nlo,lsl#4
-       subs    $cnt,$cnt,#1
        and     $nlo,$Zll,#0xf          @ rem
-       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
+       subs    $cnt,$cnt,#1
        add     $nlo,$nlo,$nlo
+       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nlo]
        eor     $Zll,$Tll,$Zll,lsr#4
-       ldrh    $Tll,[$rem_4bit,$nlo]   @ rem_4bit[rem]
        eor     $Zll,$Zll,$Zlh,lsl#28
        eor     $Zlh,$Tlh,$Zlh,lsr#4
        eor     $Zlh,$Zlh,$Zhl,lsl#28
+       ldrh    $Tll,[$rem_4bit,$nlo]   @ rem_4bit[rem]
        eor     $Zhl,$Thl,$Zhl,lsr#4
+       ldrplb  $nlo,[$Xi,$cnt]
        eor     $Zhl,$Zhl,$Zhh,lsl#28
        eor     $Zhh,$Thh,$Zhh,lsr#4
-       ldrplb  $nlo,[$Xi,$cnt]
 
        add     $Thh,$Htbl,$nhi
-       eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
        and     $nhi,$Zll,#0xf          @ rem
-       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
+       eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
        add     $nhi,$nhi,$nhi
+       ldmia   $Thh,{$Tll-$Thh}        @ load Htbl[nhi]
        eor     $Zll,$Tll,$Zll,lsr#4
-       ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
        eor     $Zll,$Zll,$Zlh,lsl#28
        eor     $Zlh,$Tlh,$Zlh,lsr#4
+       ldrh    $Tll,[$rem_4bit,$nhi]   @ rem_4bit[rem]
        eor     $Zlh,$Zlh,$Zhl,lsl#28
        eor     $Zhl,$Thl,$Zhl,lsr#4
        eor     $Zhl,$Zhl,$Zhh,lsl#28
@@ -263,20 +302,191 @@ gcm_gmult_4bit:
        andpl   $nhi,$nlo,#0xf0
        andpl   $nlo,$nlo,#0x0f
        eor     $Zhh,$Zhh,$Tll,lsl#16   @ ^= rem_4bit[rem]
-       bpl     .Loop2
+       bpl     .Loop
 ___
        &Zsmash();
 $code.=<<___;
+#if __ARM_ARCH__>=5
+       ldmia   sp!,{r4-r11,pc}
+#else
        ldmia   sp!,{r4-r11,lr}
        tst     lr,#1
        moveq   pc,lr                   @ be binary compatible with V4, yet
        bx      lr                      @ interoperable with Thumb ISA:-)
+#endif
 .size  gcm_gmult_4bit,.-gcm_gmult_4bit
-.asciz  "GHASH for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+___
+{
+my ($Xl,$Xm,$Xh,$IN)=map("q$_",(0..3));
+my ($t0,$t1,$t2,$t3)=map("q$_",(8..12));
+my ($Hlo,$Hhi,$Hhl,$k48,$k32,$k16)=map("d$_",(26..31));
+
+sub clmul64x64 {
+my ($r,$a,$b)=@_;
+$code.=<<___;
+       vext.8          $t0#lo, $a, $a, #1      @ A1
+       vmull.p8        $t0, $t0#lo, $b         @ F = A1*B
+       vext.8          $r#lo, $b, $b, #1       @ B1
+       vmull.p8        $r, $a, $r#lo           @ E = A*B1
+       vext.8          $t1#lo, $a, $a, #2      @ A2
+       vmull.p8        $t1, $t1#lo, $b         @ H = A2*B
+       vext.8          $t3#lo, $b, $b, #2      @ B2
+       vmull.p8        $t3, $a, $t3#lo         @ G = A*B2
+       vext.8          $t2#lo, $a, $a, #3      @ A3
+       veor            $t0, $t0, $r            @ L = E + F
+       vmull.p8        $t2, $t2#lo, $b         @ J = A3*B
+       vext.8          $r#lo, $b, $b, #3       @ B3
+       veor            $t1, $t1, $t3           @ M = G + H
+       vmull.p8        $r, $a, $r#lo           @ I = A*B3
+       veor            $t0#lo, $t0#lo, $t0#hi  @ t0 = (L) (P0 + P1) << 8
+       vand            $t0#hi, $t0#hi, $k48
+       vext.8          $t3#lo, $b, $b, #4      @ B4
+       veor            $t1#lo, $t1#lo, $t1#hi  @ t1 = (M) (P2 + P3) << 16
+       vand            $t1#hi, $t1#hi, $k32
+       vmull.p8        $t3, $a, $t3#lo         @ K = A*B4
+       veor            $t2, $t2, $r            @ N = I + J
+       veor            $t0#lo, $t0#lo, $t0#hi
+       veor            $t1#lo, $t1#lo, $t1#hi
+       veor            $t2#lo, $t2#lo, $t2#hi  @ t2 = (N) (P4 + P5) << 24
+       vand            $t2#hi, $t2#hi, $k16
+       vext.8          $t0, $t0, $t0, #15
+       veor            $t3#lo, $t3#lo, $t3#hi  @ t3 = (K) (P6 + P7) << 32
+       vmov.i64        $t3#hi, #0
+       vext.8          $t1, $t1, $t1, #14
+       veor            $t2#lo, $t2#lo, $t2#hi
+       vmull.p8        $r, $a, $b              @ D = A*B
+       vext.8          $t3, $t3, $t3, #12
+       vext.8          $t2, $t2, $t2, #13
+       veor            $t0, $t0, $t1
+       veor            $t2, $t2, $t3
+       veor            $r, $r, $t0
+       veor            $r, $r, $t2
+___
+}
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu   neon
+
+.global        gcm_init_neon
+.type  gcm_init_neon,%function
+.align 4
+gcm_init_neon:
+       vld1.64         $IN#hi,[r1,:64]!        @ load H
+       vmov.i8         $t0,#0xe1
+       vld1.64         $IN#lo,[r1,:64]
+       vshl.i64        $t0#hi,#57
+       vshr.u64        $t0#lo,#63              @ t0=0xc2....01
+       vdup.8          $t1,$IN#hi[7]
+       vshr.u64        $Hlo,$IN#lo,#63
+       vshr.s8         $t1,#7                  @ broadcast carry bit
+       vshl.i64        $IN,$IN,#1
+       vand            $t0,$t0,$t1
+       vorr            $IN#hi,$Hlo             @ H<<<=1
+       veor            $IN,$IN,$t0             @ twisted H
+       vstmia          r0,{$IN}
+
+       ret                                     @ bx lr
+.size  gcm_init_neon,.-gcm_init_neon
+
+.global        gcm_gmult_neon
+.type  gcm_gmult_neon,%function
+.align 4
+gcm_gmult_neon:
+       vld1.64         $IN#hi,[$Xi,:64]!       @ load Xi
+       vld1.64         $IN#lo,[$Xi,:64]!
+       vmov.i64        $k48,#0x0000ffffffffffff
+       vldmia          $Htbl,{$Hlo-$Hhi}       @ load twisted H
+       vmov.i64        $k32,#0x00000000ffffffff
+#ifdef __ARMEL__
+       vrev64.8        $IN,$IN
+#endif
+       vmov.i64        $k16,#0x000000000000ffff
+       veor            $Hhl,$Hlo,$Hhi          @ Karatsuba pre-processing
+       mov             $len,#16
+       b               .Lgmult_neon
+.size  gcm_gmult_neon,.-gcm_gmult_neon
+
+.global        gcm_ghash_neon
+.type  gcm_ghash_neon,%function
+.align 4
+gcm_ghash_neon:
+       vld1.64         $Xl#hi,[$Xi,:64]!       @ load Xi
+       vld1.64         $Xl#lo,[$Xi,:64]!
+       vmov.i64        $k48,#0x0000ffffffffffff
+       vldmia          $Htbl,{$Hlo-$Hhi}       @ load twisted H
+       vmov.i64        $k32,#0x00000000ffffffff
+#ifdef __ARMEL__
+       vrev64.8        $Xl,$Xl
+#endif
+       vmov.i64        $k16,#0x000000000000ffff
+       veor            $Hhl,$Hlo,$Hhi          @ Karatsuba pre-processing
+
+.Loop_neon:
+       vld1.64         $IN#hi,[$inp]!          @ load inp
+       vld1.64         $IN#lo,[$inp]!
+#ifdef __ARMEL__
+       vrev64.8        $IN,$IN
+#endif
+       veor            $IN,$Xl                 @ inp^=Xi
+.Lgmult_neon:
+___
+       &clmul64x64     ($Xl,$Hlo,"$IN#lo");    # H.lo·Xi.lo
+$code.=<<___;
+       veor            $IN#lo,$IN#lo,$IN#hi    @ Karatsuba pre-processing
+___
+       &clmul64x64     ($Xm,$Hhl,"$IN#lo");    # (H.lo+H.hi)·(Xi.lo+Xi.hi)
+       &clmul64x64     ($Xh,$Hhi,"$IN#hi");    # H.hi·Xi.hi
+$code.=<<___;
+       veor            $Xm,$Xm,$Xl             @ Karatsuba post-processing
+       veor            $Xm,$Xm,$Xh
+       veor            $Xl#hi,$Xl#hi,$Xm#lo
+       veor            $Xh#lo,$Xh#lo,$Xm#hi    @ Xh|Xl - 256-bit result
+
+       @ equivalent of reduction_avx from ghash-x86_64.pl
+       vshl.i64        $t1,$Xl,#57             @ 1st phase
+       vshl.i64        $t2,$Xl,#62
+       veor            $t2,$t2,$t1             @
+       vshl.i64        $t1,$Xl,#63
+       veor            $t2, $t2, $t1           @
+       veor            $Xl#hi,$Xl#hi,$t2#lo    @
+       veor            $Xh#lo,$Xh#lo,$t2#hi
+
+       vshr.u64        $t2,$Xl,#1              @ 2nd phase
+       veor            $Xh,$Xh,$Xl
+       veor            $Xl,$Xl,$t2             @
+       vshr.u64        $t2,$t2,#6
+       vshr.u64        $Xl,$Xl,#1              @
+       veor            $Xl,$Xl,$Xh             @
+       veor            $Xl,$Xl,$t2             @
+
+       subs            $len,#16
+       bne             .Loop_neon
+
+#ifdef __ARMEL__
+       vrev64.8        $Xl,$Xl
+#endif
+       sub             $Xi,#16 
+       vst1.64         $Xl#hi,[$Xi,:64]!       @ write out Xi
+       vst1.64         $Xl#lo,[$Xi,:64]
+
+       ret                                     @ bx lr
+.size  gcm_ghash_neon,.-gcm_ghash_neon
+#endif
+___
+}
+$code.=<<___;
+.asciz  "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align  2
 ___
 
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;   # make it possible to compile with -march=armv4
-print $code;
+foreach (split("\n",$code)) {
+       s/\`([^\`]*)\`/eval $1/geo;
+
+       s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo       or
+       s/\bret\b/bx    lr/go           or
+       s/\bbx\s+lr\b/.word\t0xe12fff1e/go;    # make it possible to compile with -march=armv4
+
+       print $_,"\n";
+}
 close STDOUT; # enforce flush