sha/asm/sha256-armv4.pl: add NEON code path.
authorAndy Polyakov <appro@openssl.org>
Sat, 4 Jan 2014 17:04:53 +0000 (18:04 +0100)
committerAndy Polyakov <appro@openssl.org>
Sat, 4 Jan 2014 17:04:53 +0000 (18:04 +0100)
(and shave off cycle even from integer-only code)

crypto/sha/asm/sha256-armv4.pl

index 672e571..0fb6fe9 100644 (file)
@@ -1,7 +1,7 @@
 #!/usr/bin/env perl
 
 # ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # February 2011.
 #
 # Profiler-assisted and platform-specific optimization resulted in 16%
-# improvement on Cortex A8 core and ~16.4 cycles per processed byte.
+# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
+
+# September 2013.
+#
+# Add NEON implementation. On Cortex A8 it was measured to process one
+# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
+# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
+# code (meaning that latter performs sub-optimally, nothing was done
+# about it).
 
 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
 open STDOUT,">$output";
@@ -56,10 +64,10 @@ $code.=<<___ if ($i<16);
 # if $i==15
        str     $inp,[sp,#17*4]                 @ make room for $t4
 # endif
-       mov     $t0,$e,ror#$Sigma1[0]
+       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
        add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
+       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
        rev     $t1,$t1
-       eor     $t0,$t0,$e,ror#$Sigma1[1]
 #else
        @ ldrb  $t1,[$inp,#3]                   @ $i
        add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
@@ -71,9 +79,9 @@ $code.=<<___ if ($i<16);
 # if $i==15
        str     $inp,[sp,#17*4]                 @ make room for $t4
 # endif
-       mov     $t0,$e,ror#$Sigma1[0]
+       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
        orr     $t1,$t1,$t2,lsl#24
-       eor     $t0,$t0,$e,ror#$Sigma1[1]
+       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
 #endif
 ___
 $code.=<<___;
@@ -81,12 +89,11 @@ $code.=<<___;
        add     $h,$h,$t1                       @ h+=X[i]
        str     $t1,[sp,#`$i%16`*4]
        eor     $t1,$f,$g
-       eor     $t0,$t0,$e,ror#$Sigma1[2]       @ Sigma1(e)
+       add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
        and     $t1,$t1,$e
-       add     $h,$h,$t0                       @ h+=Sigma1(e)
-       eor     $t1,$t1,$g                      @ Ch(e,f,g)
        add     $h,$h,$t2                       @ h+=K256[i]
-       mov     $t0,$a,ror#$Sigma0[0]
+       eor     $t1,$t1,$g                      @ Ch(e,f,g)
+       eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
        add     $h,$h,$t1                       @ h+=Ch(e,f,g)
 #if $i==31
        and     $t2,$t2,#0xff
@@ -104,12 +111,11 @@ $code.=<<___;
        eor     $t2,$a,$b                       @ a^b, b^c in next round
        ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
 #endif
-       eor     $t0,$t0,$a,ror#$Sigma0[1]
+       eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
        and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
        add     $d,$d,$h                        @ d+=h
-       eor     $t0,$t0,$a,ror#$Sigma0[2]       @ Sigma0(a)
        eor     $t3,$t3,$b                      @ Maj(a,b,c)
-       add     $h,$h,$t0                       @ h+=Sigma0(a)
+       add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
        @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
 ___
        ($t2,$t3)=($t3,$t2);
@@ -132,9 +138,9 @@ $code.=<<___;
        ldr     $t4,[sp,#`($i+9)%16`*4]
 
        add     $t2,$t2,$t0
-       mov     $t0,$e,ror#$Sigma1[0]           @ from BODY_00_15
+       eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
        add     $t1,$t1,$t2
-       eor     $t0,$t0,$e,ror#$Sigma1[1]       @ from BODY_00_15
+       eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
        add     $t1,$t1,$t4                     @ X[i]
 ___
        &BODY_00_15(@_);
@@ -166,15 +172,25 @@ K256:
 .word  0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 .word  0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 .size  K256,.-K256
+.word  0                               @ terminator
+.LOPENSSL_armcap:
+.word  OPENSSL_armcap_P-sha256_block_data_order
+.align 5
 
 .global        sha256_block_data_order
 .type  sha256_block_data_order,%function
 sha256_block_data_order:
        sub     r3,pc,#8                @ sha256_block_data_order
        add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
+#if __ARM_ARCH__>=7
+       ldr     r12,.LOPENSSL_armcap
+       ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
+       tst     r12,#1
+       bne     .LNEON
+#endif
        stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
        ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
-       sub     $Ktbl,r3,#256           @ K256
+       sub     $Ktbl,r3,#256+32        @ K256
        sub     sp,sp,#16*4             @ alloca(X[16])
 .Loop:
 # if __ARM_ARCH__>=7
@@ -225,9 +241,294 @@ $code.=<<___;
        moveq   pc,lr                   @ be binary compatible with V4, yet
        bx      lr                      @ interoperable with Thumb ISA:-)
 #endif
+___
+######################################################################
+# NEON stuff
+#
+{{{
+my @X=map("q$_",(0..3));
+my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
+my $Xfer=$t4;
+my $j=0;
+
+sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
+sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
+
+sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
+{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
+  my $arg = pop;
+    $arg = "#$arg" if ($arg*1 eq $arg);
+    $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
+}
+
+sub Xupdate()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+       &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vshr_u32       ($T2,$T0,$sigma0[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vshr_u32       ($T1,$T0,$sigma0[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vsli_32        ($T2,$T0,32-$sigma0[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vshr_u32       ($T3,$T0,$sigma0[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &veor           ($T1,$T1,$T2);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vsli_32        ($T3,$T0,32-$sigma0[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &veor         ($T5,$T5,$T4);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &veor         ($T5,$T5,$T4);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vld1_32        ("{$T0}","[$Ktbl,:128]!");
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+         &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       ($T0,$T0,@X[0]);
+        while($#insns>=2) { eval(shift(@insns)); }
+       &vst1_32        ("{$T0}","[$Xfer,:128]!");
+        eval(shift(@insns));
+        eval(shift(@insns));
+
+       push(@X,shift(@X));             # "rotate" X[]
+}
+
+sub Xpreload()
+{ use integer;
+  my $body = shift;
+  my @insns = (&$body,&$body,&$body,&$body);
+  my ($a,$b,$c,$d,$e,$f,$g,$h);
+
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vld1_32        ("{$T0}","[$Ktbl,:128]!");
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vrev32_8       (@X[0],@X[0]);
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+        eval(shift(@insns));
+       &vadd_i32       ($T0,$T0,@X[0]);
+        foreach (@insns) { eval; }     # remaining instructions
+       &vst1_32        ("{$T0}","[$Xfer,:128]!");
+
+       push(@X,shift(@X));             # "rotate" X[]
+}
+
+sub body_00_15 () {
+       (
+       '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
+       '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
+       '&eor   ($t1,$f,$g)',
+       '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
+       '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
+       '&and   ($t1,$t1,$e)',
+       '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
+       '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
+       '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
+       '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
+       '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
+       '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
+       '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
+       '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
+       '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
+       '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
+       '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
+       '&add   ($d,$d,$h)',                    # d+=h
+       '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
+       '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
+       '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
+       )
+}
+
+$code.=<<___;
+#if __ARM_ARCH__>=7
+.fpu   neon
+.align 4
+.LNEON:
+       stmdb   sp!,{r4-r12,lr}
+
+       mov     $t2,sp
+       sub     sp,sp,#16*4+16          @ alloca
+       sub     $Ktbl,r3,#256+32        @ K256
+       bic     sp,sp,#15               @ align for 128-bit stores
+
+       vld1.8          {@X[0]},[$inp]!
+       vld1.8          {@X[1]},[$inp]!
+       vld1.8          {@X[2]},[$inp]!
+       vld1.8          {@X[3]},[$inp]!
+       vld1.32         {$T0},[$Ktbl,:128]!
+       vld1.32         {$T1},[$Ktbl,:128]!
+       vld1.32         {$T2},[$Ktbl,:128]!
+       vld1.32         {$T3},[$Ktbl,:128]!
+       vrev32.8        @X[0],@X[0]             @ yes, even on
+       str             $ctx,[sp,#64]
+       vrev32.8        @X[1],@X[1]             @ big-endian
+       str             $inp,[sp,#68]
+       mov             $Xfer,sp
+       vrev32.8        @X[2],@X[2]
+       str             $len,[sp,#72]
+       vrev32.8        @X[3],@X[3]
+       str             $t2,[sp,#76]            @ save original sp
+       vadd.i32        $T0,$T0,@X[0]
+       vadd.i32        $T1,$T1,@X[1]
+       vst1.32         {$T0},[$Xfer,:128]!
+       vadd.i32        $T2,$T2,@X[2]
+       vst1.32         {$T1},[$Xfer,:128]!
+       vadd.i32        $T3,$T3,@X[3]
+       vst1.32         {$T2},[$Xfer,:128]!
+       vst1.32         {$T3},[$Xfer,:128]!
+
+       ldmia           $ctx,{$A-$H}
+       sub             $Xfer,$Xfer,#64
+       ldr             $t1,[sp,#0]
+       eor             $t2,$t2,$t2
+       eor             $t3,$B,$C
+       b               .L_00_48
+
+.align 4
+.L_00_48:
+___
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+       &Xupdate(\&body_00_15);
+$code.=<<___;
+       teq     $t1,#0                          @ check for K256 terminator
+       ldr     $t1,[sp,#0]
+       sub     $Xfer,$Xfer,#64
+       bne     .L_00_48
+
+       ldr             $inp,[sp,#68]
+       ldr             $t0,[sp,#72]
+       sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
+       teq             $inp,$t0
+       subeq           $inp,$inp,#64           @ avoid SEGV
+       vld1.8          {@X[0]},[$inp]!         @ load next input block
+       vld1.8          {@X[1]},[$inp]!
+       vld1.8          {@X[2]},[$inp]!
+       vld1.8          {@X[3]},[$inp]!
+       strne           $inp,[sp,#68]
+       mov             $Xfer,sp
+___
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+       &Xpreload(\&body_00_15);
+$code.=<<___;
+       ldr     $t0,[$t1,#0]
+       add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
+       ldr     $t2,[$t1,#4]
+       ldr     $t3,[$t1,#8]
+       ldr     $t4,[$t1,#12]
+       add     $A,$A,$t0                       @ accumulate
+       ldr     $t0,[$t1,#16]
+       add     $B,$B,$t2
+       ldr     $t2,[$t1,#20]
+       add     $C,$C,$t3
+       ldr     $t3,[$t1,#24]
+       add     $D,$D,$t4
+       ldr     $t4,[$t1,#28]
+       add     $E,$E,$t0
+       str     $A,[$t1],#4
+       add     $F,$F,$t2
+       str     $B,[$t1],#4
+       add     $G,$G,$t3
+       str     $C,[$t1],#4
+       add     $H,$H,$t4
+       str     $D,[$t1],#4
+       stmia   $t1,{$E-$H}
+
+       movne   $Xfer,sp
+       ldrne   $t1,[sp,#0]
+       eorne   $t2,$t2,$t2
+       ldreq   sp,[sp,#76]                     @ restore original sp
+       eorne   $t3,$B,$C
+       bne     .L_00_48
+
+       ldmia   sp!,{r4-r12,pc}
+#endif
+___
+}}}
+$code.=<<___;
 .size   sha256_block_data_order,.-sha256_block_data_order
-.asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
+.asciz  "SHA256 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
 .align 2
+.comm   OPENSSL_armcap_P,4,4
 ___
 
 $code =~ s/\`([^\`]*)\`/eval $1/gem;