crypto/sha/asm/sha256-armv4.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9
  10 # SHA256 block procedure for ARMv4. May 2007.
  11
  12 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
  13 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
  14 # byte [on single-issue Xscale PXA250 core].
  15
  16 # July 2010.
  17 #
  18 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
  19 # Cortex A8 core and ~20 cycles per processed byte.
  20
  21 # February 2011.
  22 #
  23 # Profiler-assisted and platform-specific optimization resulted in 16%
  24 # improvement on Cortex A8 core and ~16.4 cycles per processed byte.
  25
  26 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  27 open STDOUT,">$output";
  28
  29 $ctx="r0";      $t0="r0";
  30 $inp="r1";      $t4="r1";
  31 $len="r2";      $t1="r2";
  32 $T1="r3";       $t3="r3";
  33 $A="r4";
  34 $B="r5";
  35 $C="r6";
  36 $D="r7";
  37 $E="r8";
  38 $F="r9";
  39 $G="r10";
  40 $H="r11";
  41 @V=($A,$B,$C,$D,$E,$F,$G,$H);
  42 $t2="r12";
  43 $Ktbl="r14";
  44
  45 @Sigma0=( 2,13,22);
  46 @Sigma1=( 6,11,25);
  47 @sigma0=( 7,18, 3);
  48 @sigma1=(17,19,10);
  49
  50 sub BODY_00_15 {
  51 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  52
  53 $code.=<<___ if ($i<16);
  54 #if __ARM_ARCH__>=7
  55         @ ldr   $t1,[$inp],#4                   @ $i
  56 # if $i==15
  57         str     $inp,[sp,#17*4]                 @ make room for $t4
  58 # endif
  59         mov     $t0,$e,ror#$Sigma1[0]
  60         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
  61         rev     $t1,$t1
  62         eor     $t0,$t0,$e,ror#$Sigma1[1]
  63 #else
  64         @ ldrb  $t1,[$inp,#3]                   @ $i
  65         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
  66         ldrb    $t2,[$inp,#2]
  67         ldrb    $t0,[$inp,#1]
  68         orr     $t1,$t1,$t2,lsl#8
  69         ldrb    $t2,[$inp],#4
  70         orr     $t1,$t1,$t0,lsl#16
  71 # if $i==15
  72         str     $inp,[sp,#17*4]                 @ make room for $t4
  73 # endif
  74         mov     $t0,$e,ror#$Sigma1[0]
  75         orr     $t1,$t1,$t2,lsl#24
  76         eor     $t0,$t0,$e,ror#$Sigma1[1]
  77 #endif
  78 ___
  79 $code.=<<___;
  80         ldr     $t2,[$Ktbl],#4                  @ *K256++
  81         add     $h,$h,$t1                       @ h+=X[i]
  82         str     $t1,[sp,#`$i%16`*4]
  83         eor     $t1,$f,$g
  84         eor     $t0,$t0,$e,ror#$Sigma1[2]       @ Sigma1(e)
  85         and     $t1,$t1,$e
  86         add     $h,$h,$t0                       @ h+=Sigma1(e)
  87         eor     $t1,$t1,$g                      @ Ch(e,f,g)
  88         add     $h,$h,$t2                       @ h+=K256[i]
  89         mov     $t0,$a,ror#$Sigma0[0]
  90         add     $h,$h,$t1                       @ h+=Ch(e,f,g)
  91 #if $i==31
  92         and     $t2,$t2,#0xff
  93         cmp     $t2,#0xf2                       @ done?
  94 #endif
  95 #if $i<15
  96 # if __ARM_ARCH__>=7
  97         ldr     $t1,[$inp],#4                   @ prefetch
  98 # else
  99         ldrb    $t1,[$inp,#3]
 100 # endif
 101         eor     $t2,$a,$b                       @ a^b, b^c in next round
 102 #else
 103         ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
 104         eor     $t2,$a,$b                       @ a^b, b^c in next round
 105         ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
 106 #endif
 107         eor     $t0,$a,ror#$Sigma0[1]
 108         and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
 109         add     $d,$d,$h                        @ d+=h
 110         eor     $t0,$a,ror#$Sigma0[2]           @ Sigma0(a)
 111         eor     $t3,$t3,$b                      @ Maj(a,b,c)
 112         add     $h,$h,$t0                       @ h+=Sigma0(a)
 113         @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
 114 ___
 115         ($t2,$t3)=($t3,$t2);
 116 }
 117
 118 sub BODY_16_XX {
 119 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
 120
 121 $code.=<<___;
 122         @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
 123         @ ldr   $t4,[sp,#`($i+14)%16`*4]
 124         mov     $t0,$t1,ror#$sigma0[0]
 125         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
 126         mov     $t2,$t4,ror#$sigma1[0]
 127         eor     $t0,$t0,$t1,ror#$sigma0[1]
 128         eor     $t2,$t2,$t4,ror#$sigma1[1]
 129         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
 130         ldr     $t1,[sp,#`($i+0)%16`*4]
 131         eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
 132         ldr     $t4,[sp,#`($i+9)%16`*4]
 133
 134         add     $t2,$t2,$t0
 135         mov     $t0,$e,ror#$Sigma1[0]           @ from BODY_00_15
 136         add     $t1,$t1,$t2
 137         eor     $t0,$t0,$e,ror#$Sigma1[1]       @ from BODY_00_15
 138         add     $t1,$t1,$t4                     @ X[i]
 139 ___
 140         &BODY_00_15(@_);
 141 }
 142
 143 $code=<<___;
 144 #include "arm_arch.h"
 145
 146 .text
 147 .code   32
 148
 149 .type   K256,%object
 150 .align  5
 151 K256:
 152 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 153 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 154 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 155 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 156 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 157 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 158 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 159 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 160 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 161 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 162 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 163 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 164 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 165 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 166 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 167 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 168 .size   K256,.-K256
 169
 170 .global sha256_block_data_order
 171 .type   sha256_block_data_order,%function
 172 sha256_block_data_order:
 173         sub     r3,pc,#8                @ sha256_block_data_order
 174         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
 175         stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
 176         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
 177         sub     $Ktbl,r3,#256           @ K256
 178         sub     sp,sp,#16*4             @ alloca(X[16])
 179 .Loop:
 180 # if __ARM_ARCH__>=7
 181         ldr     $t1,[$inp],#4
 182 # else
 183         ldrb    $t1,[$inp,#3]
 184 # endif
 185         eor     $t3,$B,$C               @ magic
 186         eor     $t2,$t2,$t2
 187 ___
 188 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
 189 $code.=".Lrounds_16_xx:\n";
 190 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
 191 $code.=<<___;
 192         ldreq   $t3,[sp,#16*4]          @ pull ctx
 193         bne     .Lrounds_16_xx
 194
 195         add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
 196         ldr     $t0,[$t3,#0]
 197         ldr     $t1,[$t3,#4]
 198         ldr     $t2,[$t3,#8]
 199         add     $A,$A,$t0
 200         ldr     $t0,[$t3,#12]
 201         add     $B,$B,$t1
 202         ldr     $t1,[$t3,#16]
 203         add     $C,$C,$t2
 204         ldr     $t2,[$t3,#20]
 205         add     $D,$D,$t0
 206         ldr     $t0,[$t3,#24]
 207         add     $E,$E,$t1
 208         ldr     $t1,[$t3,#28]
 209         add     $F,$F,$t2
 210         ldr     $inp,[sp,#17*4]         @ pull inp
 211         ldr     $t2,[sp,#18*4]          @ pull inp+len
 212         add     $G,$G,$t0
 213         add     $H,$H,$t1
 214         stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
 215         cmp     $inp,$t2
 216         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
 217         bne     .Loop
 218
 219         add     sp,sp,#`16+3`*4 @ destroy frame
 220 #if __ARM_ARCH__>=5
 221         ldmia   sp!,{r4-r11,pc}
 222 #else
 223         ldmia   sp!,{r4-r11,lr}
 224         tst     lr,#1
 225         moveq   pc,lr                   @ be binary compatible with V4, yet
 226         bx      lr                      @ interoperable with Thumb ISA:-)
 227 #endif
 228 .size   sha256_block_data_order,.-sha256_block_data_order
 229 .asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
 230 .align  2
 231 ___
 232
 233 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 234 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
 235 print $code;
 236 close STDOUT; # enforce flush