crypto/sha/asm/sha256-armv4.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9
  10 # SHA256 block procedure for ARMv4. May 2007.
  11
  12 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
  13 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
  14 # byte [on single-issue Xscale PXA250 core].
  15
  16 # July 2010.
  17 #
  18 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
  19 # Cortex A8 core and ~20 cycles per processed byte.
  20
  21 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  22 open STDOUT,">$output";
  23
  24 $ctx="r0";      $t0="r0";
  25 $inp="r1";
  26 $len="r2";      $t1="r2";
  27 $T1="r3";
  28 $A="r4";
  29 $B="r5";
  30 $C="r6";
  31 $D="r7";
  32 $E="r8";
  33 $F="r9";
  34 $G="r10";
  35 $H="r11";
  36 @V=($A,$B,$C,$D,$E,$F,$G,$H);
  37 $t2="r12";
  38 $Ktbl="r14";
  39
  40 @Sigma0=( 2,13,22);
  41 @Sigma1=( 6,11,25);
  42 @sigma0=( 7,18, 3);
  43 @sigma1=(17,19,10);
  44
  45 sub BODY_00_15 {
  46 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  47
  48 $code.=<<___ if ($i<16);
  49         ldrb    $T1,[$inp,#3]                   @ $i
  50         ldrb    $t2,[$inp,#2]
  51         ldrb    $t1,[$inp,#1]
  52         ldrb    $t0,[$inp],#4
  53         orr     $T1,$T1,$t2,lsl#8
  54         orr     $T1,$T1,$t1,lsl#16
  55         orr     $T1,$T1,$t0,lsl#24
  56         `"str   $inp,[sp,#17*4]"        if ($i==15)`
  57 ___
  58 $code.=<<___;
  59         ldr     $t2,[$Ktbl],#4                  @ *K256++
  60         mov     $t0,$e,ror#$Sigma1[0]
  61         str     $T1,[sp,#`$i%16`*4]
  62         eor     $t0,$t0,$e,ror#$Sigma1[1]
  63         eor     $t1,$f,$g
  64         eor     $t0,$t0,$e,ror#$Sigma1[2]       @ Sigma1(e)
  65         and     $t1,$t1,$e
  66         add     $T1,$T1,$t0
  67         eor     $t1,$t1,$g                      @ Ch(e,f,g)
  68         add     $T1,$T1,$h
  69         mov     $h,$a,ror#$Sigma0[0]
  70         add     $T1,$T1,$t1
  71         eor     $h,$h,$a,ror#$Sigma0[1]
  72         add     $T1,$T1,$t2
  73         eor     $h,$h,$a,ror#$Sigma0[2]         @ Sigma0(a)
  74         orr     $t0,$a,$b
  75         and     $t1,$a,$b
  76         and     $t0,$t0,$c
  77         add     $h,$h,$T1
  78         orr     $t0,$t0,$t1                     @ Maj(a,b,c)
  79         add     $d,$d,$T1
  80         add     $h,$h,$t0
  81 ___
  82 }
  83
  84 sub BODY_16_XX {
  85 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  86
  87 $code.=<<___;
  88         ldr     $t1,[sp,#`($i+1)%16`*4]         @ $i
  89         ldr     $t2,[sp,#`($i+14)%16`*4]
  90         ldr     $T1,[sp,#`($i+0)%16`*4]
  91         mov     $t0,$t1,ror#$sigma0[0]
  92         ldr     $inp,[sp,#`($i+9)%16`*4]
  93         eor     $t0,$t0,$t1,ror#$sigma0[1]
  94         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
  95         mov     $t1,$t2,ror#$sigma1[0]
  96         add     $T1,$T1,$t0
  97         eor     $t1,$t1,$t2,ror#$sigma1[1]
  98         add     $T1,$T1,$inp
  99         eor     $t1,$t1,$t2,lsr#$sigma1[2]      @ sigma1(X[i+14])
 100         add     $T1,$T1,$t1
 101 ___
 102         &BODY_00_15(@_);
 103 }
 104
 105 $code=<<___;
 106 .text
 107 .code   32
 108
 109 .type   K256,%object
 110 .align  5
 111 K256:
 112 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
 113 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
 114 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
 115 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
 116 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
 117 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
 118 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
 119 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
 120 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
 121 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
 122 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
 123 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
 124 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
 125 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
 126 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
 127 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
 128 .size   K256,.-K256
 129
 130 .global sha256_block_data_order
 131 .type   sha256_block_data_order,%function
 132 sha256_block_data_order:
 133         sub     r3,pc,#8                @ sha256_block_data_order
 134         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
 135         stmdb   sp!,{$ctx,$inp,$len,r4-r12,lr}
 136         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
 137         sub     $Ktbl,r3,#256           @ K256
 138         sub     sp,sp,#16*4             @ alloca(X[16])
 139 .Loop:
 140 ___
 141 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
 142 $code.=".Lrounds_16_xx:\n";
 143 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
 144 $code.=<<___;
 145         and     $t2,$t2,#0xff
 146         cmp     $t2,#0xf2
 147         bne     .Lrounds_16_xx
 148
 149         ldr     $T1,[sp,#16*4]          @ pull ctx
 150         ldr     $t0,[$T1,#0]
 151         ldr     $t1,[$T1,#4]
 152         ldr     $t2,[$T1,#8]
 153         add     $A,$A,$t0
 154         ldr     $t0,[$T1,#12]
 155         add     $B,$B,$t1
 156         ldr     $t1,[$T1,#16]
 157         add     $C,$C,$t2
 158         ldr     $t2,[$T1,#20]
 159         add     $D,$D,$t0
 160         ldr     $t0,[$T1,#24]
 161         add     $E,$E,$t1
 162         ldr     $t1,[$T1,#28]
 163         add     $F,$F,$t2
 164         ldr     $inp,[sp,#17*4]         @ pull inp
 165         ldr     $t2,[sp,#18*4]          @ pull inp+len
 166         add     $G,$G,$t0
 167         add     $H,$H,$t1
 168         stmia   $T1,{$A,$B,$C,$D,$E,$F,$G,$H}
 169         cmp     $inp,$t2
 170         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
 171         bne     .Loop
 172
 173         add     sp,sp,#`16+3`*4 @ destroy frame
 174         ldmia   sp!,{r4-r12,lr}
 175         tst     lr,#1
 176         moveq   pc,lr                   @ be binary compatible with V4, yet
 177         bx      lr                      @ interoperable with Thumb ISA:-)
 178 .size   sha256_block_data_order,.-sha256_block_data_order
 179 .asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
 180 .align  2
 181 ___
 182
 183 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 184 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
 185 print $code;
 186 close STDOUT; # enforce flush