MIPS assembly pack: add support for SmartMIPS ASE.
[openssl.git] / crypto / sha / asm / sha256-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # SHA256 block procedure for ARMv4. May 2007.
11
12 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
13 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14 # byte [on single-issue Xscale PXA250 core].
15
16 # July 2010.
17 #
18 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
19 # Cortex A8 core and ~20 cycles per processed byte.
20
21 # February 2011.
22 #
23 # Profiler-assisted and platform-specific optimization resulted in 16%
24 # improvement on Cortex A8 core and ~16.4 cycles per processed byte.
25
26 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
27 open STDOUT,">$output";
28
29 $ctx="r0";      $t0="r0";
30 $inp="r1";      $t4="r1";
31 $len="r2";      $t1="r2";
32 $T1="r3";       $t3="r3";
33 $A="r4";
34 $B="r5";
35 $C="r6";
36 $D="r7";
37 $E="r8";
38 $F="r9";
39 $G="r10";
40 $H="r11";
41 @V=($A,$B,$C,$D,$E,$F,$G,$H);
42 $t2="r12";
43 $Ktbl="r14";
44
45 @Sigma0=( 2,13,22);
46 @Sigma1=( 6,11,25);
47 @sigma0=( 7,18, 3);
48 @sigma1=(17,19,10);
49
50 sub BODY_00_15 {
51 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
52
53 $code.=<<___ if ($i<16);
54 #if __ARM_ARCH__>=7
55         @ ldr   $t1,[$inp],#4                   @ $i
56 # if $i==15
57         str     $inp,[sp,#17*4]                 @ make room for $t4
58 # endif
59         mov     $t0,$e,ror#$Sigma1[0]
60         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
61         rev     $t1,$t1
62         eor     $t0,$t0,$e,ror#$Sigma1[1]
63 #else
64         @ ldrb  $t1,[$inp,#3]                   @ $i
65         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
66         ldrb    $t2,[$inp,#2]
67         ldrb    $t0,[$inp,#1]
68         orr     $t1,$t1,$t2,lsl#8
69         ldrb    $t2,[$inp],#4
70         orr     $t1,$t1,$t0,lsl#16
71 # if $i==15
72         str     $inp,[sp,#17*4]                 @ make room for $t4
73 # endif
74         mov     $t0,$e,ror#$Sigma1[0]
75         orr     $t1,$t1,$t2,lsl#24
76         eor     $t0,$t0,$e,ror#$Sigma1[1]
77 #endif
78 ___
79 $code.=<<___;
80         ldr     $t2,[$Ktbl],#4                  @ *K256++
81         add     $h,$h,$t1                       @ h+=X[i]
82         str     $t1,[sp,#`$i%16`*4]
83         eor     $t1,$f,$g
84         eor     $t0,$t0,$e,ror#$Sigma1[2]       @ Sigma1(e)
85         and     $t1,$t1,$e
86         add     $h,$h,$t0                       @ h+=Sigma1(e)
87         eor     $t1,$t1,$g                      @ Ch(e,f,g)
88         add     $h,$h,$t2                       @ h+=K256[i]
89         mov     $t0,$a,ror#$Sigma0[0]
90         add     $h,$h,$t1                       @ h+=Ch(e,f,g)
91 #if $i==31
92         and     $t2,$t2,#0xff
93         cmp     $t2,#0xf2                       @ done?
94 #endif
95 #if $i<15
96 # if __ARM_ARCH__>=7
97         ldr     $t1,[$inp],#4                   @ prefetch
98 # else
99         ldrb    $t1,[$inp,#3]
100 # endif
101         eor     $t2,$a,$b                       @ a^b, b^c in next round
102 #else
103         ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
104         eor     $t2,$a,$b                       @ a^b, b^c in next round
105         ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
106 #endif
107         eor     $t0,$a,ror#$Sigma0[1]
108         and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
109         add     $d,$d,$h                        @ d+=h
110         eor     $t0,$a,ror#$Sigma0[2]           @ Sigma0(a)
111         eor     $t3,$t3,$b                      @ Maj(a,b,c)
112         add     $h,$h,$t0                       @ h+=Sigma0(a)
113         @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
114 ___
115         ($t2,$t3)=($t3,$t2);
116 }
117
118 sub BODY_16_XX {
119 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
120
121 $code.=<<___;
122         @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
123         @ ldr   $t4,[sp,#`($i+14)%16`*4]
124         mov     $t0,$t1,ror#$sigma0[0]
125         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
126         mov     $t2,$t4,ror#$sigma1[0]
127         eor     $t0,$t0,$t1,ror#$sigma0[1]
128         eor     $t2,$t2,$t4,ror#$sigma1[1]
129         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
130         ldr     $t1,[sp,#`($i+0)%16`*4]
131         eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
132         ldr     $t4,[sp,#`($i+9)%16`*4]
133
134         add     $t2,$t2,$t0
135         mov     $t0,$e,ror#$Sigma1[0]           @ from BODY_00_15
136         add     $t1,$t1,$t2
137         eor     $t0,$t0,$e,ror#$Sigma1[1]       @ from BODY_00_15
138         add     $t1,$t1,$t4                     @ X[i]
139 ___
140         &BODY_00_15(@_);
141 }
142
143 $code=<<___;
144 #include "arm_arch.h"
145
146 .text
147 .code   32
148
149 .type   K256,%object
150 .align  5
151 K256:
152 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
153 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
154 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
155 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
156 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
157 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
158 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
159 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
160 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
161 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
162 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
163 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
164 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
165 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
166 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
167 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
168 .size   K256,.-K256
169
170 .global sha256_block_data_order
171 .type   sha256_block_data_order,%function
172 sha256_block_data_order:
173         sub     r3,pc,#8                @ sha256_block_data_order
174         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
175         stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
176         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
177         sub     $Ktbl,r3,#256           @ K256
178         sub     sp,sp,#16*4             @ alloca(X[16])
179 .Loop:
180 # if __ARM_ARCH__>=7
181         ldr     $t1,[$inp],#4
182 # else
183         ldrb    $t1,[$inp,#3]
184 # endif
185         eor     $t3,$B,$C               @ magic
186         eor     $t2,$t2,$t2
187 ___
188 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
189 $code.=".Lrounds_16_xx:\n";
190 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
191 $code.=<<___;
192         ldreq   $t3,[sp,#16*4]          @ pull ctx
193         bne     .Lrounds_16_xx
194
195         add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
196         ldr     $t0,[$t3,#0]
197         ldr     $t1,[$t3,#4]
198         ldr     $t2,[$t3,#8]
199         add     $A,$A,$t0
200         ldr     $t0,[$t3,#12]
201         add     $B,$B,$t1
202         ldr     $t1,[$t3,#16]
203         add     $C,$C,$t2
204         ldr     $t2,[$t3,#20]
205         add     $D,$D,$t0
206         ldr     $t0,[$t3,#24]
207         add     $E,$E,$t1
208         ldr     $t1,[$t3,#28]
209         add     $F,$F,$t2
210         ldr     $inp,[sp,#17*4]         @ pull inp
211         ldr     $t2,[sp,#18*4]          @ pull inp+len
212         add     $G,$G,$t0
213         add     $H,$H,$t1
214         stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
215         cmp     $inp,$t2
216         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
217         bne     .Loop
218
219         add     sp,sp,#`16+3`*4 @ destroy frame
220 #if __ARM_ARCH__>=5
221         ldmia   sp!,{r4-r11,pc}
222 #else
223         ldmia   sp!,{r4-r11,lr}
224         tst     lr,#1
225         moveq   pc,lr                   @ be binary compatible with V4, yet
226         bx      lr                      @ interoperable with Thumb ISA:-)
227 #endif
228 .size   sha256_block_data_order,.-sha256_block_data_order
229 .asciz  "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
230 .align  2
231 ___
232
233 $code =~ s/\`([^\`]*)\`/eval $1/gem;
234 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
235 print $code;
236 close STDOUT; # enforce flush