Make sha512-armv4.pl byte-order neutral.
[openssl.git] / crypto / sha / asm / sha512-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # SHA512 block procedure for ARMv4. September 2007.
11
12 # This code is ~4.5 (four and a half) times faster than code generated
13 # by gcc 3.4 and it spends ~72 clock cycles per byte. 
14
15 # Byte order [in]dependence. =========================================
16 #
17 # Caller is expected to maintain specific *dword* order in h[0-7],
18 # namely with most significant dword at *lower* address, which is
19 # reflected in below two parameters. *Byte* order within these dwords
20 # in turn is whatever *native* byte order on current platform.
21 $hi=0;
22 $lo=4;
23 # ====================================================================
24
25 $ctx="r0";
26 $inp="r1";
27 $len="r2";
28 $Tlo="r3";
29 $Thi="r4";
30 $Alo="r5";
31 $Ahi="r6";
32 $Elo="r7";
33 $Ehi="r8";
34 $t0="r9";
35 $t1="r10";
36 $t2="r11";
37 $t3="r12";
38 ############    r13 is stack pointer
39 $Ktbl="r14";
40 ############    r15 is program counter
41
42 $Aoff=8*0;
43 $Boff=8*1;
44 $Coff=8*2;
45 $Doff=8*3;
46 $Eoff=8*4;
47 $Foff=8*5;
48 $Goff=8*6;
49 $Hoff=8*7;
50 $Xoff=8*8;
51
52 sub BODY_00_15() {
53 my $magic = shift;
54 $code.=<<___;
55         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
56         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
57         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
58         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
59         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
60         mov     $t0,$Elo,lsr#14
61         mov     $t1,$Ehi,lsr#14
62         eor     $t0,$t0,$Ehi,lsl#18
63         eor     $t1,$t1,$Elo,lsl#18
64         eor     $t0,$t0,$Elo,lsr#18
65         eor     $t1,$t1,$Ehi,lsr#18
66         eor     $t0,$t0,$Ehi,lsl#14
67         eor     $t1,$t1,$Elo,lsl#14
68         eor     $t0,$t0,$Ehi,lsr#9
69         eor     $t1,$t1,$Elo,lsr#9
70         eor     $t0,$t0,$Elo,lsl#23
71         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
72         adds    $Tlo,$Tlo,$t0
73         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
74         adds    $Tlo,$Tlo,$t2
75         adc     $Thi,$Thi,$t3           @ T += h
76
77         ldr     $t0,[sp,#$Foff+0]       @ f.lo
78         ldr     $t1,[sp,#$Foff+4]       @ f.hi
79         ldr     $t2,[sp,#$Goff+0]       @ g.lo
80         ldr     $t3,[sp,#$Goff+4]       @ g.hi
81         str     $Elo,[sp,#$Eoff+0]
82         str     $Ehi,[sp,#$Eoff+4]
83         str     $Alo,[sp,#$Aoff+0]
84         str     $Ahi,[sp,#$Aoff+4]
85
86         eor     $t0,$t0,$t2
87         eor     $t1,$t1,$t3
88         and     $t0,$t0,$Elo
89         and     $t1,$t1,$Ehi
90         eor     $t0,$t0,$t2
91         eor     $t1,$t1,$t3             @ Ch(e,f,g)
92
93         ldr     $t2,[$Ktbl,#4]          @ K[i].lo
94         ldr     $t3,[$Ktbl,#0]          @ K[i].hi
95         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
96         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
97
98         adds    $Tlo,$Tlo,$t0
99         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
100         adds    $Tlo,$Tlo,$t2
101         adc     $Thi,$Thi,$t3           @ T += K[i]
102         adds    $Elo,$Elo,$Tlo
103         adc     $Ehi,$Ehi,$Thi          @ d += T
104
105         and     $t0,$t2,#0xff
106         teq     $t0,#$magic
107         orreq   $Ktbl,$Ktbl,#1
108
109         ldr     $t2,[sp,#$Boff+0]       @ b.lo
110         ldr     $t3,[sp,#$Coff+0]       @ c.lo
111         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
112         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
113         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
114         mov     $t0,$Alo,lsr#28
115         mov     $t1,$Ahi,lsr#28
116         eor     $t0,$t0,$Ahi,lsl#4
117         eor     $t1,$t1,$Alo,lsl#4
118         eor     $t0,$t0,$Ahi,lsr#2
119         eor     $t1,$t1,$Alo,lsr#2
120         eor     $t0,$t0,$Alo,lsl#30
121         eor     $t1,$t1,$Ahi,lsl#30
122         eor     $t0,$t0,$Ahi,lsr#7
123         eor     $t1,$t1,$Alo,lsr#7
124         eor     $t0,$t0,$Alo,lsl#25
125         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
126         adds    $Tlo,$Tlo,$t0
127         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
128
129         and     $t0,$Alo,$t2
130         orr     $Alo,$Alo,$t2
131         ldr     $t1,[sp,#$Boff+4]       @ b.hi
132         ldr     $t2,[sp,#$Coff+4]       @ c.hi
133         and     $Alo,$Alo,$t3
134         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
135         and     $t3,$Ahi,$t1
136         orr     $Ahi,$Ahi,$t1
137         and     $Ahi,$Ahi,$t2
138         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
139         adds    $Alo,$Alo,$Tlo
140         adc     $Ahi,$Ahi,$Thi          @ h += T
141
142         sub     sp,sp,#8
143         add     $Ktbl,$Ktbl,#8
144 ___
145 }
146 $code=<<___;
147 .text
148 .code   32
149 .type   K512,%object
150 .align  5
151 K512:
152 .word   0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
153 .word   0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
154 .word   0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
155 .word   0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
156 .word   0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
157 .word   0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
158 .word   0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
159 .word   0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
160 .word   0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
161 .word   0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
162 .word   0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
163 .word   0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
164 .word   0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
165 .word   0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
166 .word   0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
167 .word   0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
168 .word   0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
169 .word   0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
170 .word   0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
171 .word   0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
172 .word   0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
173 .word   0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
174 .word   0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
175 .word   0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
176 .word   0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
177 .word   0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
178 .word   0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
179 .word   0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
180 .word   0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
181 .word   0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
182 .word   0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
183 .word   0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
184 .word   0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
185 .word   0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
186 .word   0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
187 .word   0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
188 .word   0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
189 .word   0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
190 .word   0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
191 .word   0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
192 .size   K512,.-K512
193
194 .global sha512_block_data_order
195 .type   sha512_block_data_order,%function
196 sha512_block_data_order:
197         sub     r3,pc,#8                @ sha512_block_data_order
198         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
199         stmdb   sp!,{r4-r12,lr}
200         sub     $Ktbl,r3,#640           @ K512
201         sub     sp,sp,#9*8
202
203         ldr     $Elo,[$ctx,#$Eoff+$lo]
204         ldr     $Ehi,[$ctx,#$Eoff+$hi]
205         ldr     $t0, [$ctx,#$Goff+$lo]
206         ldr     $t1, [$ctx,#$Goff+$hi]
207         ldr     $t2, [$ctx,#$Hoff+$lo]
208         ldr     $t3, [$ctx,#$Hoff+$hi]
209 .Loop:
210         str     $t0, [sp,#$Goff+0]
211         str     $t1, [sp,#$Goff+4]
212         str     $t2, [sp,#$Hoff+0]
213         str     $t3, [sp,#$Hoff+4]
214         ldr     $Alo,[$ctx,#$Aoff+$lo]
215         ldr     $Ahi,[$ctx,#$Aoff+$hi]
216         ldr     $Tlo,[$ctx,#$Boff+$lo]
217         ldr     $Thi,[$ctx,#$Boff+$hi]
218         ldr     $t0, [$ctx,#$Coff+$lo]
219         ldr     $t1, [$ctx,#$Coff+$hi]
220         ldr     $t2, [$ctx,#$Doff+$lo]
221         ldr     $t3, [$ctx,#$Doff+$hi]
222         str     $Tlo,[sp,#$Boff+0]
223         str     $Thi,[sp,#$Boff+4]
224         str     $t0, [sp,#$Coff+0]
225         str     $t1, [sp,#$Coff+4]
226         str     $t2, [sp,#$Doff+0]
227         str     $t3, [sp,#$Doff+4]
228         ldr     $Tlo,[$ctx,#$Foff+$lo]
229         ldr     $Thi,[$ctx,#$Foff+$hi]
230         str     $Tlo,[sp,#$Foff+0]
231         str     $Thi,[sp,#$Foff+4]
232
233 .L00_15:
234         ldrb    $Tlo,[$inp,#7]
235         ldrb    $t0, [$inp,#6]
236         ldrb    $t1, [$inp,#5]
237         ldrb    $t2, [$inp,#4]
238         ldrb    $Thi,[$inp,#3]
239         ldrb    $t3, [$inp,#2]
240         orr     $Tlo,$Tlo,$t0,lsl#8
241         ldrb    $t0, [$inp,#1]
242         orr     $Tlo,$Tlo,$t1,lsl#16
243         ldrb    $t1, [$inp],#8
244         orr     $Tlo,$Tlo,$t2,lsl#24
245         orr     $Thi,$Thi,$t3,lsl#8
246         orr     $Thi,$Thi,$t0,lsl#16
247         orr     $Thi,$Thi,$t1,lsl#24
248         str     $Tlo,[sp,#$Xoff+0]
249         str     $Thi,[sp,#$Xoff+4]
250 ___
251         &BODY_00_15(0x94);
252 $code.=<<___;
253         tst     $Ktbl,#1
254         beq     .L00_15
255         bic     $Ktbl,$Ktbl,#1
256
257 .L16_79:
258         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
259         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
260         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
261         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
262
263         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
264         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
265         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
266         mov     $Tlo,$t0,lsr#1
267         mov     $Thi,$t1,lsr#1
268         eor     $Tlo,$Tlo,$t1,lsl#31
269         eor     $Thi,$Thi,$t0,lsl#31
270         eor     $Tlo,$Tlo,$t0,lsr#8
271         eor     $Thi,$Thi,$t1,lsr#8
272         eor     $Tlo,$Tlo,$t1,lsl#24
273         eor     $Thi,$Thi,$t0,lsl#24
274         eor     $Tlo,$Tlo,$t0,lsr#7
275         eor     $Thi,$Thi,$t1,lsr#7
276         eor     $Tlo,$Tlo,$t1,lsl#25
277
278         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
279         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
280         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
281         mov     $t0,$t2,lsr#19
282         mov     $t1,$t3,lsr#19
283         eor     $t0,$t0,$t3,lsl#13
284         eor     $t1,$t1,$t2,lsl#13
285         eor     $t0,$t0,$t3,lsr#29
286         eor     $t1,$t1,$t2,lsr#29
287         eor     $t0,$t0,$t2,lsl#3
288         eor     $t1,$t1,$t3,lsl#3
289         eor     $t0,$t0,$t2,lsr#6
290         eor     $t1,$t1,$t3,lsr#6
291         eor     $t0,$t0,$t3,lsl#26
292
293         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
294         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
295         adds    $Tlo,$Tlo,$t0
296         adc     $Thi,$Thi,$t1
297
298         ldr     $t0,[sp,#`$Xoff+8*16`+0]
299         ldr     $t1,[sp,#`$Xoff+8*16`+4]
300         adds    $Tlo,$Tlo,$t2
301         adc     $Thi,$Thi,$t3
302         adds    $Tlo,$Tlo,$t0
303         adc     $Thi,$Thi,$t1
304         str     $Tlo,[sp,#$Xoff+0]
305         str     $Thi,[sp,#$Xoff+4]
306 ___
307         &BODY_00_15(0x17);
308 $code.=<<___;
309         tst     $Ktbl,#1
310         beq     .L16_79
311         bic     $Ktbl,$Ktbl,#1
312
313         ldr     $Tlo,[sp,#$Boff+0]
314         ldr     $Thi,[sp,#$Boff+4]
315         ldr     $t0, [$ctx,#$Aoff+$lo]
316         ldr     $t1, [$ctx,#$Aoff+$hi]
317         ldr     $t2, [$ctx,#$Boff+$lo]
318         ldr     $t3, [$ctx,#$Boff+$hi]
319         adds    $t0,$Alo,$t0
320         adc     $t1,$Ahi,$t1
321         adds    $t2,$Tlo,$t2
322         adc     $t3,$Thi,$t3
323         str     $t0, [$ctx,#$Aoff+$lo]
324         str     $t1, [$ctx,#$Aoff+$hi]
325         str     $t2, [$ctx,#$Boff+$lo]
326         str     $t3, [$ctx,#$Boff+$hi]
327
328         ldr     $Alo,[sp,#$Coff+0]
329         ldr     $Ahi,[sp,#$Coff+4]
330         ldr     $Tlo,[sp,#$Doff+0]
331         ldr     $Thi,[sp,#$Doff+4]
332         ldr     $t0, [$ctx,#$Coff+$lo]
333         ldr     $t1, [$ctx,#$Coff+$hi]
334         ldr     $t2, [$ctx,#$Doff+$lo]
335         ldr     $t3, [$ctx,#$Doff+$hi]
336         adds    $t0,$Alo,$t0
337         adc     $t1,$Ahi,$t1
338         adds    $t2,$Tlo,$t2
339         adc     $t3,$Thi,$t3
340         str     $t0, [$ctx,#$Coff+$lo]
341         str     $t1, [$ctx,#$Coff+$hi]
342         str     $t2, [$ctx,#$Doff+$lo]
343         str     $t3, [$ctx,#$Doff+$hi]
344
345         ldr     $Tlo,[sp,#$Foff+0]
346         ldr     $Thi,[sp,#$Foff+4]
347         ldr     $t0, [$ctx,#$Eoff+$lo]
348         ldr     $t1, [$ctx,#$Eoff+$hi]
349         ldr     $t2, [$ctx,#$Foff+$lo]
350         ldr     $t3, [$ctx,#$Foff+$hi]
351         adds    $Elo,$Elo,$t0
352         adc     $Ehi,$Ehi,$t1
353         adds    $t2,$Tlo,$t2
354         adc     $t3,$Thi,$t3
355         str     $Elo,[$ctx,#$Eoff+$lo]
356         str     $Ehi,[$ctx,#$Eoff+$hi]
357         str     $t2, [$ctx,#$Foff+$lo]
358         str     $t3, [$ctx,#$Foff+$hi]
359
360         ldr     $Alo,[sp,#$Goff+0]
361         ldr     $Ahi,[sp,#$Goff+4]
362         ldr     $Tlo,[sp,#$Hoff+0]
363         ldr     $Thi,[sp,#$Hoff+4]
364         ldr     $t0, [$ctx,#$Goff+$lo]
365         ldr     $t1, [$ctx,#$Goff+$hi]
366         ldr     $t2, [$ctx,#$Hoff+$lo]
367         ldr     $t3, [$ctx,#$Hoff+$hi]
368         adds    $t0,$Alo,$t0
369         adc     $t1,$Ahi,$t1
370         adds    $t2,$Tlo,$t2
371         adc     $t3,$Thi,$t3
372         str     $t0, [$ctx,#$Goff+$lo]
373         str     $t1, [$ctx,#$Goff+$hi]
374         str     $t2, [$ctx,#$Hoff+$lo]
375         str     $t3, [$ctx,#$Hoff+$hi]
376
377         add     sp,sp,#640
378         sub     $Ktbl,$Ktbl,#640
379
380         teq     $inp,$len
381         bne     .Loop
382
383         add     sp,sp,#8*9              @ destroy frame
384         ldmia   sp!,{r4-r12,lr}
385         tst     lr,#1
386         moveq   pc,lr                   @ be binary compatible with V4, yet
387         bx      lr                      @ interoperable with Thumb ISA:-)
388 .size   sha512_block_data_order,.-sha512_block_data_order
389 .asciz  "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
390 ___
391
392 $code =~ s/\`([^\`]*)\`/eval $1/gem;
393 print $code;