Yet another ARM update. It appears to be more appropriate to make
[openssl.git] / crypto / sha / asm / sha512-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # SHA512 block procedure for ARMv4. September 2007.
11
12 # This code is ~4.5 (four and a half) times faster than code generated
13 # by gcc 3.4 and it spends ~72 clock cycles per byte. 
14
15 # Byte order [in]dependence. =========================================
16 #
17 # Caller is expected to maintain specific *dword* order in h[0-7],
18 # namely with most significant dword at *lower* address, which is
19 # reflected in below two parameters. *Byte* order within these dwords
20 # in turn is whatever *native* byte order on current platform.
21 $hi=0;
22 $lo=4;
23 # ====================================================================
24
25 $output=shift;
26 open STDOUT,">$output";
27
28 $ctx="r0";
29 $inp="r1";
30 $len="r2";
31 $Tlo="r3";
32 $Thi="r4";
33 $Alo="r5";
34 $Ahi="r6";
35 $Elo="r7";
36 $Ehi="r8";
37 $t0="r9";
38 $t1="r10";
39 $t2="r11";
40 $t3="r12";
41 ############    r13 is stack pointer
42 $Ktbl="r14";
43 ############    r15 is program counter
44
45 $Aoff=8*0;
46 $Boff=8*1;
47 $Coff=8*2;
48 $Doff=8*3;
49 $Eoff=8*4;
50 $Foff=8*5;
51 $Goff=8*6;
52 $Hoff=8*7;
53 $Xoff=8*8;
54
55 sub BODY_00_15() {
56 my $magic = shift;
57 $code.=<<___;
58         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
59         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
60         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
61         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
62         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
63         mov     $t0,$Elo,lsr#14
64         mov     $t1,$Ehi,lsr#14
65         eor     $t0,$t0,$Ehi,lsl#18
66         eor     $t1,$t1,$Elo,lsl#18
67         eor     $t0,$t0,$Elo,lsr#18
68         eor     $t1,$t1,$Ehi,lsr#18
69         eor     $t0,$t0,$Ehi,lsl#14
70         eor     $t1,$t1,$Elo,lsl#14
71         eor     $t0,$t0,$Ehi,lsr#9
72         eor     $t1,$t1,$Elo,lsr#9
73         eor     $t0,$t0,$Elo,lsl#23
74         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
75         adds    $Tlo,$Tlo,$t0
76         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
77         adds    $Tlo,$Tlo,$t2
78         adc     $Thi,$Thi,$t3           @ T += h
79
80         ldr     $t0,[sp,#$Foff+0]       @ f.lo
81         ldr     $t1,[sp,#$Foff+4]       @ f.hi
82         ldr     $t2,[sp,#$Goff+0]       @ g.lo
83         ldr     $t3,[sp,#$Goff+4]       @ g.hi
84         str     $Elo,[sp,#$Eoff+0]
85         str     $Ehi,[sp,#$Eoff+4]
86         str     $Alo,[sp,#$Aoff+0]
87         str     $Ahi,[sp,#$Aoff+4]
88
89         eor     $t0,$t0,$t2
90         eor     $t1,$t1,$t3
91         and     $t0,$t0,$Elo
92         and     $t1,$t1,$Ehi
93         eor     $t0,$t0,$t2
94         eor     $t1,$t1,$t3             @ Ch(e,f,g)
95
96         ldr     $t2,[$Ktbl,#4]          @ K[i].lo
97         ldr     $t3,[$Ktbl,#0]          @ K[i].hi
98         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
99         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
100
101         adds    $Tlo,$Tlo,$t0
102         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
103         adds    $Tlo,$Tlo,$t2
104         adc     $Thi,$Thi,$t3           @ T += K[i]
105         adds    $Elo,$Elo,$Tlo
106         adc     $Ehi,$Ehi,$Thi          @ d += T
107
108         and     $t0,$t2,#0xff
109         teq     $t0,#$magic
110         orreq   $Ktbl,$Ktbl,#1
111
112         ldr     $t2,[sp,#$Boff+0]       @ b.lo
113         ldr     $t3,[sp,#$Coff+0]       @ c.lo
114         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
115         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
116         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
117         mov     $t0,$Alo,lsr#28
118         mov     $t1,$Ahi,lsr#28
119         eor     $t0,$t0,$Ahi,lsl#4
120         eor     $t1,$t1,$Alo,lsl#4
121         eor     $t0,$t0,$Ahi,lsr#2
122         eor     $t1,$t1,$Alo,lsr#2
123         eor     $t0,$t0,$Alo,lsl#30
124         eor     $t1,$t1,$Ahi,lsl#30
125         eor     $t0,$t0,$Ahi,lsr#7
126         eor     $t1,$t1,$Alo,lsr#7
127         eor     $t0,$t0,$Alo,lsl#25
128         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
129         adds    $Tlo,$Tlo,$t0
130         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
131
132         and     $t0,$Alo,$t2
133         orr     $Alo,$Alo,$t2
134         ldr     $t1,[sp,#$Boff+4]       @ b.hi
135         ldr     $t2,[sp,#$Coff+4]       @ c.hi
136         and     $Alo,$Alo,$t3
137         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
138         and     $t3,$Ahi,$t1
139         orr     $Ahi,$Ahi,$t1
140         and     $Ahi,$Ahi,$t2
141         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
142         adds    $Alo,$Alo,$Tlo
143         adc     $Ahi,$Ahi,$Thi          @ h += T
144
145         sub     sp,sp,#8
146         add     $Ktbl,$Ktbl,#8
147 ___
148 }
149 $code=<<___;
150 .text
151 .code   32
152 .type   K512,%object
153 .align  5
154 K512:
155 .word   0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
156 .word   0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
157 .word   0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
158 .word   0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
159 .word   0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
160 .word   0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
161 .word   0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
162 .word   0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
163 .word   0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
164 .word   0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
165 .word   0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
166 .word   0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
167 .word   0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
168 .word   0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
169 .word   0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
170 .word   0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
171 .word   0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
172 .word   0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
173 .word   0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
174 .word   0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
175 .word   0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
176 .word   0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
177 .word   0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
178 .word   0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
179 .word   0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
180 .word   0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
181 .word   0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
182 .word   0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
183 .word   0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
184 .word   0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
185 .word   0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
186 .word   0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
187 .word   0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
188 .word   0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
189 .word   0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
190 .word   0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
191 .word   0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
192 .word   0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
193 .word   0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
194 .word   0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
195 .size   K512,.-K512
196
197 .global sha512_block_data_order
198 .type   sha512_block_data_order,%function
199 sha512_block_data_order:
200         sub     r3,pc,#8                @ sha512_block_data_order
201         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
202         stmdb   sp!,{r4-r12,lr}
203         sub     $Ktbl,r3,#640           @ K512
204         sub     sp,sp,#9*8
205
206         ldr     $Elo,[$ctx,#$Eoff+$lo]
207         ldr     $Ehi,[$ctx,#$Eoff+$hi]
208         ldr     $t0, [$ctx,#$Goff+$lo]
209         ldr     $t1, [$ctx,#$Goff+$hi]
210         ldr     $t2, [$ctx,#$Hoff+$lo]
211         ldr     $t3, [$ctx,#$Hoff+$hi]
212 .Loop:
213         str     $t0, [sp,#$Goff+0]
214         str     $t1, [sp,#$Goff+4]
215         str     $t2, [sp,#$Hoff+0]
216         str     $t3, [sp,#$Hoff+4]
217         ldr     $Alo,[$ctx,#$Aoff+$lo]
218         ldr     $Ahi,[$ctx,#$Aoff+$hi]
219         ldr     $Tlo,[$ctx,#$Boff+$lo]
220         ldr     $Thi,[$ctx,#$Boff+$hi]
221         ldr     $t0, [$ctx,#$Coff+$lo]
222         ldr     $t1, [$ctx,#$Coff+$hi]
223         ldr     $t2, [$ctx,#$Doff+$lo]
224         ldr     $t3, [$ctx,#$Doff+$hi]
225         str     $Tlo,[sp,#$Boff+0]
226         str     $Thi,[sp,#$Boff+4]
227         str     $t0, [sp,#$Coff+0]
228         str     $t1, [sp,#$Coff+4]
229         str     $t2, [sp,#$Doff+0]
230         str     $t3, [sp,#$Doff+4]
231         ldr     $Tlo,[$ctx,#$Foff+$lo]
232         ldr     $Thi,[$ctx,#$Foff+$hi]
233         str     $Tlo,[sp,#$Foff+0]
234         str     $Thi,[sp,#$Foff+4]
235
236 .L00_15:
237         ldrb    $Tlo,[$inp,#7]
238         ldrb    $t0, [$inp,#6]
239         ldrb    $t1, [$inp,#5]
240         ldrb    $t2, [$inp,#4]
241         ldrb    $Thi,[$inp,#3]
242         ldrb    $t3, [$inp,#2]
243         orr     $Tlo,$Tlo,$t0,lsl#8
244         ldrb    $t0, [$inp,#1]
245         orr     $Tlo,$Tlo,$t1,lsl#16
246         ldrb    $t1, [$inp],#8
247         orr     $Tlo,$Tlo,$t2,lsl#24
248         orr     $Thi,$Thi,$t3,lsl#8
249         orr     $Thi,$Thi,$t0,lsl#16
250         orr     $Thi,$Thi,$t1,lsl#24
251         str     $Tlo,[sp,#$Xoff+0]
252         str     $Thi,[sp,#$Xoff+4]
253 ___
254         &BODY_00_15(0x94);
255 $code.=<<___;
256         tst     $Ktbl,#1
257         beq     .L00_15
258         bic     $Ktbl,$Ktbl,#1
259
260 .L16_79:
261         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
262         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
263         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
264         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
265
266         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
267         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
268         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
269         mov     $Tlo,$t0,lsr#1
270         mov     $Thi,$t1,lsr#1
271         eor     $Tlo,$Tlo,$t1,lsl#31
272         eor     $Thi,$Thi,$t0,lsl#31
273         eor     $Tlo,$Tlo,$t0,lsr#8
274         eor     $Thi,$Thi,$t1,lsr#8
275         eor     $Tlo,$Tlo,$t1,lsl#24
276         eor     $Thi,$Thi,$t0,lsl#24
277         eor     $Tlo,$Tlo,$t0,lsr#7
278         eor     $Thi,$Thi,$t1,lsr#7
279         eor     $Tlo,$Tlo,$t1,lsl#25
280
281         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
282         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
283         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
284         mov     $t0,$t2,lsr#19
285         mov     $t1,$t3,lsr#19
286         eor     $t0,$t0,$t3,lsl#13
287         eor     $t1,$t1,$t2,lsl#13
288         eor     $t0,$t0,$t3,lsr#29
289         eor     $t1,$t1,$t2,lsr#29
290         eor     $t0,$t0,$t2,lsl#3
291         eor     $t1,$t1,$t3,lsl#3
292         eor     $t0,$t0,$t2,lsr#6
293         eor     $t1,$t1,$t3,lsr#6
294         eor     $t0,$t0,$t3,lsl#26
295
296         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
297         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
298         adds    $Tlo,$Tlo,$t0
299         adc     $Thi,$Thi,$t1
300
301         ldr     $t0,[sp,#`$Xoff+8*16`+0]
302         ldr     $t1,[sp,#`$Xoff+8*16`+4]
303         adds    $Tlo,$Tlo,$t2
304         adc     $Thi,$Thi,$t3
305         adds    $Tlo,$Tlo,$t0
306         adc     $Thi,$Thi,$t1
307         str     $Tlo,[sp,#$Xoff+0]
308         str     $Thi,[sp,#$Xoff+4]
309 ___
310         &BODY_00_15(0x17);
311 $code.=<<___;
312         tst     $Ktbl,#1
313         beq     .L16_79
314         bic     $Ktbl,$Ktbl,#1
315
316         ldr     $Tlo,[sp,#$Boff+0]
317         ldr     $Thi,[sp,#$Boff+4]
318         ldr     $t0, [$ctx,#$Aoff+$lo]
319         ldr     $t1, [$ctx,#$Aoff+$hi]
320         ldr     $t2, [$ctx,#$Boff+$lo]
321         ldr     $t3, [$ctx,#$Boff+$hi]
322         adds    $t0,$Alo,$t0
323         adc     $t1,$Ahi,$t1
324         adds    $t2,$Tlo,$t2
325         adc     $t3,$Thi,$t3
326         str     $t0, [$ctx,#$Aoff+$lo]
327         str     $t1, [$ctx,#$Aoff+$hi]
328         str     $t2, [$ctx,#$Boff+$lo]
329         str     $t3, [$ctx,#$Boff+$hi]
330
331         ldr     $Alo,[sp,#$Coff+0]
332         ldr     $Ahi,[sp,#$Coff+4]
333         ldr     $Tlo,[sp,#$Doff+0]
334         ldr     $Thi,[sp,#$Doff+4]
335         ldr     $t0, [$ctx,#$Coff+$lo]
336         ldr     $t1, [$ctx,#$Coff+$hi]
337         ldr     $t2, [$ctx,#$Doff+$lo]
338         ldr     $t3, [$ctx,#$Doff+$hi]
339         adds    $t0,$Alo,$t0
340         adc     $t1,$Ahi,$t1
341         adds    $t2,$Tlo,$t2
342         adc     $t3,$Thi,$t3
343         str     $t0, [$ctx,#$Coff+$lo]
344         str     $t1, [$ctx,#$Coff+$hi]
345         str     $t2, [$ctx,#$Doff+$lo]
346         str     $t3, [$ctx,#$Doff+$hi]
347
348         ldr     $Tlo,[sp,#$Foff+0]
349         ldr     $Thi,[sp,#$Foff+4]
350         ldr     $t0, [$ctx,#$Eoff+$lo]
351         ldr     $t1, [$ctx,#$Eoff+$hi]
352         ldr     $t2, [$ctx,#$Foff+$lo]
353         ldr     $t3, [$ctx,#$Foff+$hi]
354         adds    $Elo,$Elo,$t0
355         adc     $Ehi,$Ehi,$t1
356         adds    $t2,$Tlo,$t2
357         adc     $t3,$Thi,$t3
358         str     $Elo,[$ctx,#$Eoff+$lo]
359         str     $Ehi,[$ctx,#$Eoff+$hi]
360         str     $t2, [$ctx,#$Foff+$lo]
361         str     $t3, [$ctx,#$Foff+$hi]
362
363         ldr     $Alo,[sp,#$Goff+0]
364         ldr     $Ahi,[sp,#$Goff+4]
365         ldr     $Tlo,[sp,#$Hoff+0]
366         ldr     $Thi,[sp,#$Hoff+4]
367         ldr     $t0, [$ctx,#$Goff+$lo]
368         ldr     $t1, [$ctx,#$Goff+$hi]
369         ldr     $t2, [$ctx,#$Hoff+$lo]
370         ldr     $t3, [$ctx,#$Hoff+$hi]
371         adds    $t0,$Alo,$t0
372         adc     $t1,$Ahi,$t1
373         adds    $t2,$Tlo,$t2
374         adc     $t3,$Thi,$t3
375         str     $t0, [$ctx,#$Goff+$lo]
376         str     $t1, [$ctx,#$Goff+$hi]
377         str     $t2, [$ctx,#$Hoff+$lo]
378         str     $t3, [$ctx,#$Hoff+$hi]
379
380         add     sp,sp,#640
381         sub     $Ktbl,$Ktbl,#640
382
383         teq     $inp,$len
384         bne     .Loop
385
386         add     sp,sp,#8*9              @ destroy frame
387         ldmia   sp!,{r4-r12,lr}
388         tst     lr,#1
389         moveq   pc,lr                   @ be binary compatible with V4, yet
390         bx      lr                      @ interoperable with Thumb ISA:-)
391 .size   sha512_block_data_order,.-sha512_block_data_order
392 .asciz  "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
393 ___
394
395 $code =~ s/\`([^\`]*)\`/eval $1/gem;
396 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
397 print $code;
398 close STDOUT; # enforce flush