ARM assembler pack: reschedule instructions for dual-issue pipeline [from HEAD].
[openssl.git] / crypto / sha / asm / sha512-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # SHA512 block procedure for ARMv4. September 2007.
11
12 # This code is ~4.5 (four and a half) times faster than code generated
13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14 # Xscale PXA250 core].
15 #
16 # July 2010.
17 #
18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
19 # Cortex A8 core and ~40 cycles per processed byte.
20
21 # Byte order [in]dependence. =========================================
22 #
23 # Caller is expected to maintain specific *dword* order in h[0-7],
24 # namely with most significant dword at *lower* address, which is
25 # reflected in below two parameters. *Byte* order within these dwords
26 # in turn is whatever *native* byte order on current platform.
27 $hi=0;
28 $lo=4;
29 # ====================================================================
30
31 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
32 open STDOUT,">$output";
33
34 $ctx="r0";
35 $inp="r1";
36 $len="r2";
37 $Tlo="r3";
38 $Thi="r4";
39 $Alo="r5";
40 $Ahi="r6";
41 $Elo="r7";
42 $Ehi="r8";
43 $t0="r9";
44 $t1="r10";
45 $t2="r11";
46 $t3="r12";
47 ############    r13 is stack pointer
48 $Ktbl="r14";
49 ############    r15 is program counter
50
51 $Aoff=8*0;
52 $Boff=8*1;
53 $Coff=8*2;
54 $Doff=8*3;
55 $Eoff=8*4;
56 $Foff=8*5;
57 $Goff=8*6;
58 $Hoff=8*7;
59 $Xoff=8*8;
60
61 sub BODY_00_15() {
62 my $magic = shift;
63 $code.=<<___;
64         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
65         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
66         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
67         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
68         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
69         mov     $t0,$Elo,lsr#14
70         mov     $t1,$Ehi,lsr#14
71         eor     $t0,$t0,$Ehi,lsl#18
72         eor     $t1,$t1,$Elo,lsl#18
73         eor     $t0,$t0,$Elo,lsr#18
74         eor     $t1,$t1,$Ehi,lsr#18
75         eor     $t0,$t0,$Ehi,lsl#14
76         eor     $t1,$t1,$Elo,lsl#14
77         eor     $t0,$t0,$Ehi,lsr#9
78         eor     $t1,$t1,$Elo,lsr#9
79         eor     $t0,$t0,$Elo,lsl#23
80         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
81         adds    $Tlo,$Tlo,$t0
82         ldr     $t0,[sp,#$Foff+0]       @ f.lo
83         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
84         ldr     $t1,[sp,#$Foff+4]       @ f.hi
85         adds    $Tlo,$Tlo,$t2
86         ldr     $t2,[sp,#$Goff+0]       @ g.lo
87         adc     $Thi,$Thi,$t3           @ T += h
88         ldr     $t3,[sp,#$Goff+4]       @ g.hi
89
90         eor     $t0,$t0,$t2
91         str     $Elo,[sp,#$Eoff+0]
92         eor     $t1,$t1,$t3
93         str     $Ehi,[sp,#$Eoff+4]
94         and     $t0,$t0,$Elo
95         str     $Alo,[sp,#$Aoff+0]
96         and     $t1,$t1,$Ehi
97         str     $Ahi,[sp,#$Aoff+4]
98         eor     $t0,$t0,$t2
99         ldr     $t2,[$Ktbl,#4]          @ K[i].lo
100         eor     $t1,$t1,$t3             @ Ch(e,f,g)
101         ldr     $t3,[$Ktbl,#0]          @ K[i].hi
102
103         adds    $Tlo,$Tlo,$t0
104         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
105         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
106         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
107         adds    $Tlo,$Tlo,$t2
108         adc     $Thi,$Thi,$t3           @ T += K[i]
109         adds    $Elo,$Elo,$Tlo
110         adc     $Ehi,$Ehi,$Thi          @ d += T
111
112         and     $t0,$t2,#0xff
113         teq     $t0,#$magic
114         orreq   $Ktbl,$Ktbl,#1
115
116         ldr     $t2,[sp,#$Boff+0]       @ b.lo
117         ldr     $t3,[sp,#$Coff+0]       @ c.lo
118         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
119         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
120         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
121         mov     $t0,$Alo,lsr#28
122         mov     $t1,$Ahi,lsr#28
123         eor     $t0,$t0,$Ahi,lsl#4
124         eor     $t1,$t1,$Alo,lsl#4
125         eor     $t0,$t0,$Ahi,lsr#2
126         eor     $t1,$t1,$Alo,lsr#2
127         eor     $t0,$t0,$Alo,lsl#30
128         eor     $t1,$t1,$Ahi,lsl#30
129         eor     $t0,$t0,$Ahi,lsr#7
130         eor     $t1,$t1,$Alo,lsr#7
131         eor     $t0,$t0,$Alo,lsl#25
132         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
133         adds    $Tlo,$Tlo,$t0
134         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
135
136         and     $t0,$Alo,$t2
137         orr     $Alo,$Alo,$t2
138         ldr     $t1,[sp,#$Boff+4]       @ b.hi
139         ldr     $t2,[sp,#$Coff+4]       @ c.hi
140         and     $Alo,$Alo,$t3
141         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
142         and     $t3,$Ahi,$t1
143         orr     $Ahi,$Ahi,$t1
144         and     $Ahi,$Ahi,$t2
145         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
146         adds    $Alo,$Alo,$Tlo
147         adc     $Ahi,$Ahi,$Thi          @ h += T
148
149         sub     sp,sp,#8
150         add     $Ktbl,$Ktbl,#8
151 ___
152 }
153 $code=<<___;
154 .text
155 .code   32
156 .type   K512,%object
157 .align  5
158 K512:
159 .word   0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
160 .word   0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
161 .word   0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
162 .word   0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
163 .word   0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
164 .word   0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
165 .word   0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
166 .word   0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
167 .word   0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
168 .word   0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
169 .word   0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
170 .word   0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
171 .word   0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
172 .word   0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
173 .word   0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
174 .word   0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
175 .word   0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
176 .word   0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
177 .word   0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
178 .word   0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
179 .word   0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
180 .word   0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
181 .word   0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
182 .word   0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
183 .word   0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
184 .word   0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
185 .word   0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
186 .word   0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
187 .word   0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
188 .word   0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
189 .word   0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
190 .word   0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
191 .word   0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
192 .word   0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
193 .word   0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
194 .word   0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
195 .word   0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
196 .word   0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
197 .word   0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
198 .word   0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
199 .size   K512,.-K512
200
201 .global sha512_block_data_order
202 .type   sha512_block_data_order,%function
203 sha512_block_data_order:
204         sub     r3,pc,#8                @ sha512_block_data_order
205         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
206         stmdb   sp!,{r4-r12,lr}
207         sub     $Ktbl,r3,#640           @ K512
208         sub     sp,sp,#9*8
209
210         ldr     $Elo,[$ctx,#$Eoff+$lo]
211         ldr     $Ehi,[$ctx,#$Eoff+$hi]
212         ldr     $t0, [$ctx,#$Goff+$lo]
213         ldr     $t1, [$ctx,#$Goff+$hi]
214         ldr     $t2, [$ctx,#$Hoff+$lo]
215         ldr     $t3, [$ctx,#$Hoff+$hi]
216 .Loop:
217         str     $t0, [sp,#$Goff+0]
218         str     $t1, [sp,#$Goff+4]
219         str     $t2, [sp,#$Hoff+0]
220         str     $t3, [sp,#$Hoff+4]
221         ldr     $Alo,[$ctx,#$Aoff+$lo]
222         ldr     $Ahi,[$ctx,#$Aoff+$hi]
223         ldr     $Tlo,[$ctx,#$Boff+$lo]
224         ldr     $Thi,[$ctx,#$Boff+$hi]
225         ldr     $t0, [$ctx,#$Coff+$lo]
226         ldr     $t1, [$ctx,#$Coff+$hi]
227         ldr     $t2, [$ctx,#$Doff+$lo]
228         ldr     $t3, [$ctx,#$Doff+$hi]
229         str     $Tlo,[sp,#$Boff+0]
230         str     $Thi,[sp,#$Boff+4]
231         str     $t0, [sp,#$Coff+0]
232         str     $t1, [sp,#$Coff+4]
233         str     $t2, [sp,#$Doff+0]
234         str     $t3, [sp,#$Doff+4]
235         ldr     $Tlo,[$ctx,#$Foff+$lo]
236         ldr     $Thi,[$ctx,#$Foff+$hi]
237         str     $Tlo,[sp,#$Foff+0]
238         str     $Thi,[sp,#$Foff+4]
239
240 .L00_15:
241         ldrb    $Tlo,[$inp,#7]
242         ldrb    $t0, [$inp,#6]
243         ldrb    $t1, [$inp,#5]
244         ldrb    $t2, [$inp,#4]
245         ldrb    $Thi,[$inp,#3]
246         ldrb    $t3, [$inp,#2]
247         orr     $Tlo,$Tlo,$t0,lsl#8
248         ldrb    $t0, [$inp,#1]
249         orr     $Tlo,$Tlo,$t1,lsl#16
250         ldrb    $t1, [$inp],#8
251         orr     $Tlo,$Tlo,$t2,lsl#24
252         orr     $Thi,$Thi,$t3,lsl#8
253         orr     $Thi,$Thi,$t0,lsl#16
254         orr     $Thi,$Thi,$t1,lsl#24
255         str     $Tlo,[sp,#$Xoff+0]
256         str     $Thi,[sp,#$Xoff+4]
257 ___
258         &BODY_00_15(0x94);
259 $code.=<<___;
260         tst     $Ktbl,#1
261         beq     .L00_15
262         bic     $Ktbl,$Ktbl,#1
263
264 .L16_79:
265         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
266         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
267         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
268         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
269
270         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
271         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
272         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
273         mov     $Tlo,$t0,lsr#1
274         mov     $Thi,$t1,lsr#1
275         eor     $Tlo,$Tlo,$t1,lsl#31
276         eor     $Thi,$Thi,$t0,lsl#31
277         eor     $Tlo,$Tlo,$t0,lsr#8
278         eor     $Thi,$Thi,$t1,lsr#8
279         eor     $Tlo,$Tlo,$t1,lsl#24
280         eor     $Thi,$Thi,$t0,lsl#24
281         eor     $Tlo,$Tlo,$t0,lsr#7
282         eor     $Thi,$Thi,$t1,lsr#7
283         eor     $Tlo,$Tlo,$t1,lsl#25
284
285         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
286         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
287         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
288         mov     $t0,$t2,lsr#19
289         mov     $t1,$t3,lsr#19
290         eor     $t0,$t0,$t3,lsl#13
291         eor     $t1,$t1,$t2,lsl#13
292         eor     $t0,$t0,$t3,lsr#29
293         eor     $t1,$t1,$t2,lsr#29
294         eor     $t0,$t0,$t2,lsl#3
295         eor     $t1,$t1,$t3,lsl#3
296         eor     $t0,$t0,$t2,lsr#6
297         eor     $t1,$t1,$t3,lsr#6
298         eor     $t0,$t0,$t3,lsl#26
299
300         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
301         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
302         adds    $Tlo,$Tlo,$t0
303         adc     $Thi,$Thi,$t1
304
305         ldr     $t0,[sp,#`$Xoff+8*16`+0]
306         ldr     $t1,[sp,#`$Xoff+8*16`+4]
307         adds    $Tlo,$Tlo,$t2
308         adc     $Thi,$Thi,$t3
309         adds    $Tlo,$Tlo,$t0
310         adc     $Thi,$Thi,$t1
311         str     $Tlo,[sp,#$Xoff+0]
312         str     $Thi,[sp,#$Xoff+4]
313 ___
314         &BODY_00_15(0x17);
315 $code.=<<___;
316         tst     $Ktbl,#1
317         beq     .L16_79
318         bic     $Ktbl,$Ktbl,#1
319
320         ldr     $Tlo,[sp,#$Boff+0]
321         ldr     $Thi,[sp,#$Boff+4]
322         ldr     $t0, [$ctx,#$Aoff+$lo]
323         ldr     $t1, [$ctx,#$Aoff+$hi]
324         ldr     $t2, [$ctx,#$Boff+$lo]
325         ldr     $t3, [$ctx,#$Boff+$hi]
326         adds    $t0,$Alo,$t0
327         adc     $t1,$Ahi,$t1
328         adds    $t2,$Tlo,$t2
329         adc     $t3,$Thi,$t3
330         str     $t0, [$ctx,#$Aoff+$lo]
331         str     $t1, [$ctx,#$Aoff+$hi]
332         str     $t2, [$ctx,#$Boff+$lo]
333         str     $t3, [$ctx,#$Boff+$hi]
334
335         ldr     $Alo,[sp,#$Coff+0]
336         ldr     $Ahi,[sp,#$Coff+4]
337         ldr     $Tlo,[sp,#$Doff+0]
338         ldr     $Thi,[sp,#$Doff+4]
339         ldr     $t0, [$ctx,#$Coff+$lo]
340         ldr     $t1, [$ctx,#$Coff+$hi]
341         ldr     $t2, [$ctx,#$Doff+$lo]
342         ldr     $t3, [$ctx,#$Doff+$hi]
343         adds    $t0,$Alo,$t0
344         adc     $t1,$Ahi,$t1
345         adds    $t2,$Tlo,$t2
346         adc     $t3,$Thi,$t3
347         str     $t0, [$ctx,#$Coff+$lo]
348         str     $t1, [$ctx,#$Coff+$hi]
349         str     $t2, [$ctx,#$Doff+$lo]
350         str     $t3, [$ctx,#$Doff+$hi]
351
352         ldr     $Tlo,[sp,#$Foff+0]
353         ldr     $Thi,[sp,#$Foff+4]
354         ldr     $t0, [$ctx,#$Eoff+$lo]
355         ldr     $t1, [$ctx,#$Eoff+$hi]
356         ldr     $t2, [$ctx,#$Foff+$lo]
357         ldr     $t3, [$ctx,#$Foff+$hi]
358         adds    $Elo,$Elo,$t0
359         adc     $Ehi,$Ehi,$t1
360         adds    $t2,$Tlo,$t2
361         adc     $t3,$Thi,$t3
362         str     $Elo,[$ctx,#$Eoff+$lo]
363         str     $Ehi,[$ctx,#$Eoff+$hi]
364         str     $t2, [$ctx,#$Foff+$lo]
365         str     $t3, [$ctx,#$Foff+$hi]
366
367         ldr     $Alo,[sp,#$Goff+0]
368         ldr     $Ahi,[sp,#$Goff+4]
369         ldr     $Tlo,[sp,#$Hoff+0]
370         ldr     $Thi,[sp,#$Hoff+4]
371         ldr     $t0, [$ctx,#$Goff+$lo]
372         ldr     $t1, [$ctx,#$Goff+$hi]
373         ldr     $t2, [$ctx,#$Hoff+$lo]
374         ldr     $t3, [$ctx,#$Hoff+$hi]
375         adds    $t0,$Alo,$t0
376         adc     $t1,$Ahi,$t1
377         adds    $t2,$Tlo,$t2
378         adc     $t3,$Thi,$t3
379         str     $t0, [$ctx,#$Goff+$lo]
380         str     $t1, [$ctx,#$Goff+$hi]
381         str     $t2, [$ctx,#$Hoff+$lo]
382         str     $t3, [$ctx,#$Hoff+$hi]
383
384         add     sp,sp,#640
385         sub     $Ktbl,$Ktbl,#640
386
387         teq     $inp,$len
388         bne     .Loop
389
390         add     sp,sp,#8*9              @ destroy frame
391         ldmia   sp!,{r4-r12,lr}
392         tst     lr,#1
393         moveq   pc,lr                   @ be binary compatible with V4, yet
394         bx      lr                      @ interoperable with Thumb ISA:-)
395 .size   sha512_block_data_order,.-sha512_block_data_order
396 .asciz  "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
397 .align  2
398 ___
399
400 $code =~ s/\`([^\`]*)\`/eval $1/gem;
401 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
402 print $code;
403 close STDOUT; # enforce flush