d3065794b343890487187ab32e71234712ce155c
[openssl.git] / crypto / sha / asm / sha512-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # SHA512 block procedure for ARMv4. September 2007.
11
12 # This code is ~4.5 (four and a half) times faster than code generated
13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14 # Xscale PXA250 core].
15 #
16 # July 2010.
17 #
18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
19 # Cortex A8 core and ~40 cycles per processed byte.
20
21 # February 2011.
22 #
23 # Profiler-assisted and platform-specific optimization resulted in 7%
24 # improvement on Coxtex A8 core and ~38 cycles per byte.
25
26 # March 2011.
27 #
28 # Add NEON implementation. On Cortex A8 it was measured to process
29 # one byte in 23.3 cycles or ~60% faster than integer-only code.
30
31 # August 2012.
32 #
33 # Improve NEON performance by 12% on Snapdragon S4. In absolute
34 # terms it's 22.6 cycles per byte, which is disappointing result.
35 # Technical writers asserted that 3-way S4 pipeline can sustain
36 # multiple NEON instructions per cycle, but dual NEON issue could
37 # not be observed, and for NEON-only sequences IPC(*) was found to
38 # be limited by 1:-( 0.33 and 0.66 were measured for sequences with
39 # ILPs(*) of 1 and 2 respectively. This in turn means that you can
40 # even find yourself striving, as I did here, for achieving IPC
41 # adequate to one delivered by Cortex A8 [for reference, it's
42 # 0.5 for ILP of 1, and 1 for higher ILPs].
43 #
44 # (*) ILP, instruction-level parallelism, how many instructions
45 #     *can* execute at the same time. IPC, instructions per cycle,
46 #     indicates how many instructions actually execute.
47
48 # Byte order [in]dependence. =========================================
49 #
50 # Originally caller was expected to maintain specific *dword* order in
51 # h[0-7], namely with most significant dword at *lower* address, which
52 # was reflected in below two parameters as 0 and 4. Now caller is
53 # expected to maintain native byte order for whole 64-bit values.
54 $hi="HI";
55 $lo="LO";
56 # ====================================================================
57
58 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
59 open STDOUT,">$output";
60
61 $ctx="r0";      # parameter block
62 $inp="r1";
63 $len="r2";
64
65 $Tlo="r3";
66 $Thi="r4";
67 $Alo="r5";
68 $Ahi="r6";
69 $Elo="r7";
70 $Ehi="r8";
71 $t0="r9";
72 $t1="r10";
73 $t2="r11";
74 $t3="r12";
75 ############    r13 is stack pointer
76 $Ktbl="r14";
77 ############    r15 is program counter
78
79 $Aoff=8*0;
80 $Boff=8*1;
81 $Coff=8*2;
82 $Doff=8*3;
83 $Eoff=8*4;
84 $Foff=8*5;
85 $Goff=8*6;
86 $Hoff=8*7;
87 $Xoff=8*8;
88
89 sub BODY_00_15() {
90 my $magic = shift;
91 $code.=<<___;
92         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
93         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
94         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
95         mov     $t0,$Elo,lsr#14
96         str     $Tlo,[sp,#$Xoff+0]
97         mov     $t1,$Ehi,lsr#14
98         str     $Thi,[sp,#$Xoff+4]
99         eor     $t0,$t0,$Ehi,lsl#18
100         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
101         eor     $t1,$t1,$Elo,lsl#18
102         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
103         eor     $t0,$t0,$Elo,lsr#18
104         eor     $t1,$t1,$Ehi,lsr#18
105         eor     $t0,$t0,$Ehi,lsl#14
106         eor     $t1,$t1,$Elo,lsl#14
107         eor     $t0,$t0,$Ehi,lsr#9
108         eor     $t1,$t1,$Elo,lsr#9
109         eor     $t0,$t0,$Elo,lsl#23
110         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
111         adds    $Tlo,$Tlo,$t0
112         ldr     $t0,[sp,#$Foff+0]       @ f.lo
113         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
114         ldr     $t1,[sp,#$Foff+4]       @ f.hi
115         adds    $Tlo,$Tlo,$t2
116         ldr     $t2,[sp,#$Goff+0]       @ g.lo
117         adc     $Thi,$Thi,$t3           @ T += h
118         ldr     $t3,[sp,#$Goff+4]       @ g.hi
119
120         eor     $t0,$t0,$t2
121         str     $Elo,[sp,#$Eoff+0]
122         eor     $t1,$t1,$t3
123         str     $Ehi,[sp,#$Eoff+4]
124         and     $t0,$t0,$Elo
125         str     $Alo,[sp,#$Aoff+0]
126         and     $t1,$t1,$Ehi
127         str     $Ahi,[sp,#$Aoff+4]
128         eor     $t0,$t0,$t2
129         ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
130         eor     $t1,$t1,$t3             @ Ch(e,f,g)
131         ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
132
133         adds    $Tlo,$Tlo,$t0
134         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
135         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
136         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
137         adds    $Tlo,$Tlo,$t2
138         and     $t0,$t2,#0xff
139         adc     $Thi,$Thi,$t3           @ T += K[i]
140         adds    $Elo,$Elo,$Tlo
141         ldr     $t2,[sp,#$Boff+0]       @ b.lo
142         adc     $Ehi,$Ehi,$Thi          @ d += T
143         teq     $t0,#$magic
144
145         ldr     $t3,[sp,#$Coff+0]       @ c.lo
146         orreq   $Ktbl,$Ktbl,#1
147         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
148         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
149         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
150         mov     $t0,$Alo,lsr#28
151         mov     $t1,$Ahi,lsr#28
152         eor     $t0,$t0,$Ahi,lsl#4
153         eor     $t1,$t1,$Alo,lsl#4
154         eor     $t0,$t0,$Ahi,lsr#2
155         eor     $t1,$t1,$Alo,lsr#2
156         eor     $t0,$t0,$Alo,lsl#30
157         eor     $t1,$t1,$Ahi,lsl#30
158         eor     $t0,$t0,$Ahi,lsr#7
159         eor     $t1,$t1,$Alo,lsr#7
160         eor     $t0,$t0,$Alo,lsl#25
161         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
162         adds    $Tlo,$Tlo,$t0
163         and     $t0,$Alo,$t2
164         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
165
166         ldr     $t1,[sp,#$Boff+4]       @ b.hi
167         orr     $Alo,$Alo,$t2
168         ldr     $t2,[sp,#$Coff+4]       @ c.hi
169         and     $Alo,$Alo,$t3
170         and     $t3,$Ahi,$t1
171         orr     $Ahi,$Ahi,$t1
172         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
173         and     $Ahi,$Ahi,$t2
174         adds    $Alo,$Alo,$Tlo
175         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
176         sub     sp,sp,#8
177         adc     $Ahi,$Ahi,$Thi          @ h += T
178         tst     $Ktbl,#1
179         add     $Ktbl,$Ktbl,#8
180 ___
181 }
182 $code=<<___;
183 #include "arm_arch.h"
184 #ifdef __ARMEL__
185 # define LO 0
186 # define HI 4
187 # define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
188 #else
189 # define HI 0
190 # define LO 4
191 # define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
192 #endif
193
194 .text
195 .code   32
196 .type   K512,%object
197 .align  5
198 K512:
199 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
200 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
201 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
202 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
203 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
204 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
205 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
206 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
207 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
208 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
209 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
210 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
211 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
212 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
213 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
214 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
215 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
216 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
217 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
218 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
219 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
220 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
221 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
222 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
223 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
224 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
225 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
226 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
227 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
228 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
229 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
230 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
231 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
232 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
233 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
234 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
235 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
236 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
237 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
238 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
239 .size   K512,.-K512
240 .LOPENSSL_armcap:
241 .word   OPENSSL_armcap_P-sha512_block_data_order
242 .skip   32-4
243
244 .global sha512_block_data_order
245 .type   sha512_block_data_order,%function
246 sha512_block_data_order:
247         sub     r3,pc,#8                @ sha512_block_data_order
248         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
249 #if __ARM_ARCH__>=7
250         ldr     r12,.LOPENSSL_armcap
251         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
252         tst     r12,#1
253         bne     .LNEON
254 #endif
255         stmdb   sp!,{r4-r12,lr}
256         sub     $Ktbl,r3,#672           @ K512
257         sub     sp,sp,#9*8
258
259         ldr     $Elo,[$ctx,#$Eoff+$lo]
260         ldr     $Ehi,[$ctx,#$Eoff+$hi]
261         ldr     $t0, [$ctx,#$Goff+$lo]
262         ldr     $t1, [$ctx,#$Goff+$hi]
263         ldr     $t2, [$ctx,#$Hoff+$lo]
264         ldr     $t3, [$ctx,#$Hoff+$hi]
265 .Loop:
266         str     $t0, [sp,#$Goff+0]
267         str     $t1, [sp,#$Goff+4]
268         str     $t2, [sp,#$Hoff+0]
269         str     $t3, [sp,#$Hoff+4]
270         ldr     $Alo,[$ctx,#$Aoff+$lo]
271         ldr     $Ahi,[$ctx,#$Aoff+$hi]
272         ldr     $Tlo,[$ctx,#$Boff+$lo]
273         ldr     $Thi,[$ctx,#$Boff+$hi]
274         ldr     $t0, [$ctx,#$Coff+$lo]
275         ldr     $t1, [$ctx,#$Coff+$hi]
276         ldr     $t2, [$ctx,#$Doff+$lo]
277         ldr     $t3, [$ctx,#$Doff+$hi]
278         str     $Tlo,[sp,#$Boff+0]
279         str     $Thi,[sp,#$Boff+4]
280         str     $t0, [sp,#$Coff+0]
281         str     $t1, [sp,#$Coff+4]
282         str     $t2, [sp,#$Doff+0]
283         str     $t3, [sp,#$Doff+4]
284         ldr     $Tlo,[$ctx,#$Foff+$lo]
285         ldr     $Thi,[$ctx,#$Foff+$hi]
286         str     $Tlo,[sp,#$Foff+0]
287         str     $Thi,[sp,#$Foff+4]
288
289 .L00_15:
290 #if __ARM_ARCH__<7
291         ldrb    $Tlo,[$inp,#7]
292         ldrb    $t0, [$inp,#6]
293         ldrb    $t1, [$inp,#5]
294         ldrb    $t2, [$inp,#4]
295         ldrb    $Thi,[$inp,#3]
296         ldrb    $t3, [$inp,#2]
297         orr     $Tlo,$Tlo,$t0,lsl#8
298         ldrb    $t0, [$inp,#1]
299         orr     $Tlo,$Tlo,$t1,lsl#16
300         ldrb    $t1, [$inp],#8
301         orr     $Tlo,$Tlo,$t2,lsl#24
302         orr     $Thi,$Thi,$t3,lsl#8
303         orr     $Thi,$Thi,$t0,lsl#16
304         orr     $Thi,$Thi,$t1,lsl#24
305 #else
306         ldr     $Tlo,[$inp,#4]
307         ldr     $Thi,[$inp],#8
308 #ifdef __ARMEL__
309         rev     $Tlo,$Tlo
310         rev     $Thi,$Thi
311 #endif
312 #endif
313 ___
314         &BODY_00_15(0x94);
315 $code.=<<___;
316         tst     $Ktbl,#1
317         beq     .L00_15
318         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
319         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
320         bic     $Ktbl,$Ktbl,#1
321 .L16_79:
322         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
323         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
324         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
325         mov     $Tlo,$t0,lsr#1
326         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
327         mov     $Thi,$t1,lsr#1
328         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
329         eor     $Tlo,$Tlo,$t1,lsl#31
330         eor     $Thi,$Thi,$t0,lsl#31
331         eor     $Tlo,$Tlo,$t0,lsr#8
332         eor     $Thi,$Thi,$t1,lsr#8
333         eor     $Tlo,$Tlo,$t1,lsl#24
334         eor     $Thi,$Thi,$t0,lsl#24
335         eor     $Tlo,$Tlo,$t0,lsr#7
336         eor     $Thi,$Thi,$t1,lsr#7
337         eor     $Tlo,$Tlo,$t1,lsl#25
338
339         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
340         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
341         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
342         mov     $t0,$t2,lsr#19
343         mov     $t1,$t3,lsr#19
344         eor     $t0,$t0,$t3,lsl#13
345         eor     $t1,$t1,$t2,lsl#13
346         eor     $t0,$t0,$t3,lsr#29
347         eor     $t1,$t1,$t2,lsr#29
348         eor     $t0,$t0,$t2,lsl#3
349         eor     $t1,$t1,$t3,lsl#3
350         eor     $t0,$t0,$t2,lsr#6
351         eor     $t1,$t1,$t3,lsr#6
352         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
353         eor     $t0,$t0,$t3,lsl#26
354
355         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
356         adds    $Tlo,$Tlo,$t0
357         ldr     $t0,[sp,#`$Xoff+8*16`+0]
358         adc     $Thi,$Thi,$t1
359
360         ldr     $t1,[sp,#`$Xoff+8*16`+4]
361         adds    $Tlo,$Tlo,$t2
362         adc     $Thi,$Thi,$t3
363         adds    $Tlo,$Tlo,$t0
364         adc     $Thi,$Thi,$t1
365 ___
366         &BODY_00_15(0x17);
367 $code.=<<___;
368         ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
369         ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
370         beq     .L16_79
371         bic     $Ktbl,$Ktbl,#1
372
373         ldr     $Tlo,[sp,#$Boff+0]
374         ldr     $Thi,[sp,#$Boff+4]
375         ldr     $t0, [$ctx,#$Aoff+$lo]
376         ldr     $t1, [$ctx,#$Aoff+$hi]
377         ldr     $t2, [$ctx,#$Boff+$lo]
378         ldr     $t3, [$ctx,#$Boff+$hi]
379         adds    $t0,$Alo,$t0
380         str     $t0, [$ctx,#$Aoff+$lo]
381         adc     $t1,$Ahi,$t1
382         str     $t1, [$ctx,#$Aoff+$hi]
383         adds    $t2,$Tlo,$t2
384         str     $t2, [$ctx,#$Boff+$lo]
385         adc     $t3,$Thi,$t3
386         str     $t3, [$ctx,#$Boff+$hi]
387
388         ldr     $Alo,[sp,#$Coff+0]
389         ldr     $Ahi,[sp,#$Coff+4]
390         ldr     $Tlo,[sp,#$Doff+0]
391         ldr     $Thi,[sp,#$Doff+4]
392         ldr     $t0, [$ctx,#$Coff+$lo]
393         ldr     $t1, [$ctx,#$Coff+$hi]
394         ldr     $t2, [$ctx,#$Doff+$lo]
395         ldr     $t3, [$ctx,#$Doff+$hi]
396         adds    $t0,$Alo,$t0
397         str     $t0, [$ctx,#$Coff+$lo]
398         adc     $t1,$Ahi,$t1
399         str     $t1, [$ctx,#$Coff+$hi]
400         adds    $t2,$Tlo,$t2
401         str     $t2, [$ctx,#$Doff+$lo]
402         adc     $t3,$Thi,$t3
403         str     $t3, [$ctx,#$Doff+$hi]
404
405         ldr     $Tlo,[sp,#$Foff+0]
406         ldr     $Thi,[sp,#$Foff+4]
407         ldr     $t0, [$ctx,#$Eoff+$lo]
408         ldr     $t1, [$ctx,#$Eoff+$hi]
409         ldr     $t2, [$ctx,#$Foff+$lo]
410         ldr     $t3, [$ctx,#$Foff+$hi]
411         adds    $Elo,$Elo,$t0
412         str     $Elo,[$ctx,#$Eoff+$lo]
413         adc     $Ehi,$Ehi,$t1
414         str     $Ehi,[$ctx,#$Eoff+$hi]
415         adds    $t2,$Tlo,$t2
416         str     $t2, [$ctx,#$Foff+$lo]
417         adc     $t3,$Thi,$t3
418         str     $t3, [$ctx,#$Foff+$hi]
419
420         ldr     $Alo,[sp,#$Goff+0]
421         ldr     $Ahi,[sp,#$Goff+4]
422         ldr     $Tlo,[sp,#$Hoff+0]
423         ldr     $Thi,[sp,#$Hoff+4]
424         ldr     $t0, [$ctx,#$Goff+$lo]
425         ldr     $t1, [$ctx,#$Goff+$hi]
426         ldr     $t2, [$ctx,#$Hoff+$lo]
427         ldr     $t3, [$ctx,#$Hoff+$hi]
428         adds    $t0,$Alo,$t0
429         str     $t0, [$ctx,#$Goff+$lo]
430         adc     $t1,$Ahi,$t1
431         str     $t1, [$ctx,#$Goff+$hi]
432         adds    $t2,$Tlo,$t2
433         str     $t2, [$ctx,#$Hoff+$lo]
434         adc     $t3,$Thi,$t3
435         str     $t3, [$ctx,#$Hoff+$hi]
436
437         add     sp,sp,#640
438         sub     $Ktbl,$Ktbl,#640
439
440         teq     $inp,$len
441         bne     .Loop
442
443         add     sp,sp,#8*9              @ destroy frame
444 #if __ARM_ARCH__>=5
445         ldmia   sp!,{r4-r12,pc}
446 #else
447         ldmia   sp!,{r4-r12,lr}
448         tst     lr,#1
449         moveq   pc,lr                   @ be binary compatible with V4, yet
450         bx      lr                      @ interoperable with Thumb ISA:-)
451 #endif
452 ___
453
454 {
455 my @Sigma0=(28,34,39);
456 my @Sigma1=(14,18,41);
457 my @sigma0=(1, 8, 7);
458 my @sigma1=(19,61,6);
459
460 my $Ktbl="r3";
461 my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
462
463 my @X=map("d$_",(0..15));
464 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
465
466 sub NEON_00_15() {
467 my $i=shift;
468 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
469 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
470
471 $code.=<<___ if ($i<16 || $i&1);
472         vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
473 #if $i<16
474         vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
475 #endif
476         vshr.u64        $t1,$e,#@Sigma1[1]
477 #if $i>0
478          vadd.i64       $a,$Maj                 @ h+=Maj from the past
479 #endif
480         vshr.u64        $t2,$e,#@Sigma1[2]
481 ___
482 $code.=<<___;
483         vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
484         vsli.64         $t0,$e,#`64-@Sigma1[0]`
485         vsli.64         $t1,$e,#`64-@Sigma1[1]`
486         vmov            $Ch,$e
487         vsli.64         $t2,$e,#`64-@Sigma1[2]`
488 #if $i<16 && defined(__ARMEL__)
489         vrev64.8        @X[$i],@X[$i]
490 #endif
491         veor            $t1,$t0
492         vbsl            $Ch,$f,$g               @ Ch(e,f,g)
493         vshr.u64        $t0,$a,#@Sigma0[0]
494         veor            $t2,$t1                 @ Sigma1(e)
495         vadd.i64        $T1,$Ch,$h
496         vshr.u64        $t1,$a,#@Sigma0[1]
497         vsli.64         $t0,$a,#`64-@Sigma0[0]`
498         vadd.i64        $T1,$t2
499         vshr.u64        $t2,$a,#@Sigma0[2]
500         vadd.i64        $K,@X[$i%16]
501         vsli.64         $t1,$a,#`64-@Sigma0[1]`
502         veor            $Maj,$a,$b
503         vsli.64         $t2,$a,#`64-@Sigma0[2]`
504         veor            $h,$t0,$t1
505         vadd.i64        $T1,$K
506         vbsl            $Maj,$c,$b              @ Maj(a,b,c)
507         veor            $h,$t2                  @ Sigma0(a)
508         vadd.i64        $d,$T1
509         vadd.i64        $Maj,$T1
510         @ vadd.i64      $h,$Maj
511 ___
512 }
513
514 sub NEON_16_79() {
515 my $i=shift;
516
517 if ($i&1)       { &NEON_00_15($i,@_); return; }
518
519 # 2x-vectorized, therefore runs every 2nd round
520 my @X=map("q$_",(0..7));                        # view @X as 128-bit vector
521 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));     # temps
522 my ($d0,$d1,$d2) = map("d$_",(24..26));         # temps from NEON_00_15
523 my $e=@_[4];                                    # $e from NEON_00_15
524 $i /= 2;
525 $code.=<<___;
526         vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
527         vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
528          vadd.i64       @_[0],d30                       @ h+=Maj from the past
529         vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
530         vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
531         vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
532         vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
533         veor            $s1,$t0
534         vshr.u64        $t0,$s0,#@sigma0[0]
535         veor            $s1,$t1                         @ sigma1(X[i+14])
536         vshr.u64        $t1,$s0,#@sigma0[1]
537         vadd.i64        @X[$i%8],$s1
538         vshr.u64        $s1,$s0,#@sigma0[2]
539         vsli.64         $t0,$s0,#`64-@sigma0[0]`
540         vsli.64         $t1,$s0,#`64-@sigma0[1]`
541         vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
542         veor            $s1,$t0
543         vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
544         vadd.i64        @X[$i%8],$s0
545         vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
546         veor            $s1,$t1                         @ sigma0(X[i+1])
547         vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
548         vadd.i64        @X[$i%8],$s1
549 ___
550         &NEON_00_15(2*$i,@_);
551 }
552
553 $code.=<<___;
554 #if __ARM_ARCH__>=7
555 .fpu    neon
556
557 .align  4
558 .LNEON:
559         dmb                             @ errata #451034 on early Cortex A8
560         vstmdb  sp!,{d8-d15}            @ ABI specification says so
561         sub     $Ktbl,r3,#672           @ K512
562         vldmia  $ctx,{$A-$H}            @ load context
563 .Loop_neon:
564 ___
565 for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
566 $code.=<<___;
567         mov             $cnt,#4
568 .L16_79_neon:
569         subs            $cnt,#1
570 ___
571 for(;$i<32;$i++)        { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
572 $code.=<<___;
573         bne             .L16_79_neon
574
575          vadd.i64       $A,d30          @ h+=Maj from the past
576         vldmia          $ctx,{d24-d31}  @ load context to temp
577         vadd.i64        q8,q12          @ vectorized accumulate
578         vadd.i64        q9,q13
579         vadd.i64        q10,q14
580         vadd.i64        q11,q15
581         vstmia          $ctx,{$A-$H}    @ save context
582         teq             $inp,$len
583         sub             $Ktbl,#640      @ rewind K512
584         bne             .Loop_neon
585
586         vldmia  sp!,{d8-d15}            @ epilogue
587         bx      lr
588 #endif
589 ___
590 }
591 $code.=<<___;
592 .size   sha512_block_data_order,.-sha512_block_data_order
593 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
594 .align  2
595 .comm   OPENSSL_armcap_P,4,4
596 ___
597
598 $code =~ s/\`([^\`]*)\`/eval $1/gem;
599 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
600 print $code;
601 close STDOUT; # enforce flush