sha256-586.pl: fix typos.
[openssl.git] / crypto / sha / asm / sha512-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # SHA512 block procedure for ARMv4. September 2007.
11
12 # This code is ~4.5 (four and a half) times faster than code generated
13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14 # Xscale PXA250 core].
15 #
16 # July 2010.
17 #
18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
19 # Cortex A8 core and ~40 cycles per processed byte.
20
21 # February 2011.
22 #
23 # Profiler-assisted and platform-specific optimization resulted in 7%
24 # improvement on Coxtex A8 core and ~38 cycles per byte.
25
26 # March 2011.
27 #
28 # Add NEON implementation. On Cortex A8 it was measured to process
29 # one byte in 23.3 cycles or ~60% faster than integer-only code.
30
31 # Byte order [in]dependence. =========================================
32 #
33 # Originally caller was expected to maintain specific *dword* order in
34 # h[0-7], namely with most significant dword at *lower* address, which
35 # was reflected in below two parameters as 0 and 4. Now caller is
36 # expected to maintain native byte order for whole 64-bit values.
37 $hi="HI";
38 $lo="LO";
39 # ====================================================================
40
41 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
42 open STDOUT,">$output";
43
44 $ctx="r0";      # parameter block
45 $inp="r1";
46 $len="r2";
47
48 $Tlo="r3";
49 $Thi="r4";
50 $Alo="r5";
51 $Ahi="r6";
52 $Elo="r7";
53 $Ehi="r8";
54 $t0="r9";
55 $t1="r10";
56 $t2="r11";
57 $t3="r12";
58 ############    r13 is stack pointer
59 $Ktbl="r14";
60 ############    r15 is program counter
61
62 $Aoff=8*0;
63 $Boff=8*1;
64 $Coff=8*2;
65 $Doff=8*3;
66 $Eoff=8*4;
67 $Foff=8*5;
68 $Goff=8*6;
69 $Hoff=8*7;
70 $Xoff=8*8;
71
72 sub BODY_00_15() {
73 my $magic = shift;
74 $code.=<<___;
75         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
76         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
77         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
78         mov     $t0,$Elo,lsr#14
79         str     $Tlo,[sp,#$Xoff+0]
80         mov     $t1,$Ehi,lsr#14
81         str     $Thi,[sp,#$Xoff+4]
82         eor     $t0,$t0,$Ehi,lsl#18
83         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
84         eor     $t1,$t1,$Elo,lsl#18
85         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
86         eor     $t0,$t0,$Elo,lsr#18
87         eor     $t1,$t1,$Ehi,lsr#18
88         eor     $t0,$t0,$Ehi,lsl#14
89         eor     $t1,$t1,$Elo,lsl#14
90         eor     $t0,$t0,$Ehi,lsr#9
91         eor     $t1,$t1,$Elo,lsr#9
92         eor     $t0,$t0,$Elo,lsl#23
93         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
94         adds    $Tlo,$Tlo,$t0
95         ldr     $t0,[sp,#$Foff+0]       @ f.lo
96         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
97         ldr     $t1,[sp,#$Foff+4]       @ f.hi
98         adds    $Tlo,$Tlo,$t2
99         ldr     $t2,[sp,#$Goff+0]       @ g.lo
100         adc     $Thi,$Thi,$t3           @ T += h
101         ldr     $t3,[sp,#$Goff+4]       @ g.hi
102
103         eor     $t0,$t0,$t2
104         str     $Elo,[sp,#$Eoff+0]
105         eor     $t1,$t1,$t3
106         str     $Ehi,[sp,#$Eoff+4]
107         and     $t0,$t0,$Elo
108         str     $Alo,[sp,#$Aoff+0]
109         and     $t1,$t1,$Ehi
110         str     $Ahi,[sp,#$Aoff+4]
111         eor     $t0,$t0,$t2
112         ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
113         eor     $t1,$t1,$t3             @ Ch(e,f,g)
114         ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
115
116         adds    $Tlo,$Tlo,$t0
117         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
118         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
119         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
120         adds    $Tlo,$Tlo,$t2
121         and     $t0,$t2,#0xff
122         adc     $Thi,$Thi,$t3           @ T += K[i]
123         adds    $Elo,$Elo,$Tlo
124         ldr     $t2,[sp,#$Boff+0]       @ b.lo
125         adc     $Ehi,$Ehi,$Thi          @ d += T
126         teq     $t0,#$magic
127
128         ldr     $t3,[sp,#$Coff+0]       @ c.lo
129         orreq   $Ktbl,$Ktbl,#1
130         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
131         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
132         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
133         mov     $t0,$Alo,lsr#28
134         mov     $t1,$Ahi,lsr#28
135         eor     $t0,$t0,$Ahi,lsl#4
136         eor     $t1,$t1,$Alo,lsl#4
137         eor     $t0,$t0,$Ahi,lsr#2
138         eor     $t1,$t1,$Alo,lsr#2
139         eor     $t0,$t0,$Alo,lsl#30
140         eor     $t1,$t1,$Ahi,lsl#30
141         eor     $t0,$t0,$Ahi,lsr#7
142         eor     $t1,$t1,$Alo,lsr#7
143         eor     $t0,$t0,$Alo,lsl#25
144         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
145         adds    $Tlo,$Tlo,$t0
146         and     $t0,$Alo,$t2
147         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
148
149         ldr     $t1,[sp,#$Boff+4]       @ b.hi
150         orr     $Alo,$Alo,$t2
151         ldr     $t2,[sp,#$Coff+4]       @ c.hi
152         and     $Alo,$Alo,$t3
153         and     $t3,$Ahi,$t1
154         orr     $Ahi,$Ahi,$t1
155         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
156         and     $Ahi,$Ahi,$t2
157         adds    $Alo,$Alo,$Tlo
158         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
159         sub     sp,sp,#8
160         adc     $Ahi,$Ahi,$Thi          @ h += T
161         tst     $Ktbl,#1
162         add     $Ktbl,$Ktbl,#8
163 ___
164 }
165 $code=<<___;
166 #include "arm_arch.h"
167 #ifdef __ARMEL__
168 # define LO 0
169 # define HI 4
170 # define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
171 #else
172 # define HI 0
173 # define LO 4
174 # define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
175 #endif
176
177 .text
178 .code   32
179 .type   K512,%object
180 .align  5
181 K512:
182 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
183 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
184 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
185 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
186 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
187 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
188 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
189 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
190 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
191 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
192 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
193 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
194 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
195 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
196 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
197 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
198 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
199 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
200 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
201 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
202 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
203 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
204 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
205 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
206 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
207 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
208 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
209 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
210 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
211 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
212 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
213 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
214 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
215 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
216 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
217 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
218 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
219 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
220 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
221 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
222 .size   K512,.-K512
223 .LOPENSSL_armcap:
224 .word   OPENSSL_armcap_P-sha512_block_data_order
225 .skip   32-4
226
227 .global sha512_block_data_order
228 .type   sha512_block_data_order,%function
229 sha512_block_data_order:
230         sub     r3,pc,#8                @ sha512_block_data_order
231         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
232 #if __ARM_ARCH__>=7
233         ldr     r12,.LOPENSSL_armcap
234         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
235         tst     r12,#1
236         bne     .LNEON
237 #endif
238         stmdb   sp!,{r4-r12,lr}
239         sub     $Ktbl,r3,#672           @ K512
240         sub     sp,sp,#9*8
241
242         ldr     $Elo,[$ctx,#$Eoff+$lo]
243         ldr     $Ehi,[$ctx,#$Eoff+$hi]
244         ldr     $t0, [$ctx,#$Goff+$lo]
245         ldr     $t1, [$ctx,#$Goff+$hi]
246         ldr     $t2, [$ctx,#$Hoff+$lo]
247         ldr     $t3, [$ctx,#$Hoff+$hi]
248 .Loop:
249         str     $t0, [sp,#$Goff+0]
250         str     $t1, [sp,#$Goff+4]
251         str     $t2, [sp,#$Hoff+0]
252         str     $t3, [sp,#$Hoff+4]
253         ldr     $Alo,[$ctx,#$Aoff+$lo]
254         ldr     $Ahi,[$ctx,#$Aoff+$hi]
255         ldr     $Tlo,[$ctx,#$Boff+$lo]
256         ldr     $Thi,[$ctx,#$Boff+$hi]
257         ldr     $t0, [$ctx,#$Coff+$lo]
258         ldr     $t1, [$ctx,#$Coff+$hi]
259         ldr     $t2, [$ctx,#$Doff+$lo]
260         ldr     $t3, [$ctx,#$Doff+$hi]
261         str     $Tlo,[sp,#$Boff+0]
262         str     $Thi,[sp,#$Boff+4]
263         str     $t0, [sp,#$Coff+0]
264         str     $t1, [sp,#$Coff+4]
265         str     $t2, [sp,#$Doff+0]
266         str     $t3, [sp,#$Doff+4]
267         ldr     $Tlo,[$ctx,#$Foff+$lo]
268         ldr     $Thi,[$ctx,#$Foff+$hi]
269         str     $Tlo,[sp,#$Foff+0]
270         str     $Thi,[sp,#$Foff+4]
271
272 .L00_15:
273 #if __ARM_ARCH__<7
274         ldrb    $Tlo,[$inp,#7]
275         ldrb    $t0, [$inp,#6]
276         ldrb    $t1, [$inp,#5]
277         ldrb    $t2, [$inp,#4]
278         ldrb    $Thi,[$inp,#3]
279         ldrb    $t3, [$inp,#2]
280         orr     $Tlo,$Tlo,$t0,lsl#8
281         ldrb    $t0, [$inp,#1]
282         orr     $Tlo,$Tlo,$t1,lsl#16
283         ldrb    $t1, [$inp],#8
284         orr     $Tlo,$Tlo,$t2,lsl#24
285         orr     $Thi,$Thi,$t3,lsl#8
286         orr     $Thi,$Thi,$t0,lsl#16
287         orr     $Thi,$Thi,$t1,lsl#24
288 #else
289         ldr     $Tlo,[$inp,#4]
290         ldr     $Thi,[$inp],#8
291 #ifdef __ARMEL__
292         rev     $Tlo,$Tlo
293         rev     $Thi,$Thi
294 #endif
295 #endif
296 ___
297         &BODY_00_15(0x94);
298 $code.=<<___;
299         tst     $Ktbl,#1
300         beq     .L00_15
301         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
302         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
303         bic     $Ktbl,$Ktbl,#1
304 .L16_79:
305         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
306         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
307         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
308         mov     $Tlo,$t0,lsr#1
309         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
310         mov     $Thi,$t1,lsr#1
311         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
312         eor     $Tlo,$Tlo,$t1,lsl#31
313         eor     $Thi,$Thi,$t0,lsl#31
314         eor     $Tlo,$Tlo,$t0,lsr#8
315         eor     $Thi,$Thi,$t1,lsr#8
316         eor     $Tlo,$Tlo,$t1,lsl#24
317         eor     $Thi,$Thi,$t0,lsl#24
318         eor     $Tlo,$Tlo,$t0,lsr#7
319         eor     $Thi,$Thi,$t1,lsr#7
320         eor     $Tlo,$Tlo,$t1,lsl#25
321
322         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
323         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
324         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
325         mov     $t0,$t2,lsr#19
326         mov     $t1,$t3,lsr#19
327         eor     $t0,$t0,$t3,lsl#13
328         eor     $t1,$t1,$t2,lsl#13
329         eor     $t0,$t0,$t3,lsr#29
330         eor     $t1,$t1,$t2,lsr#29
331         eor     $t0,$t0,$t2,lsl#3
332         eor     $t1,$t1,$t3,lsl#3
333         eor     $t0,$t0,$t2,lsr#6
334         eor     $t1,$t1,$t3,lsr#6
335         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
336         eor     $t0,$t0,$t3,lsl#26
337
338         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
339         adds    $Tlo,$Tlo,$t0
340         ldr     $t0,[sp,#`$Xoff+8*16`+0]
341         adc     $Thi,$Thi,$t1
342
343         ldr     $t1,[sp,#`$Xoff+8*16`+4]
344         adds    $Tlo,$Tlo,$t2
345         adc     $Thi,$Thi,$t3
346         adds    $Tlo,$Tlo,$t0
347         adc     $Thi,$Thi,$t1
348 ___
349         &BODY_00_15(0x17);
350 $code.=<<___;
351         ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
352         ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
353         beq     .L16_79
354         bic     $Ktbl,$Ktbl,#1
355
356         ldr     $Tlo,[sp,#$Boff+0]
357         ldr     $Thi,[sp,#$Boff+4]
358         ldr     $t0, [$ctx,#$Aoff+$lo]
359         ldr     $t1, [$ctx,#$Aoff+$hi]
360         ldr     $t2, [$ctx,#$Boff+$lo]
361         ldr     $t3, [$ctx,#$Boff+$hi]
362         adds    $t0,$Alo,$t0
363         str     $t0, [$ctx,#$Aoff+$lo]
364         adc     $t1,$Ahi,$t1
365         str     $t1, [$ctx,#$Aoff+$hi]
366         adds    $t2,$Tlo,$t2
367         str     $t2, [$ctx,#$Boff+$lo]
368         adc     $t3,$Thi,$t3
369         str     $t3, [$ctx,#$Boff+$hi]
370
371         ldr     $Alo,[sp,#$Coff+0]
372         ldr     $Ahi,[sp,#$Coff+4]
373         ldr     $Tlo,[sp,#$Doff+0]
374         ldr     $Thi,[sp,#$Doff+4]
375         ldr     $t0, [$ctx,#$Coff+$lo]
376         ldr     $t1, [$ctx,#$Coff+$hi]
377         ldr     $t2, [$ctx,#$Doff+$lo]
378         ldr     $t3, [$ctx,#$Doff+$hi]
379         adds    $t0,$Alo,$t0
380         str     $t0, [$ctx,#$Coff+$lo]
381         adc     $t1,$Ahi,$t1
382         str     $t1, [$ctx,#$Coff+$hi]
383         adds    $t2,$Tlo,$t2
384         str     $t2, [$ctx,#$Doff+$lo]
385         adc     $t3,$Thi,$t3
386         str     $t3, [$ctx,#$Doff+$hi]
387
388         ldr     $Tlo,[sp,#$Foff+0]
389         ldr     $Thi,[sp,#$Foff+4]
390         ldr     $t0, [$ctx,#$Eoff+$lo]
391         ldr     $t1, [$ctx,#$Eoff+$hi]
392         ldr     $t2, [$ctx,#$Foff+$lo]
393         ldr     $t3, [$ctx,#$Foff+$hi]
394         adds    $Elo,$Elo,$t0
395         str     $Elo,[$ctx,#$Eoff+$lo]
396         adc     $Ehi,$Ehi,$t1
397         str     $Ehi,[$ctx,#$Eoff+$hi]
398         adds    $t2,$Tlo,$t2
399         str     $t2, [$ctx,#$Foff+$lo]
400         adc     $t3,$Thi,$t3
401         str     $t3, [$ctx,#$Foff+$hi]
402
403         ldr     $Alo,[sp,#$Goff+0]
404         ldr     $Ahi,[sp,#$Goff+4]
405         ldr     $Tlo,[sp,#$Hoff+0]
406         ldr     $Thi,[sp,#$Hoff+4]
407         ldr     $t0, [$ctx,#$Goff+$lo]
408         ldr     $t1, [$ctx,#$Goff+$hi]
409         ldr     $t2, [$ctx,#$Hoff+$lo]
410         ldr     $t3, [$ctx,#$Hoff+$hi]
411         adds    $t0,$Alo,$t0
412         str     $t0, [$ctx,#$Goff+$lo]
413         adc     $t1,$Ahi,$t1
414         str     $t1, [$ctx,#$Goff+$hi]
415         adds    $t2,$Tlo,$t2
416         str     $t2, [$ctx,#$Hoff+$lo]
417         adc     $t3,$Thi,$t3
418         str     $t3, [$ctx,#$Hoff+$hi]
419
420         add     sp,sp,#640
421         sub     $Ktbl,$Ktbl,#640
422
423         teq     $inp,$len
424         bne     .Loop
425
426         add     sp,sp,#8*9              @ destroy frame
427 #if __ARM_ARCH__>=5
428         ldmia   sp!,{r4-r12,pc}
429 #else
430         ldmia   sp!,{r4-r12,lr}
431         tst     lr,#1
432         moveq   pc,lr                   @ be binary compatible with V4, yet
433         bx      lr                      @ interoperable with Thumb ISA:-)
434 #endif
435 ___
436
437 {
438 my @Sigma0=(28,34,39);
439 my @Sigma1=(14,18,41);
440 my @sigma0=(1, 8, 7);
441 my @sigma1=(19,61,6);
442
443 my $Ktbl="r3";
444 my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
445
446 my @X=map("d$_",(0..15));
447 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
448
449 sub NEON_00_15() {
450 my $i=shift;
451 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
452 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
453
454 $code.=<<___ if ($i<16 || $i&1);
455         vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
456 #if $i<16
457         vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
458 #endif
459         vshr.u64        $t1,$e,#@Sigma1[1]
460         vshr.u64        $t2,$e,#@Sigma1[2]
461 ___
462 $code.=<<___;
463         vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
464         vsli.64         $t0,$e,#`64-@Sigma1[0]`
465         vsli.64         $t1,$e,#`64-@Sigma1[1]`
466         vmov            $Ch,$e
467         vsli.64         $t2,$e,#`64-@Sigma1[2]`
468 #if $i<16 && defined(__ARMEL__)
469         vrev64.8        @X[$i],@X[$i]
470 #endif
471         vbsl            $Ch,$f,$g               @ Ch(e,f,g)
472         veor            $t1,$t0
473         vshr.u64        $t0,$a,#@Sigma0[0]
474         veor            $t2,$t1                 @ Sigma1(e)
475         vshr.u64        $t1,$a,#@Sigma0[1]
476         vadd.i64        $T1,$h,$t2
477         vshr.u64        $t2,$a,#@Sigma0[2]
478         vadd.i64        $T1,$Ch
479         vsli.64         $t0,$a,#`64-@Sigma0[0]`
480         vadd.i64        $T1,@X[$i%16]
481         vsli.64         $t1,$a,#`64-@Sigma0[1]`
482         vadd.i64        $T1,$K
483         vsli.64         $t2,$a,#`64-@Sigma0[2]`
484         veor            $h,$t0,$t1
485         veor            $Maj,$a,$b
486         veor            $h,$t2                  @ Sigma0(a)
487         vbsl            $Maj,$c,$b              @ Maj(a,b,c)
488         vadd.i64        $h,$T1
489         vadd.i64        $d,$T1
490         vadd.i64        $h,$Maj
491 ___
492 }
493
494 sub NEON_16_79() {
495 my $i=shift;
496
497 if ($i&1)       { &NEON_00_15($i,@_); return; }
498
499 # 2x-vectorized, therefore runs every 2nd round
500 my @X=map("q$_",(0..7));                        # view @X as 128-bit vector
501 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));     # temps
502 my ($d0,$d1,$d2) = map("d$_",(24..26));         # temps from NEON_00_15
503 my $e=@_[4];                                    # $e from NEON_00_15
504 $i /= 2;
505 $code.=<<___;
506         vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
507         vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
508         vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
509         vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
510         vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
511         vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
512         veor            $s1,$t0
513         vshr.u64        $t0,$s0,#@sigma0[0]
514         veor            $s1,$t1                         @ sigma1(X[i+14])
515         vshr.u64        $t1,$s0,#@sigma0[1]
516         vadd.i64        @X[$i%8],$s1
517         vshr.u64        $s1,$s0,#@sigma0[2]
518         vsli.64         $t0,$s0,#`64-@sigma0[0]`
519         vsli.64         $t1,$s0,#`64-@sigma0[1]`
520         vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
521         veor            $s1,$t0
522         vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
523         vadd.i64        @X[$i%8],$s0
524         vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
525         veor            $s1,$t1                         @ sigma0(X[i+1])
526         vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
527         vadd.i64        @X[$i%8],$s1
528 ___
529         &NEON_00_15(2*$i,@_);
530 }
531
532 $code.=<<___;
533 #if __ARM_ARCH__>=7
534 .fpu    neon
535
536 .align  4
537 .LNEON:
538         dmb                             @ errata #451034 on early Cortex A8
539         vstmdb  sp!,{d8-d15}            @ ABI specification says so
540         sub     $Ktbl,r3,#672           @ K512
541         vldmia  $ctx,{$A-$H}            @ load context
542 .Loop_neon:
543 ___
544 for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
545 $code.=<<___;
546         mov             $cnt,#4
547 .L16_79_neon:
548         subs            $cnt,#1
549 ___
550 for(;$i<32;$i++)        { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
551 $code.=<<___;
552         bne             .L16_79_neon
553
554         vldmia          $ctx,{d24-d31}  @ load context to temp
555         vadd.i64        q8,q12          @ vectorized accumulate
556         vadd.i64        q9,q13
557         vadd.i64        q10,q14
558         vadd.i64        q11,q15
559         vstmia          $ctx,{$A-$H}    @ save context
560         teq             $inp,$len
561         sub             $Ktbl,#640      @ rewind K512
562         bne             .Loop_neon
563
564         vldmia  sp!,{d8-d15}            @ epilogue
565         bx      lr
566 #endif
567 ___
568 }
569 $code.=<<___;
570 .size   sha512_block_data_order,.-sha512_block_data_order
571 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
572 .align  2
573 .comm   OPENSSL_armcap_P,4,4
574 ___
575
576 $code =~ s/\`([^\`]*)\`/eval $1/gem;
577 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
578 print $code;
579 close STDOUT; # enforce flush