daab297db589206f3c27786c52d7fdf408f25d7b
[openssl.git] / crypto / sha / asm / sha512-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # SHA512 block procedure for ARMv4. September 2007.
11
12 # This code is ~4.5 (four and a half) times faster than code generated
13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14 # Xscale PXA250 core].
15 #
16 # July 2010.
17 #
18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
19 # Cortex A8 core and ~40 cycles per processed byte.
20
21 # February 2011.
22 #
23 # Profiler-assisted and platform-specific optimization resulted in 7%
24 # improvement on Coxtex A8 core and ~38 cycles per byte.
25
26 # March 2011.
27 #
28 # Add NEON implementation. On Cortex A8 it was measured to process
29 # one byte in 23.3 cycles or ~60% faster than integer-only code.
30
31 # August 2012.
32 #
33 # Improve NEON performance by 12% on Snapdragon S4. In absolute
34 # terms it's 22.6 cycles per byte, which is disappointing result.
35 # Technical writers asserted that 3-way S4 pipeline can sustain
36 # multiple NEON instructions per cycle, but dual NEON issue could
37 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
38 # for further details. On side note Cortex-A15 processes one byte in
39 # 16 cycles.
40
41 # Byte order [in]dependence. =========================================
42 #
43 # Originally caller was expected to maintain specific *dword* order in
44 # h[0-7], namely with most significant dword at *lower* address, which
45 # was reflected in below two parameters as 0 and 4. Now caller is
46 # expected to maintain native byte order for whole 64-bit values.
47 $hi="HI";
48 $lo="LO";
49 # ====================================================================
50
51 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
52 open STDOUT,">$output";
53
54 $ctx="r0";      # parameter block
55 $inp="r1";
56 $len="r2";
57
58 $Tlo="r3";
59 $Thi="r4";
60 $Alo="r5";
61 $Ahi="r6";
62 $Elo="r7";
63 $Ehi="r8";
64 $t0="r9";
65 $t1="r10";
66 $t2="r11";
67 $t3="r12";
68 ############    r13 is stack pointer
69 $Ktbl="r14";
70 ############    r15 is program counter
71
72 $Aoff=8*0;
73 $Boff=8*1;
74 $Coff=8*2;
75 $Doff=8*3;
76 $Eoff=8*4;
77 $Foff=8*5;
78 $Goff=8*6;
79 $Hoff=8*7;
80 $Xoff=8*8;
81
82 sub BODY_00_15() {
83 my $magic = shift;
84 $code.=<<___;
85         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
86         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
87         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
88         mov     $t0,$Elo,lsr#14
89         str     $Tlo,[sp,#$Xoff+0]
90         mov     $t1,$Ehi,lsr#14
91         str     $Thi,[sp,#$Xoff+4]
92         eor     $t0,$t0,$Ehi,lsl#18
93         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
94         eor     $t1,$t1,$Elo,lsl#18
95         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
96         eor     $t0,$t0,$Elo,lsr#18
97         eor     $t1,$t1,$Ehi,lsr#18
98         eor     $t0,$t0,$Ehi,lsl#14
99         eor     $t1,$t1,$Elo,lsl#14
100         eor     $t0,$t0,$Ehi,lsr#9
101         eor     $t1,$t1,$Elo,lsr#9
102         eor     $t0,$t0,$Elo,lsl#23
103         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
104         adds    $Tlo,$Tlo,$t0
105         ldr     $t0,[sp,#$Foff+0]       @ f.lo
106         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
107         ldr     $t1,[sp,#$Foff+4]       @ f.hi
108         adds    $Tlo,$Tlo,$t2
109         ldr     $t2,[sp,#$Goff+0]       @ g.lo
110         adc     $Thi,$Thi,$t3           @ T += h
111         ldr     $t3,[sp,#$Goff+4]       @ g.hi
112
113         eor     $t0,$t0,$t2
114         str     $Elo,[sp,#$Eoff+0]
115         eor     $t1,$t1,$t3
116         str     $Ehi,[sp,#$Eoff+4]
117         and     $t0,$t0,$Elo
118         str     $Alo,[sp,#$Aoff+0]
119         and     $t1,$t1,$Ehi
120         str     $Ahi,[sp,#$Aoff+4]
121         eor     $t0,$t0,$t2
122         ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
123         eor     $t1,$t1,$t3             @ Ch(e,f,g)
124         ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
125
126         adds    $Tlo,$Tlo,$t0
127         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
128         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
129         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
130         adds    $Tlo,$Tlo,$t2
131         and     $t0,$t2,#0xff
132         adc     $Thi,$Thi,$t3           @ T += K[i]
133         adds    $Elo,$Elo,$Tlo
134         ldr     $t2,[sp,#$Boff+0]       @ b.lo
135         adc     $Ehi,$Ehi,$Thi          @ d += T
136         teq     $t0,#$magic
137
138         ldr     $t3,[sp,#$Coff+0]       @ c.lo
139         orreq   $Ktbl,$Ktbl,#1
140         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
141         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
142         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
143         mov     $t0,$Alo,lsr#28
144         mov     $t1,$Ahi,lsr#28
145         eor     $t0,$t0,$Ahi,lsl#4
146         eor     $t1,$t1,$Alo,lsl#4
147         eor     $t0,$t0,$Ahi,lsr#2
148         eor     $t1,$t1,$Alo,lsr#2
149         eor     $t0,$t0,$Alo,lsl#30
150         eor     $t1,$t1,$Ahi,lsl#30
151         eor     $t0,$t0,$Ahi,lsr#7
152         eor     $t1,$t1,$Alo,lsr#7
153         eor     $t0,$t0,$Alo,lsl#25
154         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
155         adds    $Tlo,$Tlo,$t0
156         and     $t0,$Alo,$t2
157         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
158
159         ldr     $t1,[sp,#$Boff+4]       @ b.hi
160         orr     $Alo,$Alo,$t2
161         ldr     $t2,[sp,#$Coff+4]       @ c.hi
162         and     $Alo,$Alo,$t3
163         and     $t3,$Ahi,$t1
164         orr     $Ahi,$Ahi,$t1
165         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
166         and     $Ahi,$Ahi,$t2
167         adds    $Alo,$Alo,$Tlo
168         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
169         sub     sp,sp,#8
170         adc     $Ahi,$Ahi,$Thi          @ h += T
171         tst     $Ktbl,#1
172         add     $Ktbl,$Ktbl,#8
173 ___
174 }
175 $code=<<___;
176 #include "arm_arch.h"
177 #ifdef __ARMEL__
178 # define LO 0
179 # define HI 4
180 # define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
181 #else
182 # define HI 0
183 # define LO 4
184 # define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
185 #endif
186
187 .text
188 .code   32
189 .type   K512,%object
190 .align  5
191 K512:
192 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
193 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
194 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
195 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
196 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
197 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
198 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
199 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
200 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
201 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
202 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
203 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
204 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
205 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
206 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
207 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
208 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
209 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
210 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
211 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
212 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
213 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
214 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
215 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
216 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
217 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
218 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
219 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
220 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
221 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
222 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
223 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
224 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
225 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
226 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
227 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
228 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
229 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
230 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
231 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
232 .size   K512,.-K512
233 #if __ARM_MAX_ARCH__>=7
234 .LOPENSSL_armcap:
235 .word   OPENSSL_armcap_P-sha512_block_data_order
236 .skip   32-4
237 #else
238 .skip   32
239 #endif
240
241 .global sha512_block_data_order
242 .type   sha512_block_data_order,%function
243 sha512_block_data_order:
244         sub     r3,pc,#8                @ sha512_block_data_order
245         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
246 #if __ARM_MAX_ARCH__>=7
247         ldr     r12,.LOPENSSL_armcap
248         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
249         tst     r12,#1
250         bne     .LNEON
251 #endif
252         stmdb   sp!,{r4-r12,lr}
253         sub     $Ktbl,r3,#672           @ K512
254         sub     sp,sp,#9*8
255
256         ldr     $Elo,[$ctx,#$Eoff+$lo]
257         ldr     $Ehi,[$ctx,#$Eoff+$hi]
258         ldr     $t0, [$ctx,#$Goff+$lo]
259         ldr     $t1, [$ctx,#$Goff+$hi]
260         ldr     $t2, [$ctx,#$Hoff+$lo]
261         ldr     $t3, [$ctx,#$Hoff+$hi]
262 .Loop:
263         str     $t0, [sp,#$Goff+0]
264         str     $t1, [sp,#$Goff+4]
265         str     $t2, [sp,#$Hoff+0]
266         str     $t3, [sp,#$Hoff+4]
267         ldr     $Alo,[$ctx,#$Aoff+$lo]
268         ldr     $Ahi,[$ctx,#$Aoff+$hi]
269         ldr     $Tlo,[$ctx,#$Boff+$lo]
270         ldr     $Thi,[$ctx,#$Boff+$hi]
271         ldr     $t0, [$ctx,#$Coff+$lo]
272         ldr     $t1, [$ctx,#$Coff+$hi]
273         ldr     $t2, [$ctx,#$Doff+$lo]
274         ldr     $t3, [$ctx,#$Doff+$hi]
275         str     $Tlo,[sp,#$Boff+0]
276         str     $Thi,[sp,#$Boff+4]
277         str     $t0, [sp,#$Coff+0]
278         str     $t1, [sp,#$Coff+4]
279         str     $t2, [sp,#$Doff+0]
280         str     $t3, [sp,#$Doff+4]
281         ldr     $Tlo,[$ctx,#$Foff+$lo]
282         ldr     $Thi,[$ctx,#$Foff+$hi]
283         str     $Tlo,[sp,#$Foff+0]
284         str     $Thi,[sp,#$Foff+4]
285
286 .L00_15:
287 #if __ARM_ARCH__<7
288         ldrb    $Tlo,[$inp,#7]
289         ldrb    $t0, [$inp,#6]
290         ldrb    $t1, [$inp,#5]
291         ldrb    $t2, [$inp,#4]
292         ldrb    $Thi,[$inp,#3]
293         ldrb    $t3, [$inp,#2]
294         orr     $Tlo,$Tlo,$t0,lsl#8
295         ldrb    $t0, [$inp,#1]
296         orr     $Tlo,$Tlo,$t1,lsl#16
297         ldrb    $t1, [$inp],#8
298         orr     $Tlo,$Tlo,$t2,lsl#24
299         orr     $Thi,$Thi,$t3,lsl#8
300         orr     $Thi,$Thi,$t0,lsl#16
301         orr     $Thi,$Thi,$t1,lsl#24
302 #else
303         ldr     $Tlo,[$inp,#4]
304         ldr     $Thi,[$inp],#8
305 #ifdef __ARMEL__
306         rev     $Tlo,$Tlo
307         rev     $Thi,$Thi
308 #endif
309 #endif
310 ___
311         &BODY_00_15(0x94);
312 $code.=<<___;
313         tst     $Ktbl,#1
314         beq     .L00_15
315         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
316         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
317         bic     $Ktbl,$Ktbl,#1
318 .L16_79:
319         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
320         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
321         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
322         mov     $Tlo,$t0,lsr#1
323         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
324         mov     $Thi,$t1,lsr#1
325         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
326         eor     $Tlo,$Tlo,$t1,lsl#31
327         eor     $Thi,$Thi,$t0,lsl#31
328         eor     $Tlo,$Tlo,$t0,lsr#8
329         eor     $Thi,$Thi,$t1,lsr#8
330         eor     $Tlo,$Tlo,$t1,lsl#24
331         eor     $Thi,$Thi,$t0,lsl#24
332         eor     $Tlo,$Tlo,$t0,lsr#7
333         eor     $Thi,$Thi,$t1,lsr#7
334         eor     $Tlo,$Tlo,$t1,lsl#25
335
336         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
337         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
338         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
339         mov     $t0,$t2,lsr#19
340         mov     $t1,$t3,lsr#19
341         eor     $t0,$t0,$t3,lsl#13
342         eor     $t1,$t1,$t2,lsl#13
343         eor     $t0,$t0,$t3,lsr#29
344         eor     $t1,$t1,$t2,lsr#29
345         eor     $t0,$t0,$t2,lsl#3
346         eor     $t1,$t1,$t3,lsl#3
347         eor     $t0,$t0,$t2,lsr#6
348         eor     $t1,$t1,$t3,lsr#6
349         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
350         eor     $t0,$t0,$t3,lsl#26
351
352         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
353         adds    $Tlo,$Tlo,$t0
354         ldr     $t0,[sp,#`$Xoff+8*16`+0]
355         adc     $Thi,$Thi,$t1
356
357         ldr     $t1,[sp,#`$Xoff+8*16`+4]
358         adds    $Tlo,$Tlo,$t2
359         adc     $Thi,$Thi,$t3
360         adds    $Tlo,$Tlo,$t0
361         adc     $Thi,$Thi,$t1
362 ___
363         &BODY_00_15(0x17);
364 $code.=<<___;
365         ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
366         ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
367         beq     .L16_79
368         bic     $Ktbl,$Ktbl,#1
369
370         ldr     $Tlo,[sp,#$Boff+0]
371         ldr     $Thi,[sp,#$Boff+4]
372         ldr     $t0, [$ctx,#$Aoff+$lo]
373         ldr     $t1, [$ctx,#$Aoff+$hi]
374         ldr     $t2, [$ctx,#$Boff+$lo]
375         ldr     $t3, [$ctx,#$Boff+$hi]
376         adds    $t0,$Alo,$t0
377         str     $t0, [$ctx,#$Aoff+$lo]
378         adc     $t1,$Ahi,$t1
379         str     $t1, [$ctx,#$Aoff+$hi]
380         adds    $t2,$Tlo,$t2
381         str     $t2, [$ctx,#$Boff+$lo]
382         adc     $t3,$Thi,$t3
383         str     $t3, [$ctx,#$Boff+$hi]
384
385         ldr     $Alo,[sp,#$Coff+0]
386         ldr     $Ahi,[sp,#$Coff+4]
387         ldr     $Tlo,[sp,#$Doff+0]
388         ldr     $Thi,[sp,#$Doff+4]
389         ldr     $t0, [$ctx,#$Coff+$lo]
390         ldr     $t1, [$ctx,#$Coff+$hi]
391         ldr     $t2, [$ctx,#$Doff+$lo]
392         ldr     $t3, [$ctx,#$Doff+$hi]
393         adds    $t0,$Alo,$t0
394         str     $t0, [$ctx,#$Coff+$lo]
395         adc     $t1,$Ahi,$t1
396         str     $t1, [$ctx,#$Coff+$hi]
397         adds    $t2,$Tlo,$t2
398         str     $t2, [$ctx,#$Doff+$lo]
399         adc     $t3,$Thi,$t3
400         str     $t3, [$ctx,#$Doff+$hi]
401
402         ldr     $Tlo,[sp,#$Foff+0]
403         ldr     $Thi,[sp,#$Foff+4]
404         ldr     $t0, [$ctx,#$Eoff+$lo]
405         ldr     $t1, [$ctx,#$Eoff+$hi]
406         ldr     $t2, [$ctx,#$Foff+$lo]
407         ldr     $t3, [$ctx,#$Foff+$hi]
408         adds    $Elo,$Elo,$t0
409         str     $Elo,[$ctx,#$Eoff+$lo]
410         adc     $Ehi,$Ehi,$t1
411         str     $Ehi,[$ctx,#$Eoff+$hi]
412         adds    $t2,$Tlo,$t2
413         str     $t2, [$ctx,#$Foff+$lo]
414         adc     $t3,$Thi,$t3
415         str     $t3, [$ctx,#$Foff+$hi]
416
417         ldr     $Alo,[sp,#$Goff+0]
418         ldr     $Ahi,[sp,#$Goff+4]
419         ldr     $Tlo,[sp,#$Hoff+0]
420         ldr     $Thi,[sp,#$Hoff+4]
421         ldr     $t0, [$ctx,#$Goff+$lo]
422         ldr     $t1, [$ctx,#$Goff+$hi]
423         ldr     $t2, [$ctx,#$Hoff+$lo]
424         ldr     $t3, [$ctx,#$Hoff+$hi]
425         adds    $t0,$Alo,$t0
426         str     $t0, [$ctx,#$Goff+$lo]
427         adc     $t1,$Ahi,$t1
428         str     $t1, [$ctx,#$Goff+$hi]
429         adds    $t2,$Tlo,$t2
430         str     $t2, [$ctx,#$Hoff+$lo]
431         adc     $t3,$Thi,$t3
432         str     $t3, [$ctx,#$Hoff+$hi]
433
434         add     sp,sp,#640
435         sub     $Ktbl,$Ktbl,#640
436
437         teq     $inp,$len
438         bne     .Loop
439
440         add     sp,sp,#8*9              @ destroy frame
441 #if __ARM_ARCH__>=5
442         ldmia   sp!,{r4-r12,pc}
443 #else
444         ldmia   sp!,{r4-r12,lr}
445         tst     lr,#1
446         moveq   pc,lr                   @ be binary compatible with V4, yet
447         bx      lr                      @ interoperable with Thumb ISA:-)
448 #endif
449 ___
450
451 {
452 my @Sigma0=(28,34,39);
453 my @Sigma1=(14,18,41);
454 my @sigma0=(1, 8, 7);
455 my @sigma1=(19,61,6);
456
457 my $Ktbl="r3";
458 my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
459
460 my @X=map("d$_",(0..15));
461 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
462
463 sub NEON_00_15() {
464 my $i=shift;
465 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
466 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
467
468 $code.=<<___ if ($i<16 || $i&1);
469         vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
470 #if $i<16
471         vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
472 #endif
473         vshr.u64        $t1,$e,#@Sigma1[1]
474 #if $i>0
475          vadd.i64       $a,$Maj                 @ h+=Maj from the past
476 #endif
477         vshr.u64        $t2,$e,#@Sigma1[2]
478 ___
479 $code.=<<___;
480         vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
481         vsli.64         $t0,$e,#`64-@Sigma1[0]`
482         vsli.64         $t1,$e,#`64-@Sigma1[1]`
483         vmov            $Ch,$e
484         vsli.64         $t2,$e,#`64-@Sigma1[2]`
485 #if $i<16 && defined(__ARMEL__)
486         vrev64.8        @X[$i],@X[$i]
487 #endif
488         veor            $t1,$t0
489         vbsl            $Ch,$f,$g               @ Ch(e,f,g)
490         vshr.u64        $t0,$a,#@Sigma0[0]
491         veor            $t2,$t1                 @ Sigma1(e)
492         vadd.i64        $T1,$Ch,$h
493         vshr.u64        $t1,$a,#@Sigma0[1]
494         vsli.64         $t0,$a,#`64-@Sigma0[0]`
495         vadd.i64        $T1,$t2
496         vshr.u64        $t2,$a,#@Sigma0[2]
497         vadd.i64        $K,@X[$i%16]
498         vsli.64         $t1,$a,#`64-@Sigma0[1]`
499         veor            $Maj,$a,$b
500         vsli.64         $t2,$a,#`64-@Sigma0[2]`
501         veor            $h,$t0,$t1
502         vadd.i64        $T1,$K
503         vbsl            $Maj,$c,$b              @ Maj(a,b,c)
504         veor            $h,$t2                  @ Sigma0(a)
505         vadd.i64        $d,$T1
506         vadd.i64        $Maj,$T1
507         @ vadd.i64      $h,$Maj
508 ___
509 }
510
511 sub NEON_16_79() {
512 my $i=shift;
513
514 if ($i&1)       { &NEON_00_15($i,@_); return; }
515
516 # 2x-vectorized, therefore runs every 2nd round
517 my @X=map("q$_",(0..7));                        # view @X as 128-bit vector
518 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));     # temps
519 my ($d0,$d1,$d2) = map("d$_",(24..26));         # temps from NEON_00_15
520 my $e=@_[4];                                    # $e from NEON_00_15
521 $i /= 2;
522 $code.=<<___;
523         vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
524         vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
525          vadd.i64       @_[0],d30                       @ h+=Maj from the past
526         vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
527         vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
528         vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
529         vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
530         veor            $s1,$t0
531         vshr.u64        $t0,$s0,#@sigma0[0]
532         veor            $s1,$t1                         @ sigma1(X[i+14])
533         vshr.u64        $t1,$s0,#@sigma0[1]
534         vadd.i64        @X[$i%8],$s1
535         vshr.u64        $s1,$s0,#@sigma0[2]
536         vsli.64         $t0,$s0,#`64-@sigma0[0]`
537         vsli.64         $t1,$s0,#`64-@sigma0[1]`
538         vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
539         veor            $s1,$t0
540         vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
541         vadd.i64        @X[$i%8],$s0
542         vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
543         veor            $s1,$t1                         @ sigma0(X[i+1])
544         vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
545         vadd.i64        @X[$i%8],$s1
546 ___
547         &NEON_00_15(2*$i,@_);
548 }
549
550 $code.=<<___;
551 #if __ARM_MAX_ARCH__>=7
552 .arch   armv7-a
553 .fpu    neon
554
555 .align  4
556 .LNEON:
557         dmb                             @ errata #451034 on early Cortex A8
558         vstmdb  sp!,{d8-d15}            @ ABI specification says so
559         sub     $Ktbl,r3,#672           @ K512
560         vldmia  $ctx,{$A-$H}            @ load context
561 .Loop_neon:
562 ___
563 for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
564 $code.=<<___;
565         mov             $cnt,#4
566 .L16_79_neon:
567         subs            $cnt,#1
568 ___
569 for(;$i<32;$i++)        { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
570 $code.=<<___;
571         bne             .L16_79_neon
572
573          vadd.i64       $A,d30          @ h+=Maj from the past
574         vldmia          $ctx,{d24-d31}  @ load context to temp
575         vadd.i64        q8,q12          @ vectorized accumulate
576         vadd.i64        q9,q13
577         vadd.i64        q10,q14
578         vadd.i64        q11,q15
579         vstmia          $ctx,{$A-$H}    @ save context
580         teq             $inp,$len
581         sub             $Ktbl,#640      @ rewind K512
582         bne             .Loop_neon
583
584         vldmia  sp!,{d8-d15}            @ epilogue
585         ret                             @ bx lr
586 #endif
587 ___
588 }
589 $code.=<<___;
590 .size   sha512_block_data_order,.-sha512_block_data_order
591 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
592 .align  2
593 #if __ARM_MAX_ARCH__>=7
594 .comm   OPENSSL_armcap_P,4,4
595 #endif
596 ___
597
598 $code =~ s/\`([^\`]*)\`/eval $1/gem;
599 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
600 $code =~ s/\bret\b/bx   lr/gm;
601 print $code;
602 close STDOUT; # enforce flush