ARMv4 assembly pack: implement support for Thumb2.
[openssl.git] / crypto / sha / asm / sha512-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 #
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
11
12 # SHA512 block procedure for ARMv4. September 2007.
13
14 # This code is ~4.5 (four and a half) times faster than code generated
15 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
16 # Xscale PXA250 core].
17 #
18 # July 2010.
19 #
20 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
21 # Cortex A8 core and ~40 cycles per processed byte.
22
23 # February 2011.
24 #
25 # Profiler-assisted and platform-specific optimization resulted in 7%
26 # improvement on Coxtex A8 core and ~38 cycles per byte.
27
28 # March 2011.
29 #
30 # Add NEON implementation. On Cortex A8 it was measured to process
31 # one byte in 23.3 cycles or ~60% faster than integer-only code.
32
33 # August 2012.
34 #
35 # Improve NEON performance by 12% on Snapdragon S4. In absolute
36 # terms it's 22.6 cycles per byte, which is disappointing result.
37 # Technical writers asserted that 3-way S4 pipeline can sustain
38 # multiple NEON instructions per cycle, but dual NEON issue could
39 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
40 # for further details. On side note Cortex-A15 processes one byte in
41 # 16 cycles.
42
43 # Byte order [in]dependence. =========================================
44 #
45 # Originally caller was expected to maintain specific *dword* order in
46 # h[0-7], namely with most significant dword at *lower* address, which
47 # was reflected in below two parameters as 0 and 4. Now caller is
48 # expected to maintain native byte order for whole 64-bit values.
49 $hi="HI";
50 $lo="LO";
51 # ====================================================================
52
53 $flavour = shift;
54 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
55 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
56
57 if ($flavour && $flavour ne "void") {
58     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
60     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
61     die "can't locate arm-xlate.pl";
62
63     open STDOUT,"| \"$^X\" $xlate $flavour $output";
64 } else {
65     open STDOUT,">$output";
66 }
67
68 $ctx="r0";      # parameter block
69 $inp="r1";
70 $len="r2";
71
72 $Tlo="r3";
73 $Thi="r4";
74 $Alo="r5";
75 $Ahi="r6";
76 $Elo="r7";
77 $Ehi="r8";
78 $t0="r9";
79 $t1="r10";
80 $t2="r11";
81 $t3="r12";
82 ############    r13 is stack pointer
83 $Ktbl="r14";
84 ############    r15 is program counter
85
86 $Aoff=8*0;
87 $Boff=8*1;
88 $Coff=8*2;
89 $Doff=8*3;
90 $Eoff=8*4;
91 $Foff=8*5;
92 $Goff=8*6;
93 $Hoff=8*7;
94 $Xoff=8*8;
95
96 sub BODY_00_15() {
97 my $magic = shift;
98 $code.=<<___;
99         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
100         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
101         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
102         mov     $t0,$Elo,lsr#14
103         str     $Tlo,[sp,#$Xoff+0]
104         mov     $t1,$Ehi,lsr#14
105         str     $Thi,[sp,#$Xoff+4]
106         eor     $t0,$t0,$Ehi,lsl#18
107         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
108         eor     $t1,$t1,$Elo,lsl#18
109         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
110         eor     $t0,$t0,$Elo,lsr#18
111         eor     $t1,$t1,$Ehi,lsr#18
112         eor     $t0,$t0,$Ehi,lsl#14
113         eor     $t1,$t1,$Elo,lsl#14
114         eor     $t0,$t0,$Ehi,lsr#9
115         eor     $t1,$t1,$Elo,lsr#9
116         eor     $t0,$t0,$Elo,lsl#23
117         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
118         adds    $Tlo,$Tlo,$t0
119         ldr     $t0,[sp,#$Foff+0]       @ f.lo
120         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
121         ldr     $t1,[sp,#$Foff+4]       @ f.hi
122         adds    $Tlo,$Tlo,$t2
123         ldr     $t2,[sp,#$Goff+0]       @ g.lo
124         adc     $Thi,$Thi,$t3           @ T += h
125         ldr     $t3,[sp,#$Goff+4]       @ g.hi
126
127         eor     $t0,$t0,$t2
128         str     $Elo,[sp,#$Eoff+0]
129         eor     $t1,$t1,$t3
130         str     $Ehi,[sp,#$Eoff+4]
131         and     $t0,$t0,$Elo
132         str     $Alo,[sp,#$Aoff+0]
133         and     $t1,$t1,$Ehi
134         str     $Ahi,[sp,#$Aoff+4]
135         eor     $t0,$t0,$t2
136         ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
137         eor     $t1,$t1,$t3             @ Ch(e,f,g)
138         ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
139
140         adds    $Tlo,$Tlo,$t0
141         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
142         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
143         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
144         adds    $Tlo,$Tlo,$t2
145         and     $t0,$t2,#0xff
146         adc     $Thi,$Thi,$t3           @ T += K[i]
147         adds    $Elo,$Elo,$Tlo
148         ldr     $t2,[sp,#$Boff+0]       @ b.lo
149         adc     $Ehi,$Ehi,$Thi          @ d += T
150         teq     $t0,#$magic
151
152         ldr     $t3,[sp,#$Coff+0]       @ c.lo
153 #if __ARM_ARCH__>=7
154         it      eq                      @ Thumb2 thing, sanity check in ARM
155 #endif
156         orreq   $Ktbl,$Ktbl,#1
157         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
158         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
159         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
160         mov     $t0,$Alo,lsr#28
161         mov     $t1,$Ahi,lsr#28
162         eor     $t0,$t0,$Ahi,lsl#4
163         eor     $t1,$t1,$Alo,lsl#4
164         eor     $t0,$t0,$Ahi,lsr#2
165         eor     $t1,$t1,$Alo,lsr#2
166         eor     $t0,$t0,$Alo,lsl#30
167         eor     $t1,$t1,$Ahi,lsl#30
168         eor     $t0,$t0,$Ahi,lsr#7
169         eor     $t1,$t1,$Alo,lsr#7
170         eor     $t0,$t0,$Alo,lsl#25
171         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
172         adds    $Tlo,$Tlo,$t0
173         and     $t0,$Alo,$t2
174         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
175
176         ldr     $t1,[sp,#$Boff+4]       @ b.hi
177         orr     $Alo,$Alo,$t2
178         ldr     $t2,[sp,#$Coff+4]       @ c.hi
179         and     $Alo,$Alo,$t3
180         and     $t3,$Ahi,$t1
181         orr     $Ahi,$Ahi,$t1
182         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
183         and     $Ahi,$Ahi,$t2
184         adds    $Alo,$Alo,$Tlo
185         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
186         sub     sp,sp,#8
187         adc     $Ahi,$Ahi,$Thi          @ h += T
188         tst     $Ktbl,#1
189         add     $Ktbl,$Ktbl,#8
190 ___
191 }
192 $code=<<___;
193 #ifndef __KERNEL__
194 # include "arm_arch.h"
195 # define VFP_ABI_PUSH   vstmdb  sp!,{d8-d15}
196 # define VFP_ABI_POP    vldmia  sp!,{d8-d15}
197 #else
198 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
199 # define __ARM_MAX_ARCH__ 7
200 # define VFP_ABI_PUSH
201 # define VFP_ABI_POP
202 #endif
203
204 #ifdef __ARMEL__
205 # define LO 0
206 # define HI 4
207 # define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
208 #else
209 # define HI 0
210 # define LO 4
211 # define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
212 #endif
213
214 .text
215 #if defined(__thumb2__) && !defined(__APPLE__)
216 .syntax unified
217 .thumb
218 # define adrl adr
219 #else
220 .code   32
221 #endif
222
223 .type   K512,%object
224 .align  5
225 K512:
226 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
227 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
228 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
229 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
230 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
231 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
232 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
233 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
234 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
235 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
236 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
237 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
238 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
239 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
240 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
241 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
242 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
243 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
244 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
245 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
246 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
247 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
248 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
249 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
250 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
251 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
252 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
253 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
254 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
255 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
256 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
257 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
258 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
259 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
260 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
261 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
262 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
263 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
264 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
265 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
266 .size   K512,.-K512
267 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
268 .LOPENSSL_armcap:
269 .word   OPENSSL_armcap_P-.Lsha512_block_data_order
270 .skip   32-4
271 #else
272 .skip   32
273 #endif
274
275 .global sha512_block_data_order
276 .type   sha512_block_data_order,%function
277 sha512_block_data_order:
278 .Lsha512_block_data_order:
279 #if __ARM_ARCH__<7 && !defined(__thumb2__)
280         sub     r3,pc,#8                @ sha512_block_data_order
281 #else
282         adr     r3,.Lsha512_block_data_order
283 #endif
284 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
285         ldr     r12,.LOPENSSL_armcap
286         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
287 #ifdef  __APPLE__
288         ldr     r12,[r12]
289 #endif
290         tst     r12,#1
291         bne     .LNEON
292 #endif
293         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
294         stmdb   sp!,{r4-r12,lr}
295         sub     $Ktbl,r3,#672           @ K512
296         sub     sp,sp,#9*8
297
298         ldr     $Elo,[$ctx,#$Eoff+$lo]
299         ldr     $Ehi,[$ctx,#$Eoff+$hi]
300         ldr     $t0, [$ctx,#$Goff+$lo]
301         ldr     $t1, [$ctx,#$Goff+$hi]
302         ldr     $t2, [$ctx,#$Hoff+$lo]
303         ldr     $t3, [$ctx,#$Hoff+$hi]
304 .Loop:
305         str     $t0, [sp,#$Goff+0]
306         str     $t1, [sp,#$Goff+4]
307         str     $t2, [sp,#$Hoff+0]
308         str     $t3, [sp,#$Hoff+4]
309         ldr     $Alo,[$ctx,#$Aoff+$lo]
310         ldr     $Ahi,[$ctx,#$Aoff+$hi]
311         ldr     $Tlo,[$ctx,#$Boff+$lo]
312         ldr     $Thi,[$ctx,#$Boff+$hi]
313         ldr     $t0, [$ctx,#$Coff+$lo]
314         ldr     $t1, [$ctx,#$Coff+$hi]
315         ldr     $t2, [$ctx,#$Doff+$lo]
316         ldr     $t3, [$ctx,#$Doff+$hi]
317         str     $Tlo,[sp,#$Boff+0]
318         str     $Thi,[sp,#$Boff+4]
319         str     $t0, [sp,#$Coff+0]
320         str     $t1, [sp,#$Coff+4]
321         str     $t2, [sp,#$Doff+0]
322         str     $t3, [sp,#$Doff+4]
323         ldr     $Tlo,[$ctx,#$Foff+$lo]
324         ldr     $Thi,[$ctx,#$Foff+$hi]
325         str     $Tlo,[sp,#$Foff+0]
326         str     $Thi,[sp,#$Foff+4]
327
328 .L00_15:
329 #if __ARM_ARCH__<7
330         ldrb    $Tlo,[$inp,#7]
331         ldrb    $t0, [$inp,#6]
332         ldrb    $t1, [$inp,#5]
333         ldrb    $t2, [$inp,#4]
334         ldrb    $Thi,[$inp,#3]
335         ldrb    $t3, [$inp,#2]
336         orr     $Tlo,$Tlo,$t0,lsl#8
337         ldrb    $t0, [$inp,#1]
338         orr     $Tlo,$Tlo,$t1,lsl#16
339         ldrb    $t1, [$inp],#8
340         orr     $Tlo,$Tlo,$t2,lsl#24
341         orr     $Thi,$Thi,$t3,lsl#8
342         orr     $Thi,$Thi,$t0,lsl#16
343         orr     $Thi,$Thi,$t1,lsl#24
344 #else
345         ldr     $Tlo,[$inp,#4]
346         ldr     $Thi,[$inp],#8
347 #ifdef __ARMEL__
348         rev     $Tlo,$Tlo
349         rev     $Thi,$Thi
350 #endif
351 #endif
352 ___
353         &BODY_00_15(0x94);
354 $code.=<<___;
355         tst     $Ktbl,#1
356         beq     .L00_15
357         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
358         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
359         bic     $Ktbl,$Ktbl,#1
360 .L16_79:
361         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
362         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
363         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
364         mov     $Tlo,$t0,lsr#1
365         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
366         mov     $Thi,$t1,lsr#1
367         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
368         eor     $Tlo,$Tlo,$t1,lsl#31
369         eor     $Thi,$Thi,$t0,lsl#31
370         eor     $Tlo,$Tlo,$t0,lsr#8
371         eor     $Thi,$Thi,$t1,lsr#8
372         eor     $Tlo,$Tlo,$t1,lsl#24
373         eor     $Thi,$Thi,$t0,lsl#24
374         eor     $Tlo,$Tlo,$t0,lsr#7
375         eor     $Thi,$Thi,$t1,lsr#7
376         eor     $Tlo,$Tlo,$t1,lsl#25
377
378         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
379         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
380         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
381         mov     $t0,$t2,lsr#19
382         mov     $t1,$t3,lsr#19
383         eor     $t0,$t0,$t3,lsl#13
384         eor     $t1,$t1,$t2,lsl#13
385         eor     $t0,$t0,$t3,lsr#29
386         eor     $t1,$t1,$t2,lsr#29
387         eor     $t0,$t0,$t2,lsl#3
388         eor     $t1,$t1,$t3,lsl#3
389         eor     $t0,$t0,$t2,lsr#6
390         eor     $t1,$t1,$t3,lsr#6
391         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
392         eor     $t0,$t0,$t3,lsl#26
393
394         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
395         adds    $Tlo,$Tlo,$t0
396         ldr     $t0,[sp,#`$Xoff+8*16`+0]
397         adc     $Thi,$Thi,$t1
398
399         ldr     $t1,[sp,#`$Xoff+8*16`+4]
400         adds    $Tlo,$Tlo,$t2
401         adc     $Thi,$Thi,$t3
402         adds    $Tlo,$Tlo,$t0
403         adc     $Thi,$Thi,$t1
404 ___
405         &BODY_00_15(0x17);
406 $code.=<<___;
407 #if __ARM_ARCH__>=7
408         ittt    eq                      @ Thumb2 thing, sanity check in ARM
409 #endif
410         ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
411         ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
412         beq     .L16_79
413         bic     $Ktbl,$Ktbl,#1
414
415         ldr     $Tlo,[sp,#$Boff+0]
416         ldr     $Thi,[sp,#$Boff+4]
417         ldr     $t0, [$ctx,#$Aoff+$lo]
418         ldr     $t1, [$ctx,#$Aoff+$hi]
419         ldr     $t2, [$ctx,#$Boff+$lo]
420         ldr     $t3, [$ctx,#$Boff+$hi]
421         adds    $t0,$Alo,$t0
422         str     $t0, [$ctx,#$Aoff+$lo]
423         adc     $t1,$Ahi,$t1
424         str     $t1, [$ctx,#$Aoff+$hi]
425         adds    $t2,$Tlo,$t2
426         str     $t2, [$ctx,#$Boff+$lo]
427         adc     $t3,$Thi,$t3
428         str     $t3, [$ctx,#$Boff+$hi]
429
430         ldr     $Alo,[sp,#$Coff+0]
431         ldr     $Ahi,[sp,#$Coff+4]
432         ldr     $Tlo,[sp,#$Doff+0]
433         ldr     $Thi,[sp,#$Doff+4]
434         ldr     $t0, [$ctx,#$Coff+$lo]
435         ldr     $t1, [$ctx,#$Coff+$hi]
436         ldr     $t2, [$ctx,#$Doff+$lo]
437         ldr     $t3, [$ctx,#$Doff+$hi]
438         adds    $t0,$Alo,$t0
439         str     $t0, [$ctx,#$Coff+$lo]
440         adc     $t1,$Ahi,$t1
441         str     $t1, [$ctx,#$Coff+$hi]
442         adds    $t2,$Tlo,$t2
443         str     $t2, [$ctx,#$Doff+$lo]
444         adc     $t3,$Thi,$t3
445         str     $t3, [$ctx,#$Doff+$hi]
446
447         ldr     $Tlo,[sp,#$Foff+0]
448         ldr     $Thi,[sp,#$Foff+4]
449         ldr     $t0, [$ctx,#$Eoff+$lo]
450         ldr     $t1, [$ctx,#$Eoff+$hi]
451         ldr     $t2, [$ctx,#$Foff+$lo]
452         ldr     $t3, [$ctx,#$Foff+$hi]
453         adds    $Elo,$Elo,$t0
454         str     $Elo,[$ctx,#$Eoff+$lo]
455         adc     $Ehi,$Ehi,$t1
456         str     $Ehi,[$ctx,#$Eoff+$hi]
457         adds    $t2,$Tlo,$t2
458         str     $t2, [$ctx,#$Foff+$lo]
459         adc     $t3,$Thi,$t3
460         str     $t3, [$ctx,#$Foff+$hi]
461
462         ldr     $Alo,[sp,#$Goff+0]
463         ldr     $Ahi,[sp,#$Goff+4]
464         ldr     $Tlo,[sp,#$Hoff+0]
465         ldr     $Thi,[sp,#$Hoff+4]
466         ldr     $t0, [$ctx,#$Goff+$lo]
467         ldr     $t1, [$ctx,#$Goff+$hi]
468         ldr     $t2, [$ctx,#$Hoff+$lo]
469         ldr     $t3, [$ctx,#$Hoff+$hi]
470         adds    $t0,$Alo,$t0
471         str     $t0, [$ctx,#$Goff+$lo]
472         adc     $t1,$Ahi,$t1
473         str     $t1, [$ctx,#$Goff+$hi]
474         adds    $t2,$Tlo,$t2
475         str     $t2, [$ctx,#$Hoff+$lo]
476         adc     $t3,$Thi,$t3
477         str     $t3, [$ctx,#$Hoff+$hi]
478
479         add     sp,sp,#640
480         sub     $Ktbl,$Ktbl,#640
481
482         teq     $inp,$len
483         bne     .Loop
484
485         add     sp,sp,#8*9              @ destroy frame
486 #if __ARM_ARCH__>=5
487         ldmia   sp!,{r4-r12,pc}
488 #else
489         ldmia   sp!,{r4-r12,lr}
490         tst     lr,#1
491         moveq   pc,lr                   @ be binary compatible with V4, yet
492         bx      lr                      @ interoperable with Thumb ISA:-)
493 #endif
494 .size   sha512_block_data_order,.-sha512_block_data_order
495 ___
496
497 {
498 my @Sigma0=(28,34,39);
499 my @Sigma1=(14,18,41);
500 my @sigma0=(1, 8, 7);
501 my @sigma1=(19,61,6);
502
503 my $Ktbl="r3";
504 my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
505
506 my @X=map("d$_",(0..15));
507 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
508
509 sub NEON_00_15() {
510 my $i=shift;
511 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
512 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
513
514 $code.=<<___ if ($i<16 || $i&1);
515         vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
516 #if $i<16
517         vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
518 #endif
519         vshr.u64        $t1,$e,#@Sigma1[1]
520 #if $i>0
521          vadd.i64       $a,$Maj                 @ h+=Maj from the past
522 #endif
523         vshr.u64        $t2,$e,#@Sigma1[2]
524 ___
525 $code.=<<___;
526         vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
527         vsli.64         $t0,$e,#`64-@Sigma1[0]`
528         vsli.64         $t1,$e,#`64-@Sigma1[1]`
529         vmov            $Ch,$e
530         vsli.64         $t2,$e,#`64-@Sigma1[2]`
531 #if $i<16 && defined(__ARMEL__)
532         vrev64.8        @X[$i],@X[$i]
533 #endif
534         veor            $t1,$t0
535         vbsl            $Ch,$f,$g               @ Ch(e,f,g)
536         vshr.u64        $t0,$a,#@Sigma0[0]
537         veor            $t2,$t1                 @ Sigma1(e)
538         vadd.i64        $T1,$Ch,$h
539         vshr.u64        $t1,$a,#@Sigma0[1]
540         vsli.64         $t0,$a,#`64-@Sigma0[0]`
541         vadd.i64        $T1,$t2
542         vshr.u64        $t2,$a,#@Sigma0[2]
543         vadd.i64        $K,@X[$i%16]
544         vsli.64         $t1,$a,#`64-@Sigma0[1]`
545         veor            $Maj,$a,$b
546         vsli.64         $t2,$a,#`64-@Sigma0[2]`
547         veor            $h,$t0,$t1
548         vadd.i64        $T1,$K
549         vbsl            $Maj,$c,$b              @ Maj(a,b,c)
550         veor            $h,$t2                  @ Sigma0(a)
551         vadd.i64        $d,$T1
552         vadd.i64        $Maj,$T1
553         @ vadd.i64      $h,$Maj
554 ___
555 }
556
557 sub NEON_16_79() {
558 my $i=shift;
559
560 if ($i&1)       { &NEON_00_15($i,@_); return; }
561
562 # 2x-vectorized, therefore runs every 2nd round
563 my @X=map("q$_",(0..7));                        # view @X as 128-bit vector
564 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));     # temps
565 my ($d0,$d1,$d2) = map("d$_",(24..26));         # temps from NEON_00_15
566 my $e=@_[4];                                    # $e from NEON_00_15
567 $i /= 2;
568 $code.=<<___;
569         vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
570         vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
571          vadd.i64       @_[0],d30                       @ h+=Maj from the past
572         vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
573         vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
574         vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
575         vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
576         veor            $s1,$t0
577         vshr.u64        $t0,$s0,#@sigma0[0]
578         veor            $s1,$t1                         @ sigma1(X[i+14])
579         vshr.u64        $t1,$s0,#@sigma0[1]
580         vadd.i64        @X[$i%8],$s1
581         vshr.u64        $s1,$s0,#@sigma0[2]
582         vsli.64         $t0,$s0,#`64-@sigma0[0]`
583         vsli.64         $t1,$s0,#`64-@sigma0[1]`
584         vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
585         veor            $s1,$t0
586         vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
587         vadd.i64        @X[$i%8],$s0
588         vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
589         veor            $s1,$t1                         @ sigma0(X[i+1])
590         vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
591         vadd.i64        @X[$i%8],$s1
592 ___
593         &NEON_00_15(2*$i,@_);
594 }
595
596 $code.=<<___;
597 #if __ARM_MAX_ARCH__>=7
598 .arch   armv7-a
599 .fpu    neon
600
601 .global sha512_block_data_order_neon
602 .type   sha512_block_data_order_neon,%function
603 .align  4
604 sha512_block_data_order_neon:
605 .LNEON:
606         dmb                             @ errata #451034 on early Cortex A8
607         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
608         adr     $Ktbl,K512
609         VFP_ABI_PUSH
610         vldmia  $ctx,{$A-$H}            @ load context
611 .Loop_neon:
612 ___
613 for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
614 $code.=<<___;
615         mov             $cnt,#4
616 .L16_79_neon:
617         subs            $cnt,#1
618 ___
619 for(;$i<32;$i++)        { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
620 $code.=<<___;
621         bne             .L16_79_neon
622
623          vadd.i64       $A,d30          @ h+=Maj from the past
624         vldmia          $ctx,{d24-d31}  @ load context to temp
625         vadd.i64        q8,q12          @ vectorized accumulate
626         vadd.i64        q9,q13
627         vadd.i64        q10,q14
628         vadd.i64        q11,q15
629         vstmia          $ctx,{$A-$H}    @ save context
630         teq             $inp,$len
631         sub             $Ktbl,#640      @ rewind K512
632         bne             .Loop_neon
633
634         VFP_ABI_POP
635         ret                             @ bx lr
636 .size   sha512_block_data_order_neon,.-sha512_block_data_order_neon
637 #endif
638 ___
639 }
640 $code.=<<___;
641 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
642 .align  2
643 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
644 .comm   OPENSSL_armcap_P,4,4
645 #endif
646 ___
647
648 $code =~ s/\`([^\`]*)\`/eval $1/gem;
649 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
650 $code =~ s/\bret\b/bx   lr/gm;
651
652 open SELF,$0;
653 while(<SELF>) {
654         next if (/^#!/);
655         last if (!s/^#/@/ and !/^$/);
656         print;
657 }
658 close SELF;
659
660 print $code;
661 close STDOUT; # enforce flush