77d6c5eae95f945b71bbb1ffbf8e9053f6385cc2
[openssl.git] / crypto / sha / asm / sha512-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 #
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
11
12 # SHA512 block procedure for ARMv4. September 2007.
13
14 # This code is ~4.5 (four and a half) times faster than code generated
15 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
16 # Xscale PXA250 core].
17 #
18 # July 2010.
19 #
20 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
21 # Cortex A8 core and ~40 cycles per processed byte.
22
23 # February 2011.
24 #
25 # Profiler-assisted and platform-specific optimization resulted in 7%
26 # improvement on Coxtex A8 core and ~38 cycles per byte.
27
28 # March 2011.
29 #
30 # Add NEON implementation. On Cortex A8 it was measured to process
31 # one byte in 23.3 cycles or ~60% faster than integer-only code.
32
33 # August 2012.
34 #
35 # Improve NEON performance by 12% on Snapdragon S4. In absolute
36 # terms it's 22.6 cycles per byte, which is disappointing result.
37 # Technical writers asserted that 3-way S4 pipeline can sustain
38 # multiple NEON instructions per cycle, but dual NEON issue could
39 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
40 # for further details. On side note Cortex-A15 processes one byte in
41 # 16 cycles.
42
43 # Byte order [in]dependence. =========================================
44 #
45 # Originally caller was expected to maintain specific *dword* order in
46 # h[0-7], namely with most significant dword at *lower* address, which
47 # was reflected in below two parameters as 0 and 4. Now caller is
48 # expected to maintain native byte order for whole 64-bit values.
49 $hi="HI";
50 $lo="LO";
51 # ====================================================================
52
53 $flavour = shift;
54 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
55 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
56
57 if ($flavour && $flavour ne "void") {
58     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
60     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
61     die "can't locate arm-xlate.pl";
62
63     open STDOUT,"| \"$^X\" $xlate $flavour $output";
64 } else {
65     open STDOUT,">$output";
66 }
67
68 $ctx="r0";      # parameter block
69 $inp="r1";
70 $len="r2";
71
72 $Tlo="r3";
73 $Thi="r4";
74 $Alo="r5";
75 $Ahi="r6";
76 $Elo="r7";
77 $Ehi="r8";
78 $t0="r9";
79 $t1="r10";
80 $t2="r11";
81 $t3="r12";
82 ############    r13 is stack pointer
83 $Ktbl="r14";
84 ############    r15 is program counter
85
86 $Aoff=8*0;
87 $Boff=8*1;
88 $Coff=8*2;
89 $Doff=8*3;
90 $Eoff=8*4;
91 $Foff=8*5;
92 $Goff=8*6;
93 $Hoff=8*7;
94 $Xoff=8*8;
95
96 sub BODY_00_15() {
97 my $magic = shift;
98 $code.=<<___;
99         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
100         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
101         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
102         mov     $t0,$Elo,lsr#14
103         str     $Tlo,[sp,#$Xoff+0]
104         mov     $t1,$Ehi,lsr#14
105         str     $Thi,[sp,#$Xoff+4]
106         eor     $t0,$t0,$Ehi,lsl#18
107         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
108         eor     $t1,$t1,$Elo,lsl#18
109         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
110         eor     $t0,$t0,$Elo,lsr#18
111         eor     $t1,$t1,$Ehi,lsr#18
112         eor     $t0,$t0,$Ehi,lsl#14
113         eor     $t1,$t1,$Elo,lsl#14
114         eor     $t0,$t0,$Ehi,lsr#9
115         eor     $t1,$t1,$Elo,lsr#9
116         eor     $t0,$t0,$Elo,lsl#23
117         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
118         adds    $Tlo,$Tlo,$t0
119         ldr     $t0,[sp,#$Foff+0]       @ f.lo
120         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
121         ldr     $t1,[sp,#$Foff+4]       @ f.hi
122         adds    $Tlo,$Tlo,$t2
123         ldr     $t2,[sp,#$Goff+0]       @ g.lo
124         adc     $Thi,$Thi,$t3           @ T += h
125         ldr     $t3,[sp,#$Goff+4]       @ g.hi
126
127         eor     $t0,$t0,$t2
128         str     $Elo,[sp,#$Eoff+0]
129         eor     $t1,$t1,$t3
130         str     $Ehi,[sp,#$Eoff+4]
131         and     $t0,$t0,$Elo
132         str     $Alo,[sp,#$Aoff+0]
133         and     $t1,$t1,$Ehi
134         str     $Ahi,[sp,#$Aoff+4]
135         eor     $t0,$t0,$t2
136         ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
137         eor     $t1,$t1,$t3             @ Ch(e,f,g)
138         ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
139
140         adds    $Tlo,$Tlo,$t0
141         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
142         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
143         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
144         adds    $Tlo,$Tlo,$t2
145         and     $t0,$t2,#0xff
146         adc     $Thi,$Thi,$t3           @ T += K[i]
147         adds    $Elo,$Elo,$Tlo
148         ldr     $t2,[sp,#$Boff+0]       @ b.lo
149         adc     $Ehi,$Ehi,$Thi          @ d += T
150         teq     $t0,#$magic
151
152         ldr     $t3,[sp,#$Coff+0]       @ c.lo
153 #if __ARM_ARCH__>=7
154         it      eq                      @ Thumb2 thing, sanity check in ARM
155 #endif
156         orreq   $Ktbl,$Ktbl,#1
157         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
158         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
159         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
160         mov     $t0,$Alo,lsr#28
161         mov     $t1,$Ahi,lsr#28
162         eor     $t0,$t0,$Ahi,lsl#4
163         eor     $t1,$t1,$Alo,lsl#4
164         eor     $t0,$t0,$Ahi,lsr#2
165         eor     $t1,$t1,$Alo,lsr#2
166         eor     $t0,$t0,$Alo,lsl#30
167         eor     $t1,$t1,$Ahi,lsl#30
168         eor     $t0,$t0,$Ahi,lsr#7
169         eor     $t1,$t1,$Alo,lsr#7
170         eor     $t0,$t0,$Alo,lsl#25
171         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
172         adds    $Tlo,$Tlo,$t0
173         and     $t0,$Alo,$t2
174         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
175
176         ldr     $t1,[sp,#$Boff+4]       @ b.hi
177         orr     $Alo,$Alo,$t2
178         ldr     $t2,[sp,#$Coff+4]       @ c.hi
179         and     $Alo,$Alo,$t3
180         and     $t3,$Ahi,$t1
181         orr     $Ahi,$Ahi,$t1
182         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
183         and     $Ahi,$Ahi,$t2
184         adds    $Alo,$Alo,$Tlo
185         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
186         sub     sp,sp,#8
187         adc     $Ahi,$Ahi,$Thi          @ h += T
188         tst     $Ktbl,#1
189         add     $Ktbl,$Ktbl,#8
190 ___
191 }
192 $code=<<___;
193 #ifndef __KERNEL__
194 # include "arm_arch.h"
195 # define VFP_ABI_PUSH   vstmdb  sp!,{d8-d15}
196 # define VFP_ABI_POP    vldmia  sp!,{d8-d15}
197 #else
198 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
199 # define __ARM_MAX_ARCH__ 7
200 # define VFP_ABI_PUSH
201 # define VFP_ABI_POP
202 #endif
203
204 #ifdef __ARMEL__
205 # define LO 0
206 # define HI 4
207 # define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
208 #else
209 # define HI 0
210 # define LO 4
211 # define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
212 #endif
213
214 .text
215 #if __ARM_ARCH__<7 || defined(__APPLE__)
216 .code   32
217 #else
218 .syntax unified
219 # ifdef __thumb2__
220 #  define adrl adr
221 .thumb
222 # else
223 .code   32
224 # endif
225 #endif
226
227 .type   K512,%object
228 .align  5
229 K512:
230 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
231 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
232 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
233 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
234 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
235 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
236 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
237 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
238 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
239 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
240 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
241 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
242 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
243 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
244 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
245 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
246 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
247 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
248 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
249 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
250 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
251 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
252 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
253 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
254 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
255 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
256 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
257 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
258 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
259 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
260 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
261 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
262 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
263 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
264 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
265 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
266 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
267 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
268 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
269 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
270 .size   K512,.-K512
271 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
272 .LOPENSSL_armcap:
273 .word   OPENSSL_armcap_P-.Lsha512_block_data_order
274 .skip   32-4
275 #else
276 .skip   32
277 #endif
278
279 .global sha512_block_data_order
280 .type   sha512_block_data_order,%function
281 sha512_block_data_order:
282 .Lsha512_block_data_order:
283 #if __ARM_ARCH__<7
284         sub     r3,pc,#8                @ sha512_block_data_order
285 #else
286         adr     r3,sha512_block_data_order
287 #endif
288 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
289         ldr     r12,.LOPENSSL_armcap
290         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
291 #ifdef  __APPLE__
292         ldr     r12,[r12]
293 #endif
294         tst     r12,#1
295         bne     .LNEON
296 #endif
297         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
298         stmdb   sp!,{r4-r12,lr}
299         sub     $Ktbl,r3,#672           @ K512
300         sub     sp,sp,#9*8
301
302         ldr     $Elo,[$ctx,#$Eoff+$lo]
303         ldr     $Ehi,[$ctx,#$Eoff+$hi]
304         ldr     $t0, [$ctx,#$Goff+$lo]
305         ldr     $t1, [$ctx,#$Goff+$hi]
306         ldr     $t2, [$ctx,#$Hoff+$lo]
307         ldr     $t3, [$ctx,#$Hoff+$hi]
308 .Loop:
309         str     $t0, [sp,#$Goff+0]
310         str     $t1, [sp,#$Goff+4]
311         str     $t2, [sp,#$Hoff+0]
312         str     $t3, [sp,#$Hoff+4]
313         ldr     $Alo,[$ctx,#$Aoff+$lo]
314         ldr     $Ahi,[$ctx,#$Aoff+$hi]
315         ldr     $Tlo,[$ctx,#$Boff+$lo]
316         ldr     $Thi,[$ctx,#$Boff+$hi]
317         ldr     $t0, [$ctx,#$Coff+$lo]
318         ldr     $t1, [$ctx,#$Coff+$hi]
319         ldr     $t2, [$ctx,#$Doff+$lo]
320         ldr     $t3, [$ctx,#$Doff+$hi]
321         str     $Tlo,[sp,#$Boff+0]
322         str     $Thi,[sp,#$Boff+4]
323         str     $t0, [sp,#$Coff+0]
324         str     $t1, [sp,#$Coff+4]
325         str     $t2, [sp,#$Doff+0]
326         str     $t3, [sp,#$Doff+4]
327         ldr     $Tlo,[$ctx,#$Foff+$lo]
328         ldr     $Thi,[$ctx,#$Foff+$hi]
329         str     $Tlo,[sp,#$Foff+0]
330         str     $Thi,[sp,#$Foff+4]
331
332 .L00_15:
333 #if __ARM_ARCH__<7
334         ldrb    $Tlo,[$inp,#7]
335         ldrb    $t0, [$inp,#6]
336         ldrb    $t1, [$inp,#5]
337         ldrb    $t2, [$inp,#4]
338         ldrb    $Thi,[$inp,#3]
339         ldrb    $t3, [$inp,#2]
340         orr     $Tlo,$Tlo,$t0,lsl#8
341         ldrb    $t0, [$inp,#1]
342         orr     $Tlo,$Tlo,$t1,lsl#16
343         ldrb    $t1, [$inp],#8
344         orr     $Tlo,$Tlo,$t2,lsl#24
345         orr     $Thi,$Thi,$t3,lsl#8
346         orr     $Thi,$Thi,$t0,lsl#16
347         orr     $Thi,$Thi,$t1,lsl#24
348 #else
349         ldr     $Tlo,[$inp,#4]
350         ldr     $Thi,[$inp],#8
351 #ifdef __ARMEL__
352         rev     $Tlo,$Tlo
353         rev     $Thi,$Thi
354 #endif
355 #endif
356 ___
357         &BODY_00_15(0x94);
358 $code.=<<___;
359         tst     $Ktbl,#1
360         beq     .L00_15
361         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
362         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
363         bic     $Ktbl,$Ktbl,#1
364 .L16_79:
365         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
366         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
367         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
368         mov     $Tlo,$t0,lsr#1
369         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
370         mov     $Thi,$t1,lsr#1
371         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
372         eor     $Tlo,$Tlo,$t1,lsl#31
373         eor     $Thi,$Thi,$t0,lsl#31
374         eor     $Tlo,$Tlo,$t0,lsr#8
375         eor     $Thi,$Thi,$t1,lsr#8
376         eor     $Tlo,$Tlo,$t1,lsl#24
377         eor     $Thi,$Thi,$t0,lsl#24
378         eor     $Tlo,$Tlo,$t0,lsr#7
379         eor     $Thi,$Thi,$t1,lsr#7
380         eor     $Tlo,$Tlo,$t1,lsl#25
381
382         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
383         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
384         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
385         mov     $t0,$t2,lsr#19
386         mov     $t1,$t3,lsr#19
387         eor     $t0,$t0,$t3,lsl#13
388         eor     $t1,$t1,$t2,lsl#13
389         eor     $t0,$t0,$t3,lsr#29
390         eor     $t1,$t1,$t2,lsr#29
391         eor     $t0,$t0,$t2,lsl#3
392         eor     $t1,$t1,$t3,lsl#3
393         eor     $t0,$t0,$t2,lsr#6
394         eor     $t1,$t1,$t3,lsr#6
395         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
396         eor     $t0,$t0,$t3,lsl#26
397
398         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
399         adds    $Tlo,$Tlo,$t0
400         ldr     $t0,[sp,#`$Xoff+8*16`+0]
401         adc     $Thi,$Thi,$t1
402
403         ldr     $t1,[sp,#`$Xoff+8*16`+4]
404         adds    $Tlo,$Tlo,$t2
405         adc     $Thi,$Thi,$t3
406         adds    $Tlo,$Tlo,$t0
407         adc     $Thi,$Thi,$t1
408 ___
409         &BODY_00_15(0x17);
410 $code.=<<___;
411 #if __ARM_ARCH__>=7
412         ittt    eq                      @ Thumb2 thing, sanity check in ARM
413 #endif
414         ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
415         ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
416         beq     .L16_79
417         bic     $Ktbl,$Ktbl,#1
418
419         ldr     $Tlo,[sp,#$Boff+0]
420         ldr     $Thi,[sp,#$Boff+4]
421         ldr     $t0, [$ctx,#$Aoff+$lo]
422         ldr     $t1, [$ctx,#$Aoff+$hi]
423         ldr     $t2, [$ctx,#$Boff+$lo]
424         ldr     $t3, [$ctx,#$Boff+$hi]
425         adds    $t0,$Alo,$t0
426         str     $t0, [$ctx,#$Aoff+$lo]
427         adc     $t1,$Ahi,$t1
428         str     $t1, [$ctx,#$Aoff+$hi]
429         adds    $t2,$Tlo,$t2
430         str     $t2, [$ctx,#$Boff+$lo]
431         adc     $t3,$Thi,$t3
432         str     $t3, [$ctx,#$Boff+$hi]
433
434         ldr     $Alo,[sp,#$Coff+0]
435         ldr     $Ahi,[sp,#$Coff+4]
436         ldr     $Tlo,[sp,#$Doff+0]
437         ldr     $Thi,[sp,#$Doff+4]
438         ldr     $t0, [$ctx,#$Coff+$lo]
439         ldr     $t1, [$ctx,#$Coff+$hi]
440         ldr     $t2, [$ctx,#$Doff+$lo]
441         ldr     $t3, [$ctx,#$Doff+$hi]
442         adds    $t0,$Alo,$t0
443         str     $t0, [$ctx,#$Coff+$lo]
444         adc     $t1,$Ahi,$t1
445         str     $t1, [$ctx,#$Coff+$hi]
446         adds    $t2,$Tlo,$t2
447         str     $t2, [$ctx,#$Doff+$lo]
448         adc     $t3,$Thi,$t3
449         str     $t3, [$ctx,#$Doff+$hi]
450
451         ldr     $Tlo,[sp,#$Foff+0]
452         ldr     $Thi,[sp,#$Foff+4]
453         ldr     $t0, [$ctx,#$Eoff+$lo]
454         ldr     $t1, [$ctx,#$Eoff+$hi]
455         ldr     $t2, [$ctx,#$Foff+$lo]
456         ldr     $t3, [$ctx,#$Foff+$hi]
457         adds    $Elo,$Elo,$t0
458         str     $Elo,[$ctx,#$Eoff+$lo]
459         adc     $Ehi,$Ehi,$t1
460         str     $Ehi,[$ctx,#$Eoff+$hi]
461         adds    $t2,$Tlo,$t2
462         str     $t2, [$ctx,#$Foff+$lo]
463         adc     $t3,$Thi,$t3
464         str     $t3, [$ctx,#$Foff+$hi]
465
466         ldr     $Alo,[sp,#$Goff+0]
467         ldr     $Ahi,[sp,#$Goff+4]
468         ldr     $Tlo,[sp,#$Hoff+0]
469         ldr     $Thi,[sp,#$Hoff+4]
470         ldr     $t0, [$ctx,#$Goff+$lo]
471         ldr     $t1, [$ctx,#$Goff+$hi]
472         ldr     $t2, [$ctx,#$Hoff+$lo]
473         ldr     $t3, [$ctx,#$Hoff+$hi]
474         adds    $t0,$Alo,$t0
475         str     $t0, [$ctx,#$Goff+$lo]
476         adc     $t1,$Ahi,$t1
477         str     $t1, [$ctx,#$Goff+$hi]
478         adds    $t2,$Tlo,$t2
479         str     $t2, [$ctx,#$Hoff+$lo]
480         adc     $t3,$Thi,$t3
481         str     $t3, [$ctx,#$Hoff+$hi]
482
483         add     sp,sp,#640
484         sub     $Ktbl,$Ktbl,#640
485
486         teq     $inp,$len
487         bne     .Loop
488
489         add     sp,sp,#8*9              @ destroy frame
490 #if __ARM_ARCH__>=5
491         ldmia   sp!,{r4-r12,pc}
492 #else
493         ldmia   sp!,{r4-r12,lr}
494         tst     lr,#1
495         moveq   pc,lr                   @ be binary compatible with V4, yet
496         bx      lr                      @ interoperable with Thumb ISA:-)
497 #endif
498 .size   sha512_block_data_order,.-sha512_block_data_order
499 ___
500
501 {
502 my @Sigma0=(28,34,39);
503 my @Sigma1=(14,18,41);
504 my @sigma0=(1, 8, 7);
505 my @sigma1=(19,61,6);
506
507 my $Ktbl="r3";
508 my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
509
510 my @X=map("d$_",(0..15));
511 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
512
513 sub NEON_00_15() {
514 my $i=shift;
515 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
516 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
517
518 $code.=<<___ if ($i<16 || $i&1);
519         vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
520 #if $i<16
521         vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
522 #endif
523         vshr.u64        $t1,$e,#@Sigma1[1]
524 #if $i>0
525          vadd.i64       $a,$Maj                 @ h+=Maj from the past
526 #endif
527         vshr.u64        $t2,$e,#@Sigma1[2]
528 ___
529 $code.=<<___;
530         vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
531         vsli.64         $t0,$e,#`64-@Sigma1[0]`
532         vsli.64         $t1,$e,#`64-@Sigma1[1]`
533         vmov            $Ch,$e
534         vsli.64         $t2,$e,#`64-@Sigma1[2]`
535 #if $i<16 && defined(__ARMEL__)
536         vrev64.8        @X[$i],@X[$i]
537 #endif
538         veor            $t1,$t0
539         vbsl            $Ch,$f,$g               @ Ch(e,f,g)
540         vshr.u64        $t0,$a,#@Sigma0[0]
541         veor            $t2,$t1                 @ Sigma1(e)
542         vadd.i64        $T1,$Ch,$h
543         vshr.u64        $t1,$a,#@Sigma0[1]
544         vsli.64         $t0,$a,#`64-@Sigma0[0]`
545         vadd.i64        $T1,$t2
546         vshr.u64        $t2,$a,#@Sigma0[2]
547         vadd.i64        $K,@X[$i%16]
548         vsli.64         $t1,$a,#`64-@Sigma0[1]`
549         veor            $Maj,$a,$b
550         vsli.64         $t2,$a,#`64-@Sigma0[2]`
551         veor            $h,$t0,$t1
552         vadd.i64        $T1,$K
553         vbsl            $Maj,$c,$b              @ Maj(a,b,c)
554         veor            $h,$t2                  @ Sigma0(a)
555         vadd.i64        $d,$T1
556         vadd.i64        $Maj,$T1
557         @ vadd.i64      $h,$Maj
558 ___
559 }
560
561 sub NEON_16_79() {
562 my $i=shift;
563
564 if ($i&1)       { &NEON_00_15($i,@_); return; }
565
566 # 2x-vectorized, therefore runs every 2nd round
567 my @X=map("q$_",(0..7));                        # view @X as 128-bit vector
568 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));     # temps
569 my ($d0,$d1,$d2) = map("d$_",(24..26));         # temps from NEON_00_15
570 my $e=@_[4];                                    # $e from NEON_00_15
571 $i /= 2;
572 $code.=<<___;
573         vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
574         vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
575          vadd.i64       @_[0],d30                       @ h+=Maj from the past
576         vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
577         vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
578         vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
579         vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
580         veor            $s1,$t0
581         vshr.u64        $t0,$s0,#@sigma0[0]
582         veor            $s1,$t1                         @ sigma1(X[i+14])
583         vshr.u64        $t1,$s0,#@sigma0[1]
584         vadd.i64        @X[$i%8],$s1
585         vshr.u64        $s1,$s0,#@sigma0[2]
586         vsli.64         $t0,$s0,#`64-@sigma0[0]`
587         vsli.64         $t1,$s0,#`64-@sigma0[1]`
588         vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
589         veor            $s1,$t0
590         vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
591         vadd.i64        @X[$i%8],$s0
592         vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
593         veor            $s1,$t1                         @ sigma0(X[i+1])
594         vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
595         vadd.i64        @X[$i%8],$s1
596 ___
597         &NEON_00_15(2*$i,@_);
598 }
599
600 $code.=<<___;
601 #if __ARM_MAX_ARCH__>=7
602 .arch   armv7-a
603 .fpu    neon
604
605 .global sha512_block_data_order_neon
606 .type   sha512_block_data_order_neon,%function
607 .align  4
608 sha512_block_data_order_neon:
609 .LNEON:
610         dmb                             @ errata #451034 on early Cortex A8
611         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
612         adr     $Ktbl,K512
613         VFP_ABI_PUSH
614         vldmia  $ctx,{$A-$H}            @ load context
615 .Loop_neon:
616 ___
617 for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
618 $code.=<<___;
619         mov             $cnt,#4
620 .L16_79_neon:
621         subs            $cnt,#1
622 ___
623 for(;$i<32;$i++)        { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
624 $code.=<<___;
625         bne             .L16_79_neon
626
627          vadd.i64       $A,d30          @ h+=Maj from the past
628         vldmia          $ctx,{d24-d31}  @ load context to temp
629         vadd.i64        q8,q12          @ vectorized accumulate
630         vadd.i64        q9,q13
631         vadd.i64        q10,q14
632         vadd.i64        q11,q15
633         vstmia          $ctx,{$A-$H}    @ save context
634         teq             $inp,$len
635         sub             $Ktbl,#640      @ rewind K512
636         bne             .Loop_neon
637
638         VFP_ABI_POP
639         ret                             @ bx lr
640 .size   sha512_block_data_order_neon,.-sha512_block_data_order_neon
641 #endif
642 ___
643 }
644 $code.=<<___;
645 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
646 .align  2
647 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
648 .comm   OPENSSL_armcap_P,4,4
649 #endif
650 ___
651
652 $code =~ s/\`([^\`]*)\`/eval $1/gem;
653 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
654 $code =~ s/\bret\b/bx   lr/gm;
655
656 open SELF,$0;
657 while(<SELF>) {
658         next if (/^#!/);
659         last if (!s/^#/@/ and !/^$/);
660         print;
661 }
662 close SELF;
663
664 print $code;
665 close STDOUT; # enforce flush