7c6ac41b5b60fd76421648e07900f6f74800a77a
[openssl.git] / crypto / sha / asm / sha512-armv4.pl
1 #! /usr/bin/env perl
2 # Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 #
16 # Permission to use under GPL terms is granted.
17 # ====================================================================
18
19 # SHA512 block procedure for ARMv4. September 2007.
20
21 # This code is ~4.5 (four and a half) times faster than code generated
22 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
23 # Xscale PXA250 core].
24 #
25 # July 2010.
26 #
27 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
28 # Cortex A8 core and ~40 cycles per processed byte.
29
30 # February 2011.
31 #
32 # Profiler-assisted and platform-specific optimization resulted in 7%
33 # improvement on Coxtex A8 core and ~38 cycles per byte.
34
35 # March 2011.
36 #
37 # Add NEON implementation. On Cortex A8 it was measured to process
38 # one byte in 23.3 cycles or ~60% faster than integer-only code.
39
40 # August 2012.
41 #
42 # Improve NEON performance by 12% on Snapdragon S4. In absolute
43 # terms it's 22.6 cycles per byte, which is disappointing result.
44 # Technical writers asserted that 3-way S4 pipeline can sustain
45 # multiple NEON instructions per cycle, but dual NEON issue could
46 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
47 # for further details. On side note Cortex-A15 processes one byte in
48 # 16 cycles.
49
50 # Byte order [in]dependence. =========================================
51 #
52 # Originally caller was expected to maintain specific *dword* order in
53 # h[0-7], namely with most significant dword at *lower* address, which
54 # was reflected in below two parameters as 0 and 4. Now caller is
55 # expected to maintain native byte order for whole 64-bit values.
56 $hi="HI";
57 $lo="LO";
58 # ====================================================================
59
60 # $output is the last argument if it looks like a file (it has an extension)
61 # $flavour is the first argument if it doesn't look like a file
62 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
63 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
64
65 if ($flavour && $flavour ne "void") {
66     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
67     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
68     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
69     die "can't locate arm-xlate.pl";
70
71     open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
72         or die "can't call $xlate: $!";
73 } else {
74     $output and open STDOUT,">$output";
75 }
76
77 $ctx="r0";      # parameter block
78 $inp="r1";
79 $len="r2";
80
81 $Tlo="r3";
82 $Thi="r4";
83 $Alo="r5";
84 $Ahi="r6";
85 $Elo="r7";
86 $Ehi="r8";
87 $t0="r9";
88 $t1="r10";
89 $t2="r11";
90 $t3="r12";
91 ############    r13 is stack pointer
92 $Ktbl="r14";
93 ############    r15 is program counter
94
95 $Aoff=8*0;
96 $Boff=8*1;
97 $Coff=8*2;
98 $Doff=8*3;
99 $Eoff=8*4;
100 $Foff=8*5;
101 $Goff=8*6;
102 $Hoff=8*7;
103 $Xoff=8*8;
104
105 sub BODY_00_15() {
106 my $magic = shift;
107 $code.=<<___;
108         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
109         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
110         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
111         mov     $t0,$Elo,lsr#14
112         str     $Tlo,[sp,#$Xoff+0]
113         mov     $t1,$Ehi,lsr#14
114         str     $Thi,[sp,#$Xoff+4]
115         eor     $t0,$t0,$Ehi,lsl#18
116         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
117         eor     $t1,$t1,$Elo,lsl#18
118         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
119         eor     $t0,$t0,$Elo,lsr#18
120         eor     $t1,$t1,$Ehi,lsr#18
121         eor     $t0,$t0,$Ehi,lsl#14
122         eor     $t1,$t1,$Elo,lsl#14
123         eor     $t0,$t0,$Ehi,lsr#9
124         eor     $t1,$t1,$Elo,lsr#9
125         eor     $t0,$t0,$Elo,lsl#23
126         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
127         adds    $Tlo,$Tlo,$t0
128         ldr     $t0,[sp,#$Foff+0]       @ f.lo
129         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
130         ldr     $t1,[sp,#$Foff+4]       @ f.hi
131         adds    $Tlo,$Tlo,$t2
132         ldr     $t2,[sp,#$Goff+0]       @ g.lo
133         adc     $Thi,$Thi,$t3           @ T += h
134         ldr     $t3,[sp,#$Goff+4]       @ g.hi
135
136         eor     $t0,$t0,$t2
137         str     $Elo,[sp,#$Eoff+0]
138         eor     $t1,$t1,$t3
139         str     $Ehi,[sp,#$Eoff+4]
140         and     $t0,$t0,$Elo
141         str     $Alo,[sp,#$Aoff+0]
142         and     $t1,$t1,$Ehi
143         str     $Ahi,[sp,#$Aoff+4]
144         eor     $t0,$t0,$t2
145         ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
146         eor     $t1,$t1,$t3             @ Ch(e,f,g)
147         ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
148
149         adds    $Tlo,$Tlo,$t0
150         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
151         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
152         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
153         adds    $Tlo,$Tlo,$t2
154         and     $t0,$t2,#0xff
155         adc     $Thi,$Thi,$t3           @ T += K[i]
156         adds    $Elo,$Elo,$Tlo
157         ldr     $t2,[sp,#$Boff+0]       @ b.lo
158         adc     $Ehi,$Ehi,$Thi          @ d += T
159         teq     $t0,#$magic
160
161         ldr     $t3,[sp,#$Coff+0]       @ c.lo
162 #ifdef  __thumb2__
163         it      eq                      @ Thumb2 thing, sanity check in ARM
164 #endif
165         orreq   $Ktbl,$Ktbl,#1
166         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
167         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
168         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
169         mov     $t0,$Alo,lsr#28
170         mov     $t1,$Ahi,lsr#28
171         eor     $t0,$t0,$Ahi,lsl#4
172         eor     $t1,$t1,$Alo,lsl#4
173         eor     $t0,$t0,$Ahi,lsr#2
174         eor     $t1,$t1,$Alo,lsr#2
175         eor     $t0,$t0,$Alo,lsl#30
176         eor     $t1,$t1,$Ahi,lsl#30
177         eor     $t0,$t0,$Ahi,lsr#7
178         eor     $t1,$t1,$Alo,lsr#7
179         eor     $t0,$t0,$Alo,lsl#25
180         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
181         adds    $Tlo,$Tlo,$t0
182         and     $t0,$Alo,$t2
183         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
184
185         ldr     $t1,[sp,#$Boff+4]       @ b.hi
186         orr     $Alo,$Alo,$t2
187         ldr     $t2,[sp,#$Coff+4]       @ c.hi
188         and     $Alo,$Alo,$t3
189         and     $t3,$Ahi,$t1
190         orr     $Ahi,$Ahi,$t1
191         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
192         and     $Ahi,$Ahi,$t2
193         adds    $Alo,$Alo,$Tlo
194         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
195         sub     sp,sp,#8
196         adc     $Ahi,$Ahi,$Thi          @ h += T
197         tst     $Ktbl,#1
198         add     $Ktbl,$Ktbl,#8
199 ___
200 }
201
202 my $_word = ($flavour =~ /win/ ? "DCDU" : ".word");
203
204 $code=<<___;
205 #ifndef __KERNEL__
206 # include "arm_arch.h"
207 # define VFP_ABI_PUSH   vstmdb  sp!,{d8-d15}
208 # define VFP_ABI_POP    vldmia  sp!,{d8-d15}
209 #else
210 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
211 # define __ARM_MAX_ARCH__ 7
212 # define VFP_ABI_PUSH
213 # define VFP_ABI_POP
214 #endif
215
216 #ifdef __ARMEL__
217 # define LO 0
218 # define HI 4
219 # define WORD64(hi0,lo0,hi1,lo1)        $_word  lo0,hi0, lo1,hi1
220 #else
221 # define HI 0
222 # define LO 4
223 # define WORD64(hi0,lo0,hi1,lo1)        $_word  hi0,lo0, hi1,lo1
224 #endif
225
226 #if defined(__thumb2__)
227 .syntax unified
228 .thumb
229 # define adrl adr
230 #else
231 .code   32
232 #endif
233
234 .text
235
236 .type   K512,%object
237 .align  5
238 K512:
239 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
240 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
241 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
242 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
243 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
244 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
245 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
246 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
247 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
248 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
249 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
250 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
251 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
252 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
253 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
254 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
255 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
256 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
257 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
258 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
259 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
260 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
261 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
262 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
263 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
264 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
265 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
266 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
267 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
268 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
269 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
270 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
271 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
272 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
273 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
274 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
275 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
276 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
277 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
278 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
279 .size   K512,.-K512
280 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
281 .LOPENSSL_armcap:
282 # ifdef _WIN32
283 .word   OPENSSL_armcap_P
284 # else
285 .word   OPENSSL_armcap_P-.Lsha512_block_data_order
286 # endif
287 .skip   32-4
288 #else
289 .skip   32
290 #endif
291
292 .global sha512_block_data_order
293 .type   sha512_block_data_order,%function
294 sha512_block_data_order:
295 .Lsha512_block_data_order:
296 #if __ARM_ARCH__<7 && !defined(__thumb2__)
297         sub     r3,pc,#8                @ sha512_block_data_order
298 #else
299         adr     r3,.Lsha512_block_data_order
300 #endif
301 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
302         ldr     r12,.LOPENSSL_armcap
303 # if !defined(_WIN32)
304         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
305 # endif
306 # if defined(__APPLE__) || defined(_WIN32)
307         ldr     r12,[r12]
308 # endif
309         tst     r12,#ARMV7_NEON
310         bne     .LNEON
311 #endif
312         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
313         stmdb   sp!,{r4-r12,lr}
314         sub     $Ktbl,r3,#672           @ K512
315         sub     sp,sp,#9*8
316
317         ldr     $Elo,[$ctx,#$Eoff+$lo]
318         ldr     $Ehi,[$ctx,#$Eoff+$hi]
319         ldr     $t0, [$ctx,#$Goff+$lo]
320         ldr     $t1, [$ctx,#$Goff+$hi]
321         ldr     $t2, [$ctx,#$Hoff+$lo]
322         ldr     $t3, [$ctx,#$Hoff+$hi]
323 .Loop:
324         str     $t0, [sp,#$Goff+0]
325         str     $t1, [sp,#$Goff+4]
326         str     $t2, [sp,#$Hoff+0]
327         str     $t3, [sp,#$Hoff+4]
328         ldr     $Alo,[$ctx,#$Aoff+$lo]
329         ldr     $Ahi,[$ctx,#$Aoff+$hi]
330         ldr     $Tlo,[$ctx,#$Boff+$lo]
331         ldr     $Thi,[$ctx,#$Boff+$hi]
332         ldr     $t0, [$ctx,#$Coff+$lo]
333         ldr     $t1, [$ctx,#$Coff+$hi]
334         ldr     $t2, [$ctx,#$Doff+$lo]
335         ldr     $t3, [$ctx,#$Doff+$hi]
336         str     $Tlo,[sp,#$Boff+0]
337         str     $Thi,[sp,#$Boff+4]
338         str     $t0, [sp,#$Coff+0]
339         str     $t1, [sp,#$Coff+4]
340         str     $t2, [sp,#$Doff+0]
341         str     $t3, [sp,#$Doff+4]
342         ldr     $Tlo,[$ctx,#$Foff+$lo]
343         ldr     $Thi,[$ctx,#$Foff+$hi]
344         str     $Tlo,[sp,#$Foff+0]
345         str     $Thi,[sp,#$Foff+4]
346
347 .L00_15:
348 #if __ARM_ARCH__<7
349         ldrb    $Tlo,[$inp,#7]
350         ldrb    $t0, [$inp,#6]
351         ldrb    $t1, [$inp,#5]
352         ldrb    $t2, [$inp,#4]
353         ldrb    $Thi,[$inp,#3]
354         ldrb    $t3, [$inp,#2]
355         orr     $Tlo,$Tlo,$t0,lsl#8
356         ldrb    $t0, [$inp,#1]
357         orr     $Tlo,$Tlo,$t1,lsl#16
358         ldrb    $t1, [$inp],#8
359         orr     $Tlo,$Tlo,$t2,lsl#24
360         orr     $Thi,$Thi,$t3,lsl#8
361         orr     $Thi,$Thi,$t0,lsl#16
362         orr     $Thi,$Thi,$t1,lsl#24
363 #else
364         ldr     $Tlo,[$inp,#4]
365         ldr     $Thi,[$inp],#8
366 #ifdef __ARMEL__
367         rev     $Tlo,$Tlo
368         rev     $Thi,$Thi
369 #endif
370 #endif
371 ___
372         &BODY_00_15(0x94);
373 $code.=<<___;
374         tst     $Ktbl,#1
375         beq     .L00_15
376         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
377         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
378         bic     $Ktbl,$Ktbl,#1
379 .L16_79:
380         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
381         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
382         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
383         mov     $Tlo,$t0,lsr#1
384         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
385         mov     $Thi,$t1,lsr#1
386         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
387         eor     $Tlo,$Tlo,$t1,lsl#31
388         eor     $Thi,$Thi,$t0,lsl#31
389         eor     $Tlo,$Tlo,$t0,lsr#8
390         eor     $Thi,$Thi,$t1,lsr#8
391         eor     $Tlo,$Tlo,$t1,lsl#24
392         eor     $Thi,$Thi,$t0,lsl#24
393         eor     $Tlo,$Tlo,$t0,lsr#7
394         eor     $Thi,$Thi,$t1,lsr#7
395         eor     $Tlo,$Tlo,$t1,lsl#25
396
397         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
398         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
399         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
400         mov     $t0,$t2,lsr#19
401         mov     $t1,$t3,lsr#19
402         eor     $t0,$t0,$t3,lsl#13
403         eor     $t1,$t1,$t2,lsl#13
404         eor     $t0,$t0,$t3,lsr#29
405         eor     $t1,$t1,$t2,lsr#29
406         eor     $t0,$t0,$t2,lsl#3
407         eor     $t1,$t1,$t3,lsl#3
408         eor     $t0,$t0,$t2,lsr#6
409         eor     $t1,$t1,$t3,lsr#6
410         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
411         eor     $t0,$t0,$t3,lsl#26
412
413         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
414         adds    $Tlo,$Tlo,$t0
415         ldr     $t0,[sp,#`$Xoff+8*16`+0]
416         adc     $Thi,$Thi,$t1
417
418         ldr     $t1,[sp,#`$Xoff+8*16`+4]
419         adds    $Tlo,$Tlo,$t2
420         adc     $Thi,$Thi,$t3
421         adds    $Tlo,$Tlo,$t0
422         adc     $Thi,$Thi,$t1
423 ___
424         &BODY_00_15(0x17);
425 $code.=<<___;
426 #ifdef  __thumb2__
427         ittt    eq                      @ Thumb2 thing, sanity check in ARM
428 #endif
429         ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
430         ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
431         beq     .L16_79
432         bic     $Ktbl,$Ktbl,#1
433
434         ldr     $Tlo,[sp,#$Boff+0]
435         ldr     $Thi,[sp,#$Boff+4]
436         ldr     $t0, [$ctx,#$Aoff+$lo]
437         ldr     $t1, [$ctx,#$Aoff+$hi]
438         ldr     $t2, [$ctx,#$Boff+$lo]
439         ldr     $t3, [$ctx,#$Boff+$hi]
440         adds    $t0,$Alo,$t0
441         str     $t0, [$ctx,#$Aoff+$lo]
442         adc     $t1,$Ahi,$t1
443         str     $t1, [$ctx,#$Aoff+$hi]
444         adds    $t2,$Tlo,$t2
445         str     $t2, [$ctx,#$Boff+$lo]
446         adc     $t3,$Thi,$t3
447         str     $t3, [$ctx,#$Boff+$hi]
448
449         ldr     $Alo,[sp,#$Coff+0]
450         ldr     $Ahi,[sp,#$Coff+4]
451         ldr     $Tlo,[sp,#$Doff+0]
452         ldr     $Thi,[sp,#$Doff+4]
453         ldr     $t0, [$ctx,#$Coff+$lo]
454         ldr     $t1, [$ctx,#$Coff+$hi]
455         ldr     $t2, [$ctx,#$Doff+$lo]
456         ldr     $t3, [$ctx,#$Doff+$hi]
457         adds    $t0,$Alo,$t0
458         str     $t0, [$ctx,#$Coff+$lo]
459         adc     $t1,$Ahi,$t1
460         str     $t1, [$ctx,#$Coff+$hi]
461         adds    $t2,$Tlo,$t2
462         str     $t2, [$ctx,#$Doff+$lo]
463         adc     $t3,$Thi,$t3
464         str     $t3, [$ctx,#$Doff+$hi]
465
466         ldr     $Tlo,[sp,#$Foff+0]
467         ldr     $Thi,[sp,#$Foff+4]
468         ldr     $t0, [$ctx,#$Eoff+$lo]
469         ldr     $t1, [$ctx,#$Eoff+$hi]
470         ldr     $t2, [$ctx,#$Foff+$lo]
471         ldr     $t3, [$ctx,#$Foff+$hi]
472         adds    $Elo,$Elo,$t0
473         str     $Elo,[$ctx,#$Eoff+$lo]
474         adc     $Ehi,$Ehi,$t1
475         str     $Ehi,[$ctx,#$Eoff+$hi]
476         adds    $t2,$Tlo,$t2
477         str     $t2, [$ctx,#$Foff+$lo]
478         adc     $t3,$Thi,$t3
479         str     $t3, [$ctx,#$Foff+$hi]
480
481         ldr     $Alo,[sp,#$Goff+0]
482         ldr     $Ahi,[sp,#$Goff+4]
483         ldr     $Tlo,[sp,#$Hoff+0]
484         ldr     $Thi,[sp,#$Hoff+4]
485         ldr     $t0, [$ctx,#$Goff+$lo]
486         ldr     $t1, [$ctx,#$Goff+$hi]
487         ldr     $t2, [$ctx,#$Hoff+$lo]
488         ldr     $t3, [$ctx,#$Hoff+$hi]
489         adds    $t0,$Alo,$t0
490         str     $t0, [$ctx,#$Goff+$lo]
491         adc     $t1,$Ahi,$t1
492         str     $t1, [$ctx,#$Goff+$hi]
493         adds    $t2,$Tlo,$t2
494         str     $t2, [$ctx,#$Hoff+$lo]
495         adc     $t3,$Thi,$t3
496         str     $t3, [$ctx,#$Hoff+$hi]
497
498         add     sp,sp,#640
499         sub     $Ktbl,$Ktbl,#640
500
501         teq     $inp,$len
502         bne     .Loop
503
504         add     sp,sp,#8*9              @ destroy frame
505 #if __ARM_ARCH__>=5
506         ldmia   sp!,{r4-r12,pc}
507 #else
508         ldmia   sp!,{r4-r12,lr}
509         tst     lr,#1
510         moveq   pc,lr                   @ be binary compatible with V4, yet
511         bx      lr                      @ interoperable with Thumb ISA:-)
512 #endif
513 .size   sha512_block_data_order,.-sha512_block_data_order
514 ___
515
516 {
517 my @Sigma0=(28,34,39);
518 my @Sigma1=(14,18,41);
519 my @sigma0=(1, 8, 7);
520 my @sigma1=(19,61,6);
521
522 my $Ktbl="r3";
523 my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
524
525 my @X=map("d$_",(0..15));
526 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
527
528 sub NEON_00_15() {
529 my $i=shift;
530 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
531 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
532
533 $code.=<<___ if ($i<16 || $i&1);
534         vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
535 #if $i<16
536         vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
537 #endif
538         vshr.u64        $t1,$e,#@Sigma1[1]
539 #if $i>0
540          vadd.i64       $a,$Maj                 @ h+=Maj from the past
541 #endif
542         vshr.u64        $t2,$e,#@Sigma1[2]
543 ___
544 $code.=<<___;
545         vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
546         vsli.64         $t0,$e,#`64-@Sigma1[0]`
547         vsli.64         $t1,$e,#`64-@Sigma1[1]`
548         vmov            $Ch,$e
549         vsli.64         $t2,$e,#`64-@Sigma1[2]`
550 #if $i<16 && defined(__ARMEL__)
551         vrev64.8        @X[$i],@X[$i]
552 #endif
553         veor            $t1,$t0
554         vbsl            $Ch,$f,$g               @ Ch(e,f,g)
555         vshr.u64        $t0,$a,#@Sigma0[0]
556         veor            $t2,$t1                 @ Sigma1(e)
557         vadd.i64        $T1,$Ch,$h
558         vshr.u64        $t1,$a,#@Sigma0[1]
559         vsli.64         $t0,$a,#`64-@Sigma0[0]`
560         vadd.i64        $T1,$t2
561         vshr.u64        $t2,$a,#@Sigma0[2]
562         vadd.i64        $K,@X[$i%16]
563         vsli.64         $t1,$a,#`64-@Sigma0[1]`
564         veor            $Maj,$a,$b
565         vsli.64         $t2,$a,#`64-@Sigma0[2]`
566         veor            $h,$t0,$t1
567         vadd.i64        $T1,$K
568         vbsl            $Maj,$c,$b              @ Maj(a,b,c)
569         veor            $h,$t2                  @ Sigma0(a)
570         vadd.i64        $d,$T1
571         vadd.i64        $Maj,$T1
572         @ vadd.i64      $h,$Maj
573 ___
574 }
575
576 sub NEON_16_79() {
577 my $i=shift;
578
579 if ($i&1)       { &NEON_00_15($i,@_); return; }
580
581 # 2x-vectorized, therefore runs every 2nd round
582 my @X=map("q$_",(0..7));                        # view @X as 128-bit vector
583 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));     # temps
584 my ($d0,$d1,$d2) = map("d$_",(24..26));         # temps from NEON_00_15
585 my $e=@_[4];                                    # $e from NEON_00_15
586 $i /= 2;
587 $code.=<<___;
588         vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
589         vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
590          vadd.i64       @_[0],d30                       @ h+=Maj from the past
591         vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
592         vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
593         vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
594         vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
595         veor            $s1,$t0
596         vshr.u64        $t0,$s0,#@sigma0[0]
597         veor            $s1,$t1                         @ sigma1(X[i+14])
598         vshr.u64        $t1,$s0,#@sigma0[1]
599         vadd.i64        @X[$i%8],$s1
600         vshr.u64        $s1,$s0,#@sigma0[2]
601         vsli.64         $t0,$s0,#`64-@sigma0[0]`
602         vsli.64         $t1,$s0,#`64-@sigma0[1]`
603         vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
604         veor            $s1,$t0
605         vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
606         vadd.i64        @X[$i%8],$s0
607         vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
608         veor            $s1,$t1                         @ sigma0(X[i+1])
609         vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
610         vadd.i64        @X[$i%8],$s1
611 ___
612         &NEON_00_15(2*$i,@_);
613 }
614
615 $code.=<<___;
616 #if __ARM_MAX_ARCH__>=7
617 .arch   armv7-a
618 .fpu    neon
619
620 .global sha512_block_data_order_neon
621 .type   sha512_block_data_order_neon,%function
622 .align  4
623 sha512_block_data_order_neon:
624 .LNEON:
625         dmb                             @ errata #451034 on early Cortex A8
626         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
627         adr     $Ktbl,K512
628         VFP_ABI_PUSH
629         vldmia  $ctx,{$A-$H}            @ load context
630 .Loop_neon:
631 ___
632 for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
633 $code.=<<___;
634         mov             $cnt,#4
635 .L16_79_neon:
636         subs            $cnt,#1
637 ___
638 for(;$i<32;$i++)        { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
639 $code.=<<___;
640         bne             .L16_79_neon
641
642          vadd.i64       $A,d30          @ h+=Maj from the past
643         vldmia          $ctx,{d24-d31}  @ load context to temp
644         vadd.i64        q8,q12          @ vectorized accumulate
645         vadd.i64        q9,q13
646         vadd.i64        q10,q14
647         vadd.i64        q11,q15
648         vstmia          $ctx,{$A-$H}    @ save context
649         teq             $inp,$len
650         sub             $Ktbl,#640      @ rewind K512
651         bne             .Loop_neon
652
653         VFP_ABI_POP
654         ret                             @ bx lr
655 .size   sha512_block_data_order_neon,.-sha512_block_data_order_neon
656 #endif
657 ___
658 }
659 $code.=<<___;
660 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
661 .align  2
662 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
663 .comm   OPENSSL_armcap_P,4,4
664 #endif
665 ___
666
667 $code =~ s/\`([^\`]*)\`/eval $1/gem;
668 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
669 $code =~ s/\bret\b/bx   lr/gm;
670
671 open SELF,$0;
672 while(<SELF>) {
673         next if (/^#!/);
674         last if (!s/^#/@/ and !/^$/);
675         print;
676 }
677 close SELF;
678
679 print $code;
680 close STDOUT or die "error closing STDOUT"; # enforce flush