3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
12 # SHA512 block procedure for ARMv4. September 2007.
14 # This code is ~4.5 (four and a half) times faster than code generated
15 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
16 # Xscale PXA250 core].
20 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
21 # Cortex A8 core and ~40 cycles per processed byte.
25 # Profiler-assisted and platform-specific optimization resulted in 7%
26 # improvement on Coxtex A8 core and ~38 cycles per byte.
30 # Add NEON implementation. On Cortex A8 it was measured to process
31 # one byte in 23.3 cycles or ~60% faster than integer-only code.
35 # Improve NEON performance by 12% on Snapdragon S4. In absolute
36 # terms it's 22.6 cycles per byte, which is disappointing result.
37 # Technical writers asserted that 3-way S4 pipeline can sustain
38 # multiple NEON instructions per cycle, but dual NEON issue could
39 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
40 # for further details. On side note Cortex-A15 processes one byte in
43 # Byte order [in]dependence. =========================================
45 # Originally caller was expected to maintain specific *dword* order in
46 # h[0-7], namely with most significant dword at *lower* address, which
47 # was reflected in below two parameters as 0 and 4. Now caller is
48 # expected to maintain native byte order for whole 64-bit values.
51 # ====================================================================
54 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
55 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
57 if ($flavour && $flavour ne "void") {
58 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
60 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
61 die "can't locate arm-xlate.pl";
63 open STDOUT,"| \"$^X\" $xlate $flavour $output";
65 open STDOUT,">$output";
68 $ctx="r0"; # parameter block
82 ############ r13 is stack pointer
84 ############ r15 is program counter
99 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
100 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
101 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
103 str $Tlo,[sp,#$Xoff+0]
105 str $Thi,[sp,#$Xoff+4]
106 eor $t0,$t0,$Ehi,lsl#18
107 ldr $t2,[sp,#$Hoff+0] @ h.lo
108 eor $t1,$t1,$Elo,lsl#18
109 ldr $t3,[sp,#$Hoff+4] @ h.hi
110 eor $t0,$t0,$Elo,lsr#18
111 eor $t1,$t1,$Ehi,lsr#18
112 eor $t0,$t0,$Ehi,lsl#14
113 eor $t1,$t1,$Elo,lsl#14
114 eor $t0,$t0,$Ehi,lsr#9
115 eor $t1,$t1,$Elo,lsr#9
116 eor $t0,$t0,$Elo,lsl#23
117 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
119 ldr $t0,[sp,#$Foff+0] @ f.lo
120 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
121 ldr $t1,[sp,#$Foff+4] @ f.hi
123 ldr $t2,[sp,#$Goff+0] @ g.lo
124 adc $Thi,$Thi,$t3 @ T += h
125 ldr $t3,[sp,#$Goff+4] @ g.hi
128 str $Elo,[sp,#$Eoff+0]
130 str $Ehi,[sp,#$Eoff+4]
132 str $Alo,[sp,#$Aoff+0]
134 str $Ahi,[sp,#$Aoff+4]
136 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
137 eor $t1,$t1,$t3 @ Ch(e,f,g)
138 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
141 ldr $Elo,[sp,#$Doff+0] @ d.lo
142 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
143 ldr $Ehi,[sp,#$Doff+4] @ d.hi
146 adc $Thi,$Thi,$t3 @ T += K[i]
148 ldr $t2,[sp,#$Boff+0] @ b.lo
149 adc $Ehi,$Ehi,$Thi @ d += T
152 ldr $t3,[sp,#$Coff+0] @ c.lo
154 it eq @ Thumb2 thing, sanity check in ARM
157 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
158 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
159 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
162 eor $t0,$t0,$Ahi,lsl#4
163 eor $t1,$t1,$Alo,lsl#4
164 eor $t0,$t0,$Ahi,lsr#2
165 eor $t1,$t1,$Alo,lsr#2
166 eor $t0,$t0,$Alo,lsl#30
167 eor $t1,$t1,$Ahi,lsl#30
168 eor $t0,$t0,$Ahi,lsr#7
169 eor $t1,$t1,$Alo,lsr#7
170 eor $t0,$t0,$Alo,lsl#25
171 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
174 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
176 ldr $t1,[sp,#$Boff+4] @ b.hi
178 ldr $t2,[sp,#$Coff+4] @ c.hi
182 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
185 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
187 adc $Ahi,$Ahi,$Thi @ h += T
194 # include "arm_arch.h"
195 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
196 # define VFP_ABI_POP vldmia sp!,{d8-d15}
198 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
199 # define __ARM_MAX_ARCH__ 7
200 # define VFP_ABI_PUSH
207 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
211 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
215 #if defined(__thumb2__)
226 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
227 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
228 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
229 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
230 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
231 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
232 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
233 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
234 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
235 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
236 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
237 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
238 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
239 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
240 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
241 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
242 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
243 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
244 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
245 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
246 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
247 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
248 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
249 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
250 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
251 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
252 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
253 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
254 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
255 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
256 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
257 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
258 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
259 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
260 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
261 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
262 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
263 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
264 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
265 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
267 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
269 .word OPENSSL_armcap_P-.Lsha512_block_data_order
275 .global sha512_block_data_order
276 .type sha512_block_data_order,%function
277 sha512_block_data_order:
278 .Lsha512_block_data_order:
279 #if __ARM_ARCH__<7 && !defined(__thumb2__)
280 sub r3,pc,#8 @ sha512_block_data_order
282 adr r3,.Lsha512_block_data_order
284 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
285 ldr r12,.LOPENSSL_armcap
286 ldr r12,[r3,r12] @ OPENSSL_armcap_P
293 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
294 stmdb sp!,{r4-r12,lr}
295 sub $Ktbl,r3,#672 @ K512
298 ldr $Elo,[$ctx,#$Eoff+$lo]
299 ldr $Ehi,[$ctx,#$Eoff+$hi]
300 ldr $t0, [$ctx,#$Goff+$lo]
301 ldr $t1, [$ctx,#$Goff+$hi]
302 ldr $t2, [$ctx,#$Hoff+$lo]
303 ldr $t3, [$ctx,#$Hoff+$hi]
305 str $t0, [sp,#$Goff+0]
306 str $t1, [sp,#$Goff+4]
307 str $t2, [sp,#$Hoff+0]
308 str $t3, [sp,#$Hoff+4]
309 ldr $Alo,[$ctx,#$Aoff+$lo]
310 ldr $Ahi,[$ctx,#$Aoff+$hi]
311 ldr $Tlo,[$ctx,#$Boff+$lo]
312 ldr $Thi,[$ctx,#$Boff+$hi]
313 ldr $t0, [$ctx,#$Coff+$lo]
314 ldr $t1, [$ctx,#$Coff+$hi]
315 ldr $t2, [$ctx,#$Doff+$lo]
316 ldr $t3, [$ctx,#$Doff+$hi]
317 str $Tlo,[sp,#$Boff+0]
318 str $Thi,[sp,#$Boff+4]
319 str $t0, [sp,#$Coff+0]
320 str $t1, [sp,#$Coff+4]
321 str $t2, [sp,#$Doff+0]
322 str $t3, [sp,#$Doff+4]
323 ldr $Tlo,[$ctx,#$Foff+$lo]
324 ldr $Thi,[$ctx,#$Foff+$hi]
325 str $Tlo,[sp,#$Foff+0]
326 str $Thi,[sp,#$Foff+4]
336 orr $Tlo,$Tlo,$t0,lsl#8
338 orr $Tlo,$Tlo,$t1,lsl#16
340 orr $Tlo,$Tlo,$t2,lsl#24
341 orr $Thi,$Thi,$t3,lsl#8
342 orr $Thi,$Thi,$t0,lsl#16
343 orr $Thi,$Thi,$t1,lsl#24
357 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
358 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
361 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
362 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
363 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
365 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
367 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
368 eor $Tlo,$Tlo,$t1,lsl#31
369 eor $Thi,$Thi,$t0,lsl#31
370 eor $Tlo,$Tlo,$t0,lsr#8
371 eor $Thi,$Thi,$t1,lsr#8
372 eor $Tlo,$Tlo,$t1,lsl#24
373 eor $Thi,$Thi,$t0,lsl#24
374 eor $Tlo,$Tlo,$t0,lsr#7
375 eor $Thi,$Thi,$t1,lsr#7
376 eor $Tlo,$Tlo,$t1,lsl#25
378 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
379 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
380 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
383 eor $t0,$t0,$t3,lsl#13
384 eor $t1,$t1,$t2,lsl#13
385 eor $t0,$t0,$t3,lsr#29
386 eor $t1,$t1,$t2,lsr#29
387 eor $t0,$t0,$t2,lsl#3
388 eor $t1,$t1,$t3,lsl#3
389 eor $t0,$t0,$t2,lsr#6
390 eor $t1,$t1,$t3,lsr#6
391 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
392 eor $t0,$t0,$t3,lsl#26
394 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
396 ldr $t0,[sp,#`$Xoff+8*16`+0]
399 ldr $t1,[sp,#`$Xoff+8*16`+4]
408 ittt eq @ Thumb2 thing, sanity check in ARM
410 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
411 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
415 ldr $Tlo,[sp,#$Boff+0]
416 ldr $Thi,[sp,#$Boff+4]
417 ldr $t0, [$ctx,#$Aoff+$lo]
418 ldr $t1, [$ctx,#$Aoff+$hi]
419 ldr $t2, [$ctx,#$Boff+$lo]
420 ldr $t3, [$ctx,#$Boff+$hi]
422 str $t0, [$ctx,#$Aoff+$lo]
424 str $t1, [$ctx,#$Aoff+$hi]
426 str $t2, [$ctx,#$Boff+$lo]
428 str $t3, [$ctx,#$Boff+$hi]
430 ldr $Alo,[sp,#$Coff+0]
431 ldr $Ahi,[sp,#$Coff+4]
432 ldr $Tlo,[sp,#$Doff+0]
433 ldr $Thi,[sp,#$Doff+4]
434 ldr $t0, [$ctx,#$Coff+$lo]
435 ldr $t1, [$ctx,#$Coff+$hi]
436 ldr $t2, [$ctx,#$Doff+$lo]
437 ldr $t3, [$ctx,#$Doff+$hi]
439 str $t0, [$ctx,#$Coff+$lo]
441 str $t1, [$ctx,#$Coff+$hi]
443 str $t2, [$ctx,#$Doff+$lo]
445 str $t3, [$ctx,#$Doff+$hi]
447 ldr $Tlo,[sp,#$Foff+0]
448 ldr $Thi,[sp,#$Foff+4]
449 ldr $t0, [$ctx,#$Eoff+$lo]
450 ldr $t1, [$ctx,#$Eoff+$hi]
451 ldr $t2, [$ctx,#$Foff+$lo]
452 ldr $t3, [$ctx,#$Foff+$hi]
454 str $Elo,[$ctx,#$Eoff+$lo]
456 str $Ehi,[$ctx,#$Eoff+$hi]
458 str $t2, [$ctx,#$Foff+$lo]
460 str $t3, [$ctx,#$Foff+$hi]
462 ldr $Alo,[sp,#$Goff+0]
463 ldr $Ahi,[sp,#$Goff+4]
464 ldr $Tlo,[sp,#$Hoff+0]
465 ldr $Thi,[sp,#$Hoff+4]
466 ldr $t0, [$ctx,#$Goff+$lo]
467 ldr $t1, [$ctx,#$Goff+$hi]
468 ldr $t2, [$ctx,#$Hoff+$lo]
469 ldr $t3, [$ctx,#$Hoff+$hi]
471 str $t0, [$ctx,#$Goff+$lo]
473 str $t1, [$ctx,#$Goff+$hi]
475 str $t2, [$ctx,#$Hoff+$lo]
477 str $t3, [$ctx,#$Hoff+$hi]
485 add sp,sp,#8*9 @ destroy frame
487 ldmia sp!,{r4-r12,pc}
489 ldmia sp!,{r4-r12,lr}
491 moveq pc,lr @ be binary compatible with V4, yet
492 bx lr @ interoperable with Thumb ISA:-)
494 .size sha512_block_data_order,.-sha512_block_data_order
498 my @Sigma0=(28,34,39);
499 my @Sigma1=(14,18,41);
500 my @sigma0=(1, 8, 7);
501 my @sigma1=(19,61,6);
504 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
506 my @X=map("d$_",(0..15));
507 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
511 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
512 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
514 $code.=<<___ if ($i<16 || $i&1);
515 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
517 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
519 vshr.u64 $t1,$e,#@Sigma1[1]
521 vadd.i64 $a,$Maj @ h+=Maj from the past
523 vshr.u64 $t2,$e,#@Sigma1[2]
526 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
527 vsli.64 $t0,$e,#`64-@Sigma1[0]`
528 vsli.64 $t1,$e,#`64-@Sigma1[1]`
530 vsli.64 $t2,$e,#`64-@Sigma1[2]`
531 #if $i<16 && defined(__ARMEL__)
532 vrev64.8 @X[$i],@X[$i]
535 vbsl $Ch,$f,$g @ Ch(e,f,g)
536 vshr.u64 $t0,$a,#@Sigma0[0]
537 veor $t2,$t1 @ Sigma1(e)
539 vshr.u64 $t1,$a,#@Sigma0[1]
540 vsli.64 $t0,$a,#`64-@Sigma0[0]`
542 vshr.u64 $t2,$a,#@Sigma0[2]
543 vadd.i64 $K,@X[$i%16]
544 vsli.64 $t1,$a,#`64-@Sigma0[1]`
546 vsli.64 $t2,$a,#`64-@Sigma0[2]`
549 vbsl $Maj,$c,$b @ Maj(a,b,c)
550 veor $h,$t2 @ Sigma0(a)
560 if ($i&1) { &NEON_00_15($i,@_); return; }
562 # 2x-vectorized, therefore runs every 2nd round
563 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
564 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
565 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
566 my $e=@_[4]; # $e from NEON_00_15
569 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
570 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
571 vadd.i64 @_[0],d30 @ h+=Maj from the past
572 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
573 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
574 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
575 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
577 vshr.u64 $t0,$s0,#@sigma0[0]
578 veor $s1,$t1 @ sigma1(X[i+14])
579 vshr.u64 $t1,$s0,#@sigma0[1]
580 vadd.i64 @X[$i%8],$s1
581 vshr.u64 $s1,$s0,#@sigma0[2]
582 vsli.64 $t0,$s0,#`64-@sigma0[0]`
583 vsli.64 $t1,$s0,#`64-@sigma0[1]`
584 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
586 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
587 vadd.i64 @X[$i%8],$s0
588 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
589 veor $s1,$t1 @ sigma0(X[i+1])
590 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
591 vadd.i64 @X[$i%8],$s1
593 &NEON_00_15(2*$i,@_);
597 #if __ARM_MAX_ARCH__>=7
601 .global sha512_block_data_order_neon
602 .type sha512_block_data_order_neon,%function
604 sha512_block_data_order_neon:
606 dmb @ errata #451034 on early Cortex A8
607 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
610 vldmia $ctx,{$A-$H} @ load context
613 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
619 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
623 vadd.i64 $A,d30 @ h+=Maj from the past
624 vldmia $ctx,{d24-d31} @ load context to temp
625 vadd.i64 q8,q12 @ vectorized accumulate
629 vstmia $ctx,{$A-$H} @ save context
631 sub $Ktbl,#640 @ rewind K512
636 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
641 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
643 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
644 .comm OPENSSL_armcap_P,4,4
648 $code =~ s/\`([^\`]*)\`/eval $1/gem;
649 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
650 $code =~ s/\bret\b/bx lr/gm;
655 last if (!s/^#/@/ and !/^$/);
661 close STDOUT; # enforce flush