3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
12 # SHA512 block procedure for ARMv4. September 2007.
14 # This code is ~4.5 (four and a half) times faster than code generated
15 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
16 # Xscale PXA250 core].
20 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
21 # Cortex A8 core and ~40 cycles per processed byte.
25 # Profiler-assisted and platform-specific optimization resulted in 7%
26 # improvement on Coxtex A8 core and ~38 cycles per byte.
30 # Add NEON implementation. On Cortex A8 it was measured to process
31 # one byte in 23.3 cycles or ~60% faster than integer-only code.
35 # Improve NEON performance by 12% on Snapdragon S4. In absolute
36 # terms it's 22.6 cycles per byte, which is disappointing result.
37 # Technical writers asserted that 3-way S4 pipeline can sustain
38 # multiple NEON instructions per cycle, but dual NEON issue could
39 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
40 # for further details. On side note Cortex-A15 processes one byte in
43 # Byte order [in]dependence. =========================================
45 # Originally caller was expected to maintain specific *dword* order in
46 # h[0-7], namely with most significant dword at *lower* address, which
47 # was reflected in below two parameters as 0 and 4. Now caller is
48 # expected to maintain native byte order for whole 64-bit values.
51 # ====================================================================
54 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
55 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
57 if ($flavour && $flavour ne "void") {
58 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
59 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
60 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
61 die "can't locate arm-xlate.pl";
63 open STDOUT,"| \"$^X\" $xlate $flavour $output";
65 open STDOUT,">$output";
68 $ctx="r0"; # parameter block
82 ############ r13 is stack pointer
84 ############ r15 is program counter
99 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
100 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
101 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
103 str $Tlo,[sp,#$Xoff+0]
105 str $Thi,[sp,#$Xoff+4]
106 eor $t0,$t0,$Ehi,lsl#18
107 ldr $t2,[sp,#$Hoff+0] @ h.lo
108 eor $t1,$t1,$Elo,lsl#18
109 ldr $t3,[sp,#$Hoff+4] @ h.hi
110 eor $t0,$t0,$Elo,lsr#18
111 eor $t1,$t1,$Ehi,lsr#18
112 eor $t0,$t0,$Ehi,lsl#14
113 eor $t1,$t1,$Elo,lsl#14
114 eor $t0,$t0,$Ehi,lsr#9
115 eor $t1,$t1,$Elo,lsr#9
116 eor $t0,$t0,$Elo,lsl#23
117 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
119 ldr $t0,[sp,#$Foff+0] @ f.lo
120 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
121 ldr $t1,[sp,#$Foff+4] @ f.hi
123 ldr $t2,[sp,#$Goff+0] @ g.lo
124 adc $Thi,$Thi,$t3 @ T += h
125 ldr $t3,[sp,#$Goff+4] @ g.hi
128 str $Elo,[sp,#$Eoff+0]
130 str $Ehi,[sp,#$Eoff+4]
132 str $Alo,[sp,#$Aoff+0]
134 str $Ahi,[sp,#$Aoff+4]
136 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
137 eor $t1,$t1,$t3 @ Ch(e,f,g)
138 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
141 ldr $Elo,[sp,#$Doff+0] @ d.lo
142 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
143 ldr $Ehi,[sp,#$Doff+4] @ d.hi
146 adc $Thi,$Thi,$t3 @ T += K[i]
148 ldr $t2,[sp,#$Boff+0] @ b.lo
149 adc $Ehi,$Ehi,$Thi @ d += T
152 ldr $t3,[sp,#$Coff+0] @ c.lo
154 it eq @ Thumb2 thing, sanity check in ARM
157 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
158 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
159 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
162 eor $t0,$t0,$Ahi,lsl#4
163 eor $t1,$t1,$Alo,lsl#4
164 eor $t0,$t0,$Ahi,lsr#2
165 eor $t1,$t1,$Alo,lsr#2
166 eor $t0,$t0,$Alo,lsl#30
167 eor $t1,$t1,$Ahi,lsl#30
168 eor $t0,$t0,$Ahi,lsr#7
169 eor $t1,$t1,$Alo,lsr#7
170 eor $t0,$t0,$Alo,lsl#25
171 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
174 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
176 ldr $t1,[sp,#$Boff+4] @ b.hi
178 ldr $t2,[sp,#$Coff+4] @ c.hi
182 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
185 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
187 adc $Ahi,$Ahi,$Thi @ h += T
194 # include "arm_arch.h"
195 # define VFP_ABI_PUSH vstmdb sp!,{d8-d15}
196 # define VFP_ABI_POP vldmia sp!,{d8-d15}
198 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
199 # define __ARM_MAX_ARCH__ 7
200 # define VFP_ABI_PUSH
207 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
211 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
215 #if __ARM_ARCH__<7 || defined(__APPLE__)
230 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
231 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
232 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
233 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
234 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
235 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
236 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
237 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
238 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
239 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
240 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
241 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
242 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
243 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
244 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
245 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
246 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
247 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
248 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
249 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
250 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
251 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
252 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
253 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
254 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
255 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
256 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
257 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
258 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
259 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
260 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
261 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
262 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
263 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
264 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
265 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
266 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
267 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
268 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
269 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
271 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
273 .word OPENSSL_armcap_P-.Lsha512_block_data_order
279 .global sha512_block_data_order
280 .type sha512_block_data_order,%function
281 sha512_block_data_order:
282 .Lsha512_block_data_order:
284 sub r3,pc,#8 @ sha512_block_data_order
286 adr r3,sha512_block_data_order
288 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
289 ldr r12,.LOPENSSL_armcap
290 ldr r12,[r3,r12] @ OPENSSL_armcap_P
297 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
298 stmdb sp!,{r4-r12,lr}
299 sub $Ktbl,r3,#672 @ K512
302 ldr $Elo,[$ctx,#$Eoff+$lo]
303 ldr $Ehi,[$ctx,#$Eoff+$hi]
304 ldr $t0, [$ctx,#$Goff+$lo]
305 ldr $t1, [$ctx,#$Goff+$hi]
306 ldr $t2, [$ctx,#$Hoff+$lo]
307 ldr $t3, [$ctx,#$Hoff+$hi]
309 str $t0, [sp,#$Goff+0]
310 str $t1, [sp,#$Goff+4]
311 str $t2, [sp,#$Hoff+0]
312 str $t3, [sp,#$Hoff+4]
313 ldr $Alo,[$ctx,#$Aoff+$lo]
314 ldr $Ahi,[$ctx,#$Aoff+$hi]
315 ldr $Tlo,[$ctx,#$Boff+$lo]
316 ldr $Thi,[$ctx,#$Boff+$hi]
317 ldr $t0, [$ctx,#$Coff+$lo]
318 ldr $t1, [$ctx,#$Coff+$hi]
319 ldr $t2, [$ctx,#$Doff+$lo]
320 ldr $t3, [$ctx,#$Doff+$hi]
321 str $Tlo,[sp,#$Boff+0]
322 str $Thi,[sp,#$Boff+4]
323 str $t0, [sp,#$Coff+0]
324 str $t1, [sp,#$Coff+4]
325 str $t2, [sp,#$Doff+0]
326 str $t3, [sp,#$Doff+4]
327 ldr $Tlo,[$ctx,#$Foff+$lo]
328 ldr $Thi,[$ctx,#$Foff+$hi]
329 str $Tlo,[sp,#$Foff+0]
330 str $Thi,[sp,#$Foff+4]
340 orr $Tlo,$Tlo,$t0,lsl#8
342 orr $Tlo,$Tlo,$t1,lsl#16
344 orr $Tlo,$Tlo,$t2,lsl#24
345 orr $Thi,$Thi,$t3,lsl#8
346 orr $Thi,$Thi,$t0,lsl#16
347 orr $Thi,$Thi,$t1,lsl#24
361 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
362 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
365 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
366 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
367 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
369 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
371 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
372 eor $Tlo,$Tlo,$t1,lsl#31
373 eor $Thi,$Thi,$t0,lsl#31
374 eor $Tlo,$Tlo,$t0,lsr#8
375 eor $Thi,$Thi,$t1,lsr#8
376 eor $Tlo,$Tlo,$t1,lsl#24
377 eor $Thi,$Thi,$t0,lsl#24
378 eor $Tlo,$Tlo,$t0,lsr#7
379 eor $Thi,$Thi,$t1,lsr#7
380 eor $Tlo,$Tlo,$t1,lsl#25
382 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
383 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
384 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
387 eor $t0,$t0,$t3,lsl#13
388 eor $t1,$t1,$t2,lsl#13
389 eor $t0,$t0,$t3,lsr#29
390 eor $t1,$t1,$t2,lsr#29
391 eor $t0,$t0,$t2,lsl#3
392 eor $t1,$t1,$t3,lsl#3
393 eor $t0,$t0,$t2,lsr#6
394 eor $t1,$t1,$t3,lsr#6
395 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
396 eor $t0,$t0,$t3,lsl#26
398 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
400 ldr $t0,[sp,#`$Xoff+8*16`+0]
403 ldr $t1,[sp,#`$Xoff+8*16`+4]
412 ittt eq @ Thumb2 thing, sanity check in ARM
414 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
415 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
419 ldr $Tlo,[sp,#$Boff+0]
420 ldr $Thi,[sp,#$Boff+4]
421 ldr $t0, [$ctx,#$Aoff+$lo]
422 ldr $t1, [$ctx,#$Aoff+$hi]
423 ldr $t2, [$ctx,#$Boff+$lo]
424 ldr $t3, [$ctx,#$Boff+$hi]
426 str $t0, [$ctx,#$Aoff+$lo]
428 str $t1, [$ctx,#$Aoff+$hi]
430 str $t2, [$ctx,#$Boff+$lo]
432 str $t3, [$ctx,#$Boff+$hi]
434 ldr $Alo,[sp,#$Coff+0]
435 ldr $Ahi,[sp,#$Coff+4]
436 ldr $Tlo,[sp,#$Doff+0]
437 ldr $Thi,[sp,#$Doff+4]
438 ldr $t0, [$ctx,#$Coff+$lo]
439 ldr $t1, [$ctx,#$Coff+$hi]
440 ldr $t2, [$ctx,#$Doff+$lo]
441 ldr $t3, [$ctx,#$Doff+$hi]
443 str $t0, [$ctx,#$Coff+$lo]
445 str $t1, [$ctx,#$Coff+$hi]
447 str $t2, [$ctx,#$Doff+$lo]
449 str $t3, [$ctx,#$Doff+$hi]
451 ldr $Tlo,[sp,#$Foff+0]
452 ldr $Thi,[sp,#$Foff+4]
453 ldr $t0, [$ctx,#$Eoff+$lo]
454 ldr $t1, [$ctx,#$Eoff+$hi]
455 ldr $t2, [$ctx,#$Foff+$lo]
456 ldr $t3, [$ctx,#$Foff+$hi]
458 str $Elo,[$ctx,#$Eoff+$lo]
460 str $Ehi,[$ctx,#$Eoff+$hi]
462 str $t2, [$ctx,#$Foff+$lo]
464 str $t3, [$ctx,#$Foff+$hi]
466 ldr $Alo,[sp,#$Goff+0]
467 ldr $Ahi,[sp,#$Goff+4]
468 ldr $Tlo,[sp,#$Hoff+0]
469 ldr $Thi,[sp,#$Hoff+4]
470 ldr $t0, [$ctx,#$Goff+$lo]
471 ldr $t1, [$ctx,#$Goff+$hi]
472 ldr $t2, [$ctx,#$Hoff+$lo]
473 ldr $t3, [$ctx,#$Hoff+$hi]
475 str $t0, [$ctx,#$Goff+$lo]
477 str $t1, [$ctx,#$Goff+$hi]
479 str $t2, [$ctx,#$Hoff+$lo]
481 str $t3, [$ctx,#$Hoff+$hi]
489 add sp,sp,#8*9 @ destroy frame
491 ldmia sp!,{r4-r12,pc}
493 ldmia sp!,{r4-r12,lr}
495 moveq pc,lr @ be binary compatible with V4, yet
496 bx lr @ interoperable with Thumb ISA:-)
498 .size sha512_block_data_order,.-sha512_block_data_order
502 my @Sigma0=(28,34,39);
503 my @Sigma1=(14,18,41);
504 my @sigma0=(1, 8, 7);
505 my @sigma1=(19,61,6);
508 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
510 my @X=map("d$_",(0..15));
511 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
515 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
516 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
518 $code.=<<___ if ($i<16 || $i&1);
519 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
521 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
523 vshr.u64 $t1,$e,#@Sigma1[1]
525 vadd.i64 $a,$Maj @ h+=Maj from the past
527 vshr.u64 $t2,$e,#@Sigma1[2]
530 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
531 vsli.64 $t0,$e,#`64-@Sigma1[0]`
532 vsli.64 $t1,$e,#`64-@Sigma1[1]`
534 vsli.64 $t2,$e,#`64-@Sigma1[2]`
535 #if $i<16 && defined(__ARMEL__)
536 vrev64.8 @X[$i],@X[$i]
539 vbsl $Ch,$f,$g @ Ch(e,f,g)
540 vshr.u64 $t0,$a,#@Sigma0[0]
541 veor $t2,$t1 @ Sigma1(e)
543 vshr.u64 $t1,$a,#@Sigma0[1]
544 vsli.64 $t0,$a,#`64-@Sigma0[0]`
546 vshr.u64 $t2,$a,#@Sigma0[2]
547 vadd.i64 $K,@X[$i%16]
548 vsli.64 $t1,$a,#`64-@Sigma0[1]`
550 vsli.64 $t2,$a,#`64-@Sigma0[2]`
553 vbsl $Maj,$c,$b @ Maj(a,b,c)
554 veor $h,$t2 @ Sigma0(a)
564 if ($i&1) { &NEON_00_15($i,@_); return; }
566 # 2x-vectorized, therefore runs every 2nd round
567 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
568 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
569 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
570 my $e=@_[4]; # $e from NEON_00_15
573 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
574 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
575 vadd.i64 @_[0],d30 @ h+=Maj from the past
576 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
577 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
578 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
579 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
581 vshr.u64 $t0,$s0,#@sigma0[0]
582 veor $s1,$t1 @ sigma1(X[i+14])
583 vshr.u64 $t1,$s0,#@sigma0[1]
584 vadd.i64 @X[$i%8],$s1
585 vshr.u64 $s1,$s0,#@sigma0[2]
586 vsli.64 $t0,$s0,#`64-@sigma0[0]`
587 vsli.64 $t1,$s0,#`64-@sigma0[1]`
588 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
590 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
591 vadd.i64 @X[$i%8],$s0
592 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
593 veor $s1,$t1 @ sigma0(X[i+1])
594 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
595 vadd.i64 @X[$i%8],$s1
597 &NEON_00_15(2*$i,@_);
601 #if __ARM_MAX_ARCH__>=7
605 .global sha512_block_data_order_neon
606 .type sha512_block_data_order_neon,%function
608 sha512_block_data_order_neon:
610 dmb @ errata #451034 on early Cortex A8
611 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
614 vldmia $ctx,{$A-$H} @ load context
617 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
623 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
627 vadd.i64 $A,d30 @ h+=Maj from the past
628 vldmia $ctx,{d24-d31} @ load context to temp
629 vadd.i64 q8,q12 @ vectorized accumulate
633 vstmia $ctx,{$A-$H} @ save context
635 sub $Ktbl,#640 @ rewind K512
640 .size sha512_block_data_order_neon,.-sha512_block_data_order_neon
645 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
647 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
648 .comm OPENSSL_armcap_P,4,4
652 $code =~ s/\`([^\`]*)\`/eval $1/gem;
653 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
654 $code =~ s/\bret\b/bx lr/gm;
659 last if (!s/^#/@/ and !/^$/);
665 close STDOUT; # enforce flush