sha/asm/sha256-armv4.pl: add NEON code path.
[openssl.git] / crypto / sha / asm / sha256-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # SHA256 block procedure for ARMv4. May 2007.
11
12 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
13 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14 # byte [on single-issue Xscale PXA250 core].
15
16 # July 2010.
17 #
18 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
19 # Cortex A8 core and ~20 cycles per processed byte.
20
21 # February 2011.
22 #
23 # Profiler-assisted and platform-specific optimization resulted in 16%
24 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
25
26 # September 2013.
27 #
28 # Add NEON implementation. On Cortex A8 it was measured to process one
29 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
30 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
31 # code (meaning that latter performs sub-optimally, nothing was done
32 # about it).
33
34 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
35 open STDOUT,">$output";
36
37 $ctx="r0";      $t0="r0";
38 $inp="r1";      $t4="r1";
39 $len="r2";      $t1="r2";
40 $T1="r3";       $t3="r3";
41 $A="r4";
42 $B="r5";
43 $C="r6";
44 $D="r7";
45 $E="r8";
46 $F="r9";
47 $G="r10";
48 $H="r11";
49 @V=($A,$B,$C,$D,$E,$F,$G,$H);
50 $t2="r12";
51 $Ktbl="r14";
52
53 @Sigma0=( 2,13,22);
54 @Sigma1=( 6,11,25);
55 @sigma0=( 7,18, 3);
56 @sigma1=(17,19,10);
57
58 sub BODY_00_15 {
59 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
60
61 $code.=<<___ if ($i<16);
62 #if __ARM_ARCH__>=7
63         @ ldr   $t1,[$inp],#4                   @ $i
64 # if $i==15
65         str     $inp,[sp,#17*4]                 @ make room for $t4
66 # endif
67         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
68         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
69         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
70         rev     $t1,$t1
71 #else
72         @ ldrb  $t1,[$inp,#3]                   @ $i
73         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
74         ldrb    $t2,[$inp,#2]
75         ldrb    $t0,[$inp,#1]
76         orr     $t1,$t1,$t2,lsl#8
77         ldrb    $t2,[$inp],#4
78         orr     $t1,$t1,$t0,lsl#16
79 # if $i==15
80         str     $inp,[sp,#17*4]                 @ make room for $t4
81 # endif
82         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
83         orr     $t1,$t1,$t2,lsl#24
84         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
85 #endif
86 ___
87 $code.=<<___;
88         ldr     $t2,[$Ktbl],#4                  @ *K256++
89         add     $h,$h,$t1                       @ h+=X[i]
90         str     $t1,[sp,#`$i%16`*4]
91         eor     $t1,$f,$g
92         add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
93         and     $t1,$t1,$e
94         add     $h,$h,$t2                       @ h+=K256[i]
95         eor     $t1,$t1,$g                      @ Ch(e,f,g)
96         eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
97         add     $h,$h,$t1                       @ h+=Ch(e,f,g)
98 #if $i==31
99         and     $t2,$t2,#0xff
100         cmp     $t2,#0xf2                       @ done?
101 #endif
102 #if $i<15
103 # if __ARM_ARCH__>=7
104         ldr     $t1,[$inp],#4                   @ prefetch
105 # else
106         ldrb    $t1,[$inp,#3]
107 # endif
108         eor     $t2,$a,$b                       @ a^b, b^c in next round
109 #else
110         ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
111         eor     $t2,$a,$b                       @ a^b, b^c in next round
112         ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
113 #endif
114         eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
115         and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
116         add     $d,$d,$h                        @ d+=h
117         eor     $t3,$t3,$b                      @ Maj(a,b,c)
118         add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
119         @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
120 ___
121         ($t2,$t3)=($t3,$t2);
122 }
123
124 sub BODY_16_XX {
125 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
126
127 $code.=<<___;
128         @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
129         @ ldr   $t4,[sp,#`($i+14)%16`*4]
130         mov     $t0,$t1,ror#$sigma0[0]
131         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
132         mov     $t2,$t4,ror#$sigma1[0]
133         eor     $t0,$t0,$t1,ror#$sigma0[1]
134         eor     $t2,$t2,$t4,ror#$sigma1[1]
135         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
136         ldr     $t1,[sp,#`($i+0)%16`*4]
137         eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
138         ldr     $t4,[sp,#`($i+9)%16`*4]
139
140         add     $t2,$t2,$t0
141         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
142         add     $t1,$t1,$t2
143         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
144         add     $t1,$t1,$t4                     @ X[i]
145 ___
146         &BODY_00_15(@_);
147 }
148
149 $code=<<___;
150 #include "arm_arch.h"
151
152 .text
153 .code   32
154
155 .type   K256,%object
156 .align  5
157 K256:
158 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
159 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
160 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
161 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
162 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
163 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
164 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
165 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
166 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
167 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
168 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
169 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
170 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
171 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
172 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
173 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
174 .size   K256,.-K256
175 .word   0                               @ terminator
176 .LOPENSSL_armcap:
177 .word   OPENSSL_armcap_P-sha256_block_data_order
178 .align  5
179
180 .global sha256_block_data_order
181 .type   sha256_block_data_order,%function
182 sha256_block_data_order:
183         sub     r3,pc,#8                @ sha256_block_data_order
184         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
185 #if __ARM_ARCH__>=7
186         ldr     r12,.LOPENSSL_armcap
187         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
188         tst     r12,#1
189         bne     .LNEON
190 #endif
191         stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
192         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
193         sub     $Ktbl,r3,#256+32        @ K256
194         sub     sp,sp,#16*4             @ alloca(X[16])
195 .Loop:
196 # if __ARM_ARCH__>=7
197         ldr     $t1,[$inp],#4
198 # else
199         ldrb    $t1,[$inp,#3]
200 # endif
201         eor     $t3,$B,$C               @ magic
202         eor     $t2,$t2,$t2
203 ___
204 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
205 $code.=".Lrounds_16_xx:\n";
206 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
207 $code.=<<___;
208         ldreq   $t3,[sp,#16*4]          @ pull ctx
209         bne     .Lrounds_16_xx
210
211         add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
212         ldr     $t0,[$t3,#0]
213         ldr     $t1,[$t3,#4]
214         ldr     $t2,[$t3,#8]
215         add     $A,$A,$t0
216         ldr     $t0,[$t3,#12]
217         add     $B,$B,$t1
218         ldr     $t1,[$t3,#16]
219         add     $C,$C,$t2
220         ldr     $t2,[$t3,#20]
221         add     $D,$D,$t0
222         ldr     $t0,[$t3,#24]
223         add     $E,$E,$t1
224         ldr     $t1,[$t3,#28]
225         add     $F,$F,$t2
226         ldr     $inp,[sp,#17*4]         @ pull inp
227         ldr     $t2,[sp,#18*4]          @ pull inp+len
228         add     $G,$G,$t0
229         add     $H,$H,$t1
230         stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
231         cmp     $inp,$t2
232         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
233         bne     .Loop
234
235         add     sp,sp,#`16+3`*4 @ destroy frame
236 #if __ARM_ARCH__>=5
237         ldmia   sp!,{r4-r11,pc}
238 #else
239         ldmia   sp!,{r4-r11,lr}
240         tst     lr,#1
241         moveq   pc,lr                   @ be binary compatible with V4, yet
242         bx      lr                      @ interoperable with Thumb ISA:-)
243 #endif
244 ___
245 ######################################################################
246 # NEON stuff
247 #
248 {{{
249 my @X=map("q$_",(0..3));
250 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
251 my $Xfer=$t4;
252 my $j=0;
253
254 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
255 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
256
257 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
258 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
259   my $arg = pop;
260     $arg = "#$arg" if ($arg*1 eq $arg);
261     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
262 }
263
264 sub Xupdate()
265 { use integer;
266   my $body = shift;
267   my @insns = (&$body,&$body,&$body,&$body);
268   my ($a,$b,$c,$d,$e,$f,$g,$h);
269
270         &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
271          eval(shift(@insns));
272          eval(shift(@insns));
273          eval(shift(@insns));
274         &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
275          eval(shift(@insns));
276          eval(shift(@insns));
277          eval(shift(@insns));
278         &vshr_u32       ($T2,$T0,$sigma0[0]);
279          eval(shift(@insns));
280          eval(shift(@insns));
281         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
282          eval(shift(@insns));
283          eval(shift(@insns));
284         &vshr_u32       ($T1,$T0,$sigma0[2]);
285          eval(shift(@insns));
286          eval(shift(@insns));
287         &vsli_32        ($T2,$T0,32-$sigma0[0]);
288          eval(shift(@insns));
289          eval(shift(@insns));
290         &vshr_u32       ($T3,$T0,$sigma0[1]);
291          eval(shift(@insns));
292          eval(shift(@insns));
293         &veor           ($T1,$T1,$T2);
294          eval(shift(@insns));
295          eval(shift(@insns));
296         &vsli_32        ($T3,$T0,32-$sigma0[1]);
297          eval(shift(@insns));
298          eval(shift(@insns));
299           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
300          eval(shift(@insns));
301          eval(shift(@insns));
302         &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
303          eval(shift(@insns));
304          eval(shift(@insns));
305           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
306          eval(shift(@insns));
307          eval(shift(@insns));
308           &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
309          eval(shift(@insns));
310          eval(shift(@insns));
311         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
312          eval(shift(@insns));
313          eval(shift(@insns));
314           &veor         ($T5,$T5,$T4);
315          eval(shift(@insns));
316          eval(shift(@insns));
317           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
318          eval(shift(@insns));
319          eval(shift(@insns));
320           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
321          eval(shift(@insns));
322          eval(shift(@insns));
323           &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
324          eval(shift(@insns));
325          eval(shift(@insns));
326         &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
327          eval(shift(@insns));
328          eval(shift(@insns));
329           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
330          eval(shift(@insns));
331          eval(shift(@insns));
332           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
333          eval(shift(@insns));
334          eval(shift(@insns));
335           &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
336          eval(shift(@insns));
337          eval(shift(@insns));
338           &veor         ($T5,$T5,$T4);
339          eval(shift(@insns));
340          eval(shift(@insns));
341           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
342          eval(shift(@insns));
343          eval(shift(@insns));
344         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
345          eval(shift(@insns));
346          eval(shift(@insns));
347           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
348          eval(shift(@insns));
349          eval(shift(@insns));
350           &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
351          eval(shift(@insns));
352          eval(shift(@insns));
353         &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
354          eval(shift(@insns));
355          eval(shift(@insns));
356         &vadd_i32       ($T0,$T0,@X[0]);
357          while($#insns>=2) { eval(shift(@insns)); }
358         &vst1_32        ("{$T0}","[$Xfer,:128]!");
359          eval(shift(@insns));
360          eval(shift(@insns));
361
362         push(@X,shift(@X));             # "rotate" X[]
363 }
364
365 sub Xpreload()
366 { use integer;
367   my $body = shift;
368   my @insns = (&$body,&$body,&$body,&$body);
369   my ($a,$b,$c,$d,$e,$f,$g,$h);
370
371          eval(shift(@insns));
372          eval(shift(@insns));
373          eval(shift(@insns));
374          eval(shift(@insns));
375         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
376          eval(shift(@insns));
377          eval(shift(@insns));
378          eval(shift(@insns));
379          eval(shift(@insns));
380         &vrev32_8       (@X[0],@X[0]);
381          eval(shift(@insns));
382          eval(shift(@insns));
383          eval(shift(@insns));
384          eval(shift(@insns));
385         &vadd_i32       ($T0,$T0,@X[0]);
386          foreach (@insns) { eval; }     # remaining instructions
387         &vst1_32        ("{$T0}","[$Xfer,:128]!");
388
389         push(@X,shift(@X));             # "rotate" X[]
390 }
391
392 sub body_00_15 () {
393         (
394         '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
395         '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
396         '&eor   ($t1,$f,$g)',
397         '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
398         '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
399         '&and   ($t1,$t1,$e)',
400         '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
401         '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
402         '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
403         '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
404         '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
405         '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
406         '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
407         '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
408         '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
409         '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
410         '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
411         '&add   ($d,$d,$h)',                    # d+=h
412         '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
413         '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
414         '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
415         )
416 }
417
418 $code.=<<___;
419 #if __ARM_ARCH__>=7
420 .fpu    neon
421 .align  4
422 .LNEON:
423         stmdb   sp!,{r4-r12,lr}
424
425         mov     $t2,sp
426         sub     sp,sp,#16*4+16          @ alloca
427         sub     $Ktbl,r3,#256+32        @ K256
428         bic     sp,sp,#15               @ align for 128-bit stores
429
430         vld1.8          {@X[0]},[$inp]!
431         vld1.8          {@X[1]},[$inp]!
432         vld1.8          {@X[2]},[$inp]!
433         vld1.8          {@X[3]},[$inp]!
434         vld1.32         {$T0},[$Ktbl,:128]!
435         vld1.32         {$T1},[$Ktbl,:128]!
436         vld1.32         {$T2},[$Ktbl,:128]!
437         vld1.32         {$T3},[$Ktbl,:128]!
438         vrev32.8        @X[0],@X[0]             @ yes, even on
439         str             $ctx,[sp,#64]
440         vrev32.8        @X[1],@X[1]             @ big-endian
441         str             $inp,[sp,#68]
442         mov             $Xfer,sp
443         vrev32.8        @X[2],@X[2]
444         str             $len,[sp,#72]
445         vrev32.8        @X[3],@X[3]
446         str             $t2,[sp,#76]            @ save original sp
447         vadd.i32        $T0,$T0,@X[0]
448         vadd.i32        $T1,$T1,@X[1]
449         vst1.32         {$T0},[$Xfer,:128]!
450         vadd.i32        $T2,$T2,@X[2]
451         vst1.32         {$T1},[$Xfer,:128]!
452         vadd.i32        $T3,$T3,@X[3]
453         vst1.32         {$T2},[$Xfer,:128]!
454         vst1.32         {$T3},[$Xfer,:128]!
455
456         ldmia           $ctx,{$A-$H}
457         sub             $Xfer,$Xfer,#64
458         ldr             $t1,[sp,#0]
459         eor             $t2,$t2,$t2
460         eor             $t3,$B,$C
461         b               .L_00_48
462
463 .align  4
464 .L_00_48:
465 ___
466         &Xupdate(\&body_00_15);
467         &Xupdate(\&body_00_15);
468         &Xupdate(\&body_00_15);
469         &Xupdate(\&body_00_15);
470 $code.=<<___;
471         teq     $t1,#0                          @ check for K256 terminator
472         ldr     $t1,[sp,#0]
473         sub     $Xfer,$Xfer,#64
474         bne     .L_00_48
475
476         ldr             $inp,[sp,#68]
477         ldr             $t0,[sp,#72]
478         sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
479         teq             $inp,$t0
480         subeq           $inp,$inp,#64           @ avoid SEGV
481         vld1.8          {@X[0]},[$inp]!         @ load next input block
482         vld1.8          {@X[1]},[$inp]!
483         vld1.8          {@X[2]},[$inp]!
484         vld1.8          {@X[3]},[$inp]!
485         strne           $inp,[sp,#68]
486         mov             $Xfer,sp
487 ___
488         &Xpreload(\&body_00_15);
489         &Xpreload(\&body_00_15);
490         &Xpreload(\&body_00_15);
491         &Xpreload(\&body_00_15);
492 $code.=<<___;
493         ldr     $t0,[$t1,#0]
494         add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
495         ldr     $t2,[$t1,#4]
496         ldr     $t3,[$t1,#8]
497         ldr     $t4,[$t1,#12]
498         add     $A,$A,$t0                       @ accumulate
499         ldr     $t0,[$t1,#16]
500         add     $B,$B,$t2
501         ldr     $t2,[$t1,#20]
502         add     $C,$C,$t3
503         ldr     $t3,[$t1,#24]
504         add     $D,$D,$t4
505         ldr     $t4,[$t1,#28]
506         add     $E,$E,$t0
507         str     $A,[$t1],#4
508         add     $F,$F,$t2
509         str     $B,[$t1],#4
510         add     $G,$G,$t3
511         str     $C,[$t1],#4
512         add     $H,$H,$t4
513         str     $D,[$t1],#4
514         stmia   $t1,{$E-$H}
515
516         movne   $Xfer,sp
517         ldrne   $t1,[sp,#0]
518         eorne   $t2,$t2,$t2
519         ldreq   sp,[sp,#76]                     @ restore original sp
520         eorne   $t3,$B,$C
521         bne     .L_00_48
522
523         ldmia   sp!,{r4-r12,pc}
524 #endif
525 ___
526 }}}
527 $code.=<<___;
528 .size   sha256_block_data_order,.-sha256_block_data_order
529 .asciz  "SHA256 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
530 .align  2
531 .comm   OPENSSL_armcap_P,4,4
532 ___
533
534 $code =~ s/\`([^\`]*)\`/eval $1/gem;
535 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
536 print $code;
537 close STDOUT; # enforce flush