ARM assembly pack: get ARMv7 instruction endianness right.
[openssl.git] / crypto / sha / asm / sha256-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # SHA256 block procedure for ARMv4. May 2007.
11
12 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
13 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14 # byte [on single-issue Xscale PXA250 core].
15
16 # July 2010.
17 #
18 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
19 # Cortex A8 core and ~20 cycles per processed byte.
20
21 # February 2011.
22 #
23 # Profiler-assisted and platform-specific optimization resulted in 16%
24 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
25
26 # September 2013.
27 #
28 # Add NEON implementation. On Cortex A8 it was measured to process one
29 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
30 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
31 # code (meaning that latter performs sub-optimally, nothing was done
32 # about it).
33
34 # May 2014.
35 #
36 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
37
38 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
39 open STDOUT,">$output";
40
41 $ctx="r0";      $t0="r0";
42 $inp="r1";      $t4="r1";
43 $len="r2";      $t1="r2";
44 $T1="r3";       $t3="r3";
45 $A="r4";
46 $B="r5";
47 $C="r6";
48 $D="r7";
49 $E="r8";
50 $F="r9";
51 $G="r10";
52 $H="r11";
53 @V=($A,$B,$C,$D,$E,$F,$G,$H);
54 $t2="r12";
55 $Ktbl="r14";
56
57 @Sigma0=( 2,13,22);
58 @Sigma1=( 6,11,25);
59 @sigma0=( 7,18, 3);
60 @sigma1=(17,19,10);
61
62 sub BODY_00_15 {
63 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
64
65 $code.=<<___ if ($i<16);
66 #if __ARM_ARCH__>=7
67         @ ldr   $t1,[$inp],#4                   @ $i
68 # if $i==15
69         str     $inp,[sp,#17*4]                 @ make room for $t4
70 # endif
71         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
72         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
73         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
74         rev     $t1,$t1
75 #else
76         @ ldrb  $t1,[$inp,#3]                   @ $i
77         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
78         ldrb    $t2,[$inp,#2]
79         ldrb    $t0,[$inp,#1]
80         orr     $t1,$t1,$t2,lsl#8
81         ldrb    $t2,[$inp],#4
82         orr     $t1,$t1,$t0,lsl#16
83 # if $i==15
84         str     $inp,[sp,#17*4]                 @ make room for $t4
85 # endif
86         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
87         orr     $t1,$t1,$t2,lsl#24
88         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
89 #endif
90 ___
91 $code.=<<___;
92         ldr     $t2,[$Ktbl],#4                  @ *K256++
93         add     $h,$h,$t1                       @ h+=X[i]
94         str     $t1,[sp,#`$i%16`*4]
95         eor     $t1,$f,$g
96         add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
97         and     $t1,$t1,$e
98         add     $h,$h,$t2                       @ h+=K256[i]
99         eor     $t1,$t1,$g                      @ Ch(e,f,g)
100         eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
101         add     $h,$h,$t1                       @ h+=Ch(e,f,g)
102 #if $i==31
103         and     $t2,$t2,#0xff
104         cmp     $t2,#0xf2                       @ done?
105 #endif
106 #if $i<15
107 # if __ARM_ARCH__>=7
108         ldr     $t1,[$inp],#4                   @ prefetch
109 # else
110         ldrb    $t1,[$inp,#3]
111 # endif
112         eor     $t2,$a,$b                       @ a^b, b^c in next round
113 #else
114         ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
115         eor     $t2,$a,$b                       @ a^b, b^c in next round
116         ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
117 #endif
118         eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
119         and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
120         add     $d,$d,$h                        @ d+=h
121         eor     $t3,$t3,$b                      @ Maj(a,b,c)
122         add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
123         @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
124 ___
125         ($t2,$t3)=($t3,$t2);
126 }
127
128 sub BODY_16_XX {
129 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
130
131 $code.=<<___;
132         @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
133         @ ldr   $t4,[sp,#`($i+14)%16`*4]
134         mov     $t0,$t1,ror#$sigma0[0]
135         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
136         mov     $t2,$t4,ror#$sigma1[0]
137         eor     $t0,$t0,$t1,ror#$sigma0[1]
138         eor     $t2,$t2,$t4,ror#$sigma1[1]
139         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
140         ldr     $t1,[sp,#`($i+0)%16`*4]
141         eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
142         ldr     $t4,[sp,#`($i+9)%16`*4]
143
144         add     $t2,$t2,$t0
145         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
146         add     $t1,$t1,$t2
147         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
148         add     $t1,$t1,$t4                     @ X[i]
149 ___
150         &BODY_00_15(@_);
151 }
152
153 $code=<<___;
154 #include "arm_arch.h"
155
156 .text
157 .code   32
158
159 .type   K256,%object
160 .align  5
161 K256:
162 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
163 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
164 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
165 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
166 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
167 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
168 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
169 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
170 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
171 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
172 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
173 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
174 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
175 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
176 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
177 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
178 .size   K256,.-K256
179 .word   0                               @ terminator
180 .LOPENSSL_armcap:
181 .word   OPENSSL_armcap_P-sha256_block_data_order
182 .align  5
183
184 .global sha256_block_data_order
185 .type   sha256_block_data_order,%function
186 sha256_block_data_order:
187         sub     r3,pc,#8                @ sha256_block_data_order
188         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
189 #if __ARM_ARCH__>=7
190         ldr     r12,.LOPENSSL_armcap
191         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
192         tst     r12,#ARMV8_SHA256
193         bne     .LARMv8
194         tst     r12,#ARMV7_NEON
195         bne     .LNEON
196 #endif
197         stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
198         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
199         sub     $Ktbl,r3,#256+32        @ K256
200         sub     sp,sp,#16*4             @ alloca(X[16])
201 .Loop:
202 # if __ARM_ARCH__>=7
203         ldr     $t1,[$inp],#4
204 # else
205         ldrb    $t1,[$inp,#3]
206 # endif
207         eor     $t3,$B,$C               @ magic
208         eor     $t2,$t2,$t2
209 ___
210 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
211 $code.=".Lrounds_16_xx:\n";
212 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
213 $code.=<<___;
214         ldreq   $t3,[sp,#16*4]          @ pull ctx
215         bne     .Lrounds_16_xx
216
217         add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
218         ldr     $t0,[$t3,#0]
219         ldr     $t1,[$t3,#4]
220         ldr     $t2,[$t3,#8]
221         add     $A,$A,$t0
222         ldr     $t0,[$t3,#12]
223         add     $B,$B,$t1
224         ldr     $t1,[$t3,#16]
225         add     $C,$C,$t2
226         ldr     $t2,[$t3,#20]
227         add     $D,$D,$t0
228         ldr     $t0,[$t3,#24]
229         add     $E,$E,$t1
230         ldr     $t1,[$t3,#28]
231         add     $F,$F,$t2
232         ldr     $inp,[sp,#17*4]         @ pull inp
233         ldr     $t2,[sp,#18*4]          @ pull inp+len
234         add     $G,$G,$t0
235         add     $H,$H,$t1
236         stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
237         cmp     $inp,$t2
238         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
239         bne     .Loop
240
241         add     sp,sp,#`16+3`*4 @ destroy frame
242 #if __ARM_ARCH__>=5
243         ldmia   sp!,{r4-r11,pc}
244 #else
245         ldmia   sp!,{r4-r11,lr}
246         tst     lr,#1
247         moveq   pc,lr                   @ be binary compatible with V4, yet
248         bx      lr                      @ interoperable with Thumb ISA:-)
249 #endif
250 .size   sha256_block_data_order,.-sha256_block_data_order
251 ___
252 ######################################################################
253 # NEON stuff
254 #
255 {{{
256 my @X=map("q$_",(0..3));
257 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
258 my $Xfer=$t4;
259 my $j=0;
260
261 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
262 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
263
264 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
265 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
266   my $arg = pop;
267     $arg = "#$arg" if ($arg*1 eq $arg);
268     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
269 }
270
271 sub Xupdate()
272 { use integer;
273   my $body = shift;
274   my @insns = (&$body,&$body,&$body,&$body);
275   my ($a,$b,$c,$d,$e,$f,$g,$h);
276
277         &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
278          eval(shift(@insns));
279          eval(shift(@insns));
280          eval(shift(@insns));
281         &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
282          eval(shift(@insns));
283          eval(shift(@insns));
284          eval(shift(@insns));
285         &vshr_u32       ($T2,$T0,$sigma0[0]);
286          eval(shift(@insns));
287          eval(shift(@insns));
288         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
289          eval(shift(@insns));
290          eval(shift(@insns));
291         &vshr_u32       ($T1,$T0,$sigma0[2]);
292          eval(shift(@insns));
293          eval(shift(@insns));
294         &vsli_32        ($T2,$T0,32-$sigma0[0]);
295          eval(shift(@insns));
296          eval(shift(@insns));
297         &vshr_u32       ($T3,$T0,$sigma0[1]);
298          eval(shift(@insns));
299          eval(shift(@insns));
300         &veor           ($T1,$T1,$T2);
301          eval(shift(@insns));
302          eval(shift(@insns));
303         &vsli_32        ($T3,$T0,32-$sigma0[1]);
304          eval(shift(@insns));
305          eval(shift(@insns));
306           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
307          eval(shift(@insns));
308          eval(shift(@insns));
309         &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
310          eval(shift(@insns));
311          eval(shift(@insns));
312           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
313          eval(shift(@insns));
314          eval(shift(@insns));
315           &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
316          eval(shift(@insns));
317          eval(shift(@insns));
318         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
319          eval(shift(@insns));
320          eval(shift(@insns));
321           &veor         ($T5,$T5,$T4);
322          eval(shift(@insns));
323          eval(shift(@insns));
324           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
325          eval(shift(@insns));
326          eval(shift(@insns));
327           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
328          eval(shift(@insns));
329          eval(shift(@insns));
330           &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
331          eval(shift(@insns));
332          eval(shift(@insns));
333         &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
334          eval(shift(@insns));
335          eval(shift(@insns));
336           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
337          eval(shift(@insns));
338          eval(shift(@insns));
339           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
340          eval(shift(@insns));
341          eval(shift(@insns));
342           &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
343          eval(shift(@insns));
344          eval(shift(@insns));
345           &veor         ($T5,$T5,$T4);
346          eval(shift(@insns));
347          eval(shift(@insns));
348           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
349          eval(shift(@insns));
350          eval(shift(@insns));
351         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
352          eval(shift(@insns));
353          eval(shift(@insns));
354           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
355          eval(shift(@insns));
356          eval(shift(@insns));
357           &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
358          eval(shift(@insns));
359          eval(shift(@insns));
360         &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
361          eval(shift(@insns));
362          eval(shift(@insns));
363         &vadd_i32       ($T0,$T0,@X[0]);
364          while($#insns>=2) { eval(shift(@insns)); }
365         &vst1_32        ("{$T0}","[$Xfer,:128]!");
366          eval(shift(@insns));
367          eval(shift(@insns));
368
369         push(@X,shift(@X));             # "rotate" X[]
370 }
371
372 sub Xpreload()
373 { use integer;
374   my $body = shift;
375   my @insns = (&$body,&$body,&$body,&$body);
376   my ($a,$b,$c,$d,$e,$f,$g,$h);
377
378          eval(shift(@insns));
379          eval(shift(@insns));
380          eval(shift(@insns));
381          eval(shift(@insns));
382         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
383          eval(shift(@insns));
384          eval(shift(@insns));
385          eval(shift(@insns));
386          eval(shift(@insns));
387         &vrev32_8       (@X[0],@X[0]);
388          eval(shift(@insns));
389          eval(shift(@insns));
390          eval(shift(@insns));
391          eval(shift(@insns));
392         &vadd_i32       ($T0,$T0,@X[0]);
393          foreach (@insns) { eval; }     # remaining instructions
394         &vst1_32        ("{$T0}","[$Xfer,:128]!");
395
396         push(@X,shift(@X));             # "rotate" X[]
397 }
398
399 sub body_00_15 () {
400         (
401         '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
402         '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
403         '&eor   ($t1,$f,$g)',
404         '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
405         '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
406         '&and   ($t1,$t1,$e)',
407         '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
408         '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
409         '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
410         '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
411         '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
412         '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
413         '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
414         '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
415         '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
416         '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
417         '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
418         '&add   ($d,$d,$h)',                    # d+=h
419         '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
420         '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
421         '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
422         )
423 }
424
425 $code.=<<___;
426 #if __ARM_ARCH__>=7
427 .fpu    neon
428
429 .type   sha256_block_data_order_neon,%function
430 .align  4
431 sha256_block_data_order_neon:
432 .LNEON:
433         stmdb   sp!,{r4-r12,lr}
434
435         mov     $t2,sp
436         sub     sp,sp,#16*4+16          @ alloca
437         sub     $Ktbl,r3,#256+32        @ K256
438         bic     sp,sp,#15               @ align for 128-bit stores
439
440         vld1.8          {@X[0]},[$inp]!
441         vld1.8          {@X[1]},[$inp]!
442         vld1.8          {@X[2]},[$inp]!
443         vld1.8          {@X[3]},[$inp]!
444         vld1.32         {$T0},[$Ktbl,:128]!
445         vld1.32         {$T1},[$Ktbl,:128]!
446         vld1.32         {$T2},[$Ktbl,:128]!
447         vld1.32         {$T3},[$Ktbl,:128]!
448         vrev32.8        @X[0],@X[0]             @ yes, even on
449         str             $ctx,[sp,#64]
450         vrev32.8        @X[1],@X[1]             @ big-endian
451         str             $inp,[sp,#68]
452         mov             $Xfer,sp
453         vrev32.8        @X[2],@X[2]
454         str             $len,[sp,#72]
455         vrev32.8        @X[3],@X[3]
456         str             $t2,[sp,#76]            @ save original sp
457         vadd.i32        $T0,$T0,@X[0]
458         vadd.i32        $T1,$T1,@X[1]
459         vst1.32         {$T0},[$Xfer,:128]!
460         vadd.i32        $T2,$T2,@X[2]
461         vst1.32         {$T1},[$Xfer,:128]!
462         vadd.i32        $T3,$T3,@X[3]
463         vst1.32         {$T2},[$Xfer,:128]!
464         vst1.32         {$T3},[$Xfer,:128]!
465
466         ldmia           $ctx,{$A-$H}
467         sub             $Xfer,$Xfer,#64
468         ldr             $t1,[sp,#0]
469         eor             $t2,$t2,$t2
470         eor             $t3,$B,$C
471         b               .L_00_48
472
473 .align  4
474 .L_00_48:
475 ___
476         &Xupdate(\&body_00_15);
477         &Xupdate(\&body_00_15);
478         &Xupdate(\&body_00_15);
479         &Xupdate(\&body_00_15);
480 $code.=<<___;
481         teq     $t1,#0                          @ check for K256 terminator
482         ldr     $t1,[sp,#0]
483         sub     $Xfer,$Xfer,#64
484         bne     .L_00_48
485
486         ldr             $inp,[sp,#68]
487         ldr             $t0,[sp,#72]
488         sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
489         teq             $inp,$t0
490         subeq           $inp,$inp,#64           @ avoid SEGV
491         vld1.8          {@X[0]},[$inp]!         @ load next input block
492         vld1.8          {@X[1]},[$inp]!
493         vld1.8          {@X[2]},[$inp]!
494         vld1.8          {@X[3]},[$inp]!
495         strne           $inp,[sp,#68]
496         mov             $Xfer,sp
497 ___
498         &Xpreload(\&body_00_15);
499         &Xpreload(\&body_00_15);
500         &Xpreload(\&body_00_15);
501         &Xpreload(\&body_00_15);
502 $code.=<<___;
503         ldr     $t0,[$t1,#0]
504         add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
505         ldr     $t2,[$t1,#4]
506         ldr     $t3,[$t1,#8]
507         ldr     $t4,[$t1,#12]
508         add     $A,$A,$t0                       @ accumulate
509         ldr     $t0,[$t1,#16]
510         add     $B,$B,$t2
511         ldr     $t2,[$t1,#20]
512         add     $C,$C,$t3
513         ldr     $t3,[$t1,#24]
514         add     $D,$D,$t4
515         ldr     $t4,[$t1,#28]
516         add     $E,$E,$t0
517         str     $A,[$t1],#4
518         add     $F,$F,$t2
519         str     $B,[$t1],#4
520         add     $G,$G,$t3
521         str     $C,[$t1],#4
522         add     $H,$H,$t4
523         str     $D,[$t1],#4
524         stmia   $t1,{$E-$H}
525
526         movne   $Xfer,sp
527         ldrne   $t1,[sp,#0]
528         eorne   $t2,$t2,$t2
529         ldreq   sp,[sp,#76]                     @ restore original sp
530         eorne   $t3,$B,$C
531         bne     .L_00_48
532
533         ldmia   sp!,{r4-r12,pc}
534 .size   sha256_block_data_order_neon,.-sha256_block_data_order_neon
535 #endif
536 ___
537 }}}
538 ######################################################################
539 # ARMv8 stuff
540 #
541 {{{
542 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
543 my @MSG=map("q$_",(8..11));
544 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
545 my $Ktbl="r3";
546
547 $code.=<<___;
548 #if __ARM_ARCH__>=7
549 .type   sha256_block_data_order_armv8,%function
550 .align  5
551 sha256_block_data_order_armv8:
552 .LARMv8:
553         vld1.32 {$ABCD,$EFGH},[$ctx]
554         sub     $Ktbl,r3,#sha256_block_data_order-K256
555
556 .Loop_v8:
557         vld1.8          {@MSG[0]-@MSG[1]},[$inp]!
558         vld1.8          {@MSG[2]-@MSG[3]},[$inp]!
559         vld1.32         {$W0},[$Ktbl]!
560         vrev32.8        @MSG[0],@MSG[0]
561         vrev32.8        @MSG[1],@MSG[1]
562         vrev32.8        @MSG[2],@MSG[2]
563         vrev32.8        @MSG[3],@MSG[3]
564         vmov            $ABCD_SAVE,$ABCD        @ offload
565         vmov            $EFGH_SAVE,$EFGH
566         teq             $inp,$len
567 ___
568 for($i=0;$i<12;$i++) {
569 $code.=<<___;
570         vld1.32         {$W1},[$Ktbl]!
571         vadd.i32        $W0,$W0,@MSG[0]
572         sha256su0       @MSG[0],@MSG[1]
573         vmov            $abcd,$ABCD
574         sha256h         $ABCD,$EFGH,$W0
575         sha256h2        $EFGH,$abcd,$W0
576         sha256su1       @MSG[0],@MSG[2],@MSG[3]
577 ___
578         ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
579 }
580 $code.=<<___;
581         vld1.32         {$W1},[$Ktbl]!
582         vadd.i32        $W0,$W0,@MSG[0]
583         vmov            $abcd,$ABCD
584         sha256h         $ABCD,$EFGH,$W0
585         sha256h2        $EFGH,$abcd,$W0
586
587         vld1.32         {$W0},[$Ktbl]!
588         vadd.i32        $W1,$W1,@MSG[1]
589         vmov            $abcd,$ABCD
590         sha256h         $ABCD,$EFGH,$W1
591         sha256h2        $EFGH,$abcd,$W1
592
593         vld1.32         {$W1},[$Ktbl]
594         vadd.i32        $W0,$W0,@MSG[2]
595         sub             $Ktbl,$Ktbl,#256-16     @ rewind
596         vmov            $abcd,$ABCD
597         sha256h         $ABCD,$EFGH,$W0
598         sha256h2        $EFGH,$abcd,$W0
599
600         vadd.i32        $W1,$W1,@MSG[3]
601         vmov            $abcd,$ABCD
602         sha256h         $ABCD,$EFGH,$W1
603         sha256h2        $EFGH,$abcd,$W1
604
605         vadd.i32        $ABCD,$ABCD,$ABCD_SAVE
606         vadd.i32        $EFGH,$EFGH,$EFGH_SAVE
607         bne             .Loop_v8
608
609         vst1.32         {$ABCD,$EFGH},[$ctx]
610
611         ret             @ bx lr
612 .size   sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
613 #endif
614 ___
615 }}}
616 $code.=<<___;
617 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
618 .align  2
619 .comm   OPENSSL_armcap_P,4,4
620 ___
621
622 {   my  %opcode = (
623         "sha256h"       => 0xf3000c40,  "sha256h2"      => 0xf3100c40,
624         "sha256su0"     => 0xf3ba03c0,  "sha256su1"     => 0xf3200c40   );
625
626     sub unsha256 {
627         my ($mnemonic,$arg)=@_;
628
629         if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
630             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
631                                          |(($2&7)<<17)|(($2&8)<<4)
632                                          |(($3&7)<<1) |(($3&8)<<2);
633             # since ARMv7 instructions are always encoded little-endian.
634             # correct solution is to use .inst directive, but older
635             # assemblers don't implement it:-(
636             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
637                         $word&0xff,($word>>8)&0xff,
638                         ($word>>16)&0xff,($word>>24)&0xff,
639                         $mnemonic,$arg;
640         }
641     }
642 }
643
644 foreach (split($/,$code)) {
645
646         s/\`([^\`]*)\`/eval $1/geo;
647
648         s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
649
650         s/\bret\b/bx    lr/go           or
651         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
652
653         print $_,"\n";
654 }
655
656 close STDOUT; # enforce flush