b0ae93633f71b00fd12116f58699d2d74d9d364c
[openssl.git] / crypto / sha / asm / sha256-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # SHA256 block procedure for ARMv4. May 2007.
11
12 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
13 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14 # byte [on single-issue Xscale PXA250 core].
15
16 # July 2010.
17 #
18 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
19 # Cortex A8 core and ~20 cycles per processed byte.
20
21 # February 2011.
22 #
23 # Profiler-assisted and platform-specific optimization resulted in 16%
24 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
25
26 # September 2013.
27 #
28 # Add NEON implementation. On Cortex A8 it was measured to process one
29 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
30 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
31 # code (meaning that latter performs sub-optimally, nothing was done
32 # about it).
33
34 # May 2014.
35 #
36 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
37
38 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
39 open STDOUT,">$output";
40
41 $ctx="r0";      $t0="r0";
42 $inp="r1";      $t4="r1";
43 $len="r2";      $t1="r2";
44 $T1="r3";       $t3="r3";
45 $A="r4";
46 $B="r5";
47 $C="r6";
48 $D="r7";
49 $E="r8";
50 $F="r9";
51 $G="r10";
52 $H="r11";
53 @V=($A,$B,$C,$D,$E,$F,$G,$H);
54 $t2="r12";
55 $Ktbl="r14";
56
57 @Sigma0=( 2,13,22);
58 @Sigma1=( 6,11,25);
59 @sigma0=( 7,18, 3);
60 @sigma1=(17,19,10);
61
62 sub BODY_00_15 {
63 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
64
65 $code.=<<___ if ($i<16);
66 #if __ARM_ARCH__>=7
67         @ ldr   $t1,[$inp],#4                   @ $i
68 # if $i==15
69         str     $inp,[sp,#17*4]                 @ make room for $t4
70 # endif
71         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
72         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
73         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
74         rev     $t1,$t1
75 #else
76         @ ldrb  $t1,[$inp,#3]                   @ $i
77         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
78         ldrb    $t2,[$inp,#2]
79         ldrb    $t0,[$inp,#1]
80         orr     $t1,$t1,$t2,lsl#8
81         ldrb    $t2,[$inp],#4
82         orr     $t1,$t1,$t0,lsl#16
83 # if $i==15
84         str     $inp,[sp,#17*4]                 @ make room for $t4
85 # endif
86         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
87         orr     $t1,$t1,$t2,lsl#24
88         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
89 #endif
90 ___
91 $code.=<<___;
92         ldr     $t2,[$Ktbl],#4                  @ *K256++
93         add     $h,$h,$t1                       @ h+=X[i]
94         str     $t1,[sp,#`$i%16`*4]
95         eor     $t1,$f,$g
96         add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
97         and     $t1,$t1,$e
98         add     $h,$h,$t2                       @ h+=K256[i]
99         eor     $t1,$t1,$g                      @ Ch(e,f,g)
100         eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
101         add     $h,$h,$t1                       @ h+=Ch(e,f,g)
102 #if $i==31
103         and     $t2,$t2,#0xff
104         cmp     $t2,#0xf2                       @ done?
105 #endif
106 #if $i<15
107 # if __ARM_ARCH__>=7
108         ldr     $t1,[$inp],#4                   @ prefetch
109 # else
110         ldrb    $t1,[$inp,#3]
111 # endif
112         eor     $t2,$a,$b                       @ a^b, b^c in next round
113 #else
114         ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
115         eor     $t2,$a,$b                       @ a^b, b^c in next round
116         ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
117 #endif
118         eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
119         and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
120         add     $d,$d,$h                        @ d+=h
121         eor     $t3,$t3,$b                      @ Maj(a,b,c)
122         add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
123         @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
124 ___
125         ($t2,$t3)=($t3,$t2);
126 }
127
128 sub BODY_16_XX {
129 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
130
131 $code.=<<___;
132         @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
133         @ ldr   $t4,[sp,#`($i+14)%16`*4]
134         mov     $t0,$t1,ror#$sigma0[0]
135         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
136         mov     $t2,$t4,ror#$sigma1[0]
137         eor     $t0,$t0,$t1,ror#$sigma0[1]
138         eor     $t2,$t2,$t4,ror#$sigma1[1]
139         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
140         ldr     $t1,[sp,#`($i+0)%16`*4]
141         eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
142         ldr     $t4,[sp,#`($i+9)%16`*4]
143
144         add     $t2,$t2,$t0
145         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
146         add     $t1,$t1,$t2
147         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
148         add     $t1,$t1,$t4                     @ X[i]
149 ___
150         &BODY_00_15(@_);
151 }
152
153 $code=<<___;
154 #include "arm_arch.h"
155
156 .text
157 .code   32
158
159 .type   K256,%object
160 .align  5
161 K256:
162 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
163 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
164 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
165 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
166 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
167 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
168 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
169 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
170 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
171 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
172 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
173 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
174 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
175 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
176 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
177 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
178 .size   K256,.-K256
179 .word   0                               @ terminator
180 #if __ARM_MAX_ARCH__>=7
181 .LOPENSSL_armcap:
182 .word   OPENSSL_armcap_P-sha256_block_data_order
183 #endif
184 .align  5
185
186 .global sha256_block_data_order
187 .type   sha256_block_data_order,%function
188 sha256_block_data_order:
189         sub     r3,pc,#8                @ sha256_block_data_order
190         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
191 #if __ARM_MAX_ARCH__>=7
192         ldr     r12,.LOPENSSL_armcap
193         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
194         tst     r12,#ARMV8_SHA256
195         bne     .LARMv8
196         tst     r12,#ARMV7_NEON
197         bne     .LNEON
198 #endif
199         stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
200         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
201         sub     $Ktbl,r3,#256+32        @ K256
202         sub     sp,sp,#16*4             @ alloca(X[16])
203 .Loop:
204 # if __ARM_ARCH__>=7
205         ldr     $t1,[$inp],#4
206 # else
207         ldrb    $t1,[$inp,#3]
208 # endif
209         eor     $t3,$B,$C               @ magic
210         eor     $t2,$t2,$t2
211 ___
212 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
213 $code.=".Lrounds_16_xx:\n";
214 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
215 $code.=<<___;
216         ldreq   $t3,[sp,#16*4]          @ pull ctx
217         bne     .Lrounds_16_xx
218
219         add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
220         ldr     $t0,[$t3,#0]
221         ldr     $t1,[$t3,#4]
222         ldr     $t2,[$t3,#8]
223         add     $A,$A,$t0
224         ldr     $t0,[$t3,#12]
225         add     $B,$B,$t1
226         ldr     $t1,[$t3,#16]
227         add     $C,$C,$t2
228         ldr     $t2,[$t3,#20]
229         add     $D,$D,$t0
230         ldr     $t0,[$t3,#24]
231         add     $E,$E,$t1
232         ldr     $t1,[$t3,#28]
233         add     $F,$F,$t2
234         ldr     $inp,[sp,#17*4]         @ pull inp
235         ldr     $t2,[sp,#18*4]          @ pull inp+len
236         add     $G,$G,$t0
237         add     $H,$H,$t1
238         stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
239         cmp     $inp,$t2
240         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
241         bne     .Loop
242
243         add     sp,sp,#`16+3`*4 @ destroy frame
244 #if __ARM_ARCH__>=5
245         ldmia   sp!,{r4-r11,pc}
246 #else
247         ldmia   sp!,{r4-r11,lr}
248         tst     lr,#1
249         moveq   pc,lr                   @ be binary compatible with V4, yet
250         bx      lr                      @ interoperable with Thumb ISA:-)
251 #endif
252 .size   sha256_block_data_order,.-sha256_block_data_order
253 ___
254 ######################################################################
255 # NEON stuff
256 #
257 {{{
258 my @X=map("q$_",(0..3));
259 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
260 my $Xfer=$t4;
261 my $j=0;
262
263 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
264 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
265
266 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
267 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
268   my $arg = pop;
269     $arg = "#$arg" if ($arg*1 eq $arg);
270     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
271 }
272
273 sub Xupdate()
274 { use integer;
275   my $body = shift;
276   my @insns = (&$body,&$body,&$body,&$body);
277   my ($a,$b,$c,$d,$e,$f,$g,$h);
278
279         &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
280          eval(shift(@insns));
281          eval(shift(@insns));
282          eval(shift(@insns));
283         &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
284          eval(shift(@insns));
285          eval(shift(@insns));
286          eval(shift(@insns));
287         &vshr_u32       ($T2,$T0,$sigma0[0]);
288          eval(shift(@insns));
289          eval(shift(@insns));
290         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
291          eval(shift(@insns));
292          eval(shift(@insns));
293         &vshr_u32       ($T1,$T0,$sigma0[2]);
294          eval(shift(@insns));
295          eval(shift(@insns));
296         &vsli_32        ($T2,$T0,32-$sigma0[0]);
297          eval(shift(@insns));
298          eval(shift(@insns));
299         &vshr_u32       ($T3,$T0,$sigma0[1]);
300          eval(shift(@insns));
301          eval(shift(@insns));
302         &veor           ($T1,$T1,$T2);
303          eval(shift(@insns));
304          eval(shift(@insns));
305         &vsli_32        ($T3,$T0,32-$sigma0[1]);
306          eval(shift(@insns));
307          eval(shift(@insns));
308           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
309          eval(shift(@insns));
310          eval(shift(@insns));
311         &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
312          eval(shift(@insns));
313          eval(shift(@insns));
314           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
315          eval(shift(@insns));
316          eval(shift(@insns));
317           &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
318          eval(shift(@insns));
319          eval(shift(@insns));
320         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
321          eval(shift(@insns));
322          eval(shift(@insns));
323           &veor         ($T5,$T5,$T4);
324          eval(shift(@insns));
325          eval(shift(@insns));
326           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
327          eval(shift(@insns));
328          eval(shift(@insns));
329           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
330          eval(shift(@insns));
331          eval(shift(@insns));
332           &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
333          eval(shift(@insns));
334          eval(shift(@insns));
335         &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
336          eval(shift(@insns));
337          eval(shift(@insns));
338           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
339          eval(shift(@insns));
340          eval(shift(@insns));
341           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
342          eval(shift(@insns));
343          eval(shift(@insns));
344           &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
345          eval(shift(@insns));
346          eval(shift(@insns));
347           &veor         ($T5,$T5,$T4);
348          eval(shift(@insns));
349          eval(shift(@insns));
350           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
351          eval(shift(@insns));
352          eval(shift(@insns));
353         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
354          eval(shift(@insns));
355          eval(shift(@insns));
356           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
357          eval(shift(@insns));
358          eval(shift(@insns));
359           &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
360          eval(shift(@insns));
361          eval(shift(@insns));
362         &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
363          eval(shift(@insns));
364          eval(shift(@insns));
365         &vadd_i32       ($T0,$T0,@X[0]);
366          while($#insns>=2) { eval(shift(@insns)); }
367         &vst1_32        ("{$T0}","[$Xfer,:128]!");
368          eval(shift(@insns));
369          eval(shift(@insns));
370
371         push(@X,shift(@X));             # "rotate" X[]
372 }
373
374 sub Xpreload()
375 { use integer;
376   my $body = shift;
377   my @insns = (&$body,&$body,&$body,&$body);
378   my ($a,$b,$c,$d,$e,$f,$g,$h);
379
380          eval(shift(@insns));
381          eval(shift(@insns));
382          eval(shift(@insns));
383          eval(shift(@insns));
384         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
385          eval(shift(@insns));
386          eval(shift(@insns));
387          eval(shift(@insns));
388          eval(shift(@insns));
389         &vrev32_8       (@X[0],@X[0]);
390          eval(shift(@insns));
391          eval(shift(@insns));
392          eval(shift(@insns));
393          eval(shift(@insns));
394         &vadd_i32       ($T0,$T0,@X[0]);
395          foreach (@insns) { eval; }     # remaining instructions
396         &vst1_32        ("{$T0}","[$Xfer,:128]!");
397
398         push(@X,shift(@X));             # "rotate" X[]
399 }
400
401 sub body_00_15 () {
402         (
403         '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
404         '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
405         '&eor   ($t1,$f,$g)',
406         '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
407         '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
408         '&and   ($t1,$t1,$e)',
409         '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
410         '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
411         '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
412         '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
413         '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
414         '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
415         '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
416         '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
417         '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
418         '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
419         '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
420         '&add   ($d,$d,$h)',                    # d+=h
421         '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
422         '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
423         '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
424         )
425 }
426
427 $code.=<<___;
428 #if __ARM_MAX_ARCH__>=7
429 .arch   armv7-a
430 .fpu    neon
431
432 .type   sha256_block_data_order_neon,%function
433 .align  4
434 sha256_block_data_order_neon:
435 .LNEON:
436         stmdb   sp!,{r4-r12,lr}
437
438         mov     $t2,sp
439         sub     sp,sp,#16*4+16          @ alloca
440         sub     $Ktbl,r3,#256+32        @ K256
441         bic     sp,sp,#15               @ align for 128-bit stores
442
443         vld1.8          {@X[0]},[$inp]!
444         vld1.8          {@X[1]},[$inp]!
445         vld1.8          {@X[2]},[$inp]!
446         vld1.8          {@X[3]},[$inp]!
447         vld1.32         {$T0},[$Ktbl,:128]!
448         vld1.32         {$T1},[$Ktbl,:128]!
449         vld1.32         {$T2},[$Ktbl,:128]!
450         vld1.32         {$T3},[$Ktbl,:128]!
451         vrev32.8        @X[0],@X[0]             @ yes, even on
452         str             $ctx,[sp,#64]
453         vrev32.8        @X[1],@X[1]             @ big-endian
454         str             $inp,[sp,#68]
455         mov             $Xfer,sp
456         vrev32.8        @X[2],@X[2]
457         str             $len,[sp,#72]
458         vrev32.8        @X[3],@X[3]
459         str             $t2,[sp,#76]            @ save original sp
460         vadd.i32        $T0,$T0,@X[0]
461         vadd.i32        $T1,$T1,@X[1]
462         vst1.32         {$T0},[$Xfer,:128]!
463         vadd.i32        $T2,$T2,@X[2]
464         vst1.32         {$T1},[$Xfer,:128]!
465         vadd.i32        $T3,$T3,@X[3]
466         vst1.32         {$T2},[$Xfer,:128]!
467         vst1.32         {$T3},[$Xfer,:128]!
468
469         ldmia           $ctx,{$A-$H}
470         sub             $Xfer,$Xfer,#64
471         ldr             $t1,[sp,#0]
472         eor             $t2,$t2,$t2
473         eor             $t3,$B,$C
474         b               .L_00_48
475
476 .align  4
477 .L_00_48:
478 ___
479         &Xupdate(\&body_00_15);
480         &Xupdate(\&body_00_15);
481         &Xupdate(\&body_00_15);
482         &Xupdate(\&body_00_15);
483 $code.=<<___;
484         teq     $t1,#0                          @ check for K256 terminator
485         ldr     $t1,[sp,#0]
486         sub     $Xfer,$Xfer,#64
487         bne     .L_00_48
488
489         ldr             $inp,[sp,#68]
490         ldr             $t0,[sp,#72]
491         sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
492         teq             $inp,$t0
493         subeq           $inp,$inp,#64           @ avoid SEGV
494         vld1.8          {@X[0]},[$inp]!         @ load next input block
495         vld1.8          {@X[1]},[$inp]!
496         vld1.8          {@X[2]},[$inp]!
497         vld1.8          {@X[3]},[$inp]!
498         strne           $inp,[sp,#68]
499         mov             $Xfer,sp
500 ___
501         &Xpreload(\&body_00_15);
502         &Xpreload(\&body_00_15);
503         &Xpreload(\&body_00_15);
504         &Xpreload(\&body_00_15);
505 $code.=<<___;
506         ldr     $t0,[$t1,#0]
507         add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
508         ldr     $t2,[$t1,#4]
509         ldr     $t3,[$t1,#8]
510         ldr     $t4,[$t1,#12]
511         add     $A,$A,$t0                       @ accumulate
512         ldr     $t0,[$t1,#16]
513         add     $B,$B,$t2
514         ldr     $t2,[$t1,#20]
515         add     $C,$C,$t3
516         ldr     $t3,[$t1,#24]
517         add     $D,$D,$t4
518         ldr     $t4,[$t1,#28]
519         add     $E,$E,$t0
520         str     $A,[$t1],#4
521         add     $F,$F,$t2
522         str     $B,[$t1],#4
523         add     $G,$G,$t3
524         str     $C,[$t1],#4
525         add     $H,$H,$t4
526         str     $D,[$t1],#4
527         stmia   $t1,{$E-$H}
528
529         movne   $Xfer,sp
530         ldrne   $t1,[sp,#0]
531         eorne   $t2,$t2,$t2
532         ldreq   sp,[sp,#76]                     @ restore original sp
533         eorne   $t3,$B,$C
534         bne     .L_00_48
535
536         ldmia   sp!,{r4-r12,pc}
537 .size   sha256_block_data_order_neon,.-sha256_block_data_order_neon
538 #endif
539 ___
540 }}}
541 ######################################################################
542 # ARMv8 stuff
543 #
544 {{{
545 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
546 my @MSG=map("q$_",(8..11));
547 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
548 my $Ktbl="r3";
549
550 $code.=<<___;
551 #if __ARM_MAX_ARCH__>=7
552 .type   sha256_block_data_order_armv8,%function
553 .align  5
554 sha256_block_data_order_armv8:
555 .LARMv8:
556         vld1.32 {$ABCD,$EFGH},[$ctx]
557         sub     $Ktbl,r3,#sha256_block_data_order-K256
558
559 .Loop_v8:
560         vld1.8          {@MSG[0]-@MSG[1]},[$inp]!
561         vld1.8          {@MSG[2]-@MSG[3]},[$inp]!
562         vld1.32         {$W0},[$Ktbl]!
563         vrev32.8        @MSG[0],@MSG[0]
564         vrev32.8        @MSG[1],@MSG[1]
565         vrev32.8        @MSG[2],@MSG[2]
566         vrev32.8        @MSG[3],@MSG[3]
567         vmov            $ABCD_SAVE,$ABCD        @ offload
568         vmov            $EFGH_SAVE,$EFGH
569         teq             $inp,$len
570 ___
571 for($i=0;$i<12;$i++) {
572 $code.=<<___;
573         vld1.32         {$W1},[$Ktbl]!
574         vadd.i32        $W0,$W0,@MSG[0]
575         sha256su0       @MSG[0],@MSG[1]
576         vmov            $abcd,$ABCD
577         sha256h         $ABCD,$EFGH,$W0
578         sha256h2        $EFGH,$abcd,$W0
579         sha256su1       @MSG[0],@MSG[2],@MSG[3]
580 ___
581         ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
582 }
583 $code.=<<___;
584         vld1.32         {$W1},[$Ktbl]!
585         vadd.i32        $W0,$W0,@MSG[0]
586         vmov            $abcd,$ABCD
587         sha256h         $ABCD,$EFGH,$W0
588         sha256h2        $EFGH,$abcd,$W0
589
590         vld1.32         {$W0},[$Ktbl]!
591         vadd.i32        $W1,$W1,@MSG[1]
592         vmov            $abcd,$ABCD
593         sha256h         $ABCD,$EFGH,$W1
594         sha256h2        $EFGH,$abcd,$W1
595
596         vld1.32         {$W1},[$Ktbl]
597         vadd.i32        $W0,$W0,@MSG[2]
598         sub             $Ktbl,$Ktbl,#256-16     @ rewind
599         vmov            $abcd,$ABCD
600         sha256h         $ABCD,$EFGH,$W0
601         sha256h2        $EFGH,$abcd,$W0
602
603         vadd.i32        $W1,$W1,@MSG[3]
604         vmov            $abcd,$ABCD
605         sha256h         $ABCD,$EFGH,$W1
606         sha256h2        $EFGH,$abcd,$W1
607
608         vadd.i32        $ABCD,$ABCD,$ABCD_SAVE
609         vadd.i32        $EFGH,$EFGH,$EFGH_SAVE
610         bne             .Loop_v8
611
612         vst1.32         {$ABCD,$EFGH},[$ctx]
613
614         ret             @ bx lr
615 .size   sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
616 #endif
617 ___
618 }}}
619 $code.=<<___;
620 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
621 .align  2
622 #if __ARM_MARCH_ARCH__>=7
623 .comm   OPENSSL_armcap_P,4,4
624 #endif
625 ___
626
627 {   my  %opcode = (
628         "sha256h"       => 0xf3000c40,  "sha256h2"      => 0xf3100c40,
629         "sha256su0"     => 0xf3ba03c0,  "sha256su1"     => 0xf3200c40   );
630
631     sub unsha256 {
632         my ($mnemonic,$arg)=@_;
633
634         if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
635             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
636                                          |(($2&7)<<17)|(($2&8)<<4)
637                                          |(($3&7)<<1) |(($3&8)<<2);
638             # since ARMv7 instructions are always encoded little-endian.
639             # correct solution is to use .inst directive, but older
640             # assemblers don't implement it:-(
641             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
642                         $word&0xff,($word>>8)&0xff,
643                         ($word>>16)&0xff,($word>>24)&0xff,
644                         $mnemonic,$arg;
645         }
646     }
647 }
648
649 foreach (split($/,$code)) {
650
651         s/\`([^\`]*)\`/eval $1/geo;
652
653         s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
654
655         s/\bret\b/bx    lr/go           or
656         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
657
658         print $_,"\n";
659 }
660
661 close STDOUT; # enforce flush