sha/asm/sha256-armv4.pl: adapt for use in Linux kernel context.
[openssl.git] / crypto / sha / asm / sha256-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 #
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
11
12 # SHA256 block procedure for ARMv4. May 2007.
13
14 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
15 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16 # byte [on single-issue Xscale PXA250 core].
17
18 # July 2010.
19 #
20 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
21 # Cortex A8 core and ~20 cycles per processed byte.
22
23 # February 2011.
24 #
25 # Profiler-assisted and platform-specific optimization resulted in 16%
26 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
27
28 # September 2013.
29 #
30 # Add NEON implementation. On Cortex A8 it was measured to process one
31 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33 # code (meaning that latter performs sub-optimally, nothing was done
34 # about it).
35
36 # May 2014.
37 #
38 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
39
40 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
41 open STDOUT,">$output";
42
43 $ctx="r0";      $t0="r0";
44 $inp="r1";      $t4="r1";
45 $len="r2";      $t1="r2";
46 $T1="r3";       $t3="r3";
47 $A="r4";
48 $B="r5";
49 $C="r6";
50 $D="r7";
51 $E="r8";
52 $F="r9";
53 $G="r10";
54 $H="r11";
55 @V=($A,$B,$C,$D,$E,$F,$G,$H);
56 $t2="r12";
57 $Ktbl="r14";
58
59 @Sigma0=( 2,13,22);
60 @Sigma1=( 6,11,25);
61 @sigma0=( 7,18, 3);
62 @sigma1=(17,19,10);
63
64 sub BODY_00_15 {
65 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
66
67 $code.=<<___ if ($i<16);
68 #if __ARM_ARCH__>=7
69         @ ldr   $t1,[$inp],#4                   @ $i
70 # if $i==15
71         str     $inp,[sp,#17*4]                 @ make room for $t4
72 # endif
73         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
74         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
75         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
76         rev     $t1,$t1
77 #else
78         @ ldrb  $t1,[$inp,#3]                   @ $i
79         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
80         ldrb    $t2,[$inp,#2]
81         ldrb    $t0,[$inp,#1]
82         orr     $t1,$t1,$t2,lsl#8
83         ldrb    $t2,[$inp],#4
84         orr     $t1,$t1,$t0,lsl#16
85 # if $i==15
86         str     $inp,[sp,#17*4]                 @ make room for $t4
87 # endif
88         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
89         orr     $t1,$t1,$t2,lsl#24
90         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
91 #endif
92 ___
93 $code.=<<___;
94         ldr     $t2,[$Ktbl],#4                  @ *K256++
95         add     $h,$h,$t1                       @ h+=X[i]
96         str     $t1,[sp,#`$i%16`*4]
97         eor     $t1,$f,$g
98         add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
99         and     $t1,$t1,$e
100         add     $h,$h,$t2                       @ h+=K256[i]
101         eor     $t1,$t1,$g                      @ Ch(e,f,g)
102         eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
103         add     $h,$h,$t1                       @ h+=Ch(e,f,g)
104 #if $i==31
105         and     $t2,$t2,#0xff
106         cmp     $t2,#0xf2                       @ done?
107 #endif
108 #if $i<15
109 # if __ARM_ARCH__>=7
110         ldr     $t1,[$inp],#4                   @ prefetch
111 # else
112         ldrb    $t1,[$inp,#3]
113 # endif
114         eor     $t2,$a,$b                       @ a^b, b^c in next round
115 #else
116         ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
117         eor     $t2,$a,$b                       @ a^b, b^c in next round
118         ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
119 #endif
120         eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
121         and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
122         add     $d,$d,$h                        @ d+=h
123         eor     $t3,$t3,$b                      @ Maj(a,b,c)
124         add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
125         @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
126 ___
127         ($t2,$t3)=($t3,$t2);
128 }
129
130 sub BODY_16_XX {
131 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
132
133 $code.=<<___;
134         @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
135         @ ldr   $t4,[sp,#`($i+14)%16`*4]
136         mov     $t0,$t1,ror#$sigma0[0]
137         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
138         mov     $t2,$t4,ror#$sigma1[0]
139         eor     $t0,$t0,$t1,ror#$sigma0[1]
140         eor     $t2,$t2,$t4,ror#$sigma1[1]
141         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
142         ldr     $t1,[sp,#`($i+0)%16`*4]
143         eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
144         ldr     $t4,[sp,#`($i+9)%16`*4]
145
146         add     $t2,$t2,$t0
147         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
148         add     $t1,$t1,$t2
149         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
150         add     $t1,$t1,$t4                     @ X[i]
151 ___
152         &BODY_00_15(@_);
153 }
154
155 $code=<<___;
156 #ifndef __KERNEL__
157 # include "arm_arch.h"
158 #else
159 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
160 # define __ARM_MAX_ARCH__ 7
161 #endif
162
163 .text
164 #if __ARM_ARCH__<7
165 .code   32
166 #else
167 .syntax unified
168 # ifdef __thumb2__
169 .thumb
170 # else
171 .code   32
172 # endif
173 #endif
174
175 .type   K256,%object
176 .align  5
177 K256:
178 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
179 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
180 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
181 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
182 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
183 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
184 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
185 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
186 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
187 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
188 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
189 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
190 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
191 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
192 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
193 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
194 .size   K256,.-K256
195 .word   0                               @ terminator
196 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
197 .LOPENSSL_armcap:
198 .word   OPENSSL_armcap_P-sha256_block_data_order
199 #endif
200 .align  5
201
202 .global sha256_block_data_order
203 .type   sha256_block_data_order,%function
204 sha256_block_data_order:
205 #if __ARM_ARCH__<7
206         sub     r3,pc,#8                @ sha256_block_data_order
207 #else
208         adr     r3,sha256_block_data_order
209 #endif
210 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
211         ldr     r12,.LOPENSSL_armcap
212         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
213         tst     r12,#ARMV8_SHA256
214         bne     .LARMv8
215         tst     r12,#ARMV7_NEON
216         bne     .LNEON
217 #endif
218         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
219         stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
220         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
221         sub     $Ktbl,r3,#256+32        @ K256
222         sub     sp,sp,#16*4             @ alloca(X[16])
223 .Loop:
224 # if __ARM_ARCH__>=7
225         ldr     $t1,[$inp],#4
226 # else
227         ldrb    $t1,[$inp,#3]
228 # endif
229         eor     $t3,$B,$C               @ magic
230         eor     $t2,$t2,$t2
231 ___
232 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
233 $code.=".Lrounds_16_xx:\n";
234 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
235 $code.=<<___;
236 #if __ARM_ARCH__>=7
237         ite     eq                      @ Thumb2 thing, sanity check in ARM
238 #endif
239         ldreq   $t3,[sp,#16*4]          @ pull ctx
240         bne     .Lrounds_16_xx
241
242         add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
243         ldr     $t0,[$t3,#0]
244         ldr     $t1,[$t3,#4]
245         ldr     $t2,[$t3,#8]
246         add     $A,$A,$t0
247         ldr     $t0,[$t3,#12]
248         add     $B,$B,$t1
249         ldr     $t1,[$t3,#16]
250         add     $C,$C,$t2
251         ldr     $t2,[$t3,#20]
252         add     $D,$D,$t0
253         ldr     $t0,[$t3,#24]
254         add     $E,$E,$t1
255         ldr     $t1,[$t3,#28]
256         add     $F,$F,$t2
257         ldr     $inp,[sp,#17*4]         @ pull inp
258         ldr     $t2,[sp,#18*4]          @ pull inp+len
259         add     $G,$G,$t0
260         add     $H,$H,$t1
261         stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
262         cmp     $inp,$t2
263         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
264         bne     .Loop
265
266         add     sp,sp,#`16+3`*4 @ destroy frame
267 #if __ARM_ARCH__>=5
268         ldmia   sp!,{r4-r11,pc}
269 #else
270         ldmia   sp!,{r4-r11,lr}
271         tst     lr,#1
272         moveq   pc,lr                   @ be binary compatible with V4, yet
273         bx      lr                      @ interoperable with Thumb ISA:-)
274 #endif
275 .size   sha256_block_data_order,.-sha256_block_data_order
276 ___
277 ######################################################################
278 # NEON stuff
279 #
280 {{{
281 my @X=map("q$_",(0..3));
282 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
283 my $Xfer=$t4;
284 my $j=0;
285
286 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
287 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
288
289 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
290 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
291   my $arg = pop;
292     $arg = "#$arg" if ($arg*1 eq $arg);
293     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
294 }
295
296 sub Xupdate()
297 { use integer;
298   my $body = shift;
299   my @insns = (&$body,&$body,&$body,&$body);
300   my ($a,$b,$c,$d,$e,$f,$g,$h);
301
302         &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
303          eval(shift(@insns));
304          eval(shift(@insns));
305          eval(shift(@insns));
306         &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
307          eval(shift(@insns));
308          eval(shift(@insns));
309          eval(shift(@insns));
310         &vshr_u32       ($T2,$T0,$sigma0[0]);
311          eval(shift(@insns));
312          eval(shift(@insns));
313         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
314          eval(shift(@insns));
315          eval(shift(@insns));
316         &vshr_u32       ($T1,$T0,$sigma0[2]);
317          eval(shift(@insns));
318          eval(shift(@insns));
319         &vsli_32        ($T2,$T0,32-$sigma0[0]);
320          eval(shift(@insns));
321          eval(shift(@insns));
322         &vshr_u32       ($T3,$T0,$sigma0[1]);
323          eval(shift(@insns));
324          eval(shift(@insns));
325         &veor           ($T1,$T1,$T2);
326          eval(shift(@insns));
327          eval(shift(@insns));
328         &vsli_32        ($T3,$T0,32-$sigma0[1]);
329          eval(shift(@insns));
330          eval(shift(@insns));
331           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
332          eval(shift(@insns));
333          eval(shift(@insns));
334         &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
335          eval(shift(@insns));
336          eval(shift(@insns));
337           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
338          eval(shift(@insns));
339          eval(shift(@insns));
340           &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
341          eval(shift(@insns));
342          eval(shift(@insns));
343         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
344          eval(shift(@insns));
345          eval(shift(@insns));
346           &veor         ($T5,$T5,$T4);
347          eval(shift(@insns));
348          eval(shift(@insns));
349           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
350          eval(shift(@insns));
351          eval(shift(@insns));
352           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
353          eval(shift(@insns));
354          eval(shift(@insns));
355           &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
356          eval(shift(@insns));
357          eval(shift(@insns));
358         &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
359          eval(shift(@insns));
360          eval(shift(@insns));
361           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
362          eval(shift(@insns));
363          eval(shift(@insns));
364           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
365          eval(shift(@insns));
366          eval(shift(@insns));
367           &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
368          eval(shift(@insns));
369          eval(shift(@insns));
370           &veor         ($T5,$T5,$T4);
371          eval(shift(@insns));
372          eval(shift(@insns));
373           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
374          eval(shift(@insns));
375          eval(shift(@insns));
376         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
377          eval(shift(@insns));
378          eval(shift(@insns));
379           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
380          eval(shift(@insns));
381          eval(shift(@insns));
382           &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
383          eval(shift(@insns));
384          eval(shift(@insns));
385         &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
386          eval(shift(@insns));
387          eval(shift(@insns));
388         &vadd_i32       ($T0,$T0,@X[0]);
389          while($#insns>=2) { eval(shift(@insns)); }
390         &vst1_32        ("{$T0}","[$Xfer,:128]!");
391          eval(shift(@insns));
392          eval(shift(@insns));
393
394         push(@X,shift(@X));             # "rotate" X[]
395 }
396
397 sub Xpreload()
398 { use integer;
399   my $body = shift;
400   my @insns = (&$body,&$body,&$body,&$body);
401   my ($a,$b,$c,$d,$e,$f,$g,$h);
402
403          eval(shift(@insns));
404          eval(shift(@insns));
405          eval(shift(@insns));
406          eval(shift(@insns));
407         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
408          eval(shift(@insns));
409          eval(shift(@insns));
410          eval(shift(@insns));
411          eval(shift(@insns));
412         &vrev32_8       (@X[0],@X[0]);
413          eval(shift(@insns));
414          eval(shift(@insns));
415          eval(shift(@insns));
416          eval(shift(@insns));
417         &vadd_i32       ($T0,$T0,@X[0]);
418          foreach (@insns) { eval; }     # remaining instructions
419         &vst1_32        ("{$T0}","[$Xfer,:128]!");
420
421         push(@X,shift(@X));             # "rotate" X[]
422 }
423
424 sub body_00_15 () {
425         (
426         '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
427         '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
428         '&eor   ($t1,$f,$g)',
429         '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
430         '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
431         '&and   ($t1,$t1,$e)',
432         '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
433         '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
434         '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
435         '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
436         '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
437         '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
438         '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
439         '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
440         '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
441         '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
442         '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
443         '&add   ($d,$d,$h)',                    # d+=h
444         '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
445         '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
446         '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
447         )
448 }
449
450 $code.=<<___;
451 #if __ARM_MAX_ARCH__>=7
452 .arch   armv7-a
453 .fpu    neon
454
455 .global sha256_block_data_order_neon
456 .type   sha256_block_data_order_neon,%function
457 .align  4
458 sha256_block_data_order_neon:
459 .LNEON:
460         stmdb   sp!,{r4-r12,lr}
461
462         sub     $H,sp,#16*4+16
463         adr     $Ktbl,K256
464         bic     $H,$H,#15               @ align for 128-bit stores
465         mov     $t2,sp
466         mov     sp,$H                   @ alloca
467         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
468
469         vld1.8          {@X[0]},[$inp]!
470         vld1.8          {@X[1]},[$inp]!
471         vld1.8          {@X[2]},[$inp]!
472         vld1.8          {@X[3]},[$inp]!
473         vld1.32         {$T0},[$Ktbl,:128]!
474         vld1.32         {$T1},[$Ktbl,:128]!
475         vld1.32         {$T2},[$Ktbl,:128]!
476         vld1.32         {$T3},[$Ktbl,:128]!
477         vrev32.8        @X[0],@X[0]             @ yes, even on
478         str             $ctx,[sp,#64]
479         vrev32.8        @X[1],@X[1]             @ big-endian
480         str             $inp,[sp,#68]
481         mov             $Xfer,sp
482         vrev32.8        @X[2],@X[2]
483         str             $len,[sp,#72]
484         vrev32.8        @X[3],@X[3]
485         str             $t2,[sp,#76]            @ save original sp
486         vadd.i32        $T0,$T0,@X[0]
487         vadd.i32        $T1,$T1,@X[1]
488         vst1.32         {$T0},[$Xfer,:128]!
489         vadd.i32        $T2,$T2,@X[2]
490         vst1.32         {$T1},[$Xfer,:128]!
491         vadd.i32        $T3,$T3,@X[3]
492         vst1.32         {$T2},[$Xfer,:128]!
493         vst1.32         {$T3},[$Xfer,:128]!
494
495         ldmia           $ctx,{$A-$H}
496         sub             $Xfer,$Xfer,#64
497         ldr             $t1,[sp,#0]
498         eor             $t2,$t2,$t2
499         eor             $t3,$B,$C
500         b               .L_00_48
501
502 .align  4
503 .L_00_48:
504 ___
505         &Xupdate(\&body_00_15);
506         &Xupdate(\&body_00_15);
507         &Xupdate(\&body_00_15);
508         &Xupdate(\&body_00_15);
509 $code.=<<___;
510         teq     $t1,#0                          @ check for K256 terminator
511         ldr     $t1,[sp,#0]
512         sub     $Xfer,$Xfer,#64
513         bne     .L_00_48
514
515         ldr             $inp,[sp,#68]
516         ldr             $t0,[sp,#72]
517         sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
518         teq             $inp,$t0
519         it              eq
520         subeq           $inp,$inp,#64           @ avoid SEGV
521         vld1.8          {@X[0]},[$inp]!         @ load next input block
522         vld1.8          {@X[1]},[$inp]!
523         vld1.8          {@X[2]},[$inp]!
524         vld1.8          {@X[3]},[$inp]!
525         it              ne
526         strne           $inp,[sp,#68]
527         mov             $Xfer,sp
528 ___
529         &Xpreload(\&body_00_15);
530         &Xpreload(\&body_00_15);
531         &Xpreload(\&body_00_15);
532         &Xpreload(\&body_00_15);
533 $code.=<<___;
534         ldr     $t0,[$t1,#0]
535         add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
536         ldr     $t2,[$t1,#4]
537         ldr     $t3,[$t1,#8]
538         ldr     $t4,[$t1,#12]
539         add     $A,$A,$t0                       @ accumulate
540         ldr     $t0,[$t1,#16]
541         add     $B,$B,$t2
542         ldr     $t2,[$t1,#20]
543         add     $C,$C,$t3
544         ldr     $t3,[$t1,#24]
545         add     $D,$D,$t4
546         ldr     $t4,[$t1,#28]
547         add     $E,$E,$t0
548         str     $A,[$t1],#4
549         add     $F,$F,$t2
550         str     $B,[$t1],#4
551         add     $G,$G,$t3
552         str     $C,[$t1],#4
553         add     $H,$H,$t4
554         str     $D,[$t1],#4
555         stmia   $t1,{$E-$H}
556
557         ittte   ne
558         movne   $Xfer,sp
559         ldrne   $t1,[sp,#0]
560         eorne   $t2,$t2,$t2
561         ldreq   sp,[sp,#76]                     @ restore original sp
562         itt     ne
563         eorne   $t3,$B,$C
564         bne     .L_00_48
565
566         ldmia   sp!,{r4-r12,pc}
567 .size   sha256_block_data_order_neon,.-sha256_block_data_order_neon
568 #endif
569 ___
570 }}}
571 ######################################################################
572 # ARMv8 stuff
573 #
574 {{{
575 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
576 my @MSG=map("q$_",(8..11));
577 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
578 my $Ktbl="r3";
579
580 $code.=<<___;
581 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
582
583 # ifdef __thumb2__
584 #  define INST(a,b,c,d) .byte   c,d|0xc,a,b
585 # else
586 #  define INST(a,b,c,d) .byte   a,b,c,d
587 # endif
588
589 .type   sha256_block_data_order_armv8,%function
590 .align  5
591 sha256_block_data_order_armv8:
592 .LARMv8:
593         vld1.32 {$ABCD,$EFGH},[$ctx]
594 # ifdef __thumb2__
595         adr     $Ktbl,.LARMv8
596         sub     $Ktbl,$Ktbl,#.LARMv8-K256
597 # else
598         adrl    $Ktbl,K256
599 # endif
600         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
601
602 .Loop_v8:
603         vld1.8          {@MSG[0]-@MSG[1]},[$inp]!
604         vld1.8          {@MSG[2]-@MSG[3]},[$inp]!
605         vld1.32         {$W0},[$Ktbl]!
606         vrev32.8        @MSG[0],@MSG[0]
607         vrev32.8        @MSG[1],@MSG[1]
608         vrev32.8        @MSG[2],@MSG[2]
609         vrev32.8        @MSG[3],@MSG[3]
610         vmov            $ABCD_SAVE,$ABCD        @ offload
611         vmov            $EFGH_SAVE,$EFGH
612         teq             $inp,$len
613 ___
614 for($i=0;$i<12;$i++) {
615 $code.=<<___;
616         vld1.32         {$W1},[$Ktbl]!
617         vadd.i32        $W0,$W0,@MSG[0]
618         sha256su0       @MSG[0],@MSG[1]
619         vmov            $abcd,$ABCD
620         sha256h         $ABCD,$EFGH,$W0
621         sha256h2        $EFGH,$abcd,$W0
622         sha256su1       @MSG[0],@MSG[2],@MSG[3]
623 ___
624         ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
625 }
626 $code.=<<___;
627         vld1.32         {$W1},[$Ktbl]!
628         vadd.i32        $W0,$W0,@MSG[0]
629         vmov            $abcd,$ABCD
630         sha256h         $ABCD,$EFGH,$W0
631         sha256h2        $EFGH,$abcd,$W0
632
633         vld1.32         {$W0},[$Ktbl]!
634         vadd.i32        $W1,$W1,@MSG[1]
635         vmov            $abcd,$ABCD
636         sha256h         $ABCD,$EFGH,$W1
637         sha256h2        $EFGH,$abcd,$W1
638
639         vld1.32         {$W1},[$Ktbl]
640         vadd.i32        $W0,$W0,@MSG[2]
641         sub             $Ktbl,$Ktbl,#256-16     @ rewind
642         vmov            $abcd,$ABCD
643         sha256h         $ABCD,$EFGH,$W0
644         sha256h2        $EFGH,$abcd,$W0
645
646         vadd.i32        $W1,$W1,@MSG[3]
647         vmov            $abcd,$ABCD
648         sha256h         $ABCD,$EFGH,$W1
649         sha256h2        $EFGH,$abcd,$W1
650
651         vadd.i32        $ABCD,$ABCD,$ABCD_SAVE
652         vadd.i32        $EFGH,$EFGH,$EFGH_SAVE
653         it              ne
654         bne             .Loop_v8
655
656         vst1.32         {$ABCD,$EFGH},[$ctx]
657
658         ret             @ bx lr
659 .size   sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
660 #endif
661 ___
662 }}}
663 $code.=<<___;
664 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
665 .align  2
666 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
667 .comm   OPENSSL_armcap_P,4,4
668 #endif
669 ___
670
671 open SELF,$0;
672 while(<SELF>) {
673         next if (/^#!/);
674         last if (!s/^#/@/ and !/^$/);
675         print;
676 }
677 close SELF;
678
679 {   my  %opcode = (
680         "sha256h"       => 0xf3000c40,  "sha256h2"      => 0xf3100c40,
681         "sha256su0"     => 0xf3ba03c0,  "sha256su1"     => 0xf3200c40   );
682
683     sub unsha256 {
684         my ($mnemonic,$arg)=@_;
685
686         if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
687             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
688                                          |(($2&7)<<17)|(($2&8)<<4)
689                                          |(($3&7)<<1) |(($3&8)<<2);
690             # since ARMv7 instructions are always encoded little-endian.
691             # correct solution is to use .inst directive, but older
692             # assemblers don't implement it:-(
693             sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
694                         $word&0xff,($word>>8)&0xff,
695                         ($word>>16)&0xff,($word>>24)&0xff,
696                         $mnemonic,$arg;
697         }
698     }
699 }
700
701 foreach (split($/,$code)) {
702
703         s/\`([^\`]*)\`/eval $1/geo;
704
705         s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
706
707         s/\bret\b/bx    lr/go           or
708         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
709
710         print $_,"\n";
711 }
712
713 close STDOUT; # enforce flush