sha/asm/sha256-armv4.pl: fix compile issue in kernel
[openssl.git] / crypto / sha / asm / sha256-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 #
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
11
12 # SHA256 block procedure for ARMv4. May 2007.
13
14 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
15 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16 # byte [on single-issue Xscale PXA250 core].
17
18 # July 2010.
19 #
20 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
21 # Cortex A8 core and ~20 cycles per processed byte.
22
23 # February 2011.
24 #
25 # Profiler-assisted and platform-specific optimization resulted in 16%
26 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
27
28 # September 2013.
29 #
30 # Add NEON implementation. On Cortex A8 it was measured to process one
31 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33 # code (meaning that latter performs sub-optimally, nothing was done
34 # about it).
35
36 # May 2014.
37 #
38 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
39
40 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
41 open STDOUT,">$output";
42
43 $ctx="r0";      $t0="r0";
44 $inp="r1";      $t4="r1";
45 $len="r2";      $t1="r2";
46 $T1="r3";       $t3="r3";
47 $A="r4";
48 $B="r5";
49 $C="r6";
50 $D="r7";
51 $E="r8";
52 $F="r9";
53 $G="r10";
54 $H="r11";
55 @V=($A,$B,$C,$D,$E,$F,$G,$H);
56 $t2="r12";
57 $Ktbl="r14";
58
59 @Sigma0=( 2,13,22);
60 @Sigma1=( 6,11,25);
61 @sigma0=( 7,18, 3);
62 @sigma1=(17,19,10);
63
64 sub BODY_00_15 {
65 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
66
67 $code.=<<___ if ($i<16);
68 #if __ARM_ARCH__>=7
69         @ ldr   $t1,[$inp],#4                   @ $i
70 # if $i==15
71         str     $inp,[sp,#17*4]                 @ make room for $t4
72 # endif
73         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
74         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
75         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
76 # ifndef __ARMEB__
77         rev     $t1,$t1
78 # endif
79 #else
80         @ ldrb  $t1,[$inp,#3]                   @ $i
81         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
82         ldrb    $t2,[$inp,#2]
83         ldrb    $t0,[$inp,#1]
84         orr     $t1,$t1,$t2,lsl#8
85         ldrb    $t2,[$inp],#4
86         orr     $t1,$t1,$t0,lsl#16
87 # if $i==15
88         str     $inp,[sp,#17*4]                 @ make room for $t4
89 # endif
90         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
91         orr     $t1,$t1,$t2,lsl#24
92         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
93 #endif
94 ___
95 $code.=<<___;
96         ldr     $t2,[$Ktbl],#4                  @ *K256++
97         add     $h,$h,$t1                       @ h+=X[i]
98         str     $t1,[sp,#`$i%16`*4]
99         eor     $t1,$f,$g
100         add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
101         and     $t1,$t1,$e
102         add     $h,$h,$t2                       @ h+=K256[i]
103         eor     $t1,$t1,$g                      @ Ch(e,f,g)
104         eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
105         add     $h,$h,$t1                       @ h+=Ch(e,f,g)
106 #if $i==31
107         and     $t2,$t2,#0xff
108         cmp     $t2,#0xf2                       @ done?
109 #endif
110 #if $i<15
111 # if __ARM_ARCH__>=7
112         ldr     $t1,[$inp],#4                   @ prefetch
113 # else
114         ldrb    $t1,[$inp,#3]
115 # endif
116         eor     $t2,$a,$b                       @ a^b, b^c in next round
117 #else
118         ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
119         eor     $t2,$a,$b                       @ a^b, b^c in next round
120         ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
121 #endif
122         eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
123         and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
124         add     $d,$d,$h                        @ d+=h
125         eor     $t3,$t3,$b                      @ Maj(a,b,c)
126         add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
127         @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
128 ___
129         ($t2,$t3)=($t3,$t2);
130 }
131
132 sub BODY_16_XX {
133 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
134
135 $code.=<<___;
136         @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
137         @ ldr   $t4,[sp,#`($i+14)%16`*4]
138         mov     $t0,$t1,ror#$sigma0[0]
139         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
140         mov     $t2,$t4,ror#$sigma1[0]
141         eor     $t0,$t0,$t1,ror#$sigma0[1]
142         eor     $t2,$t2,$t4,ror#$sigma1[1]
143         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
144         ldr     $t1,[sp,#`($i+0)%16`*4]
145         eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
146         ldr     $t4,[sp,#`($i+9)%16`*4]
147
148         add     $t2,$t2,$t0
149         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
150         add     $t1,$t1,$t2
151         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
152         add     $t1,$t1,$t4                     @ X[i]
153 ___
154         &BODY_00_15(@_);
155 }
156
157 $code=<<___;
158 #ifndef __KERNEL__
159 # include "arm_arch.h"
160 #else
161 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
162 # define __ARM_MAX_ARCH__ 7
163 #endif
164
165 .text
166 #if __ARM_ARCH__<7
167 .code   32
168 #else
169 .syntax unified
170 # ifdef __thumb2__
171 #  define adrl adr
172 .thumb
173 # else
174 .code   32
175 # endif
176 #endif
177
178 .type   K256,%object
179 .align  5
180 K256:
181 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
182 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
183 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
184 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
185 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
186 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
187 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
188 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
189 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
190 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
191 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
192 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
193 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
194 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
195 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
196 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
197 .size   K256,.-K256
198 .word   0                               @ terminator
199 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
200 .LOPENSSL_armcap:
201 .word   OPENSSL_armcap_P-sha256_block_data_order
202 #endif
203 .align  5
204
205 .global sha256_block_data_order
206 .type   sha256_block_data_order,%function
207 sha256_block_data_order:
208 #if __ARM_ARCH__<7
209         sub     r3,pc,#8                @ sha256_block_data_order
210 #else
211         adr     r3,sha256_block_data_order
212 #endif
213 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
214         ldr     r12,.LOPENSSL_armcap
215         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
216         tst     r12,#ARMV8_SHA256
217         bne     .LARMv8
218         tst     r12,#ARMV7_NEON
219         bne     .LNEON
220 #endif
221         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
222         stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
223         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
224         sub     $Ktbl,r3,#256+32        @ K256
225         sub     sp,sp,#16*4             @ alloca(X[16])
226 .Loop:
227 # if __ARM_ARCH__>=7
228         ldr     $t1,[$inp],#4
229 # else
230         ldrb    $t1,[$inp,#3]
231 # endif
232         eor     $t3,$B,$C               @ magic
233         eor     $t2,$t2,$t2
234 ___
235 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
236 $code.=".Lrounds_16_xx:\n";
237 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
238 $code.=<<___;
239 #if __ARM_ARCH__>=7
240         ite     eq                      @ Thumb2 thing, sanity check in ARM
241 #endif
242         ldreq   $t3,[sp,#16*4]          @ pull ctx
243         bne     .Lrounds_16_xx
244
245         add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
246         ldr     $t0,[$t3,#0]
247         ldr     $t1,[$t3,#4]
248         ldr     $t2,[$t3,#8]
249         add     $A,$A,$t0
250         ldr     $t0,[$t3,#12]
251         add     $B,$B,$t1
252         ldr     $t1,[$t3,#16]
253         add     $C,$C,$t2
254         ldr     $t2,[$t3,#20]
255         add     $D,$D,$t0
256         ldr     $t0,[$t3,#24]
257         add     $E,$E,$t1
258         ldr     $t1,[$t3,#28]
259         add     $F,$F,$t2
260         ldr     $inp,[sp,#17*4]         @ pull inp
261         ldr     $t2,[sp,#18*4]          @ pull inp+len
262         add     $G,$G,$t0
263         add     $H,$H,$t1
264         stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
265         cmp     $inp,$t2
266         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
267         bne     .Loop
268
269         add     sp,sp,#`16+3`*4 @ destroy frame
270 #if __ARM_ARCH__>=5
271         ldmia   sp!,{r4-r11,pc}
272 #else
273         ldmia   sp!,{r4-r11,lr}
274         tst     lr,#1
275         moveq   pc,lr                   @ be binary compatible with V4, yet
276         bx      lr                      @ interoperable with Thumb ISA:-)
277 #endif
278 .size   sha256_block_data_order,.-sha256_block_data_order
279 ___
280 ######################################################################
281 # NEON stuff
282 #
283 {{{
284 my @X=map("q$_",(0..3));
285 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
286 my $Xfer=$t4;
287 my $j=0;
288
289 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
290 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
291
292 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
293 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
294   my $arg = pop;
295     $arg = "#$arg" if ($arg*1 eq $arg);
296     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
297 }
298
299 sub Xupdate()
300 { use integer;
301   my $body = shift;
302   my @insns = (&$body,&$body,&$body,&$body);
303   my ($a,$b,$c,$d,$e,$f,$g,$h);
304
305         &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
306          eval(shift(@insns));
307          eval(shift(@insns));
308          eval(shift(@insns));
309         &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
310          eval(shift(@insns));
311          eval(shift(@insns));
312          eval(shift(@insns));
313         &vshr_u32       ($T2,$T0,$sigma0[0]);
314          eval(shift(@insns));
315          eval(shift(@insns));
316         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
317          eval(shift(@insns));
318          eval(shift(@insns));
319         &vshr_u32       ($T1,$T0,$sigma0[2]);
320          eval(shift(@insns));
321          eval(shift(@insns));
322         &vsli_32        ($T2,$T0,32-$sigma0[0]);
323          eval(shift(@insns));
324          eval(shift(@insns));
325         &vshr_u32       ($T3,$T0,$sigma0[1]);
326          eval(shift(@insns));
327          eval(shift(@insns));
328         &veor           ($T1,$T1,$T2);
329          eval(shift(@insns));
330          eval(shift(@insns));
331         &vsli_32        ($T3,$T0,32-$sigma0[1]);
332          eval(shift(@insns));
333          eval(shift(@insns));
334           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
335          eval(shift(@insns));
336          eval(shift(@insns));
337         &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
338          eval(shift(@insns));
339          eval(shift(@insns));
340           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
341          eval(shift(@insns));
342          eval(shift(@insns));
343           &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
344          eval(shift(@insns));
345          eval(shift(@insns));
346         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
347          eval(shift(@insns));
348          eval(shift(@insns));
349           &veor         ($T5,$T5,$T4);
350          eval(shift(@insns));
351          eval(shift(@insns));
352           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
353          eval(shift(@insns));
354          eval(shift(@insns));
355           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
356          eval(shift(@insns));
357          eval(shift(@insns));
358           &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
359          eval(shift(@insns));
360          eval(shift(@insns));
361         &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
362          eval(shift(@insns));
363          eval(shift(@insns));
364           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
365          eval(shift(@insns));
366          eval(shift(@insns));
367           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
368          eval(shift(@insns));
369          eval(shift(@insns));
370           &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
371          eval(shift(@insns));
372          eval(shift(@insns));
373           &veor         ($T5,$T5,$T4);
374          eval(shift(@insns));
375          eval(shift(@insns));
376           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
377          eval(shift(@insns));
378          eval(shift(@insns));
379         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
380          eval(shift(@insns));
381          eval(shift(@insns));
382           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
383          eval(shift(@insns));
384          eval(shift(@insns));
385           &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
386          eval(shift(@insns));
387          eval(shift(@insns));
388         &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
389          eval(shift(@insns));
390          eval(shift(@insns));
391         &vadd_i32       ($T0,$T0,@X[0]);
392          while($#insns>=2) { eval(shift(@insns)); }
393         &vst1_32        ("{$T0}","[$Xfer,:128]!");
394          eval(shift(@insns));
395          eval(shift(@insns));
396
397         push(@X,shift(@X));             # "rotate" X[]
398 }
399
400 sub Xpreload()
401 { use integer;
402   my $body = shift;
403   my @insns = (&$body,&$body,&$body,&$body);
404   my ($a,$b,$c,$d,$e,$f,$g,$h);
405
406          eval(shift(@insns));
407          eval(shift(@insns));
408          eval(shift(@insns));
409          eval(shift(@insns));
410         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
411          eval(shift(@insns));
412          eval(shift(@insns));
413          eval(shift(@insns));
414          eval(shift(@insns));
415         &vrev32_8       (@X[0],@X[0]);
416          eval(shift(@insns));
417          eval(shift(@insns));
418          eval(shift(@insns));
419          eval(shift(@insns));
420         &vadd_i32       ($T0,$T0,@X[0]);
421          foreach (@insns) { eval; }     # remaining instructions
422         &vst1_32        ("{$T0}","[$Xfer,:128]!");
423
424         push(@X,shift(@X));             # "rotate" X[]
425 }
426
427 sub body_00_15 () {
428         (
429         '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
430         '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
431         '&eor   ($t1,$f,$g)',
432         '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
433         '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
434         '&and   ($t1,$t1,$e)',
435         '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
436         '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
437         '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
438         '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
439         '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
440         '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
441         '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
442         '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
443         '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
444         '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
445         '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
446         '&add   ($d,$d,$h)',                    # d+=h
447         '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
448         '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
449         '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
450         )
451 }
452
453 $code.=<<___;
454 #if __ARM_MAX_ARCH__>=7
455 .arch   armv7-a
456 .fpu    neon
457
458 .global sha256_block_data_order_neon
459 .type   sha256_block_data_order_neon,%function
460 .align  4
461 sha256_block_data_order_neon:
462 .LNEON:
463         stmdb   sp!,{r4-r12,lr}
464
465         sub     $H,sp,#16*4+16
466         adrl    $Ktbl,K256
467         bic     $H,$H,#15               @ align for 128-bit stores
468         mov     $t2,sp
469         mov     sp,$H                   @ alloca
470         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
471
472         vld1.8          {@X[0]},[$inp]!
473         vld1.8          {@X[1]},[$inp]!
474         vld1.8          {@X[2]},[$inp]!
475         vld1.8          {@X[3]},[$inp]!
476         vld1.32         {$T0},[$Ktbl,:128]!
477         vld1.32         {$T1},[$Ktbl,:128]!
478         vld1.32         {$T2},[$Ktbl,:128]!
479         vld1.32         {$T3},[$Ktbl,:128]!
480         vrev32.8        @X[0],@X[0]             @ yes, even on
481         str             $ctx,[sp,#64]
482         vrev32.8        @X[1],@X[1]             @ big-endian
483         str             $inp,[sp,#68]
484         mov             $Xfer,sp
485         vrev32.8        @X[2],@X[2]
486         str             $len,[sp,#72]
487         vrev32.8        @X[3],@X[3]
488         str             $t2,[sp,#76]            @ save original sp
489         vadd.i32        $T0,$T0,@X[0]
490         vadd.i32        $T1,$T1,@X[1]
491         vst1.32         {$T0},[$Xfer,:128]!
492         vadd.i32        $T2,$T2,@X[2]
493         vst1.32         {$T1},[$Xfer,:128]!
494         vadd.i32        $T3,$T3,@X[3]
495         vst1.32         {$T2},[$Xfer,:128]!
496         vst1.32         {$T3},[$Xfer,:128]!
497
498         ldmia           $ctx,{$A-$H}
499         sub             $Xfer,$Xfer,#64
500         ldr             $t1,[sp,#0]
501         eor             $t2,$t2,$t2
502         eor             $t3,$B,$C
503         b               .L_00_48
504
505 .align  4
506 .L_00_48:
507 ___
508         &Xupdate(\&body_00_15);
509         &Xupdate(\&body_00_15);
510         &Xupdate(\&body_00_15);
511         &Xupdate(\&body_00_15);
512 $code.=<<___;
513         teq     $t1,#0                          @ check for K256 terminator
514         ldr     $t1,[sp,#0]
515         sub     $Xfer,$Xfer,#64
516         bne     .L_00_48
517
518         ldr             $inp,[sp,#68]
519         ldr             $t0,[sp,#72]
520         sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
521         teq             $inp,$t0
522         it              eq
523         subeq           $inp,$inp,#64           @ avoid SEGV
524         vld1.8          {@X[0]},[$inp]!         @ load next input block
525         vld1.8          {@X[1]},[$inp]!
526         vld1.8          {@X[2]},[$inp]!
527         vld1.8          {@X[3]},[$inp]!
528         it              ne
529         strne           $inp,[sp,#68]
530         mov             $Xfer,sp
531 ___
532         &Xpreload(\&body_00_15);
533         &Xpreload(\&body_00_15);
534         &Xpreload(\&body_00_15);
535         &Xpreload(\&body_00_15);
536 $code.=<<___;
537         ldr     $t0,[$t1,#0]
538         add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
539         ldr     $t2,[$t1,#4]
540         ldr     $t3,[$t1,#8]
541         ldr     $t4,[$t1,#12]
542         add     $A,$A,$t0                       @ accumulate
543         ldr     $t0,[$t1,#16]
544         add     $B,$B,$t2
545         ldr     $t2,[$t1,#20]
546         add     $C,$C,$t3
547         ldr     $t3,[$t1,#24]
548         add     $D,$D,$t4
549         ldr     $t4,[$t1,#28]
550         add     $E,$E,$t0
551         str     $A,[$t1],#4
552         add     $F,$F,$t2
553         str     $B,[$t1],#4
554         add     $G,$G,$t3
555         str     $C,[$t1],#4
556         add     $H,$H,$t4
557         str     $D,[$t1],#4
558         stmia   $t1,{$E-$H}
559
560         ittte   ne
561         movne   $Xfer,sp
562         ldrne   $t1,[sp,#0]
563         eorne   $t2,$t2,$t2
564         ldreq   sp,[sp,#76]                     @ restore original sp
565         itt     ne
566         eorne   $t3,$B,$C
567         bne     .L_00_48
568
569         ldmia   sp!,{r4-r12,pc}
570 .size   sha256_block_data_order_neon,.-sha256_block_data_order_neon
571 #endif
572 ___
573 }}}
574 ######################################################################
575 # ARMv8 stuff
576 #
577 {{{
578 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
579 my @MSG=map("q$_",(8..11));
580 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
581 my $Ktbl="r3";
582
583 $code.=<<___;
584 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
585
586 # ifdef __thumb2__
587 #  define INST(a,b,c,d) .byte   c,d|0xc,a,b
588 # else
589 #  define INST(a,b,c,d) .byte   a,b,c,d
590 # endif
591
592 .type   sha256_block_data_order_armv8,%function
593 .align  5
594 sha256_block_data_order_armv8:
595 .LARMv8:
596         vld1.32 {$ABCD,$EFGH},[$ctx]
597 # ifdef __thumb2__
598         adr     $Ktbl,.LARMv8
599         sub     $Ktbl,$Ktbl,#.LARMv8-K256
600 # else
601         adrl    $Ktbl,K256
602 # endif
603         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
604
605 .Loop_v8:
606         vld1.8          {@MSG[0]-@MSG[1]},[$inp]!
607         vld1.8          {@MSG[2]-@MSG[3]},[$inp]!
608         vld1.32         {$W0},[$Ktbl]!
609         vrev32.8        @MSG[0],@MSG[0]
610         vrev32.8        @MSG[1],@MSG[1]
611         vrev32.8        @MSG[2],@MSG[2]
612         vrev32.8        @MSG[3],@MSG[3]
613         vmov            $ABCD_SAVE,$ABCD        @ offload
614         vmov            $EFGH_SAVE,$EFGH
615         teq             $inp,$len
616 ___
617 for($i=0;$i<12;$i++) {
618 $code.=<<___;
619         vld1.32         {$W1},[$Ktbl]!
620         vadd.i32        $W0,$W0,@MSG[0]
621         sha256su0       @MSG[0],@MSG[1]
622         vmov            $abcd,$ABCD
623         sha256h         $ABCD,$EFGH,$W0
624         sha256h2        $EFGH,$abcd,$W0
625         sha256su1       @MSG[0],@MSG[2],@MSG[3]
626 ___
627         ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
628 }
629 $code.=<<___;
630         vld1.32         {$W1},[$Ktbl]!
631         vadd.i32        $W0,$W0,@MSG[0]
632         vmov            $abcd,$ABCD
633         sha256h         $ABCD,$EFGH,$W0
634         sha256h2        $EFGH,$abcd,$W0
635
636         vld1.32         {$W0},[$Ktbl]!
637         vadd.i32        $W1,$W1,@MSG[1]
638         vmov            $abcd,$ABCD
639         sha256h         $ABCD,$EFGH,$W1
640         sha256h2        $EFGH,$abcd,$W1
641
642         vld1.32         {$W1},[$Ktbl]
643         vadd.i32        $W0,$W0,@MSG[2]
644         sub             $Ktbl,$Ktbl,#256-16     @ rewind
645         vmov            $abcd,$ABCD
646         sha256h         $ABCD,$EFGH,$W0
647         sha256h2        $EFGH,$abcd,$W0
648
649         vadd.i32        $W1,$W1,@MSG[3]
650         vmov            $abcd,$ABCD
651         sha256h         $ABCD,$EFGH,$W1
652         sha256h2        $EFGH,$abcd,$W1
653
654         vadd.i32        $ABCD,$ABCD,$ABCD_SAVE
655         vadd.i32        $EFGH,$EFGH,$EFGH_SAVE
656         it              ne
657         bne             .Loop_v8
658
659         vst1.32         {$ABCD,$EFGH},[$ctx]
660
661         ret             @ bx lr
662 .size   sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
663 #endif
664 ___
665 }}}
666 $code.=<<___;
667 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
668 .align  2
669 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
670 .comm   OPENSSL_armcap_P,4,4
671 #endif
672 ___
673
674 open SELF,$0;
675 while(<SELF>) {
676         next if (/^#!/);
677         last if (!s/^#/@/ and !/^$/);
678         print;
679 }
680 close SELF;
681
682 {   my  %opcode = (
683         "sha256h"       => 0xf3000c40,  "sha256h2"      => 0xf3100c40,
684         "sha256su0"     => 0xf3ba03c0,  "sha256su1"     => 0xf3200c40   );
685
686     sub unsha256 {
687         my ($mnemonic,$arg)=@_;
688
689         if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
690             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
691                                          |(($2&7)<<17)|(($2&8)<<4)
692                                          |(($3&7)<<1) |(($3&8)<<2);
693             # since ARMv7 instructions are always encoded little-endian.
694             # correct solution is to use .inst directive, but older
695             # assemblers don't implement it:-(
696             sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
697                         $word&0xff,($word>>8)&0xff,
698                         ($word>>16)&0xff,($word>>24)&0xff,
699                         $mnemonic,$arg;
700         }
701     }
702 }
703
704 foreach (split($/,$code)) {
705
706         s/\`([^\`]*)\`/eval $1/geo;
707
708         s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
709
710         s/\bret\b/bx    lr/go           or
711         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
712
713         print $_,"\n";
714 }
715
716 close STDOUT; # enforce flush