ARMv4 assembly pack: allow Thumb2 even in iOS build,
[openssl.git] / crypto / sha / asm / sha256-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 #
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
11
12 # SHA256 block procedure for ARMv4. May 2007.
13
14 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
15 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16 # byte [on single-issue Xscale PXA250 core].
17
18 # July 2010.
19 #
20 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
21 # Cortex A8 core and ~20 cycles per processed byte.
22
23 # February 2011.
24 #
25 # Profiler-assisted and platform-specific optimization resulted in 16%
26 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
27
28 # September 2013.
29 #
30 # Add NEON implementation. On Cortex A8 it was measured to process one
31 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33 # code (meaning that latter performs sub-optimally, nothing was done
34 # about it).
35
36 # May 2014.
37 #
38 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
39
40 $flavour = shift;
41 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
42 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
43
44 if ($flavour && $flavour ne "void") {
45     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
47     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
48     die "can't locate arm-xlate.pl";
49
50     open STDOUT,"| \"$^X\" $xlate $flavour $output";
51 } else {
52     open STDOUT,">$output";
53 }
54
55 $ctx="r0";      $t0="r0";
56 $inp="r1";      $t4="r1";
57 $len="r2";      $t1="r2";
58 $T1="r3";       $t3="r3";
59 $A="r4";
60 $B="r5";
61 $C="r6";
62 $D="r7";
63 $E="r8";
64 $F="r9";
65 $G="r10";
66 $H="r11";
67 @V=($A,$B,$C,$D,$E,$F,$G,$H);
68 $t2="r12";
69 $Ktbl="r14";
70
71 @Sigma0=( 2,13,22);
72 @Sigma1=( 6,11,25);
73 @sigma0=( 7,18, 3);
74 @sigma1=(17,19,10);
75
76 sub BODY_00_15 {
77 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
78
79 $code.=<<___ if ($i<16);
80 #if __ARM_ARCH__>=7
81         @ ldr   $t1,[$inp],#4                   @ $i
82 # if $i==15
83         str     $inp,[sp,#17*4]                 @ make room for $t4
84 # endif
85         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
86         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
87         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
88 # ifndef __ARMEB__
89         rev     $t1,$t1
90 # endif
91 #else
92         @ ldrb  $t1,[$inp,#3]                   @ $i
93         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
94         ldrb    $t2,[$inp,#2]
95         ldrb    $t0,[$inp,#1]
96         orr     $t1,$t1,$t2,lsl#8
97         ldrb    $t2,[$inp],#4
98         orr     $t1,$t1,$t0,lsl#16
99 # if $i==15
100         str     $inp,[sp,#17*4]                 @ make room for $t4
101 # endif
102         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
103         orr     $t1,$t1,$t2,lsl#24
104         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
105 #endif
106 ___
107 $code.=<<___;
108         ldr     $t2,[$Ktbl],#4                  @ *K256++
109         add     $h,$h,$t1                       @ h+=X[i]
110         str     $t1,[sp,#`$i%16`*4]
111         eor     $t1,$f,$g
112         add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
113         and     $t1,$t1,$e
114         add     $h,$h,$t2                       @ h+=K256[i]
115         eor     $t1,$t1,$g                      @ Ch(e,f,g)
116         eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
117         add     $h,$h,$t1                       @ h+=Ch(e,f,g)
118 #if $i==31
119         and     $t2,$t2,#0xff
120         cmp     $t2,#0xf2                       @ done?
121 #endif
122 #if $i<15
123 # if __ARM_ARCH__>=7
124         ldr     $t1,[$inp],#4                   @ prefetch
125 # else
126         ldrb    $t1,[$inp,#3]
127 # endif
128         eor     $t2,$a,$b                       @ a^b, b^c in next round
129 #else
130         ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
131         eor     $t2,$a,$b                       @ a^b, b^c in next round
132         ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
133 #endif
134         eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
135         and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
136         add     $d,$d,$h                        @ d+=h
137         eor     $t3,$t3,$b                      @ Maj(a,b,c)
138         add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
139         @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
140 ___
141         ($t2,$t3)=($t3,$t2);
142 }
143
144 sub BODY_16_XX {
145 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
146
147 $code.=<<___;
148         @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
149         @ ldr   $t4,[sp,#`($i+14)%16`*4]
150         mov     $t0,$t1,ror#$sigma0[0]
151         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
152         mov     $t2,$t4,ror#$sigma1[0]
153         eor     $t0,$t0,$t1,ror#$sigma0[1]
154         eor     $t2,$t2,$t4,ror#$sigma1[1]
155         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
156         ldr     $t1,[sp,#`($i+0)%16`*4]
157         eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
158         ldr     $t4,[sp,#`($i+9)%16`*4]
159
160         add     $t2,$t2,$t0
161         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
162         add     $t1,$t1,$t2
163         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
164         add     $t1,$t1,$t4                     @ X[i]
165 ___
166         &BODY_00_15(@_);
167 }
168
169 $code=<<___;
170 #ifndef __KERNEL__
171 # include "arm_arch.h"
172 #else
173 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
174 # define __ARM_MAX_ARCH__ 7
175 #endif
176
177 .text
178 #if defined(__thumb2__)
179 .syntax unified
180 .thumb
181 # define adrl adr
182 #else
183 .code   32
184 #endif
185
186 .type   K256,%object
187 .align  5
188 K256:
189 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
190 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
191 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
192 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
193 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
194 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
195 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
196 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
197 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
198 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
199 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
200 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
201 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
202 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
203 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
204 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
205 .size   K256,.-K256
206 .word   0                               @ terminator
207 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
208 .LOPENSSL_armcap:
209 .word   OPENSSL_armcap_P-.Lsha256_block_data_order
210 #endif
211 .align  5
212
213 .global sha256_block_data_order
214 .type   sha256_block_data_order,%function
215 sha256_block_data_order:
216 .Lsha256_block_data_order:
217 #if __ARM_ARCH__<7 && !defined(__thumb2__)
218         sub     r3,pc,#8                @ sha256_block_data_order
219 #else
220         adr     r3,.Lsha256_block_data_order
221 #endif
222 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
223         ldr     r12,.LOPENSSL_armcap
224         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
225 #ifdef  __APPLE__
226         ldr     r12,[r12]
227 #endif
228         tst     r12,#ARMV8_SHA256
229         bne     .LARMv8
230         tst     r12,#ARMV7_NEON
231         bne     .LNEON
232 #endif
233         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
234         stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
235         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
236         sub     $Ktbl,r3,#256+32        @ K256
237         sub     sp,sp,#16*4             @ alloca(X[16])
238 .Loop:
239 # if __ARM_ARCH__>=7
240         ldr     $t1,[$inp],#4
241 # else
242         ldrb    $t1,[$inp,#3]
243 # endif
244         eor     $t3,$B,$C               @ magic
245         eor     $t2,$t2,$t2
246 ___
247 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
248 $code.=".Lrounds_16_xx:\n";
249 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
250 $code.=<<___;
251 #if __ARM_ARCH__>=7
252         ite     eq                      @ Thumb2 thing, sanity check in ARM
253 #endif
254         ldreq   $t3,[sp,#16*4]          @ pull ctx
255         bne     .Lrounds_16_xx
256
257         add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
258         ldr     $t0,[$t3,#0]
259         ldr     $t1,[$t3,#4]
260         ldr     $t2,[$t3,#8]
261         add     $A,$A,$t0
262         ldr     $t0,[$t3,#12]
263         add     $B,$B,$t1
264         ldr     $t1,[$t3,#16]
265         add     $C,$C,$t2
266         ldr     $t2,[$t3,#20]
267         add     $D,$D,$t0
268         ldr     $t0,[$t3,#24]
269         add     $E,$E,$t1
270         ldr     $t1,[$t3,#28]
271         add     $F,$F,$t2
272         ldr     $inp,[sp,#17*4]         @ pull inp
273         ldr     $t2,[sp,#18*4]          @ pull inp+len
274         add     $G,$G,$t0
275         add     $H,$H,$t1
276         stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
277         cmp     $inp,$t2
278         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
279         bne     .Loop
280
281         add     sp,sp,#`16+3`*4 @ destroy frame
282 #if __ARM_ARCH__>=5
283         ldmia   sp!,{r4-r11,pc}
284 #else
285         ldmia   sp!,{r4-r11,lr}
286         tst     lr,#1
287         moveq   pc,lr                   @ be binary compatible with V4, yet
288         bx      lr                      @ interoperable with Thumb ISA:-)
289 #endif
290 .size   sha256_block_data_order,.-sha256_block_data_order
291 ___
292 ######################################################################
293 # NEON stuff
294 #
295 {{{
296 my @X=map("q$_",(0..3));
297 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
298 my $Xfer=$t4;
299 my $j=0;
300
301 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
302 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
303
304 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
305 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
306   my $arg = pop;
307     $arg = "#$arg" if ($arg*1 eq $arg);
308     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
309 }
310
311 sub Xupdate()
312 { use integer;
313   my $body = shift;
314   my @insns = (&$body,&$body,&$body,&$body);
315   my ($a,$b,$c,$d,$e,$f,$g,$h);
316
317         &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
318          eval(shift(@insns));
319          eval(shift(@insns));
320          eval(shift(@insns));
321         &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
322          eval(shift(@insns));
323          eval(shift(@insns));
324          eval(shift(@insns));
325         &vshr_u32       ($T2,$T0,$sigma0[0]);
326          eval(shift(@insns));
327          eval(shift(@insns));
328         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
329          eval(shift(@insns));
330          eval(shift(@insns));
331         &vshr_u32       ($T1,$T0,$sigma0[2]);
332          eval(shift(@insns));
333          eval(shift(@insns));
334         &vsli_32        ($T2,$T0,32-$sigma0[0]);
335          eval(shift(@insns));
336          eval(shift(@insns));
337         &vshr_u32       ($T3,$T0,$sigma0[1]);
338          eval(shift(@insns));
339          eval(shift(@insns));
340         &veor           ($T1,$T1,$T2);
341          eval(shift(@insns));
342          eval(shift(@insns));
343         &vsli_32        ($T3,$T0,32-$sigma0[1]);
344          eval(shift(@insns));
345          eval(shift(@insns));
346           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
347          eval(shift(@insns));
348          eval(shift(@insns));
349         &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
350          eval(shift(@insns));
351          eval(shift(@insns));
352           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
353          eval(shift(@insns));
354          eval(shift(@insns));
355           &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
356          eval(shift(@insns));
357          eval(shift(@insns));
358         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
359          eval(shift(@insns));
360          eval(shift(@insns));
361           &veor         ($T5,$T5,$T4);
362          eval(shift(@insns));
363          eval(shift(@insns));
364           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
365          eval(shift(@insns));
366          eval(shift(@insns));
367           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
368          eval(shift(@insns));
369          eval(shift(@insns));
370           &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
371          eval(shift(@insns));
372          eval(shift(@insns));
373         &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
374          eval(shift(@insns));
375          eval(shift(@insns));
376           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
377          eval(shift(@insns));
378          eval(shift(@insns));
379           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
380          eval(shift(@insns));
381          eval(shift(@insns));
382           &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
383          eval(shift(@insns));
384          eval(shift(@insns));
385           &veor         ($T5,$T5,$T4);
386          eval(shift(@insns));
387          eval(shift(@insns));
388           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
389          eval(shift(@insns));
390          eval(shift(@insns));
391         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
392          eval(shift(@insns));
393          eval(shift(@insns));
394           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
395          eval(shift(@insns));
396          eval(shift(@insns));
397           &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
398          eval(shift(@insns));
399          eval(shift(@insns));
400         &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
401          eval(shift(@insns));
402          eval(shift(@insns));
403         &vadd_i32       ($T0,$T0,@X[0]);
404          while($#insns>=2) { eval(shift(@insns)); }
405         &vst1_32        ("{$T0}","[$Xfer,:128]!");
406          eval(shift(@insns));
407          eval(shift(@insns));
408
409         push(@X,shift(@X));             # "rotate" X[]
410 }
411
412 sub Xpreload()
413 { use integer;
414   my $body = shift;
415   my @insns = (&$body,&$body,&$body,&$body);
416   my ($a,$b,$c,$d,$e,$f,$g,$h);
417
418          eval(shift(@insns));
419          eval(shift(@insns));
420          eval(shift(@insns));
421          eval(shift(@insns));
422         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
423          eval(shift(@insns));
424          eval(shift(@insns));
425          eval(shift(@insns));
426          eval(shift(@insns));
427         &vrev32_8       (@X[0],@X[0]);
428          eval(shift(@insns));
429          eval(shift(@insns));
430          eval(shift(@insns));
431          eval(shift(@insns));
432         &vadd_i32       ($T0,$T0,@X[0]);
433          foreach (@insns) { eval; }     # remaining instructions
434         &vst1_32        ("{$T0}","[$Xfer,:128]!");
435
436         push(@X,shift(@X));             # "rotate" X[]
437 }
438
439 sub body_00_15 () {
440         (
441         '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
442         '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
443         '&eor   ($t1,$f,$g)',
444         '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
445         '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
446         '&and   ($t1,$t1,$e)',
447         '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
448         '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
449         '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
450         '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
451         '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
452         '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
453         '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
454         '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
455         '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
456         '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
457         '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
458         '&add   ($d,$d,$h)',                    # d+=h
459         '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
460         '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
461         '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
462         )
463 }
464
465 $code.=<<___;
466 #if __ARM_MAX_ARCH__>=7
467 .arch   armv7-a
468 .fpu    neon
469
470 .global sha256_block_data_order_neon
471 .type   sha256_block_data_order_neon,%function
472 .align  4
473 sha256_block_data_order_neon:
474 .LNEON:
475         stmdb   sp!,{r4-r12,lr}
476
477         sub     $H,sp,#16*4+16
478         adr     $Ktbl,K256
479         bic     $H,$H,#15               @ align for 128-bit stores
480         mov     $t2,sp
481         mov     sp,$H                   @ alloca
482         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
483
484         vld1.8          {@X[0]},[$inp]!
485         vld1.8          {@X[1]},[$inp]!
486         vld1.8          {@X[2]},[$inp]!
487         vld1.8          {@X[3]},[$inp]!
488         vld1.32         {$T0},[$Ktbl,:128]!
489         vld1.32         {$T1},[$Ktbl,:128]!
490         vld1.32         {$T2},[$Ktbl,:128]!
491         vld1.32         {$T3},[$Ktbl,:128]!
492         vrev32.8        @X[0],@X[0]             @ yes, even on
493         str             $ctx,[sp,#64]
494         vrev32.8        @X[1],@X[1]             @ big-endian
495         str             $inp,[sp,#68]
496         mov             $Xfer,sp
497         vrev32.8        @X[2],@X[2]
498         str             $len,[sp,#72]
499         vrev32.8        @X[3],@X[3]
500         str             $t2,[sp,#76]            @ save original sp
501         vadd.i32        $T0,$T0,@X[0]
502         vadd.i32        $T1,$T1,@X[1]
503         vst1.32         {$T0},[$Xfer,:128]!
504         vadd.i32        $T2,$T2,@X[2]
505         vst1.32         {$T1},[$Xfer,:128]!
506         vadd.i32        $T3,$T3,@X[3]
507         vst1.32         {$T2},[$Xfer,:128]!
508         vst1.32         {$T3},[$Xfer,:128]!
509
510         ldmia           $ctx,{$A-$H}
511         sub             $Xfer,$Xfer,#64
512         ldr             $t1,[sp,#0]
513         eor             $t2,$t2,$t2
514         eor             $t3,$B,$C
515         b               .L_00_48
516
517 .align  4
518 .L_00_48:
519 ___
520         &Xupdate(\&body_00_15);
521         &Xupdate(\&body_00_15);
522         &Xupdate(\&body_00_15);
523         &Xupdate(\&body_00_15);
524 $code.=<<___;
525         teq     $t1,#0                          @ check for K256 terminator
526         ldr     $t1,[sp,#0]
527         sub     $Xfer,$Xfer,#64
528         bne     .L_00_48
529
530         ldr             $inp,[sp,#68]
531         ldr             $t0,[sp,#72]
532         sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
533         teq             $inp,$t0
534         it              eq
535         subeq           $inp,$inp,#64           @ avoid SEGV
536         vld1.8          {@X[0]},[$inp]!         @ load next input block
537         vld1.8          {@X[1]},[$inp]!
538         vld1.8          {@X[2]},[$inp]!
539         vld1.8          {@X[3]},[$inp]!
540         it              ne
541         strne           $inp,[sp,#68]
542         mov             $Xfer,sp
543 ___
544         &Xpreload(\&body_00_15);
545         &Xpreload(\&body_00_15);
546         &Xpreload(\&body_00_15);
547         &Xpreload(\&body_00_15);
548 $code.=<<___;
549         ldr     $t0,[$t1,#0]
550         add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
551         ldr     $t2,[$t1,#4]
552         ldr     $t3,[$t1,#8]
553         ldr     $t4,[$t1,#12]
554         add     $A,$A,$t0                       @ accumulate
555         ldr     $t0,[$t1,#16]
556         add     $B,$B,$t2
557         ldr     $t2,[$t1,#20]
558         add     $C,$C,$t3
559         ldr     $t3,[$t1,#24]
560         add     $D,$D,$t4
561         ldr     $t4,[$t1,#28]
562         add     $E,$E,$t0
563         str     $A,[$t1],#4
564         add     $F,$F,$t2
565         str     $B,[$t1],#4
566         add     $G,$G,$t3
567         str     $C,[$t1],#4
568         add     $H,$H,$t4
569         str     $D,[$t1],#4
570         stmia   $t1,{$E-$H}
571
572         ittte   ne
573         movne   $Xfer,sp
574         ldrne   $t1,[sp,#0]
575         eorne   $t2,$t2,$t2
576         ldreq   sp,[sp,#76]                     @ restore original sp
577         itt     ne
578         eorne   $t3,$B,$C
579         bne     .L_00_48
580
581         ldmia   sp!,{r4-r12,pc}
582 .size   sha256_block_data_order_neon,.-sha256_block_data_order_neon
583 #endif
584 ___
585 }}}
586 ######################################################################
587 # ARMv8 stuff
588 #
589 {{{
590 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
591 my @MSG=map("q$_",(8..11));
592 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
593 my $Ktbl="r3";
594
595 $code.=<<___;
596 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
597
598 # if defined(__thumb2__)
599 #  define INST(a,b,c,d) .byte   c,d|0xc,a,b
600 # else
601 #  define INST(a,b,c,d) .byte   a,b,c,d
602 # endif
603
604 .type   sha256_block_data_order_armv8,%function
605 .align  5
606 sha256_block_data_order_armv8:
607 .LARMv8:
608         vld1.32 {$ABCD,$EFGH},[$ctx]
609 # ifdef __APPLE__
610         sub     $Ktbl,$Ktbl,#256+32
611 # elif  defined(__thumb2__)
612         adr     $Ktbl,.LARMv8
613         sub     $Ktbl,$Ktbl,#.LARMv8-K256
614 # else
615         adrl    $Ktbl,K256
616 # endif
617         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
618
619 .Loop_v8:
620         vld1.8          {@MSG[0]-@MSG[1]},[$inp]!
621         vld1.8          {@MSG[2]-@MSG[3]},[$inp]!
622         vld1.32         {$W0},[$Ktbl]!
623         vrev32.8        @MSG[0],@MSG[0]
624         vrev32.8        @MSG[1],@MSG[1]
625         vrev32.8        @MSG[2],@MSG[2]
626         vrev32.8        @MSG[3],@MSG[3]
627         vmov            $ABCD_SAVE,$ABCD        @ offload
628         vmov            $EFGH_SAVE,$EFGH
629         teq             $inp,$len
630 ___
631 for($i=0;$i<12;$i++) {
632 $code.=<<___;
633         vld1.32         {$W1},[$Ktbl]!
634         vadd.i32        $W0,$W0,@MSG[0]
635         sha256su0       @MSG[0],@MSG[1]
636         vmov            $abcd,$ABCD
637         sha256h         $ABCD,$EFGH,$W0
638         sha256h2        $EFGH,$abcd,$W0
639         sha256su1       @MSG[0],@MSG[2],@MSG[3]
640 ___
641         ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
642 }
643 $code.=<<___;
644         vld1.32         {$W1},[$Ktbl]!
645         vadd.i32        $W0,$W0,@MSG[0]
646         vmov            $abcd,$ABCD
647         sha256h         $ABCD,$EFGH,$W0
648         sha256h2        $EFGH,$abcd,$W0
649
650         vld1.32         {$W0},[$Ktbl]!
651         vadd.i32        $W1,$W1,@MSG[1]
652         vmov            $abcd,$ABCD
653         sha256h         $ABCD,$EFGH,$W1
654         sha256h2        $EFGH,$abcd,$W1
655
656         vld1.32         {$W1},[$Ktbl]
657         vadd.i32        $W0,$W0,@MSG[2]
658         sub             $Ktbl,$Ktbl,#256-16     @ rewind
659         vmov            $abcd,$ABCD
660         sha256h         $ABCD,$EFGH,$W0
661         sha256h2        $EFGH,$abcd,$W0
662
663         vadd.i32        $W1,$W1,@MSG[3]
664         vmov            $abcd,$ABCD
665         sha256h         $ABCD,$EFGH,$W1
666         sha256h2        $EFGH,$abcd,$W1
667
668         vadd.i32        $ABCD,$ABCD,$ABCD_SAVE
669         vadd.i32        $EFGH,$EFGH,$EFGH_SAVE
670         it              ne
671         bne             .Loop_v8
672
673         vst1.32         {$ABCD,$EFGH},[$ctx]
674
675         ret             @ bx lr
676 .size   sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
677 #endif
678 ___
679 }}}
680 $code.=<<___;
681 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
682 .align  2
683 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
684 .comm   OPENSSL_armcap_P,4,4
685 #endif
686 ___
687
688 open SELF,$0;
689 while(<SELF>) {
690         next if (/^#!/);
691         last if (!s/^#/@/ and !/^$/);
692         print;
693 }
694 close SELF;
695
696 {   my  %opcode = (
697         "sha256h"       => 0xf3000c40,  "sha256h2"      => 0xf3100c40,
698         "sha256su0"     => 0xf3ba03c0,  "sha256su1"     => 0xf3200c40   );
699
700     sub unsha256 {
701         my ($mnemonic,$arg)=@_;
702
703         if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
704             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
705                                          |(($2&7)<<17)|(($2&8)<<4)
706                                          |(($3&7)<<1) |(($3&8)<<2);
707             # since ARMv7 instructions are always encoded little-endian.
708             # correct solution is to use .inst directive, but older
709             # assemblers don't implement it:-(
710             sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
711                         $word&0xff,($word>>8)&0xff,
712                         ($word>>16)&0xff,($word>>24)&0xff,
713                         $mnemonic,$arg;
714         }
715     }
716 }
717
718 foreach (split($/,$code)) {
719
720         s/\`([^\`]*)\`/eval $1/geo;
721
722         s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
723
724         s/\bret\b/bx    lr/go           or
725         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
726
727         print $_,"\n";
728 }
729
730 close STDOUT; # enforce flush