Fold threads.h into crypto.h making API public
[openssl.git] / crypto / sha / asm / sha256-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 #
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
11
12 # SHA256 block procedure for ARMv4. May 2007.
13
14 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
15 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16 # byte [on single-issue Xscale PXA250 core].
17
18 # July 2010.
19 #
20 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
21 # Cortex A8 core and ~20 cycles per processed byte.
22
23 # February 2011.
24 #
25 # Profiler-assisted and platform-specific optimization resulted in 16%
26 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
27
28 # September 2013.
29 #
30 # Add NEON implementation. On Cortex A8 it was measured to process one
31 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33 # code (meaning that latter performs sub-optimally, nothing was done
34 # about it).
35
36 # May 2014.
37 #
38 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
39
40 $flavour = shift;
41 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
42 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
43
44 if ($flavour && $flavour ne "void") {
45     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
47     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
48     die "can't locate arm-xlate.pl";
49
50     open STDOUT,"| \"$^X\" $xlate $flavour $output";
51 } else {
52     open STDOUT,">$output";
53 }
54
55 $ctx="r0";      $t0="r0";
56 $inp="r1";      $t4="r1";
57 $len="r2";      $t1="r2";
58 $T1="r3";       $t3="r3";
59 $A="r4";
60 $B="r5";
61 $C="r6";
62 $D="r7";
63 $E="r8";
64 $F="r9";
65 $G="r10";
66 $H="r11";
67 @V=($A,$B,$C,$D,$E,$F,$G,$H);
68 $t2="r12";
69 $Ktbl="r14";
70
71 @Sigma0=( 2,13,22);
72 @Sigma1=( 6,11,25);
73 @sigma0=( 7,18, 3);
74 @sigma1=(17,19,10);
75
76 sub BODY_00_15 {
77 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
78
79 $code.=<<___ if ($i<16);
80 #if __ARM_ARCH__>=7
81         @ ldr   $t1,[$inp],#4                   @ $i
82 # if $i==15
83         str     $inp,[sp,#17*4]                 @ make room for $t4
84 # endif
85         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
86         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
87         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
88 # ifndef __ARMEB__
89         rev     $t1,$t1
90 # endif
91 #else
92         @ ldrb  $t1,[$inp,#3]                   @ $i
93         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
94         ldrb    $t2,[$inp,#2]
95         ldrb    $t0,[$inp,#1]
96         orr     $t1,$t1,$t2,lsl#8
97         ldrb    $t2,[$inp],#4
98         orr     $t1,$t1,$t0,lsl#16
99 # if $i==15
100         str     $inp,[sp,#17*4]                 @ make room for $t4
101 # endif
102         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
103         orr     $t1,$t1,$t2,lsl#24
104         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
105 #endif
106 ___
107 $code.=<<___;
108         ldr     $t2,[$Ktbl],#4                  @ *K256++
109         add     $h,$h,$t1                       @ h+=X[i]
110         str     $t1,[sp,#`$i%16`*4]
111         eor     $t1,$f,$g
112         add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
113         and     $t1,$t1,$e
114         add     $h,$h,$t2                       @ h+=K256[i]
115         eor     $t1,$t1,$g                      @ Ch(e,f,g)
116         eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
117         add     $h,$h,$t1                       @ h+=Ch(e,f,g)
118 #if $i==31
119         and     $t2,$t2,#0xff
120         cmp     $t2,#0xf2                       @ done?
121 #endif
122 #if $i<15
123 # if __ARM_ARCH__>=7
124         ldr     $t1,[$inp],#4                   @ prefetch
125 # else
126         ldrb    $t1,[$inp,#3]
127 # endif
128         eor     $t2,$a,$b                       @ a^b, b^c in next round
129 #else
130         ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
131         eor     $t2,$a,$b                       @ a^b, b^c in next round
132         ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
133 #endif
134         eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
135         and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
136         add     $d,$d,$h                        @ d+=h
137         eor     $t3,$t3,$b                      @ Maj(a,b,c)
138         add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
139         @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
140 ___
141         ($t2,$t3)=($t3,$t2);
142 }
143
144 sub BODY_16_XX {
145 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
146
147 $code.=<<___;
148         @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
149         @ ldr   $t4,[sp,#`($i+14)%16`*4]
150         mov     $t0,$t1,ror#$sigma0[0]
151         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
152         mov     $t2,$t4,ror#$sigma1[0]
153         eor     $t0,$t0,$t1,ror#$sigma0[1]
154         eor     $t2,$t2,$t4,ror#$sigma1[1]
155         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
156         ldr     $t1,[sp,#`($i+0)%16`*4]
157         eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
158         ldr     $t4,[sp,#`($i+9)%16`*4]
159
160         add     $t2,$t2,$t0
161         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
162         add     $t1,$t1,$t2
163         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
164         add     $t1,$t1,$t4                     @ X[i]
165 ___
166         &BODY_00_15(@_);
167 }
168
169 $code=<<___;
170 #ifndef __KERNEL__
171 # include "arm_arch.h"
172 #else
173 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
174 # define __ARM_MAX_ARCH__ 7
175 #endif
176
177 .text
178 #if defined(__thumb2__)
179 .syntax unified
180 .thumb
181 #else
182 .code   32
183 #endif
184
185 .type   K256,%object
186 .align  5
187 K256:
188 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
189 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
190 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
191 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
192 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
193 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
194 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
195 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
196 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
197 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
198 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
199 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
200 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
201 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
202 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
203 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
204 .size   K256,.-K256
205 .word   0                               @ terminator
206 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
207 .LOPENSSL_armcap:
208 .word   OPENSSL_armcap_P-.Lsha256_block_data_order
209 #endif
210 .align  5
211
212 .global sha256_block_data_order
213 .type   sha256_block_data_order,%function
214 sha256_block_data_order:
215 .Lsha256_block_data_order:
216 #if __ARM_ARCH__<7 && !defined(__thumb2__)
217         sub     r3,pc,#8                @ sha256_block_data_order
218 #else
219         adr     r3,.Lsha256_block_data_order
220 #endif
221 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
222         ldr     r12,.LOPENSSL_armcap
223         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
224 #ifdef  __APPLE__
225         ldr     r12,[r12]
226 #endif
227         tst     r12,#ARMV8_SHA256
228         bne     .LARMv8
229         tst     r12,#ARMV7_NEON
230         bne     .LNEON
231 #endif
232         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
233         stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
234         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
235         sub     $Ktbl,r3,#256+32        @ K256
236         sub     sp,sp,#16*4             @ alloca(X[16])
237 .Loop:
238 # if __ARM_ARCH__>=7
239         ldr     $t1,[$inp],#4
240 # else
241         ldrb    $t1,[$inp,#3]
242 # endif
243         eor     $t3,$B,$C               @ magic
244         eor     $t2,$t2,$t2
245 ___
246 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
247 $code.=".Lrounds_16_xx:\n";
248 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
249 $code.=<<___;
250 #if __ARM_ARCH__>=7
251         ite     eq                      @ Thumb2 thing, sanity check in ARM
252 #endif
253         ldreq   $t3,[sp,#16*4]          @ pull ctx
254         bne     .Lrounds_16_xx
255
256         add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
257         ldr     $t0,[$t3,#0]
258         ldr     $t1,[$t3,#4]
259         ldr     $t2,[$t3,#8]
260         add     $A,$A,$t0
261         ldr     $t0,[$t3,#12]
262         add     $B,$B,$t1
263         ldr     $t1,[$t3,#16]
264         add     $C,$C,$t2
265         ldr     $t2,[$t3,#20]
266         add     $D,$D,$t0
267         ldr     $t0,[$t3,#24]
268         add     $E,$E,$t1
269         ldr     $t1,[$t3,#28]
270         add     $F,$F,$t2
271         ldr     $inp,[sp,#17*4]         @ pull inp
272         ldr     $t2,[sp,#18*4]          @ pull inp+len
273         add     $G,$G,$t0
274         add     $H,$H,$t1
275         stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
276         cmp     $inp,$t2
277         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
278         bne     .Loop
279
280         add     sp,sp,#`16+3`*4 @ destroy frame
281 #if __ARM_ARCH__>=5
282         ldmia   sp!,{r4-r11,pc}
283 #else
284         ldmia   sp!,{r4-r11,lr}
285         tst     lr,#1
286         moveq   pc,lr                   @ be binary compatible with V4, yet
287         bx      lr                      @ interoperable with Thumb ISA:-)
288 #endif
289 .size   sha256_block_data_order,.-sha256_block_data_order
290 ___
291 ######################################################################
292 # NEON stuff
293 #
294 {{{
295 my @X=map("q$_",(0..3));
296 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
297 my $Xfer=$t4;
298 my $j=0;
299
300 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
301 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
302
303 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
304 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
305   my $arg = pop;
306     $arg = "#$arg" if ($arg*1 eq $arg);
307     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
308 }
309
310 sub Xupdate()
311 { use integer;
312   my $body = shift;
313   my @insns = (&$body,&$body,&$body,&$body);
314   my ($a,$b,$c,$d,$e,$f,$g,$h);
315
316         &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
317          eval(shift(@insns));
318          eval(shift(@insns));
319          eval(shift(@insns));
320         &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
321          eval(shift(@insns));
322          eval(shift(@insns));
323          eval(shift(@insns));
324         &vshr_u32       ($T2,$T0,$sigma0[0]);
325          eval(shift(@insns));
326          eval(shift(@insns));
327         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
328          eval(shift(@insns));
329          eval(shift(@insns));
330         &vshr_u32       ($T1,$T0,$sigma0[2]);
331          eval(shift(@insns));
332          eval(shift(@insns));
333         &vsli_32        ($T2,$T0,32-$sigma0[0]);
334          eval(shift(@insns));
335          eval(shift(@insns));
336         &vshr_u32       ($T3,$T0,$sigma0[1]);
337          eval(shift(@insns));
338          eval(shift(@insns));
339         &veor           ($T1,$T1,$T2);
340          eval(shift(@insns));
341          eval(shift(@insns));
342         &vsli_32        ($T3,$T0,32-$sigma0[1]);
343          eval(shift(@insns));
344          eval(shift(@insns));
345           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
346          eval(shift(@insns));
347          eval(shift(@insns));
348         &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
349          eval(shift(@insns));
350          eval(shift(@insns));
351           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
352          eval(shift(@insns));
353          eval(shift(@insns));
354           &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
355          eval(shift(@insns));
356          eval(shift(@insns));
357         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
358          eval(shift(@insns));
359          eval(shift(@insns));
360           &veor         ($T5,$T5,$T4);
361          eval(shift(@insns));
362          eval(shift(@insns));
363           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
364          eval(shift(@insns));
365          eval(shift(@insns));
366           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
367          eval(shift(@insns));
368          eval(shift(@insns));
369           &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
370          eval(shift(@insns));
371          eval(shift(@insns));
372         &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
373          eval(shift(@insns));
374          eval(shift(@insns));
375           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
376          eval(shift(@insns));
377          eval(shift(@insns));
378           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
379          eval(shift(@insns));
380          eval(shift(@insns));
381           &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
382          eval(shift(@insns));
383          eval(shift(@insns));
384           &veor         ($T5,$T5,$T4);
385          eval(shift(@insns));
386          eval(shift(@insns));
387           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
388          eval(shift(@insns));
389          eval(shift(@insns));
390         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
391          eval(shift(@insns));
392          eval(shift(@insns));
393           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
394          eval(shift(@insns));
395          eval(shift(@insns));
396           &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
397          eval(shift(@insns));
398          eval(shift(@insns));
399         &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
400          eval(shift(@insns));
401          eval(shift(@insns));
402         &vadd_i32       ($T0,$T0,@X[0]);
403          while($#insns>=2) { eval(shift(@insns)); }
404         &vst1_32        ("{$T0}","[$Xfer,:128]!");
405          eval(shift(@insns));
406          eval(shift(@insns));
407
408         push(@X,shift(@X));             # "rotate" X[]
409 }
410
411 sub Xpreload()
412 { use integer;
413   my $body = shift;
414   my @insns = (&$body,&$body,&$body,&$body);
415   my ($a,$b,$c,$d,$e,$f,$g,$h);
416
417          eval(shift(@insns));
418          eval(shift(@insns));
419          eval(shift(@insns));
420          eval(shift(@insns));
421         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
422          eval(shift(@insns));
423          eval(shift(@insns));
424          eval(shift(@insns));
425          eval(shift(@insns));
426         &vrev32_8       (@X[0],@X[0]);
427          eval(shift(@insns));
428          eval(shift(@insns));
429          eval(shift(@insns));
430          eval(shift(@insns));
431         &vadd_i32       ($T0,$T0,@X[0]);
432          foreach (@insns) { eval; }     # remaining instructions
433         &vst1_32        ("{$T0}","[$Xfer,:128]!");
434
435         push(@X,shift(@X));             # "rotate" X[]
436 }
437
438 sub body_00_15 () {
439         (
440         '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
441         '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
442         '&eor   ($t1,$f,$g)',
443         '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
444         '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
445         '&and   ($t1,$t1,$e)',
446         '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
447         '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
448         '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
449         '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
450         '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
451         '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
452         '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
453         '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
454         '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
455         '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
456         '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
457         '&add   ($d,$d,$h)',                    # d+=h
458         '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
459         '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
460         '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
461         )
462 }
463
464 $code.=<<___;
465 #if __ARM_MAX_ARCH__>=7
466 .arch   armv7-a
467 .fpu    neon
468
469 .global sha256_block_data_order_neon
470 .type   sha256_block_data_order_neon,%function
471 .align  5
472 .skip   16
473 sha256_block_data_order_neon:
474 .LNEON:
475         stmdb   sp!,{r4-r12,lr}
476
477         sub     $H,sp,#16*4+16
478         adr     $Ktbl,K256
479         bic     $H,$H,#15               @ align for 128-bit stores
480         mov     $t2,sp
481         mov     sp,$H                   @ alloca
482         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
483
484         vld1.8          {@X[0]},[$inp]!
485         vld1.8          {@X[1]},[$inp]!
486         vld1.8          {@X[2]},[$inp]!
487         vld1.8          {@X[3]},[$inp]!
488         vld1.32         {$T0},[$Ktbl,:128]!
489         vld1.32         {$T1},[$Ktbl,:128]!
490         vld1.32         {$T2},[$Ktbl,:128]!
491         vld1.32         {$T3},[$Ktbl,:128]!
492         vrev32.8        @X[0],@X[0]             @ yes, even on
493         str             $ctx,[sp,#64]
494         vrev32.8        @X[1],@X[1]             @ big-endian
495         str             $inp,[sp,#68]
496         mov             $Xfer,sp
497         vrev32.8        @X[2],@X[2]
498         str             $len,[sp,#72]
499         vrev32.8        @X[3],@X[3]
500         str             $t2,[sp,#76]            @ save original sp
501         vadd.i32        $T0,$T0,@X[0]
502         vadd.i32        $T1,$T1,@X[1]
503         vst1.32         {$T0},[$Xfer,:128]!
504         vadd.i32        $T2,$T2,@X[2]
505         vst1.32         {$T1},[$Xfer,:128]!
506         vadd.i32        $T3,$T3,@X[3]
507         vst1.32         {$T2},[$Xfer,:128]!
508         vst1.32         {$T3},[$Xfer,:128]!
509
510         ldmia           $ctx,{$A-$H}
511         sub             $Xfer,$Xfer,#64
512         ldr             $t1,[sp,#0]
513         eor             $t2,$t2,$t2
514         eor             $t3,$B,$C
515         b               .L_00_48
516
517 .align  4
518 .L_00_48:
519 ___
520         &Xupdate(\&body_00_15);
521         &Xupdate(\&body_00_15);
522         &Xupdate(\&body_00_15);
523         &Xupdate(\&body_00_15);
524 $code.=<<___;
525         teq     $t1,#0                          @ check for K256 terminator
526         ldr     $t1,[sp,#0]
527         sub     $Xfer,$Xfer,#64
528         bne     .L_00_48
529
530         ldr             $inp,[sp,#68]
531         ldr             $t0,[sp,#72]
532         sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
533         teq             $inp,$t0
534         it              eq
535         subeq           $inp,$inp,#64           @ avoid SEGV
536         vld1.8          {@X[0]},[$inp]!         @ load next input block
537         vld1.8          {@X[1]},[$inp]!
538         vld1.8          {@X[2]},[$inp]!
539         vld1.8          {@X[3]},[$inp]!
540         it              ne
541         strne           $inp,[sp,#68]
542         mov             $Xfer,sp
543 ___
544         &Xpreload(\&body_00_15);
545         &Xpreload(\&body_00_15);
546         &Xpreload(\&body_00_15);
547         &Xpreload(\&body_00_15);
548 $code.=<<___;
549         ldr     $t0,[$t1,#0]
550         add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
551         ldr     $t2,[$t1,#4]
552         ldr     $t3,[$t1,#8]
553         ldr     $t4,[$t1,#12]
554         add     $A,$A,$t0                       @ accumulate
555         ldr     $t0,[$t1,#16]
556         add     $B,$B,$t2
557         ldr     $t2,[$t1,#20]
558         add     $C,$C,$t3
559         ldr     $t3,[$t1,#24]
560         add     $D,$D,$t4
561         ldr     $t4,[$t1,#28]
562         add     $E,$E,$t0
563         str     $A,[$t1],#4
564         add     $F,$F,$t2
565         str     $B,[$t1],#4
566         add     $G,$G,$t3
567         str     $C,[$t1],#4
568         add     $H,$H,$t4
569         str     $D,[$t1],#4
570         stmia   $t1,{$E-$H}
571
572         ittte   ne
573         movne   $Xfer,sp
574         ldrne   $t1,[sp,#0]
575         eorne   $t2,$t2,$t2
576         ldreq   sp,[sp,#76]                     @ restore original sp
577         itt     ne
578         eorne   $t3,$B,$C
579         bne     .L_00_48
580
581         ldmia   sp!,{r4-r12,pc}
582 .size   sha256_block_data_order_neon,.-sha256_block_data_order_neon
583 #endif
584 ___
585 }}}
586 ######################################################################
587 # ARMv8 stuff
588 #
589 {{{
590 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
591 my @MSG=map("q$_",(8..11));
592 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
593 my $Ktbl="r3";
594
595 $code.=<<___;
596 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
597
598 # if defined(__thumb2__)
599 #  define INST(a,b,c,d) .byte   c,d|0xc,a,b
600 # else
601 #  define INST(a,b,c,d) .byte   a,b,c,d
602 # endif
603
604 .type   sha256_block_data_order_armv8,%function
605 .align  5
606 sha256_block_data_order_armv8:
607 .LARMv8:
608         vld1.32 {$ABCD,$EFGH},[$ctx]
609         sub     $Ktbl,$Ktbl,#256+32
610         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
611         b       .Loop_v8
612
613 .align  4
614 .Loop_v8:
615         vld1.8          {@MSG[0]-@MSG[1]},[$inp]!
616         vld1.8          {@MSG[2]-@MSG[3]},[$inp]!
617         vld1.32         {$W0},[$Ktbl]!
618         vrev32.8        @MSG[0],@MSG[0]
619         vrev32.8        @MSG[1],@MSG[1]
620         vrev32.8        @MSG[2],@MSG[2]
621         vrev32.8        @MSG[3],@MSG[3]
622         vmov            $ABCD_SAVE,$ABCD        @ offload
623         vmov            $EFGH_SAVE,$EFGH
624         teq             $inp,$len
625 ___
626 for($i=0;$i<12;$i++) {
627 $code.=<<___;
628         vld1.32         {$W1},[$Ktbl]!
629         vadd.i32        $W0,$W0,@MSG[0]
630         sha256su0       @MSG[0],@MSG[1]
631         vmov            $abcd,$ABCD
632         sha256h         $ABCD,$EFGH,$W0
633         sha256h2        $EFGH,$abcd,$W0
634         sha256su1       @MSG[0],@MSG[2],@MSG[3]
635 ___
636         ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
637 }
638 $code.=<<___;
639         vld1.32         {$W1},[$Ktbl]!
640         vadd.i32        $W0,$W0,@MSG[0]
641         vmov            $abcd,$ABCD
642         sha256h         $ABCD,$EFGH,$W0
643         sha256h2        $EFGH,$abcd,$W0
644
645         vld1.32         {$W0},[$Ktbl]!
646         vadd.i32        $W1,$W1,@MSG[1]
647         vmov            $abcd,$ABCD
648         sha256h         $ABCD,$EFGH,$W1
649         sha256h2        $EFGH,$abcd,$W1
650
651         vld1.32         {$W1},[$Ktbl]
652         vadd.i32        $W0,$W0,@MSG[2]
653         sub             $Ktbl,$Ktbl,#256-16     @ rewind
654         vmov            $abcd,$ABCD
655         sha256h         $ABCD,$EFGH,$W0
656         sha256h2        $EFGH,$abcd,$W0
657
658         vadd.i32        $W1,$W1,@MSG[3]
659         vmov            $abcd,$ABCD
660         sha256h         $ABCD,$EFGH,$W1
661         sha256h2        $EFGH,$abcd,$W1
662
663         vadd.i32        $ABCD,$ABCD,$ABCD_SAVE
664         vadd.i32        $EFGH,$EFGH,$EFGH_SAVE
665         it              ne
666         bne             .Loop_v8
667
668         vst1.32         {$ABCD,$EFGH},[$ctx]
669
670         ret             @ bx lr
671 .size   sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
672 #endif
673 ___
674 }}}
675 $code.=<<___;
676 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
677 .align  2
678 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
679 .comm   OPENSSL_armcap_P,4,4
680 #endif
681 ___
682
683 open SELF,$0;
684 while(<SELF>) {
685         next if (/^#!/);
686         last if (!s/^#/@/ and !/^$/);
687         print;
688 }
689 close SELF;
690
691 {   my  %opcode = (
692         "sha256h"       => 0xf3000c40,  "sha256h2"      => 0xf3100c40,
693         "sha256su0"     => 0xf3ba03c0,  "sha256su1"     => 0xf3200c40   );
694
695     sub unsha256 {
696         my ($mnemonic,$arg)=@_;
697
698         if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
699             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
700                                          |(($2&7)<<17)|(($2&8)<<4)
701                                          |(($3&7)<<1) |(($3&8)<<2);
702             # since ARMv7 instructions are always encoded little-endian.
703             # correct solution is to use .inst directive, but older
704             # assemblers don't implement it:-(
705             sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
706                         $word&0xff,($word>>8)&0xff,
707                         ($word>>16)&0xff,($word>>24)&0xff,
708                         $mnemonic,$arg;
709         }
710     }
711 }
712
713 foreach (split($/,$code)) {
714
715         s/\`([^\`]*)\`/eval $1/geo;
716
717         s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
718
719         s/\bret\b/bx    lr/go           or
720         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
721
722         print $_,"\n";
723 }
724
725 close STDOUT; # enforce flush