efee1fb1f3b4023d82e0b13472ddd7e8f10d0dfc
[openssl.git] / crypto / sha / asm / sha256-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 #
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
11
12 # SHA256 block procedure for ARMv4. May 2007.
13
14 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
15 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16 # byte [on single-issue Xscale PXA250 core].
17
18 # July 2010.
19 #
20 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
21 # Cortex A8 core and ~20 cycles per processed byte.
22
23 # February 2011.
24 #
25 # Profiler-assisted and platform-specific optimization resulted in 16%
26 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
27
28 # September 2013.
29 #
30 # Add NEON implementation. On Cortex A8 it was measured to process one
31 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33 # code (meaning that latter performs sub-optimally, nothing was done
34 # about it).
35
36 # May 2014.
37 #
38 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
39
40 $flavour = shift;
41 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
42 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
43
44 if ($flavour && $flavour ne "void") {
45     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
47     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
48     die "can't locate arm-xlate.pl";
49
50     open STDOUT,"| \"$^X\" $xlate $flavour $output";
51 } else {
52     open STDOUT,">$output";
53 }
54
55 $ctx="r0";      $t0="r0";
56 $inp="r1";      $t4="r1";
57 $len="r2";      $t1="r2";
58 $T1="r3";       $t3="r3";
59 $A="r4";
60 $B="r5";
61 $C="r6";
62 $D="r7";
63 $E="r8";
64 $F="r9";
65 $G="r10";
66 $H="r11";
67 @V=($A,$B,$C,$D,$E,$F,$G,$H);
68 $t2="r12";
69 $Ktbl="r14";
70
71 @Sigma0=( 2,13,22);
72 @Sigma1=( 6,11,25);
73 @sigma0=( 7,18, 3);
74 @sigma1=(17,19,10);
75
76 sub BODY_00_15 {
77 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
78
79 $code.=<<___ if ($i<16);
80 #if __ARM_ARCH__>=7
81         @ ldr   $t1,[$inp],#4                   @ $i
82 # if $i==15
83         str     $inp,[sp,#17*4]                 @ make room for $t4
84 # endif
85         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
86         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
87         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
88 # ifndef __ARMEB__
89         rev     $t1,$t1
90 # endif
91 #else
92         @ ldrb  $t1,[$inp,#3]                   @ $i
93         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
94         ldrb    $t2,[$inp,#2]
95         ldrb    $t0,[$inp,#1]
96         orr     $t1,$t1,$t2,lsl#8
97         ldrb    $t2,[$inp],#4
98         orr     $t1,$t1,$t0,lsl#16
99 # if $i==15
100         str     $inp,[sp,#17*4]                 @ make room for $t4
101 # endif
102         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
103         orr     $t1,$t1,$t2,lsl#24
104         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
105 #endif
106 ___
107 $code.=<<___;
108         ldr     $t2,[$Ktbl],#4                  @ *K256++
109         add     $h,$h,$t1                       @ h+=X[i]
110         str     $t1,[sp,#`$i%16`*4]
111         eor     $t1,$f,$g
112         add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
113         and     $t1,$t1,$e
114         add     $h,$h,$t2                       @ h+=K256[i]
115         eor     $t1,$t1,$g                      @ Ch(e,f,g)
116         eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
117         add     $h,$h,$t1                       @ h+=Ch(e,f,g)
118 #if $i==31
119         and     $t2,$t2,#0xff
120         cmp     $t2,#0xf2                       @ done?
121 #endif
122 #if $i<15
123 # if __ARM_ARCH__>=7
124         ldr     $t1,[$inp],#4                   @ prefetch
125 # else
126         ldrb    $t1,[$inp,#3]
127 # endif
128         eor     $t2,$a,$b                       @ a^b, b^c in next round
129 #else
130         ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
131         eor     $t2,$a,$b                       @ a^b, b^c in next round
132         ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
133 #endif
134         eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
135         and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
136         add     $d,$d,$h                        @ d+=h
137         eor     $t3,$t3,$b                      @ Maj(a,b,c)
138         add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
139         @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
140 ___
141         ($t2,$t3)=($t3,$t2);
142 }
143
144 sub BODY_16_XX {
145 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
146
147 $code.=<<___;
148         @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
149         @ ldr   $t4,[sp,#`($i+14)%16`*4]
150         mov     $t0,$t1,ror#$sigma0[0]
151         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
152         mov     $t2,$t4,ror#$sigma1[0]
153         eor     $t0,$t0,$t1,ror#$sigma0[1]
154         eor     $t2,$t2,$t4,ror#$sigma1[1]
155         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
156         ldr     $t1,[sp,#`($i+0)%16`*4]
157         eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
158         ldr     $t4,[sp,#`($i+9)%16`*4]
159
160         add     $t2,$t2,$t0
161         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
162         add     $t1,$t1,$t2
163         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
164         add     $t1,$t1,$t4                     @ X[i]
165 ___
166         &BODY_00_15(@_);
167 }
168
169 $code=<<___;
170 #ifndef __KERNEL__
171 # include "arm_arch.h"
172 #else
173 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
174 # define __ARM_MAX_ARCH__ 7
175 #endif
176
177 .text
178 #if __ARM_ARCH__<7
179 .code   32
180 #else
181 .syntax unified
182 # if defined(__thumb2__) && !defined(__APPLE__)
183 #  define adrl adr
184 .thumb
185 # else
186 .code   32
187 # endif
188 #endif
189
190 .type   K256,%object
191 .align  5
192 K256:
193 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
194 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
195 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
196 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
197 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
198 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
199 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
200 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
201 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
202 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
203 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
204 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
205 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
206 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
207 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
208 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
209 .size   K256,.-K256
210 .word   0                               @ terminator
211 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
212 .LOPENSSL_armcap:
213 .word   OPENSSL_armcap_P-.Lsha256_block_data_order
214 #endif
215 .align  5
216
217 .global sha256_block_data_order
218 .type   sha256_block_data_order,%function
219 sha256_block_data_order:
220 .Lsha256_block_data_order:
221 #if __ARM_ARCH__<7
222         sub     r3,pc,#8                @ sha256_block_data_order
223 #else
224         adr     r3,sha256_block_data_order
225 #endif
226 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
227         ldr     r12,.LOPENSSL_armcap
228         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
229 #ifdef  __APPLE__
230         ldr     r12,[r12]
231 #endif
232         tst     r12,#ARMV8_SHA256
233         bne     .LARMv8
234         tst     r12,#ARMV7_NEON
235         bne     .LNEON
236 #endif
237         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
238         stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
239         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
240         sub     $Ktbl,r3,#256+32        @ K256
241         sub     sp,sp,#16*4             @ alloca(X[16])
242 .Loop:
243 # if __ARM_ARCH__>=7
244         ldr     $t1,[$inp],#4
245 # else
246         ldrb    $t1,[$inp,#3]
247 # endif
248         eor     $t3,$B,$C               @ magic
249         eor     $t2,$t2,$t2
250 ___
251 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
252 $code.=".Lrounds_16_xx:\n";
253 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
254 $code.=<<___;
255 #if __ARM_ARCH__>=7
256         ite     eq                      @ Thumb2 thing, sanity check in ARM
257 #endif
258         ldreq   $t3,[sp,#16*4]          @ pull ctx
259         bne     .Lrounds_16_xx
260
261         add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
262         ldr     $t0,[$t3,#0]
263         ldr     $t1,[$t3,#4]
264         ldr     $t2,[$t3,#8]
265         add     $A,$A,$t0
266         ldr     $t0,[$t3,#12]
267         add     $B,$B,$t1
268         ldr     $t1,[$t3,#16]
269         add     $C,$C,$t2
270         ldr     $t2,[$t3,#20]
271         add     $D,$D,$t0
272         ldr     $t0,[$t3,#24]
273         add     $E,$E,$t1
274         ldr     $t1,[$t3,#28]
275         add     $F,$F,$t2
276         ldr     $inp,[sp,#17*4]         @ pull inp
277         ldr     $t2,[sp,#18*4]          @ pull inp+len
278         add     $G,$G,$t0
279         add     $H,$H,$t1
280         stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
281         cmp     $inp,$t2
282         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
283         bne     .Loop
284
285         add     sp,sp,#`16+3`*4 @ destroy frame
286 #if __ARM_ARCH__>=5
287         ldmia   sp!,{r4-r11,pc}
288 #else
289         ldmia   sp!,{r4-r11,lr}
290         tst     lr,#1
291         moveq   pc,lr                   @ be binary compatible with V4, yet
292         bx      lr                      @ interoperable with Thumb ISA:-)
293 #endif
294 .size   sha256_block_data_order,.-sha256_block_data_order
295 ___
296 ######################################################################
297 # NEON stuff
298 #
299 {{{
300 my @X=map("q$_",(0..3));
301 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
302 my $Xfer=$t4;
303 my $j=0;
304
305 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
306 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
307
308 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
309 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
310   my $arg = pop;
311     $arg = "#$arg" if ($arg*1 eq $arg);
312     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
313 }
314
315 sub Xupdate()
316 { use integer;
317   my $body = shift;
318   my @insns = (&$body,&$body,&$body,&$body);
319   my ($a,$b,$c,$d,$e,$f,$g,$h);
320
321         &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
322          eval(shift(@insns));
323          eval(shift(@insns));
324          eval(shift(@insns));
325         &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
326          eval(shift(@insns));
327          eval(shift(@insns));
328          eval(shift(@insns));
329         &vshr_u32       ($T2,$T0,$sigma0[0]);
330          eval(shift(@insns));
331          eval(shift(@insns));
332         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
333          eval(shift(@insns));
334          eval(shift(@insns));
335         &vshr_u32       ($T1,$T0,$sigma0[2]);
336          eval(shift(@insns));
337          eval(shift(@insns));
338         &vsli_32        ($T2,$T0,32-$sigma0[0]);
339          eval(shift(@insns));
340          eval(shift(@insns));
341         &vshr_u32       ($T3,$T0,$sigma0[1]);
342          eval(shift(@insns));
343          eval(shift(@insns));
344         &veor           ($T1,$T1,$T2);
345          eval(shift(@insns));
346          eval(shift(@insns));
347         &vsli_32        ($T3,$T0,32-$sigma0[1]);
348          eval(shift(@insns));
349          eval(shift(@insns));
350           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
351          eval(shift(@insns));
352          eval(shift(@insns));
353         &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
354          eval(shift(@insns));
355          eval(shift(@insns));
356           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
357          eval(shift(@insns));
358          eval(shift(@insns));
359           &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
360          eval(shift(@insns));
361          eval(shift(@insns));
362         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
363          eval(shift(@insns));
364          eval(shift(@insns));
365           &veor         ($T5,$T5,$T4);
366          eval(shift(@insns));
367          eval(shift(@insns));
368           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
369          eval(shift(@insns));
370          eval(shift(@insns));
371           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
372          eval(shift(@insns));
373          eval(shift(@insns));
374           &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
375          eval(shift(@insns));
376          eval(shift(@insns));
377         &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
378          eval(shift(@insns));
379          eval(shift(@insns));
380           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
381          eval(shift(@insns));
382          eval(shift(@insns));
383           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
384          eval(shift(@insns));
385          eval(shift(@insns));
386           &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
387          eval(shift(@insns));
388          eval(shift(@insns));
389           &veor         ($T5,$T5,$T4);
390          eval(shift(@insns));
391          eval(shift(@insns));
392           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
393          eval(shift(@insns));
394          eval(shift(@insns));
395         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
396          eval(shift(@insns));
397          eval(shift(@insns));
398           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
399          eval(shift(@insns));
400          eval(shift(@insns));
401           &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
402          eval(shift(@insns));
403          eval(shift(@insns));
404         &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
405          eval(shift(@insns));
406          eval(shift(@insns));
407         &vadd_i32       ($T0,$T0,@X[0]);
408          while($#insns>=2) { eval(shift(@insns)); }
409         &vst1_32        ("{$T0}","[$Xfer,:128]!");
410          eval(shift(@insns));
411          eval(shift(@insns));
412
413         push(@X,shift(@X));             # "rotate" X[]
414 }
415
416 sub Xpreload()
417 { use integer;
418   my $body = shift;
419   my @insns = (&$body,&$body,&$body,&$body);
420   my ($a,$b,$c,$d,$e,$f,$g,$h);
421
422          eval(shift(@insns));
423          eval(shift(@insns));
424          eval(shift(@insns));
425          eval(shift(@insns));
426         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
427          eval(shift(@insns));
428          eval(shift(@insns));
429          eval(shift(@insns));
430          eval(shift(@insns));
431         &vrev32_8       (@X[0],@X[0]);
432          eval(shift(@insns));
433          eval(shift(@insns));
434          eval(shift(@insns));
435          eval(shift(@insns));
436         &vadd_i32       ($T0,$T0,@X[0]);
437          foreach (@insns) { eval; }     # remaining instructions
438         &vst1_32        ("{$T0}","[$Xfer,:128]!");
439
440         push(@X,shift(@X));             # "rotate" X[]
441 }
442
443 sub body_00_15 () {
444         (
445         '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
446         '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
447         '&eor   ($t1,$f,$g)',
448         '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
449         '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
450         '&and   ($t1,$t1,$e)',
451         '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
452         '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
453         '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
454         '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
455         '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
456         '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
457         '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
458         '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
459         '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
460         '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
461         '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
462         '&add   ($d,$d,$h)',                    # d+=h
463         '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
464         '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
465         '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
466         )
467 }
468
469 $code.=<<___;
470 #if __ARM_MAX_ARCH__>=7
471 .arch   armv7-a
472 .fpu    neon
473
474 .global sha256_block_data_order_neon
475 .type   sha256_block_data_order_neon,%function
476 .align  4
477 sha256_block_data_order_neon:
478 .LNEON:
479         stmdb   sp!,{r4-r12,lr}
480
481         sub     $H,sp,#16*4+16
482         adr     $Ktbl,K256
483         bic     $H,$H,#15               @ align for 128-bit stores
484         mov     $t2,sp
485         mov     sp,$H                   @ alloca
486         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
487
488         vld1.8          {@X[0]},[$inp]!
489         vld1.8          {@X[1]},[$inp]!
490         vld1.8          {@X[2]},[$inp]!
491         vld1.8          {@X[3]},[$inp]!
492         vld1.32         {$T0},[$Ktbl,:128]!
493         vld1.32         {$T1},[$Ktbl,:128]!
494         vld1.32         {$T2},[$Ktbl,:128]!
495         vld1.32         {$T3},[$Ktbl,:128]!
496         vrev32.8        @X[0],@X[0]             @ yes, even on
497         str             $ctx,[sp,#64]
498         vrev32.8        @X[1],@X[1]             @ big-endian
499         str             $inp,[sp,#68]
500         mov             $Xfer,sp
501         vrev32.8        @X[2],@X[2]
502         str             $len,[sp,#72]
503         vrev32.8        @X[3],@X[3]
504         str             $t2,[sp,#76]            @ save original sp
505         vadd.i32        $T0,$T0,@X[0]
506         vadd.i32        $T1,$T1,@X[1]
507         vst1.32         {$T0},[$Xfer,:128]!
508         vadd.i32        $T2,$T2,@X[2]
509         vst1.32         {$T1},[$Xfer,:128]!
510         vadd.i32        $T3,$T3,@X[3]
511         vst1.32         {$T2},[$Xfer,:128]!
512         vst1.32         {$T3},[$Xfer,:128]!
513
514         ldmia           $ctx,{$A-$H}
515         sub             $Xfer,$Xfer,#64
516         ldr             $t1,[sp,#0]
517         eor             $t2,$t2,$t2
518         eor             $t3,$B,$C
519         b               .L_00_48
520
521 .align  4
522 .L_00_48:
523 ___
524         &Xupdate(\&body_00_15);
525         &Xupdate(\&body_00_15);
526         &Xupdate(\&body_00_15);
527         &Xupdate(\&body_00_15);
528 $code.=<<___;
529         teq     $t1,#0                          @ check for K256 terminator
530         ldr     $t1,[sp,#0]
531         sub     $Xfer,$Xfer,#64
532         bne     .L_00_48
533
534         ldr             $inp,[sp,#68]
535         ldr             $t0,[sp,#72]
536         sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
537         teq             $inp,$t0
538         it              eq
539         subeq           $inp,$inp,#64           @ avoid SEGV
540         vld1.8          {@X[0]},[$inp]!         @ load next input block
541         vld1.8          {@X[1]},[$inp]!
542         vld1.8          {@X[2]},[$inp]!
543         vld1.8          {@X[3]},[$inp]!
544         it              ne
545         strne           $inp,[sp,#68]
546         mov             $Xfer,sp
547 ___
548         &Xpreload(\&body_00_15);
549         &Xpreload(\&body_00_15);
550         &Xpreload(\&body_00_15);
551         &Xpreload(\&body_00_15);
552 $code.=<<___;
553         ldr     $t0,[$t1,#0]
554         add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
555         ldr     $t2,[$t1,#4]
556         ldr     $t3,[$t1,#8]
557         ldr     $t4,[$t1,#12]
558         add     $A,$A,$t0                       @ accumulate
559         ldr     $t0,[$t1,#16]
560         add     $B,$B,$t2
561         ldr     $t2,[$t1,#20]
562         add     $C,$C,$t3
563         ldr     $t3,[$t1,#24]
564         add     $D,$D,$t4
565         ldr     $t4,[$t1,#28]
566         add     $E,$E,$t0
567         str     $A,[$t1],#4
568         add     $F,$F,$t2
569         str     $B,[$t1],#4
570         add     $G,$G,$t3
571         str     $C,[$t1],#4
572         add     $H,$H,$t4
573         str     $D,[$t1],#4
574         stmia   $t1,{$E-$H}
575
576         ittte   ne
577         movne   $Xfer,sp
578         ldrne   $t1,[sp,#0]
579         eorne   $t2,$t2,$t2
580         ldreq   sp,[sp,#76]                     @ restore original sp
581         itt     ne
582         eorne   $t3,$B,$C
583         bne     .L_00_48
584
585         ldmia   sp!,{r4-r12,pc}
586 .size   sha256_block_data_order_neon,.-sha256_block_data_order_neon
587 #endif
588 ___
589 }}}
590 ######################################################################
591 # ARMv8 stuff
592 #
593 {{{
594 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
595 my @MSG=map("q$_",(8..11));
596 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
597 my $Ktbl="r3";
598
599 $code.=<<___;
600 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
601
602 # if defined(__thumb2__) && !defined(__APPLE__)
603 #  define INST(a,b,c,d) .byte   c,d|0xc,a,b
604 # else
605 #  define INST(a,b,c,d) .byte   a,b,c,d
606 # endif
607
608 .type   sha256_block_data_order_armv8,%function
609 .align  5
610 sha256_block_data_order_armv8:
611 .LARMv8:
612         vld1.32 {$ABCD,$EFGH},[$ctx]
613 # ifdef __APPLE__
614         sub     $Ktbl,$Ktbl,#256+32
615 # elif  defined(__thumb2__)
616         adr     $Ktbl,.LARMv8
617         sub     $Ktbl,$Ktbl,#.LARMv8-K256
618 # else
619         adrl    $Ktbl,K256
620 # endif
621         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
622
623 .Loop_v8:
624         vld1.8          {@MSG[0]-@MSG[1]},[$inp]!
625         vld1.8          {@MSG[2]-@MSG[3]},[$inp]!
626         vld1.32         {$W0},[$Ktbl]!
627         vrev32.8        @MSG[0],@MSG[0]
628         vrev32.8        @MSG[1],@MSG[1]
629         vrev32.8        @MSG[2],@MSG[2]
630         vrev32.8        @MSG[3],@MSG[3]
631         vmov            $ABCD_SAVE,$ABCD        @ offload
632         vmov            $EFGH_SAVE,$EFGH
633         teq             $inp,$len
634 ___
635 for($i=0;$i<12;$i++) {
636 $code.=<<___;
637         vld1.32         {$W1},[$Ktbl]!
638         vadd.i32        $W0,$W0,@MSG[0]
639         sha256su0       @MSG[0],@MSG[1]
640         vmov            $abcd,$ABCD
641         sha256h         $ABCD,$EFGH,$W0
642         sha256h2        $EFGH,$abcd,$W0
643         sha256su1       @MSG[0],@MSG[2],@MSG[3]
644 ___
645         ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
646 }
647 $code.=<<___;
648         vld1.32         {$W1},[$Ktbl]!
649         vadd.i32        $W0,$W0,@MSG[0]
650         vmov            $abcd,$ABCD
651         sha256h         $ABCD,$EFGH,$W0
652         sha256h2        $EFGH,$abcd,$W0
653
654         vld1.32         {$W0},[$Ktbl]!
655         vadd.i32        $W1,$W1,@MSG[1]
656         vmov            $abcd,$ABCD
657         sha256h         $ABCD,$EFGH,$W1
658         sha256h2        $EFGH,$abcd,$W1
659
660         vld1.32         {$W1},[$Ktbl]
661         vadd.i32        $W0,$W0,@MSG[2]
662         sub             $Ktbl,$Ktbl,#256-16     @ rewind
663         vmov            $abcd,$ABCD
664         sha256h         $ABCD,$EFGH,$W0
665         sha256h2        $EFGH,$abcd,$W0
666
667         vadd.i32        $W1,$W1,@MSG[3]
668         vmov            $abcd,$ABCD
669         sha256h         $ABCD,$EFGH,$W1
670         sha256h2        $EFGH,$abcd,$W1
671
672         vadd.i32        $ABCD,$ABCD,$ABCD_SAVE
673         vadd.i32        $EFGH,$EFGH,$EFGH_SAVE
674         it              ne
675         bne             .Loop_v8
676
677         vst1.32         {$ABCD,$EFGH},[$ctx]
678
679         ret             @ bx lr
680 .size   sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
681 #endif
682 ___
683 }}}
684 $code.=<<___;
685 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
686 .align  2
687 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
688 .comm   OPENSSL_armcap_P,4,4
689 #endif
690 ___
691
692 open SELF,$0;
693 while(<SELF>) {
694         next if (/^#!/);
695         last if (!s/^#/@/ and !/^$/);
696         print;
697 }
698 close SELF;
699
700 {   my  %opcode = (
701         "sha256h"       => 0xf3000c40,  "sha256h2"      => 0xf3100c40,
702         "sha256su0"     => 0xf3ba03c0,  "sha256su1"     => 0xf3200c40   );
703
704     sub unsha256 {
705         my ($mnemonic,$arg)=@_;
706
707         if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
708             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
709                                          |(($2&7)<<17)|(($2&8)<<4)
710                                          |(($3&7)<<1) |(($3&8)<<2);
711             # since ARMv7 instructions are always encoded little-endian.
712             # correct solution is to use .inst directive, but older
713             # assemblers don't implement it:-(
714             sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
715                         $word&0xff,($word>>8)&0xff,
716                         ($word>>16)&0xff,($word>>24)&0xff,
717                         $mnemonic,$arg;
718         }
719     }
720 }
721
722 foreach (split($/,$code)) {
723
724         s/\`([^\`]*)\`/eval $1/geo;
725
726         s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
727
728         s/\bret\b/bx    lr/go           or
729         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
730
731         print $_,"\n";
732 }
733
734 close STDOUT; # enforce flush