Invoke tear_down when exiting test_encode_tls_sct() prematurely
[openssl.git] / crypto / sha / asm / sha256-armv4.pl
1 #! /usr/bin/env perl
2 # Copyright 2007-2018 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 #
16 # Permission to use under GPL terms is granted.
17 # ====================================================================
18
19 # SHA256 block procedure for ARMv4. May 2007.
20
21 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
22 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
23 # byte [on single-issue Xscale PXA250 core].
24
25 # July 2010.
26 #
27 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
28 # Cortex A8 core and ~20 cycles per processed byte.
29
30 # February 2011.
31 #
32 # Profiler-assisted and platform-specific optimization resulted in 16%
33 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
34
35 # September 2013.
36 #
37 # Add NEON implementation. On Cortex A8 it was measured to process one
38 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
39 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
40 # code (meaning that latter performs sub-optimally, nothing was done
41 # about it).
42
43 # May 2014.
44 #
45 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
46
47 # $output is the last argument if it looks like a file (it has an extension)
48 # $flavour is the first argument if it doesn't look like a file
49 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
50 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
51
52 if ($flavour && $flavour ne "void") {
53     $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
54     ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
55     ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
56     die "can't locate arm-xlate.pl";
57
58     open STDOUT,"| \"$^X\" $xlate $flavour \"$output\""
59         or die "can't call $xlate: $!";
60 } else {
61     $output and open STDOUT,">$output";
62 }
63
64 $ctx="r0";      $t0="r0";
65 $inp="r1";      $t4="r1";
66 $len="r2";      $t1="r2";
67 $T1="r3";       $t3="r3";
68 $A="r4";
69 $B="r5";
70 $C="r6";
71 $D="r7";
72 $E="r8";
73 $F="r9";
74 $G="r10";
75 $H="r11";
76 @V=($A,$B,$C,$D,$E,$F,$G,$H);
77 $t2="r12";
78 $Ktbl="r14";
79
80 @Sigma0=( 2,13,22);
81 @Sigma1=( 6,11,25);
82 @sigma0=( 7,18, 3);
83 @sigma1=(17,19,10);
84
85 sub BODY_00_15 {
86 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
87
88 $code.=<<___ if ($i<16);
89 #if __ARM_ARCH__>=7
90         @ ldr   $t1,[$inp],#4                   @ $i
91 # if $i==15
92         str     $inp,[sp,#17*4]                 @ make room for $t4
93 # endif
94         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
95         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
96         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
97 # ifndef __ARMEB__
98         rev     $t1,$t1
99 # endif
100 #else
101         @ ldrb  $t1,[$inp,#3]                   @ $i
102         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
103         ldrb    $t2,[$inp,#2]
104         ldrb    $t0,[$inp,#1]
105         orr     $t1,$t1,$t2,lsl#8
106         ldrb    $t2,[$inp],#4
107         orr     $t1,$t1,$t0,lsl#16
108 # if $i==15
109         str     $inp,[sp,#17*4]                 @ make room for $t4
110 # endif
111         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
112         orr     $t1,$t1,$t2,lsl#24
113         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
114 #endif
115 ___
116 $code.=<<___;
117         ldr     $t2,[$Ktbl],#4                  @ *K256++
118         add     $h,$h,$t1                       @ h+=X[i]
119         str     $t1,[sp,#`$i%16`*4]
120         eor     $t1,$f,$g
121         add     $h,$h,$t0,ror#$Sigma1[0]        @ h+=Sigma1(e)
122         and     $t1,$t1,$e
123         add     $h,$h,$t2                       @ h+=K256[i]
124         eor     $t1,$t1,$g                      @ Ch(e,f,g)
125         eor     $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
126         add     $h,$h,$t1                       @ h+=Ch(e,f,g)
127 #if $i==31
128         and     $t2,$t2,#0xff
129         cmp     $t2,#0xf2                       @ done?
130 #endif
131 #if $i<15
132 # if __ARM_ARCH__>=7
133         ldr     $t1,[$inp],#4                   @ prefetch
134 # else
135         ldrb    $t1,[$inp,#3]
136 # endif
137         eor     $t2,$a,$b                       @ a^b, b^c in next round
138 #else
139         ldr     $t1,[sp,#`($i+2)%16`*4]         @ from future BODY_16_xx
140         eor     $t2,$a,$b                       @ a^b, b^c in next round
141         ldr     $t4,[sp,#`($i+15)%16`*4]        @ from future BODY_16_xx
142 #endif
143         eor     $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]`  @ Sigma0(a)
144         and     $t3,$t3,$t2                     @ (b^c)&=(a^b)
145         add     $d,$d,$h                        @ d+=h
146         eor     $t3,$t3,$b                      @ Maj(a,b,c)
147         add     $h,$h,$t0,ror#$Sigma0[0]        @ h+=Sigma0(a)
148         @ add   $h,$h,$t3                       @ h+=Maj(a,b,c)
149 ___
150         ($t2,$t3)=($t3,$t2);
151 }
152
153 sub BODY_16_XX {
154 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
155
156 $code.=<<___;
157         @ ldr   $t1,[sp,#`($i+1)%16`*4]         @ $i
158         @ ldr   $t4,[sp,#`($i+14)%16`*4]
159         mov     $t0,$t1,ror#$sigma0[0]
160         add     $a,$a,$t2                       @ h+=Maj(a,b,c) from the past
161         mov     $t2,$t4,ror#$sigma1[0]
162         eor     $t0,$t0,$t1,ror#$sigma0[1]
163         eor     $t2,$t2,$t4,ror#$sigma1[1]
164         eor     $t0,$t0,$t1,lsr#$sigma0[2]      @ sigma0(X[i+1])
165         ldr     $t1,[sp,#`($i+0)%16`*4]
166         eor     $t2,$t2,$t4,lsr#$sigma1[2]      @ sigma1(X[i+14])
167         ldr     $t4,[sp,#`($i+9)%16`*4]
168
169         add     $t2,$t2,$t0
170         eor     $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`   @ from BODY_00_15
171         add     $t1,$t1,$t2
172         eor     $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]`  @ Sigma1(e)
173         add     $t1,$t1,$t4                     @ X[i]
174 ___
175         &BODY_00_15(@_);
176 }
177
178 $code=<<___;
179 #ifndef __KERNEL__
180 # include "arm_arch.h"
181 #else
182 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
183 # define __ARM_MAX_ARCH__ 7
184 #endif
185
186 #if defined(__thumb2__)
187 .syntax unified
188 .thumb
189 #else
190 .code   32
191 #endif
192
193 .text
194
195 .type   K256,%object
196 .align  5
197 K256:
198 .word   0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
199 .word   0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
200 .word   0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
201 .word   0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
202 .word   0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
203 .word   0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
204 .word   0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
205 .word   0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
206 .word   0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
207 .word   0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
208 .word   0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
209 .word   0xd192e819,0xd6990624,0xf40e3585,0x106aa070
210 .word   0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
211 .word   0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
212 .word   0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
213 .word   0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
214 .size   K256,.-K256
215 .word   0                               @ terminator
216 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
217 .LOPENSSL_armcap:
218 # ifdef _WIN32
219 .word   OPENSSL_armcap_P
220 # else
221 .word   OPENSSL_armcap_P-.Lsha256_block_data_order
222 # endif
223 #endif
224 .align  5
225
226 .global sha256_block_data_order
227 .type   sha256_block_data_order,%function
228 sha256_block_data_order:
229 .Lsha256_block_data_order:
230 #if __ARM_ARCH__<7 && !defined(__thumb2__)
231         sub     r3,pc,#8                @ sha256_block_data_order
232 #else
233         adr     r3,.Lsha256_block_data_order
234 #endif
235 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
236         ldr     r12,.LOPENSSL_armcap
237 # if !defined(_WIN32)
238         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
239 # endif
240 # if defined(__APPLE__) || defined(_WIN32)
241         ldr     r12,[r12]
242 # endif
243         tst     r12,#ARMV8_SHA256
244         bne     .LARMv8
245         tst     r12,#ARMV7_NEON
246         bne     .LNEON
247 #endif
248         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
249         stmdb   sp!,{$ctx,$inp,$len,r4-r11,lr}
250         ldmia   $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
251         sub     $Ktbl,r3,#256+32        @ K256
252         sub     sp,sp,#16*4             @ alloca(X[16])
253 .Loop:
254 # if __ARM_ARCH__>=7
255         ldr     $t1,[$inp],#4
256 # else
257         ldrb    $t1,[$inp,#3]
258 # endif
259         eor     $t3,$B,$C               @ magic
260         eor     $t2,$t2,$t2
261 ___
262 for($i=0;$i<16;$i++)    { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
263 $code.=".Lrounds_16_xx:\n";
264 for (;$i<32;$i++)       { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
265 $code.=<<___;
266 #ifdef  __thumb2__
267         ite     eq                      @ Thumb2 thing, sanity check in ARM
268 #endif
269         ldreq   $t3,[sp,#16*4]          @ pull ctx
270         bne     .Lrounds_16_xx
271
272         add     $A,$A,$t2               @ h+=Maj(a,b,c) from the past
273         ldr     $t0,[$t3,#0]
274         ldr     $t1,[$t3,#4]
275         ldr     $t2,[$t3,#8]
276         add     $A,$A,$t0
277         ldr     $t0,[$t3,#12]
278         add     $B,$B,$t1
279         ldr     $t1,[$t3,#16]
280         add     $C,$C,$t2
281         ldr     $t2,[$t3,#20]
282         add     $D,$D,$t0
283         ldr     $t0,[$t3,#24]
284         add     $E,$E,$t1
285         ldr     $t1,[$t3,#28]
286         add     $F,$F,$t2
287         ldr     $inp,[sp,#17*4]         @ pull inp
288         ldr     $t2,[sp,#18*4]          @ pull inp+len
289         add     $G,$G,$t0
290         add     $H,$H,$t1
291         stmia   $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
292         cmp     $inp,$t2
293         sub     $Ktbl,$Ktbl,#256        @ rewind Ktbl
294         bne     .Loop
295
296         add     sp,sp,#`16+3`*4 @ destroy frame
297 #if __ARM_ARCH__>=5
298         ldmia   sp!,{r4-r11,pc}
299 #else
300         ldmia   sp!,{r4-r11,lr}
301         tst     lr,#1
302         moveq   pc,lr                   @ be binary compatible with V4, yet
303         bx      lr                      @ interoperable with Thumb ISA:-)
304 #endif
305 .size   sha256_block_data_order,.-sha256_block_data_order
306 ___
307 ######################################################################
308 # NEON stuff
309 #
310 {{{
311 my @X=map("q$_",(0..3));
312 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
313 my $Xfer=$t4;
314 my $j=0;
315
316 sub Dlo()   { shift=~m|q([1]?[0-9])|?"d".($1*2):"";     }
317 sub Dhi()   { shift=~m|q([1]?[0-9])|?"d".($1*2+1):"";   }
318
319 sub AUTOLOAD()          # thunk [simplified] x86-style perlasm
320 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
321   my $arg = pop;
322     $arg = "#$arg" if ($arg*1 eq $arg);
323     $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
324 }
325
326 sub Xupdate()
327 { use integer;
328   my $body = shift;
329   my @insns = (&$body,&$body,&$body,&$body);
330   my ($a,$b,$c,$d,$e,$f,$g,$h);
331
332         &vext_8         ($T0,@X[0],@X[1],4);    # X[1..4]
333          eval(shift(@insns));
334          eval(shift(@insns));
335          eval(shift(@insns));
336         &vext_8         ($T1,@X[2],@X[3],4);    # X[9..12]
337          eval(shift(@insns));
338          eval(shift(@insns));
339          eval(shift(@insns));
340         &vshr_u32       ($T2,$T0,$sigma0[0]);
341          eval(shift(@insns));
342          eval(shift(@insns));
343         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += X[9..12]
344          eval(shift(@insns));
345          eval(shift(@insns));
346         &vshr_u32       ($T1,$T0,$sigma0[2]);
347          eval(shift(@insns));
348          eval(shift(@insns));
349         &vsli_32        ($T2,$T0,32-$sigma0[0]);
350          eval(shift(@insns));
351          eval(shift(@insns));
352         &vshr_u32       ($T3,$T0,$sigma0[1]);
353          eval(shift(@insns));
354          eval(shift(@insns));
355         &veor           ($T1,$T1,$T2);
356          eval(shift(@insns));
357          eval(shift(@insns));
358         &vsli_32        ($T3,$T0,32-$sigma0[1]);
359          eval(shift(@insns));
360          eval(shift(@insns));
361           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[0]);
362          eval(shift(@insns));
363          eval(shift(@insns));
364         &veor           ($T1,$T1,$T3);          # sigma0(X[1..4])
365          eval(shift(@insns));
366          eval(shift(@insns));
367           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[0]);
368          eval(shift(@insns));
369          eval(shift(@insns));
370           &vshr_u32     ($T5,&Dhi(@X[3]),$sigma1[2]);
371          eval(shift(@insns));
372          eval(shift(@insns));
373         &vadd_i32       (@X[0],@X[0],$T1);      # X[0..3] += sigma0(X[1..4])
374          eval(shift(@insns));
375          eval(shift(@insns));
376           &veor         ($T5,$T5,$T4);
377          eval(shift(@insns));
378          eval(shift(@insns));
379           &vshr_u32     ($T4,&Dhi(@X[3]),$sigma1[1]);
380          eval(shift(@insns));
381          eval(shift(@insns));
382           &vsli_32      ($T4,&Dhi(@X[3]),32-$sigma1[1]);
383          eval(shift(@insns));
384          eval(shift(@insns));
385           &veor         ($T5,$T5,$T4);          # sigma1(X[14..15])
386          eval(shift(@insns));
387          eval(shift(@insns));
388         &vadd_i32       (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
389          eval(shift(@insns));
390          eval(shift(@insns));
391           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[0]);
392          eval(shift(@insns));
393          eval(shift(@insns));
394           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[0]);
395          eval(shift(@insns));
396          eval(shift(@insns));
397           &vshr_u32     ($T5,&Dlo(@X[0]),$sigma1[2]);
398          eval(shift(@insns));
399          eval(shift(@insns));
400           &veor         ($T5,$T5,$T4);
401          eval(shift(@insns));
402          eval(shift(@insns));
403           &vshr_u32     ($T4,&Dlo(@X[0]),$sigma1[1]);
404          eval(shift(@insns));
405          eval(shift(@insns));
406         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
407          eval(shift(@insns));
408          eval(shift(@insns));
409           &vsli_32      ($T4,&Dlo(@X[0]),32-$sigma1[1]);
410          eval(shift(@insns));
411          eval(shift(@insns));
412           &veor         ($T5,$T5,$T4);          # sigma1(X[16..17])
413          eval(shift(@insns));
414          eval(shift(@insns));
415         &vadd_i32       (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
416          eval(shift(@insns));
417          eval(shift(@insns));
418         &vadd_i32       ($T0,$T0,@X[0]);
419          while($#insns>=2) { eval(shift(@insns)); }
420         &vst1_32        ("{$T0}","[$Xfer,:128]!");
421          eval(shift(@insns));
422          eval(shift(@insns));
423
424         push(@X,shift(@X));             # "rotate" X[]
425 }
426
427 sub Xpreload()
428 { use integer;
429   my $body = shift;
430   my @insns = (&$body,&$body,&$body,&$body);
431   my ($a,$b,$c,$d,$e,$f,$g,$h);
432
433          eval(shift(@insns));
434          eval(shift(@insns));
435          eval(shift(@insns));
436          eval(shift(@insns));
437         &vld1_32        ("{$T0}","[$Ktbl,:128]!");
438          eval(shift(@insns));
439          eval(shift(@insns));
440          eval(shift(@insns));
441          eval(shift(@insns));
442         &vrev32_8       (@X[0],@X[0]);
443          eval(shift(@insns));
444          eval(shift(@insns));
445          eval(shift(@insns));
446          eval(shift(@insns));
447         &vadd_i32       ($T0,$T0,@X[0]);
448          foreach (@insns) { eval; }     # remaining instructions
449         &vst1_32        ("{$T0}","[$Xfer,:128]!");
450
451         push(@X,shift(@X));             # "rotate" X[]
452 }
453
454 sub body_00_15 () {
455         (
456         '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
457         '&add   ($h,$h,$t1)',                   # h+=X[i]+K[i]
458         '&eor   ($t1,$f,$g)',
459         '&eor   ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
460         '&add   ($a,$a,$t2)',                   # h+=Maj(a,b,c) from the past
461         '&and   ($t1,$t1,$e)',
462         '&eor   ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))',   # Sigma1(e)
463         '&eor   ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
464         '&eor   ($t1,$t1,$g)',                  # Ch(e,f,g)
465         '&add   ($h,$h,$t2,"ror#$Sigma1[0]")',  # h+=Sigma1(e)
466         '&eor   ($t2,$a,$b)',                   # a^b, b^c in next round
467         '&eor   ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))',   # Sigma0(a)
468         '&add   ($h,$h,$t1)',                   # h+=Ch(e,f,g)
469         '&ldr   ($t1,sprintf "[sp,#%d]",4*(($j+1)&15))  if (($j&15)!=15);'.
470         '&ldr   ($t1,"[$Ktbl]")                         if ($j==15);'.
471         '&ldr   ($t1,"[sp,#64]")                        if ($j==31)',
472         '&and   ($t3,$t3,$t2)',                 # (b^c)&=(a^b)
473         '&add   ($d,$d,$h)',                    # d+=h
474         '&add   ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
475         '&eor   ($t3,$t3,$b)',                  # Maj(a,b,c)
476         '$j++;  unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
477         )
478 }
479
480 $code.=<<___;
481 #if __ARM_MAX_ARCH__>=7
482 .arch   armv7-a
483 .fpu    neon
484
485 .global sha256_block_data_order_neon
486 .type   sha256_block_data_order_neon,%function
487 .align  5
488 .skip   16
489 sha256_block_data_order_neon:
490 .LNEON:
491         stmdb   sp!,{r4-r12,lr}
492
493         sub     $H,sp,#16*4+16
494         adr     $Ktbl,K256
495         bic     $H,$H,#15               @ align for 128-bit stores
496         mov     $t2,sp
497         mov     sp,$H                   @ alloca
498         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
499
500         vld1.8          {@X[0]},[$inp]!
501         vld1.8          {@X[1]},[$inp]!
502         vld1.8          {@X[2]},[$inp]!
503         vld1.8          {@X[3]},[$inp]!
504         vld1.32         {$T0},[$Ktbl,:128]!
505         vld1.32         {$T1},[$Ktbl,:128]!
506         vld1.32         {$T2},[$Ktbl,:128]!
507         vld1.32         {$T3},[$Ktbl,:128]!
508         vrev32.8        @X[0],@X[0]             @ yes, even on
509         str             $ctx,[sp,#64]
510         vrev32.8        @X[1],@X[1]             @ big-endian
511         str             $inp,[sp,#68]
512         mov             $Xfer,sp
513         vrev32.8        @X[2],@X[2]
514         str             $len,[sp,#72]
515         vrev32.8        @X[3],@X[3]
516         str             $t2,[sp,#76]            @ save original sp
517         vadd.i32        $T0,$T0,@X[0]
518         vadd.i32        $T1,$T1,@X[1]
519         vst1.32         {$T0},[$Xfer,:128]!
520         vadd.i32        $T2,$T2,@X[2]
521         vst1.32         {$T1},[$Xfer,:128]!
522         vadd.i32        $T3,$T3,@X[3]
523         vst1.32         {$T2},[$Xfer,:128]!
524         vst1.32         {$T3},[$Xfer,:128]!
525
526         ldmia           $ctx,{$A-$H}
527         sub             $Xfer,$Xfer,#64
528         ldr             $t1,[sp,#0]
529         eor             $t2,$t2,$t2
530         eor             $t3,$B,$C
531         b               .L_00_48
532
533 .align  4
534 .L_00_48:
535 ___
536         &Xupdate(\&body_00_15);
537         &Xupdate(\&body_00_15);
538         &Xupdate(\&body_00_15);
539         &Xupdate(\&body_00_15);
540 $code.=<<___;
541         teq     $t1,#0                          @ check for K256 terminator
542         ldr     $t1,[sp,#0]
543         sub     $Xfer,$Xfer,#64
544         bne     .L_00_48
545
546         ldr             $inp,[sp,#68]
547         ldr             $t0,[sp,#72]
548         sub             $Ktbl,$Ktbl,#256        @ rewind $Ktbl
549         teq             $inp,$t0
550         it              eq
551         subeq           $inp,$inp,#64           @ avoid SEGV
552         vld1.8          {@X[0]},[$inp]!         @ load next input block
553         vld1.8          {@X[1]},[$inp]!
554         vld1.8          {@X[2]},[$inp]!
555         vld1.8          {@X[3]},[$inp]!
556         it              ne
557         strne           $inp,[sp,#68]
558         mov             $Xfer,sp
559 ___
560         &Xpreload(\&body_00_15);
561         &Xpreload(\&body_00_15);
562         &Xpreload(\&body_00_15);
563         &Xpreload(\&body_00_15);
564 $code.=<<___;
565         ldr     $t0,[$t1,#0]
566         add     $A,$A,$t2                       @ h+=Maj(a,b,c) from the past
567         ldr     $t2,[$t1,#4]
568         ldr     $t3,[$t1,#8]
569         ldr     $t4,[$t1,#12]
570         add     $A,$A,$t0                       @ accumulate
571         ldr     $t0,[$t1,#16]
572         add     $B,$B,$t2
573         ldr     $t2,[$t1,#20]
574         add     $C,$C,$t3
575         ldr     $t3,[$t1,#24]
576         add     $D,$D,$t4
577         ldr     $t4,[$t1,#28]
578         add     $E,$E,$t0
579         str     $A,[$t1],#4
580         add     $F,$F,$t2
581         str     $B,[$t1],#4
582         add     $G,$G,$t3
583         str     $C,[$t1],#4
584         add     $H,$H,$t4
585         str     $D,[$t1],#4
586         stmia   $t1,{$E-$H}
587
588         ittte   ne
589         movne   $Xfer,sp
590         ldrne   $t1,[sp,#0]
591         eorne   $t2,$t2,$t2
592         ldreq   sp,[sp,#76]                     @ restore original sp
593         itt     ne
594         eorne   $t3,$B,$C
595         bne     .L_00_48
596
597         ldmia   sp!,{r4-r12,pc}
598 .size   sha256_block_data_order_neon,.-sha256_block_data_order_neon
599 #endif
600 ___
601 }}}
602 ######################################################################
603 # ARMv8 stuff
604 #
605 {{{
606 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
607 my @MSG=map("q$_",(8..11));
608 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
609 my $Ktbl="r3";
610 my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
611
612 $code.=<<___;
613 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
614
615 # if defined(__thumb2__)
616 #  define INST(a,b,c,d) $_byte  c,d|0xc,a,b
617 # else
618 #  define INST(a,b,c,d) $_byte  a,b,c,d
619 # endif
620
621 .type   sha256_block_data_order_armv8,%function
622 .align  5
623 sha256_block_data_order_armv8:
624 .LARMv8:
625         vld1.32 {$ABCD,$EFGH},[$ctx]
626         sub     $Ktbl,$Ktbl,#256+32
627         add     $len,$inp,$len,lsl#6    @ len to point at the end of inp
628         b       .Loop_v8
629
630 .align  4
631 .Loop_v8:
632         vld1.8          {@MSG[0]-@MSG[1]},[$inp]!
633         vld1.8          {@MSG[2]-@MSG[3]},[$inp]!
634         vld1.32         {$W0},[$Ktbl]!
635         vrev32.8        @MSG[0],@MSG[0]
636         vrev32.8        @MSG[1],@MSG[1]
637         vrev32.8        @MSG[2],@MSG[2]
638         vrev32.8        @MSG[3],@MSG[3]
639         vmov            $ABCD_SAVE,$ABCD        @ offload
640         vmov            $EFGH_SAVE,$EFGH
641         teq             $inp,$len
642 ___
643 for($i=0;$i<12;$i++) {
644 $code.=<<___;
645         vld1.32         {$W1},[$Ktbl]!
646         vadd.i32        $W0,$W0,@MSG[0]
647         sha256su0       @MSG[0],@MSG[1]
648         vmov            $abcd,$ABCD
649         sha256h         $ABCD,$EFGH,$W0
650         sha256h2        $EFGH,$abcd,$W0
651         sha256su1       @MSG[0],@MSG[2],@MSG[3]
652 ___
653         ($W0,$W1)=($W1,$W0);    push(@MSG,shift(@MSG));
654 }
655 $code.=<<___;
656         vld1.32         {$W1},[$Ktbl]!
657         vadd.i32        $W0,$W0,@MSG[0]
658         vmov            $abcd,$ABCD
659         sha256h         $ABCD,$EFGH,$W0
660         sha256h2        $EFGH,$abcd,$W0
661
662         vld1.32         {$W0},[$Ktbl]!
663         vadd.i32        $W1,$W1,@MSG[1]
664         vmov            $abcd,$ABCD
665         sha256h         $ABCD,$EFGH,$W1
666         sha256h2        $EFGH,$abcd,$W1
667
668         vld1.32         {$W1},[$Ktbl]
669         vadd.i32        $W0,$W0,@MSG[2]
670         sub             $Ktbl,$Ktbl,#256-16     @ rewind
671         vmov            $abcd,$ABCD
672         sha256h         $ABCD,$EFGH,$W0
673         sha256h2        $EFGH,$abcd,$W0
674
675         vadd.i32        $W1,$W1,@MSG[3]
676         vmov            $abcd,$ABCD
677         sha256h         $ABCD,$EFGH,$W1
678         sha256h2        $EFGH,$abcd,$W1
679
680         vadd.i32        $ABCD,$ABCD,$ABCD_SAVE
681         vadd.i32        $EFGH,$EFGH,$EFGH_SAVE
682         it              ne
683         bne             .Loop_v8
684
685         vst1.32         {$ABCD,$EFGH},[$ctx]
686
687         ret             @ bx lr
688 .size   sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
689 #endif
690 ___
691 }}}
692 $code.=<<___;
693 .asciz  "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
694 .align  2
695 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
696 .comm   OPENSSL_armcap_P,4,4
697 #endif
698 ___
699
700 open SELF,$0;
701 while(<SELF>) {
702         next if (/^#!/);
703         last if (!s/^#/@/ and !/^$/);
704         print;
705 }
706 close SELF;
707
708 {   my  %opcode = (
709         "sha256h"       => 0xf3000c40,  "sha256h2"      => 0xf3100c40,
710         "sha256su0"     => 0xf3ba03c0,  "sha256su1"     => 0xf3200c40   );
711
712     sub unsha256 {
713         my ($mnemonic,$arg)=@_;
714
715         if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
716             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
717                                          |(($2&7)<<17)|(($2&8)<<4)
718                                          |(($3&7)<<1) |(($3&8)<<2);
719             # since ARMv7 instructions are always encoded little-endian.
720             # correct solution is to use .inst directive, but older
721             # assemblers don't implement it:-(
722             sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
723                         $word&0xff,($word>>8)&0xff,
724                         ($word>>16)&0xff,($word>>24)&0xff,
725                         $mnemonic,$arg;
726         }
727     }
728 }
729
730 foreach (split($/,$code)) {
731
732         s/\`([^\`]*)\`/eval $1/geo;
733
734         s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
735
736         s/\bret\b/bx    lr/go           or
737         s/\bbx\s+lr\b/.word\t0xe12fff1e/go;     # make it possible to compile with -march=armv4
738
739         print $_,"\n";
740 }
741
742 close STDOUT; # enforce flush