Remove inconsistency in ARM support.
[openssl.git] / crypto / sha / asm / sha512-armv4.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # SHA512 block procedure for ARMv4. September 2007.
11
12 # This code is ~4.5 (four and a half) times faster than code generated
13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14 # Xscale PXA250 core].
15 #
16 # July 2010.
17 #
18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
19 # Cortex A8 core and ~40 cycles per processed byte.
20
21 # February 2011.
22 #
23 # Profiler-assisted and platform-specific optimization resulted in 7%
24 # improvement on Coxtex A8 core and ~38 cycles per byte.
25
26 # March 2011.
27 #
28 # Add NEON implementation. On Cortex A8 it was measured to process
29 # one byte in 23.3 cycles or ~60% faster than integer-only code.
30
31 # August 2012.
32 #
33 # Improve NEON performance by 12% on Snapdragon S4. In absolute
34 # terms it's 22.6 cycles per byte, which is disappointing result.
35 # Technical writers asserted that 3-way S4 pipeline can sustain
36 # multiple NEON instructions per cycle, but dual NEON issue could
37 # not be observed, and for NEON-only sequences IPC(*) was found to
38 # be limited by 1:-( 0.33 and 0.66 were measured for sequences with
39 # ILPs(*) of 1 and 2 respectively. This in turn means that you can
40 # even find yourself striving, as I did here, for achieving IPC
41 # adequate to one delivered by Cortex A8 [for reference, it's
42 # 0.5 for ILP of 1, and 1 for higher ILPs].
43 #
44 # (*) ILP, instruction-level parallelism, how many instructions
45 #     *can* execute at the same time. IPC, instructions per cycle,
46 #     indicates how many instructions actually execute.
47
48 # Byte order [in]dependence. =========================================
49 #
50 # Originally caller was expected to maintain specific *dword* order in
51 # h[0-7], namely with most significant dword at *lower* address, which
52 # was reflected in below two parameters as 0 and 4. Now caller is
53 # expected to maintain native byte order for whole 64-bit values.
54 $hi="HI";
55 $lo="LO";
56 # ====================================================================
57
58 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
59 open STDOUT,">$output";
60
61 $ctx="r0";      # parameter block
62 $inp="r1";
63 $len="r2";
64
65 $Tlo="r3";
66 $Thi="r4";
67 $Alo="r5";
68 $Ahi="r6";
69 $Elo="r7";
70 $Ehi="r8";
71 $t0="r9";
72 $t1="r10";
73 $t2="r11";
74 $t3="r12";
75 ############    r13 is stack pointer
76 $Ktbl="r14";
77 ############    r15 is program counter
78
79 $Aoff=8*0;
80 $Boff=8*1;
81 $Coff=8*2;
82 $Doff=8*3;
83 $Eoff=8*4;
84 $Foff=8*5;
85 $Goff=8*6;
86 $Hoff=8*7;
87 $Xoff=8*8;
88
89 sub BODY_00_15() {
90 my $magic = shift;
91 $code.=<<___;
92         @ Sigma1(x)     (ROTR((x),14) ^ ROTR((x),18)  ^ ROTR((x),41))
93         @ LO            lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
94         @ HI            hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
95         mov     $t0,$Elo,lsr#14
96         str     $Tlo,[sp,#$Xoff+0]
97         mov     $t1,$Ehi,lsr#14
98         str     $Thi,[sp,#$Xoff+4]
99         eor     $t0,$t0,$Ehi,lsl#18
100         ldr     $t2,[sp,#$Hoff+0]       @ h.lo
101         eor     $t1,$t1,$Elo,lsl#18
102         ldr     $t3,[sp,#$Hoff+4]       @ h.hi
103         eor     $t0,$t0,$Elo,lsr#18
104         eor     $t1,$t1,$Ehi,lsr#18
105         eor     $t0,$t0,$Ehi,lsl#14
106         eor     $t1,$t1,$Elo,lsl#14
107         eor     $t0,$t0,$Ehi,lsr#9
108         eor     $t1,$t1,$Elo,lsr#9
109         eor     $t0,$t0,$Elo,lsl#23
110         eor     $t1,$t1,$Ehi,lsl#23     @ Sigma1(e)
111         adds    $Tlo,$Tlo,$t0
112         ldr     $t0,[sp,#$Foff+0]       @ f.lo
113         adc     $Thi,$Thi,$t1           @ T += Sigma1(e)
114         ldr     $t1,[sp,#$Foff+4]       @ f.hi
115         adds    $Tlo,$Tlo,$t2
116         ldr     $t2,[sp,#$Goff+0]       @ g.lo
117         adc     $Thi,$Thi,$t3           @ T += h
118         ldr     $t3,[sp,#$Goff+4]       @ g.hi
119
120         eor     $t0,$t0,$t2
121         str     $Elo,[sp,#$Eoff+0]
122         eor     $t1,$t1,$t3
123         str     $Ehi,[sp,#$Eoff+4]
124         and     $t0,$t0,$Elo
125         str     $Alo,[sp,#$Aoff+0]
126         and     $t1,$t1,$Ehi
127         str     $Ahi,[sp,#$Aoff+4]
128         eor     $t0,$t0,$t2
129         ldr     $t2,[$Ktbl,#$lo]        @ K[i].lo
130         eor     $t1,$t1,$t3             @ Ch(e,f,g)
131         ldr     $t3,[$Ktbl,#$hi]        @ K[i].hi
132
133         adds    $Tlo,$Tlo,$t0
134         ldr     $Elo,[sp,#$Doff+0]      @ d.lo
135         adc     $Thi,$Thi,$t1           @ T += Ch(e,f,g)
136         ldr     $Ehi,[sp,#$Doff+4]      @ d.hi
137         adds    $Tlo,$Tlo,$t2
138         and     $t0,$t2,#0xff
139         adc     $Thi,$Thi,$t3           @ T += K[i]
140         adds    $Elo,$Elo,$Tlo
141         ldr     $t2,[sp,#$Boff+0]       @ b.lo
142         adc     $Ehi,$Ehi,$Thi          @ d += T
143         teq     $t0,#$magic
144
145         ldr     $t3,[sp,#$Coff+0]       @ c.lo
146         orreq   $Ktbl,$Ktbl,#1
147         @ Sigma0(x)     (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
148         @ LO            lo>>28^hi<<4  ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
149         @ HI            hi>>28^lo<<4  ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
150         mov     $t0,$Alo,lsr#28
151         mov     $t1,$Ahi,lsr#28
152         eor     $t0,$t0,$Ahi,lsl#4
153         eor     $t1,$t1,$Alo,lsl#4
154         eor     $t0,$t0,$Ahi,lsr#2
155         eor     $t1,$t1,$Alo,lsr#2
156         eor     $t0,$t0,$Alo,lsl#30
157         eor     $t1,$t1,$Ahi,lsl#30
158         eor     $t0,$t0,$Ahi,lsr#7
159         eor     $t1,$t1,$Alo,lsr#7
160         eor     $t0,$t0,$Alo,lsl#25
161         eor     $t1,$t1,$Ahi,lsl#25     @ Sigma0(a)
162         adds    $Tlo,$Tlo,$t0
163         and     $t0,$Alo,$t2
164         adc     $Thi,$Thi,$t1           @ T += Sigma0(a)
165
166         ldr     $t1,[sp,#$Boff+4]       @ b.hi
167         orr     $Alo,$Alo,$t2
168         ldr     $t2,[sp,#$Coff+4]       @ c.hi
169         and     $Alo,$Alo,$t3
170         and     $t3,$Ahi,$t1
171         orr     $Ahi,$Ahi,$t1
172         orr     $Alo,$Alo,$t0           @ Maj(a,b,c).lo
173         and     $Ahi,$Ahi,$t2
174         adds    $Alo,$Alo,$Tlo
175         orr     $Ahi,$Ahi,$t3           @ Maj(a,b,c).hi
176         sub     sp,sp,#8
177         adc     $Ahi,$Ahi,$Thi          @ h += T
178         tst     $Ktbl,#1
179         add     $Ktbl,$Ktbl,#8
180 ___
181 }
182 $code=<<___;
183 #include "arm_arch.h"
184 #ifdef __ARMEL__
185 # define LO 0
186 # define HI 4
187 # define WORD64(hi0,lo0,hi1,lo1)        .word   lo0,hi0, lo1,hi1
188 #else
189 # define HI 0
190 # define LO 4
191 # define WORD64(hi0,lo0,hi1,lo1)        .word   hi0,lo0, hi1,lo1
192 #endif
193
194 .text
195 .code   32
196 .type   K512,%object
197 .align  5
198 K512:
199 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
200 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
201 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
202 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
203 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
204 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
205 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
206 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
207 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
208 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
209 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
210 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
211 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
212 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
213 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
214 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
215 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
216 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
217 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
218 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
219 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
220 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
221 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
222 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
223 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
224 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
225 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
226 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
227 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
228 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
229 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
230 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
231 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
232 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
233 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
234 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
235 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
236 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
237 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
238 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
239 .size   K512,.-K512
240 #if __ARM_MAX_ARCH__>=7
241 .LOPENSSL_armcap:
242 .word   OPENSSL_armcap_P-sha512_block_data_order
243 .skip   32-4
244 #else
245 .skip   32
246 #endif
247
248 .global sha512_block_data_order
249 .type   sha512_block_data_order,%function
250 sha512_block_data_order:
251         sub     r3,pc,#8                @ sha512_block_data_order
252         add     $len,$inp,$len,lsl#7    @ len to point at the end of inp
253 #if __ARM_MAX_ARCH__>=7
254         ldr     r12,.LOPENSSL_armcap
255         ldr     r12,[r3,r12]            @ OPENSSL_armcap_P
256         tst     r12,#1
257         bne     .LNEON
258 #endif
259         stmdb   sp!,{r4-r12,lr}
260         sub     $Ktbl,r3,#672           @ K512
261         sub     sp,sp,#9*8
262
263         ldr     $Elo,[$ctx,#$Eoff+$lo]
264         ldr     $Ehi,[$ctx,#$Eoff+$hi]
265         ldr     $t0, [$ctx,#$Goff+$lo]
266         ldr     $t1, [$ctx,#$Goff+$hi]
267         ldr     $t2, [$ctx,#$Hoff+$lo]
268         ldr     $t3, [$ctx,#$Hoff+$hi]
269 .Loop:
270         str     $t0, [sp,#$Goff+0]
271         str     $t1, [sp,#$Goff+4]
272         str     $t2, [sp,#$Hoff+0]
273         str     $t3, [sp,#$Hoff+4]
274         ldr     $Alo,[$ctx,#$Aoff+$lo]
275         ldr     $Ahi,[$ctx,#$Aoff+$hi]
276         ldr     $Tlo,[$ctx,#$Boff+$lo]
277         ldr     $Thi,[$ctx,#$Boff+$hi]
278         ldr     $t0, [$ctx,#$Coff+$lo]
279         ldr     $t1, [$ctx,#$Coff+$hi]
280         ldr     $t2, [$ctx,#$Doff+$lo]
281         ldr     $t3, [$ctx,#$Doff+$hi]
282         str     $Tlo,[sp,#$Boff+0]
283         str     $Thi,[sp,#$Boff+4]
284         str     $t0, [sp,#$Coff+0]
285         str     $t1, [sp,#$Coff+4]
286         str     $t2, [sp,#$Doff+0]
287         str     $t3, [sp,#$Doff+4]
288         ldr     $Tlo,[$ctx,#$Foff+$lo]
289         ldr     $Thi,[$ctx,#$Foff+$hi]
290         str     $Tlo,[sp,#$Foff+0]
291         str     $Thi,[sp,#$Foff+4]
292
293 .L00_15:
294 #if __ARM_ARCH__<7
295         ldrb    $Tlo,[$inp,#7]
296         ldrb    $t0, [$inp,#6]
297         ldrb    $t1, [$inp,#5]
298         ldrb    $t2, [$inp,#4]
299         ldrb    $Thi,[$inp,#3]
300         ldrb    $t3, [$inp,#2]
301         orr     $Tlo,$Tlo,$t0,lsl#8
302         ldrb    $t0, [$inp,#1]
303         orr     $Tlo,$Tlo,$t1,lsl#16
304         ldrb    $t1, [$inp],#8
305         orr     $Tlo,$Tlo,$t2,lsl#24
306         orr     $Thi,$Thi,$t3,lsl#8
307         orr     $Thi,$Thi,$t0,lsl#16
308         orr     $Thi,$Thi,$t1,lsl#24
309 #else
310         ldr     $Tlo,[$inp,#4]
311         ldr     $Thi,[$inp],#8
312 #ifdef __ARMEL__
313         rev     $Tlo,$Tlo
314         rev     $Thi,$Thi
315 #endif
316 #endif
317 ___
318         &BODY_00_15(0x94);
319 $code.=<<___;
320         tst     $Ktbl,#1
321         beq     .L00_15
322         ldr     $t0,[sp,#`$Xoff+8*(16-1)`+0]
323         ldr     $t1,[sp,#`$Xoff+8*(16-1)`+4]
324         bic     $Ktbl,$Ktbl,#1
325 .L16_79:
326         @ sigma0(x)     (ROTR((x),1)  ^ ROTR((x),8)  ^ ((x)>>7))
327         @ LO            lo>>1^hi<<31  ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
328         @ HI            hi>>1^lo<<31  ^ hi>>8^lo<<24 ^ hi>>7
329         mov     $Tlo,$t0,lsr#1
330         ldr     $t2,[sp,#`$Xoff+8*(16-14)`+0]
331         mov     $Thi,$t1,lsr#1
332         ldr     $t3,[sp,#`$Xoff+8*(16-14)`+4]
333         eor     $Tlo,$Tlo,$t1,lsl#31
334         eor     $Thi,$Thi,$t0,lsl#31
335         eor     $Tlo,$Tlo,$t0,lsr#8
336         eor     $Thi,$Thi,$t1,lsr#8
337         eor     $Tlo,$Tlo,$t1,lsl#24
338         eor     $Thi,$Thi,$t0,lsl#24
339         eor     $Tlo,$Tlo,$t0,lsr#7
340         eor     $Thi,$Thi,$t1,lsr#7
341         eor     $Tlo,$Tlo,$t1,lsl#25
342
343         @ sigma1(x)     (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
344         @ LO            lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
345         @ HI            hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
346         mov     $t0,$t2,lsr#19
347         mov     $t1,$t3,lsr#19
348         eor     $t0,$t0,$t3,lsl#13
349         eor     $t1,$t1,$t2,lsl#13
350         eor     $t0,$t0,$t3,lsr#29
351         eor     $t1,$t1,$t2,lsr#29
352         eor     $t0,$t0,$t2,lsl#3
353         eor     $t1,$t1,$t3,lsl#3
354         eor     $t0,$t0,$t2,lsr#6
355         eor     $t1,$t1,$t3,lsr#6
356         ldr     $t2,[sp,#`$Xoff+8*(16-9)`+0]
357         eor     $t0,$t0,$t3,lsl#26
358
359         ldr     $t3,[sp,#`$Xoff+8*(16-9)`+4]
360         adds    $Tlo,$Tlo,$t0
361         ldr     $t0,[sp,#`$Xoff+8*16`+0]
362         adc     $Thi,$Thi,$t1
363
364         ldr     $t1,[sp,#`$Xoff+8*16`+4]
365         adds    $Tlo,$Tlo,$t2
366         adc     $Thi,$Thi,$t3
367         adds    $Tlo,$Tlo,$t0
368         adc     $Thi,$Thi,$t1
369 ___
370         &BODY_00_15(0x17);
371 $code.=<<___;
372         ldreq   $t0,[sp,#`$Xoff+8*(16-1)`+0]
373         ldreq   $t1,[sp,#`$Xoff+8*(16-1)`+4]
374         beq     .L16_79
375         bic     $Ktbl,$Ktbl,#1
376
377         ldr     $Tlo,[sp,#$Boff+0]
378         ldr     $Thi,[sp,#$Boff+4]
379         ldr     $t0, [$ctx,#$Aoff+$lo]
380         ldr     $t1, [$ctx,#$Aoff+$hi]
381         ldr     $t2, [$ctx,#$Boff+$lo]
382         ldr     $t3, [$ctx,#$Boff+$hi]
383         adds    $t0,$Alo,$t0
384         str     $t0, [$ctx,#$Aoff+$lo]
385         adc     $t1,$Ahi,$t1
386         str     $t1, [$ctx,#$Aoff+$hi]
387         adds    $t2,$Tlo,$t2
388         str     $t2, [$ctx,#$Boff+$lo]
389         adc     $t3,$Thi,$t3
390         str     $t3, [$ctx,#$Boff+$hi]
391
392         ldr     $Alo,[sp,#$Coff+0]
393         ldr     $Ahi,[sp,#$Coff+4]
394         ldr     $Tlo,[sp,#$Doff+0]
395         ldr     $Thi,[sp,#$Doff+4]
396         ldr     $t0, [$ctx,#$Coff+$lo]
397         ldr     $t1, [$ctx,#$Coff+$hi]
398         ldr     $t2, [$ctx,#$Doff+$lo]
399         ldr     $t3, [$ctx,#$Doff+$hi]
400         adds    $t0,$Alo,$t0
401         str     $t0, [$ctx,#$Coff+$lo]
402         adc     $t1,$Ahi,$t1
403         str     $t1, [$ctx,#$Coff+$hi]
404         adds    $t2,$Tlo,$t2
405         str     $t2, [$ctx,#$Doff+$lo]
406         adc     $t3,$Thi,$t3
407         str     $t3, [$ctx,#$Doff+$hi]
408
409         ldr     $Tlo,[sp,#$Foff+0]
410         ldr     $Thi,[sp,#$Foff+4]
411         ldr     $t0, [$ctx,#$Eoff+$lo]
412         ldr     $t1, [$ctx,#$Eoff+$hi]
413         ldr     $t2, [$ctx,#$Foff+$lo]
414         ldr     $t3, [$ctx,#$Foff+$hi]
415         adds    $Elo,$Elo,$t0
416         str     $Elo,[$ctx,#$Eoff+$lo]
417         adc     $Ehi,$Ehi,$t1
418         str     $Ehi,[$ctx,#$Eoff+$hi]
419         adds    $t2,$Tlo,$t2
420         str     $t2, [$ctx,#$Foff+$lo]
421         adc     $t3,$Thi,$t3
422         str     $t3, [$ctx,#$Foff+$hi]
423
424         ldr     $Alo,[sp,#$Goff+0]
425         ldr     $Ahi,[sp,#$Goff+4]
426         ldr     $Tlo,[sp,#$Hoff+0]
427         ldr     $Thi,[sp,#$Hoff+4]
428         ldr     $t0, [$ctx,#$Goff+$lo]
429         ldr     $t1, [$ctx,#$Goff+$hi]
430         ldr     $t2, [$ctx,#$Hoff+$lo]
431         ldr     $t3, [$ctx,#$Hoff+$hi]
432         adds    $t0,$Alo,$t0
433         str     $t0, [$ctx,#$Goff+$lo]
434         adc     $t1,$Ahi,$t1
435         str     $t1, [$ctx,#$Goff+$hi]
436         adds    $t2,$Tlo,$t2
437         str     $t2, [$ctx,#$Hoff+$lo]
438         adc     $t3,$Thi,$t3
439         str     $t3, [$ctx,#$Hoff+$hi]
440
441         add     sp,sp,#640
442         sub     $Ktbl,$Ktbl,#640
443
444         teq     $inp,$len
445         bne     .Loop
446
447         add     sp,sp,#8*9              @ destroy frame
448 #if __ARM_ARCH__>=5
449         ldmia   sp!,{r4-r12,pc}
450 #else
451         ldmia   sp!,{r4-r12,lr}
452         tst     lr,#1
453         moveq   pc,lr                   @ be binary compatible with V4, yet
454         bx      lr                      @ interoperable with Thumb ISA:-)
455 #endif
456 ___
457
458 {
459 my @Sigma0=(28,34,39);
460 my @Sigma1=(14,18,41);
461 my @sigma0=(1, 8, 7);
462 my @sigma1=(19,61,6);
463
464 my $Ktbl="r3";
465 my $cnt="r12";  # volatile register known as ip, intra-procedure-call scratch
466
467 my @X=map("d$_",(0..15));
468 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
469
470 sub NEON_00_15() {
471 my $i=shift;
472 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
473 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31));   # temps
474
475 $code.=<<___ if ($i<16 || $i&1);
476         vshr.u64        $t0,$e,#@Sigma1[0]      @ $i
477 #if $i<16
478         vld1.64         {@X[$i%16]},[$inp]!     @ handles unaligned
479 #endif
480         vshr.u64        $t1,$e,#@Sigma1[1]
481 #if $i>0
482          vadd.i64       $a,$Maj                 @ h+=Maj from the past
483 #endif
484         vshr.u64        $t2,$e,#@Sigma1[2]
485 ___
486 $code.=<<___;
487         vld1.64         {$K},[$Ktbl,:64]!       @ K[i++]
488         vsli.64         $t0,$e,#`64-@Sigma1[0]`
489         vsli.64         $t1,$e,#`64-@Sigma1[1]`
490         vmov            $Ch,$e
491         vsli.64         $t2,$e,#`64-@Sigma1[2]`
492 #if $i<16 && defined(__ARMEL__)
493         vrev64.8        @X[$i],@X[$i]
494 #endif
495         veor            $t1,$t0
496         vbsl            $Ch,$f,$g               @ Ch(e,f,g)
497         vshr.u64        $t0,$a,#@Sigma0[0]
498         veor            $t2,$t1                 @ Sigma1(e)
499         vadd.i64        $T1,$Ch,$h
500         vshr.u64        $t1,$a,#@Sigma0[1]
501         vsli.64         $t0,$a,#`64-@Sigma0[0]`
502         vadd.i64        $T1,$t2
503         vshr.u64        $t2,$a,#@Sigma0[2]
504         vadd.i64        $K,@X[$i%16]
505         vsli.64         $t1,$a,#`64-@Sigma0[1]`
506         veor            $Maj,$a,$b
507         vsli.64         $t2,$a,#`64-@Sigma0[2]`
508         veor            $h,$t0,$t1
509         vadd.i64        $T1,$K
510         vbsl            $Maj,$c,$b              @ Maj(a,b,c)
511         veor            $h,$t2                  @ Sigma0(a)
512         vadd.i64        $d,$T1
513         vadd.i64        $Maj,$T1
514         @ vadd.i64      $h,$Maj
515 ___
516 }
517
518 sub NEON_16_79() {
519 my $i=shift;
520
521 if ($i&1)       { &NEON_00_15($i,@_); return; }
522
523 # 2x-vectorized, therefore runs every 2nd round
524 my @X=map("q$_",(0..7));                        # view @X as 128-bit vector
525 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15));     # temps
526 my ($d0,$d1,$d2) = map("d$_",(24..26));         # temps from NEON_00_15
527 my $e=@_[4];                                    # $e from NEON_00_15
528 $i /= 2;
529 $code.=<<___;
530         vshr.u64        $t0,@X[($i+7)%8],#@sigma1[0]
531         vshr.u64        $t1,@X[($i+7)%8],#@sigma1[1]
532          vadd.i64       @_[0],d30                       @ h+=Maj from the past
533         vshr.u64        $s1,@X[($i+7)%8],#@sigma1[2]
534         vsli.64         $t0,@X[($i+7)%8],#`64-@sigma1[0]`
535         vext.8          $s0,@X[$i%8],@X[($i+1)%8],#8    @ X[i+1]
536         vsli.64         $t1,@X[($i+7)%8],#`64-@sigma1[1]`
537         veor            $s1,$t0
538         vshr.u64        $t0,$s0,#@sigma0[0]
539         veor            $s1,$t1                         @ sigma1(X[i+14])
540         vshr.u64        $t1,$s0,#@sigma0[1]
541         vadd.i64        @X[$i%8],$s1
542         vshr.u64        $s1,$s0,#@sigma0[2]
543         vsli.64         $t0,$s0,#`64-@sigma0[0]`
544         vsli.64         $t1,$s0,#`64-@sigma0[1]`
545         vext.8          $s0,@X[($i+4)%8],@X[($i+5)%8],#8        @ X[i+9]
546         veor            $s1,$t0
547         vshr.u64        $d0,$e,#@Sigma1[0]              @ from NEON_00_15
548         vadd.i64        @X[$i%8],$s0
549         vshr.u64        $d1,$e,#@Sigma1[1]              @ from NEON_00_15
550         veor            $s1,$t1                         @ sigma0(X[i+1])
551         vshr.u64        $d2,$e,#@Sigma1[2]              @ from NEON_00_15
552         vadd.i64        @X[$i%8],$s1
553 ___
554         &NEON_00_15(2*$i,@_);
555 }
556
557 $code.=<<___;
558 #if __ARM_MAX_ARCH__>=7
559 .arch   armv7-a
560 .fpu    neon
561
562 .align  4
563 .LNEON:
564         dmb                             @ errata #451034 on early Cortex A8
565         vstmdb  sp!,{d8-d15}            @ ABI specification says so
566         sub     $Ktbl,r3,#672           @ K512
567         vldmia  $ctx,{$A-$H}            @ load context
568 .Loop_neon:
569 ___
570 for($i=0;$i<16;$i++)    { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
571 $code.=<<___;
572         mov             $cnt,#4
573 .L16_79_neon:
574         subs            $cnt,#1
575 ___
576 for(;$i<32;$i++)        { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
577 $code.=<<___;
578         bne             .L16_79_neon
579
580          vadd.i64       $A,d30          @ h+=Maj from the past
581         vldmia          $ctx,{d24-d31}  @ load context to temp
582         vadd.i64        q8,q12          @ vectorized accumulate
583         vadd.i64        q9,q13
584         vadd.i64        q10,q14
585         vadd.i64        q11,q15
586         vstmia          $ctx,{$A-$H}    @ save context
587         teq             $inp,$len
588         sub             $Ktbl,#640      @ rewind K512
589         bne             .Loop_neon
590
591         vldmia  sp!,{d8-d15}            @ epilogue
592         ret                             @ bx lr
593 #endif
594 ___
595 }
596 $code.=<<___;
597 .size   sha512_block_data_order,.-sha512_block_data_order
598 .asciz  "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
599 .align  2
600 #if __ARM_MAX_ARCH__>=7
601 .comm   OPENSSL_armcap_P,4,4
602 #endif
603 ___
604
605 $code =~ s/\`([^\`]*)\`/eval $1/gem;
606 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm;    # make it possible to compile with -march=armv4
607 $code =~ s/\bret\b/bx   lr/gm;
608 print $code;
609 close STDOUT; # enforce flush