3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # SHA512 block procedure for ARMv4. September 2007.
12 # This code is ~4.5 (four and a half) times faster than code generated
13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14 # Xscale PXA250 core].
18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
19 # Cortex A8 core and ~40 cycles per processed byte.
23 # Profiler-assisted and platform-specific optimization resulted in 7%
24 # improvement on Coxtex A8 core and ~38 cycles per byte.
28 # Add NEON implementation. On Cortex A8 it was measured to process
29 # one byte in 23.3 cycles or ~60% faster than integer-only code.
33 # Improve NEON performance by 12% on Snapdragon S4. In absolute
34 # terms it's 22.6 cycles per byte, which is disappointing result.
35 # Technical writers asserted that 3-way S4 pipeline can sustain
36 # multiple NEON instructions per cycle, but dual NEON issue could
37 # not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html
38 # for further details. On side note Cortex-A15 processes one byte in
41 # Byte order [in]dependence. =========================================
43 # Originally caller was expected to maintain specific *dword* order in
44 # h[0-7], namely with most significant dword at *lower* address, which
45 # was reflected in below two parameters as 0 and 4. Now caller is
46 # expected to maintain native byte order for whole 64-bit values.
49 # ====================================================================
51 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
52 open STDOUT,">$output";
54 $ctx="r0"; # parameter block
68 ############ r13 is stack pointer
70 ############ r15 is program counter
85 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
86 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
87 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
89 str $Tlo,[sp,#$Xoff+0]
91 str $Thi,[sp,#$Xoff+4]
92 eor $t0,$t0,$Ehi,lsl#18
93 ldr $t2,[sp,#$Hoff+0] @ h.lo
94 eor $t1,$t1,$Elo,lsl#18
95 ldr $t3,[sp,#$Hoff+4] @ h.hi
96 eor $t0,$t0,$Elo,lsr#18
97 eor $t1,$t1,$Ehi,lsr#18
98 eor $t0,$t0,$Ehi,lsl#14
99 eor $t1,$t1,$Elo,lsl#14
100 eor $t0,$t0,$Ehi,lsr#9
101 eor $t1,$t1,$Elo,lsr#9
102 eor $t0,$t0,$Elo,lsl#23
103 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
105 ldr $t0,[sp,#$Foff+0] @ f.lo
106 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
107 ldr $t1,[sp,#$Foff+4] @ f.hi
109 ldr $t2,[sp,#$Goff+0] @ g.lo
110 adc $Thi,$Thi,$t3 @ T += h
111 ldr $t3,[sp,#$Goff+4] @ g.hi
114 str $Elo,[sp,#$Eoff+0]
116 str $Ehi,[sp,#$Eoff+4]
118 str $Alo,[sp,#$Aoff+0]
120 str $Ahi,[sp,#$Aoff+4]
122 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
123 eor $t1,$t1,$t3 @ Ch(e,f,g)
124 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
127 ldr $Elo,[sp,#$Doff+0] @ d.lo
128 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
129 ldr $Ehi,[sp,#$Doff+4] @ d.hi
132 adc $Thi,$Thi,$t3 @ T += K[i]
134 ldr $t2,[sp,#$Boff+0] @ b.lo
135 adc $Ehi,$Ehi,$Thi @ d += T
138 ldr $t3,[sp,#$Coff+0] @ c.lo
140 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
141 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
142 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
145 eor $t0,$t0,$Ahi,lsl#4
146 eor $t1,$t1,$Alo,lsl#4
147 eor $t0,$t0,$Ahi,lsr#2
148 eor $t1,$t1,$Alo,lsr#2
149 eor $t0,$t0,$Alo,lsl#30
150 eor $t1,$t1,$Ahi,lsl#30
151 eor $t0,$t0,$Ahi,lsr#7
152 eor $t1,$t1,$Alo,lsr#7
153 eor $t0,$t0,$Alo,lsl#25
154 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
157 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
159 ldr $t1,[sp,#$Boff+4] @ b.hi
161 ldr $t2,[sp,#$Coff+4] @ c.hi
165 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
168 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
170 adc $Ahi,$Ahi,$Thi @ h += T
176 #include "arm_arch.h"
180 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
184 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
192 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
193 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
194 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
195 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
196 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
197 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
198 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
199 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
200 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
201 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
202 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
203 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
204 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
205 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
206 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
207 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
208 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
209 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
210 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
211 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
212 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
213 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
214 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
215 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
216 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
217 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
218 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
219 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
220 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
221 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
222 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
223 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
224 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
225 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
226 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
227 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
228 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
229 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
230 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
231 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
233 #if __ARM_MAX_ARCH__>=7
235 .word OPENSSL_armcap_P-sha512_block_data_order
241 .global sha512_block_data_order
242 .type sha512_block_data_order,%function
243 sha512_block_data_order:
244 sub r3,pc,#8 @ sha512_block_data_order
245 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
246 #if __ARM_MAX_ARCH__>=7
247 ldr r12,.LOPENSSL_armcap
248 ldr r12,[r3,r12] @ OPENSSL_armcap_P
252 stmdb sp!,{r4-r12,lr}
253 sub $Ktbl,r3,#672 @ K512
256 ldr $Elo,[$ctx,#$Eoff+$lo]
257 ldr $Ehi,[$ctx,#$Eoff+$hi]
258 ldr $t0, [$ctx,#$Goff+$lo]
259 ldr $t1, [$ctx,#$Goff+$hi]
260 ldr $t2, [$ctx,#$Hoff+$lo]
261 ldr $t3, [$ctx,#$Hoff+$hi]
263 str $t0, [sp,#$Goff+0]
264 str $t1, [sp,#$Goff+4]
265 str $t2, [sp,#$Hoff+0]
266 str $t3, [sp,#$Hoff+4]
267 ldr $Alo,[$ctx,#$Aoff+$lo]
268 ldr $Ahi,[$ctx,#$Aoff+$hi]
269 ldr $Tlo,[$ctx,#$Boff+$lo]
270 ldr $Thi,[$ctx,#$Boff+$hi]
271 ldr $t0, [$ctx,#$Coff+$lo]
272 ldr $t1, [$ctx,#$Coff+$hi]
273 ldr $t2, [$ctx,#$Doff+$lo]
274 ldr $t3, [$ctx,#$Doff+$hi]
275 str $Tlo,[sp,#$Boff+0]
276 str $Thi,[sp,#$Boff+4]
277 str $t0, [sp,#$Coff+0]
278 str $t1, [sp,#$Coff+4]
279 str $t2, [sp,#$Doff+0]
280 str $t3, [sp,#$Doff+4]
281 ldr $Tlo,[$ctx,#$Foff+$lo]
282 ldr $Thi,[$ctx,#$Foff+$hi]
283 str $Tlo,[sp,#$Foff+0]
284 str $Thi,[sp,#$Foff+4]
294 orr $Tlo,$Tlo,$t0,lsl#8
296 orr $Tlo,$Tlo,$t1,lsl#16
298 orr $Tlo,$Tlo,$t2,lsl#24
299 orr $Thi,$Thi,$t3,lsl#8
300 orr $Thi,$Thi,$t0,lsl#16
301 orr $Thi,$Thi,$t1,lsl#24
315 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
316 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
319 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
320 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
321 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
323 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
325 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
326 eor $Tlo,$Tlo,$t1,lsl#31
327 eor $Thi,$Thi,$t0,lsl#31
328 eor $Tlo,$Tlo,$t0,lsr#8
329 eor $Thi,$Thi,$t1,lsr#8
330 eor $Tlo,$Tlo,$t1,lsl#24
331 eor $Thi,$Thi,$t0,lsl#24
332 eor $Tlo,$Tlo,$t0,lsr#7
333 eor $Thi,$Thi,$t1,lsr#7
334 eor $Tlo,$Tlo,$t1,lsl#25
336 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
337 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
338 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
341 eor $t0,$t0,$t3,lsl#13
342 eor $t1,$t1,$t2,lsl#13
343 eor $t0,$t0,$t3,lsr#29
344 eor $t1,$t1,$t2,lsr#29
345 eor $t0,$t0,$t2,lsl#3
346 eor $t1,$t1,$t3,lsl#3
347 eor $t0,$t0,$t2,lsr#6
348 eor $t1,$t1,$t3,lsr#6
349 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
350 eor $t0,$t0,$t3,lsl#26
352 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
354 ldr $t0,[sp,#`$Xoff+8*16`+0]
357 ldr $t1,[sp,#`$Xoff+8*16`+4]
365 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
366 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
370 ldr $Tlo,[sp,#$Boff+0]
371 ldr $Thi,[sp,#$Boff+4]
372 ldr $t0, [$ctx,#$Aoff+$lo]
373 ldr $t1, [$ctx,#$Aoff+$hi]
374 ldr $t2, [$ctx,#$Boff+$lo]
375 ldr $t3, [$ctx,#$Boff+$hi]
377 str $t0, [$ctx,#$Aoff+$lo]
379 str $t1, [$ctx,#$Aoff+$hi]
381 str $t2, [$ctx,#$Boff+$lo]
383 str $t3, [$ctx,#$Boff+$hi]
385 ldr $Alo,[sp,#$Coff+0]
386 ldr $Ahi,[sp,#$Coff+4]
387 ldr $Tlo,[sp,#$Doff+0]
388 ldr $Thi,[sp,#$Doff+4]
389 ldr $t0, [$ctx,#$Coff+$lo]
390 ldr $t1, [$ctx,#$Coff+$hi]
391 ldr $t2, [$ctx,#$Doff+$lo]
392 ldr $t3, [$ctx,#$Doff+$hi]
394 str $t0, [$ctx,#$Coff+$lo]
396 str $t1, [$ctx,#$Coff+$hi]
398 str $t2, [$ctx,#$Doff+$lo]
400 str $t3, [$ctx,#$Doff+$hi]
402 ldr $Tlo,[sp,#$Foff+0]
403 ldr $Thi,[sp,#$Foff+4]
404 ldr $t0, [$ctx,#$Eoff+$lo]
405 ldr $t1, [$ctx,#$Eoff+$hi]
406 ldr $t2, [$ctx,#$Foff+$lo]
407 ldr $t3, [$ctx,#$Foff+$hi]
409 str $Elo,[$ctx,#$Eoff+$lo]
411 str $Ehi,[$ctx,#$Eoff+$hi]
413 str $t2, [$ctx,#$Foff+$lo]
415 str $t3, [$ctx,#$Foff+$hi]
417 ldr $Alo,[sp,#$Goff+0]
418 ldr $Ahi,[sp,#$Goff+4]
419 ldr $Tlo,[sp,#$Hoff+0]
420 ldr $Thi,[sp,#$Hoff+4]
421 ldr $t0, [$ctx,#$Goff+$lo]
422 ldr $t1, [$ctx,#$Goff+$hi]
423 ldr $t2, [$ctx,#$Hoff+$lo]
424 ldr $t3, [$ctx,#$Hoff+$hi]
426 str $t0, [$ctx,#$Goff+$lo]
428 str $t1, [$ctx,#$Goff+$hi]
430 str $t2, [$ctx,#$Hoff+$lo]
432 str $t3, [$ctx,#$Hoff+$hi]
440 add sp,sp,#8*9 @ destroy frame
442 ldmia sp!,{r4-r12,pc}
444 ldmia sp!,{r4-r12,lr}
446 moveq pc,lr @ be binary compatible with V4, yet
447 bx lr @ interoperable with Thumb ISA:-)
452 my @Sigma0=(28,34,39);
453 my @Sigma1=(14,18,41);
454 my @sigma0=(1, 8, 7);
455 my @sigma1=(19,61,6);
458 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
460 my @X=map("d$_",(0..15));
461 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
465 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
466 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
468 $code.=<<___ if ($i<16 || $i&1);
469 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
471 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
473 vshr.u64 $t1,$e,#@Sigma1[1]
475 vadd.i64 $a,$Maj @ h+=Maj from the past
477 vshr.u64 $t2,$e,#@Sigma1[2]
480 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
481 vsli.64 $t0,$e,#`64-@Sigma1[0]`
482 vsli.64 $t1,$e,#`64-@Sigma1[1]`
484 vsli.64 $t2,$e,#`64-@Sigma1[2]`
485 #if $i<16 && defined(__ARMEL__)
486 vrev64.8 @X[$i],@X[$i]
489 vbsl $Ch,$f,$g @ Ch(e,f,g)
490 vshr.u64 $t0,$a,#@Sigma0[0]
491 veor $t2,$t1 @ Sigma1(e)
493 vshr.u64 $t1,$a,#@Sigma0[1]
494 vsli.64 $t0,$a,#`64-@Sigma0[0]`
496 vshr.u64 $t2,$a,#@Sigma0[2]
497 vadd.i64 $K,@X[$i%16]
498 vsli.64 $t1,$a,#`64-@Sigma0[1]`
500 vsli.64 $t2,$a,#`64-@Sigma0[2]`
503 vbsl $Maj,$c,$b @ Maj(a,b,c)
504 veor $h,$t2 @ Sigma0(a)
514 if ($i&1) { &NEON_00_15($i,@_); return; }
516 # 2x-vectorized, therefore runs every 2nd round
517 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
518 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
519 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
520 my $e=@_[4]; # $e from NEON_00_15
523 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
524 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
525 vadd.i64 @_[0],d30 @ h+=Maj from the past
526 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
527 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
528 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
529 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
531 vshr.u64 $t0,$s0,#@sigma0[0]
532 veor $s1,$t1 @ sigma1(X[i+14])
533 vshr.u64 $t1,$s0,#@sigma0[1]
534 vadd.i64 @X[$i%8],$s1
535 vshr.u64 $s1,$s0,#@sigma0[2]
536 vsli.64 $t0,$s0,#`64-@sigma0[0]`
537 vsli.64 $t1,$s0,#`64-@sigma0[1]`
538 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
540 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
541 vadd.i64 @X[$i%8],$s0
542 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
543 veor $s1,$t1 @ sigma0(X[i+1])
544 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
545 vadd.i64 @X[$i%8],$s1
547 &NEON_00_15(2*$i,@_);
551 #if __ARM_MAX_ARCH__>=7
557 dmb @ errata #451034 on early Cortex A8
558 vstmdb sp!,{d8-d15} @ ABI specification says so
559 sub $Ktbl,r3,#672 @ K512
560 vldmia $ctx,{$A-$H} @ load context
563 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
569 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
573 vadd.i64 $A,d30 @ h+=Maj from the past
574 vldmia $ctx,{d24-d31} @ load context to temp
575 vadd.i64 q8,q12 @ vectorized accumulate
579 vstmia $ctx,{$A-$H} @ save context
581 sub $Ktbl,#640 @ rewind K512
584 vldmia sp!,{d8-d15} @ epilogue
590 .size sha512_block_data_order,.-sha512_block_data_order
591 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
593 #if __ARM_MAX_ARCH__>=7
594 .comm OPENSSL_armcap_P,4,4
598 $code =~ s/\`([^\`]*)\`/eval $1/gem;
599 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
600 $code =~ s/\bret\b/bx lr/gm;
602 close STDOUT; # enforce flush