3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # SHA512 block procedure for ARMv4. September 2007.
12 # This code is ~4.5 (four and a half) times faster than code generated
13 # by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue
14 # Xscale PXA250 core].
18 # Rescheduling for dual-issue pipeline resulted in 6% improvement on
19 # Cortex A8 core and ~40 cycles per processed byte.
23 # Profiler-assisted and platform-specific optimization resulted in 7%
24 # improvement on Coxtex A8 core and ~38 cycles per byte.
28 # Add NEON implementation. On Cortex A8 it was measured to process
29 # one byte in 23.3 cycles or ~60% faster than integer-only code.
31 # Byte order [in]dependence. =========================================
33 # Originally caller was expected to maintain specific *dword* order in
34 # h[0-7], namely with most significant dword at *lower* address, which
35 # was reflected in below two parameters as 0 and 4. Now caller is
36 # expected to maintain native byte order for whole 64-bit values.
39 # ====================================================================
41 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
42 open STDOUT,">$output";
44 $ctx="r0"; # parameter block
58 ############ r13 is stack pointer
60 ############ r15 is program counter
75 @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
76 @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
77 @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
79 str $Tlo,[sp,#$Xoff+0]
81 str $Thi,[sp,#$Xoff+4]
82 eor $t0,$t0,$Ehi,lsl#18
83 ldr $t2,[sp,#$Hoff+0] @ h.lo
84 eor $t1,$t1,$Elo,lsl#18
85 ldr $t3,[sp,#$Hoff+4] @ h.hi
86 eor $t0,$t0,$Elo,lsr#18
87 eor $t1,$t1,$Ehi,lsr#18
88 eor $t0,$t0,$Ehi,lsl#14
89 eor $t1,$t1,$Elo,lsl#14
90 eor $t0,$t0,$Ehi,lsr#9
91 eor $t1,$t1,$Elo,lsr#9
92 eor $t0,$t0,$Elo,lsl#23
93 eor $t1,$t1,$Ehi,lsl#23 @ Sigma1(e)
95 ldr $t0,[sp,#$Foff+0] @ f.lo
96 adc $Thi,$Thi,$t1 @ T += Sigma1(e)
97 ldr $t1,[sp,#$Foff+4] @ f.hi
99 ldr $t2,[sp,#$Goff+0] @ g.lo
100 adc $Thi,$Thi,$t3 @ T += h
101 ldr $t3,[sp,#$Goff+4] @ g.hi
104 str $Elo,[sp,#$Eoff+0]
106 str $Ehi,[sp,#$Eoff+4]
108 str $Alo,[sp,#$Aoff+0]
110 str $Ahi,[sp,#$Aoff+4]
112 ldr $t2,[$Ktbl,#$lo] @ K[i].lo
113 eor $t1,$t1,$t3 @ Ch(e,f,g)
114 ldr $t3,[$Ktbl,#$hi] @ K[i].hi
117 ldr $Elo,[sp,#$Doff+0] @ d.lo
118 adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
119 ldr $Ehi,[sp,#$Doff+4] @ d.hi
122 adc $Thi,$Thi,$t3 @ T += K[i]
124 ldr $t2,[sp,#$Boff+0] @ b.lo
125 adc $Ehi,$Ehi,$Thi @ d += T
128 ldr $t3,[sp,#$Coff+0] @ c.lo
130 @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
131 @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
132 @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
135 eor $t0,$t0,$Ahi,lsl#4
136 eor $t1,$t1,$Alo,lsl#4
137 eor $t0,$t0,$Ahi,lsr#2
138 eor $t1,$t1,$Alo,lsr#2
139 eor $t0,$t0,$Alo,lsl#30
140 eor $t1,$t1,$Ahi,lsl#30
141 eor $t0,$t0,$Ahi,lsr#7
142 eor $t1,$t1,$Alo,lsr#7
143 eor $t0,$t0,$Alo,lsl#25
144 eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
147 adc $Thi,$Thi,$t1 @ T += Sigma0(a)
149 ldr $t1,[sp,#$Boff+4] @ b.hi
151 ldr $t2,[sp,#$Coff+4] @ c.hi
155 orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
158 orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
160 adc $Ahi,$Ahi,$Thi @ h += T
166 #include "arm_arch.h"
170 # define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
174 # define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
182 WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
183 WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
184 WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
185 WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
186 WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
187 WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
188 WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
189 WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
190 WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
191 WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
192 WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
193 WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
194 WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
195 WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
196 WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
197 WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
198 WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
199 WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
200 WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
201 WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
202 WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
203 WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
204 WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
205 WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
206 WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
207 WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
208 WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
209 WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
210 WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
211 WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
212 WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
213 WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
214 WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
215 WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
216 WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
217 WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
218 WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
219 WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
220 WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
221 WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
224 .word OPENSSL_armcap_P-sha512_block_data_order
227 .global sha512_block_data_order
228 .type sha512_block_data_order,%function
229 sha512_block_data_order:
230 sub r3,pc,#8 @ sha512_block_data_order
231 add $len,$inp,$len,lsl#7 @ len to point at the end of inp
233 ldr r12,.LOPENSSL_armcap
234 ldr r12,[r3,r12] @ OPENSSL_armcap_P
238 stmdb sp!,{r4-r12,lr}
239 sub $Ktbl,r3,#672 @ K512
242 ldr $Elo,[$ctx,#$Eoff+$lo]
243 ldr $Ehi,[$ctx,#$Eoff+$hi]
244 ldr $t0, [$ctx,#$Goff+$lo]
245 ldr $t1, [$ctx,#$Goff+$hi]
246 ldr $t2, [$ctx,#$Hoff+$lo]
247 ldr $t3, [$ctx,#$Hoff+$hi]
249 str $t0, [sp,#$Goff+0]
250 str $t1, [sp,#$Goff+4]
251 str $t2, [sp,#$Hoff+0]
252 str $t3, [sp,#$Hoff+4]
253 ldr $Alo,[$ctx,#$Aoff+$lo]
254 ldr $Ahi,[$ctx,#$Aoff+$hi]
255 ldr $Tlo,[$ctx,#$Boff+$lo]
256 ldr $Thi,[$ctx,#$Boff+$hi]
257 ldr $t0, [$ctx,#$Coff+$lo]
258 ldr $t1, [$ctx,#$Coff+$hi]
259 ldr $t2, [$ctx,#$Doff+$lo]
260 ldr $t3, [$ctx,#$Doff+$hi]
261 str $Tlo,[sp,#$Boff+0]
262 str $Thi,[sp,#$Boff+4]
263 str $t0, [sp,#$Coff+0]
264 str $t1, [sp,#$Coff+4]
265 str $t2, [sp,#$Doff+0]
266 str $t3, [sp,#$Doff+4]
267 ldr $Tlo,[$ctx,#$Foff+$lo]
268 ldr $Thi,[$ctx,#$Foff+$hi]
269 str $Tlo,[sp,#$Foff+0]
270 str $Thi,[sp,#$Foff+4]
280 orr $Tlo,$Tlo,$t0,lsl#8
282 orr $Tlo,$Tlo,$t1,lsl#16
284 orr $Tlo,$Tlo,$t2,lsl#24
285 orr $Thi,$Thi,$t3,lsl#8
286 orr $Thi,$Thi,$t0,lsl#16
287 orr $Thi,$Thi,$t1,lsl#24
301 ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
302 ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
305 @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
306 @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
307 @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
309 ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
311 ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
312 eor $Tlo,$Tlo,$t1,lsl#31
313 eor $Thi,$Thi,$t0,lsl#31
314 eor $Tlo,$Tlo,$t0,lsr#8
315 eor $Thi,$Thi,$t1,lsr#8
316 eor $Tlo,$Tlo,$t1,lsl#24
317 eor $Thi,$Thi,$t0,lsl#24
318 eor $Tlo,$Tlo,$t0,lsr#7
319 eor $Thi,$Thi,$t1,lsr#7
320 eor $Tlo,$Tlo,$t1,lsl#25
322 @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))
323 @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26
324 @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6
327 eor $t0,$t0,$t3,lsl#13
328 eor $t1,$t1,$t2,lsl#13
329 eor $t0,$t0,$t3,lsr#29
330 eor $t1,$t1,$t2,lsr#29
331 eor $t0,$t0,$t2,lsl#3
332 eor $t1,$t1,$t3,lsl#3
333 eor $t0,$t0,$t2,lsr#6
334 eor $t1,$t1,$t3,lsr#6
335 ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
336 eor $t0,$t0,$t3,lsl#26
338 ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
340 ldr $t0,[sp,#`$Xoff+8*16`+0]
343 ldr $t1,[sp,#`$Xoff+8*16`+4]
351 ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
352 ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
356 ldr $Tlo,[sp,#$Boff+0]
357 ldr $Thi,[sp,#$Boff+4]
358 ldr $t0, [$ctx,#$Aoff+$lo]
359 ldr $t1, [$ctx,#$Aoff+$hi]
360 ldr $t2, [$ctx,#$Boff+$lo]
361 ldr $t3, [$ctx,#$Boff+$hi]
363 str $t0, [$ctx,#$Aoff+$lo]
365 str $t1, [$ctx,#$Aoff+$hi]
367 str $t2, [$ctx,#$Boff+$lo]
369 str $t3, [$ctx,#$Boff+$hi]
371 ldr $Alo,[sp,#$Coff+0]
372 ldr $Ahi,[sp,#$Coff+4]
373 ldr $Tlo,[sp,#$Doff+0]
374 ldr $Thi,[sp,#$Doff+4]
375 ldr $t0, [$ctx,#$Coff+$lo]
376 ldr $t1, [$ctx,#$Coff+$hi]
377 ldr $t2, [$ctx,#$Doff+$lo]
378 ldr $t3, [$ctx,#$Doff+$hi]
380 str $t0, [$ctx,#$Coff+$lo]
382 str $t1, [$ctx,#$Coff+$hi]
384 str $t2, [$ctx,#$Doff+$lo]
386 str $t3, [$ctx,#$Doff+$hi]
388 ldr $Tlo,[sp,#$Foff+0]
389 ldr $Thi,[sp,#$Foff+4]
390 ldr $t0, [$ctx,#$Eoff+$lo]
391 ldr $t1, [$ctx,#$Eoff+$hi]
392 ldr $t2, [$ctx,#$Foff+$lo]
393 ldr $t3, [$ctx,#$Foff+$hi]
395 str $Elo,[$ctx,#$Eoff+$lo]
397 str $Ehi,[$ctx,#$Eoff+$hi]
399 str $t2, [$ctx,#$Foff+$lo]
401 str $t3, [$ctx,#$Foff+$hi]
403 ldr $Alo,[sp,#$Goff+0]
404 ldr $Ahi,[sp,#$Goff+4]
405 ldr $Tlo,[sp,#$Hoff+0]
406 ldr $Thi,[sp,#$Hoff+4]
407 ldr $t0, [$ctx,#$Goff+$lo]
408 ldr $t1, [$ctx,#$Goff+$hi]
409 ldr $t2, [$ctx,#$Hoff+$lo]
410 ldr $t3, [$ctx,#$Hoff+$hi]
412 str $t0, [$ctx,#$Goff+$lo]
414 str $t1, [$ctx,#$Goff+$hi]
416 str $t2, [$ctx,#$Hoff+$lo]
418 str $t3, [$ctx,#$Hoff+$hi]
426 add sp,sp,#8*9 @ destroy frame
428 ldmia sp!,{r4-r12,pc}
430 ldmia sp!,{r4-r12,lr}
432 moveq pc,lr @ be binary compatible with V4, yet
433 bx lr @ interoperable with Thumb ISA:-)
438 my @Sigma0=(28,34,39);
439 my @Sigma1=(14,18,41);
440 my @sigma0=(1, 8, 7);
441 my @sigma1=(19,61,6);
444 my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
446 my @X=map("d$_",(0..15));
447 my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
451 my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
452 my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
454 $code.=<<___ if ($i<16 || $i&1);
455 vshr.u64 $t0,$e,#@Sigma1[0] @ $i
457 vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
459 vshr.u64 $t1,$e,#@Sigma1[1]
460 vshr.u64 $t2,$e,#@Sigma1[2]
463 vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
464 vsli.64 $t0,$e,#`64-@Sigma1[0]`
465 vsli.64 $t1,$e,#`64-@Sigma1[1]`
467 vsli.64 $t2,$e,#`64-@Sigma1[2]`
468 #if $i<16 && defined(__ARMEL__)
469 vrev64.8 @X[$i],@X[$i]
471 vbsl $Ch,$f,$g @ Ch(e,f,g)
473 vshr.u64 $t0,$a,#@Sigma0[0]
474 veor $t2,$t1 @ Sigma1(e)
475 vshr.u64 $t1,$a,#@Sigma0[1]
477 vshr.u64 $t2,$a,#@Sigma0[2]
479 vsli.64 $t0,$a,#`64-@Sigma0[0]`
480 vadd.i64 $T1,@X[$i%16]
481 vsli.64 $t1,$a,#`64-@Sigma0[1]`
483 vsli.64 $t2,$a,#`64-@Sigma0[2]`
486 veor $h,$t2 @ Sigma0(a)
487 vbsl $Maj,$c,$b @ Maj(a,b,c)
497 if ($i&1) { &NEON_00_15($i,@_); return; }
499 # 2x-vectorized, therefore runs every 2nd round
500 my @X=map("q$_",(0..7)); # view @X as 128-bit vector
501 my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
502 my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
503 my $e=@_[4]; # $e from NEON_00_15
506 vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
507 vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
508 vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
509 vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
510 vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
511 vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
513 vshr.u64 $t0,$s0,#@sigma0[0]
514 veor $s1,$t1 @ sigma1(X[i+14])
515 vshr.u64 $t1,$s0,#@sigma0[1]
516 vadd.i64 @X[$i%8],$s1
517 vshr.u64 $s1,$s0,#@sigma0[2]
518 vsli.64 $t0,$s0,#`64-@sigma0[0]`
519 vsli.64 $t1,$s0,#`64-@sigma0[1]`
520 vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
522 vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
523 vadd.i64 @X[$i%8],$s0
524 vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
525 veor $s1,$t1 @ sigma0(X[i+1])
526 vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
527 vadd.i64 @X[$i%8],$s1
529 &NEON_00_15(2*$i,@_);
538 dmb @ errata #451034 on early Cortex A8
539 vstmdb sp!,{d8-d15} @ ABI specification says so
540 sub $Ktbl,r3,#672 @ K512
541 vldmia $ctx,{$A-$H} @ load context
544 for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
550 for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
554 vldmia $ctx,{d24-d31} @ load context to temp
555 vadd.i64 q8,q12 @ vectorized accumulate
559 vstmia $ctx,{$A-$H} @ save context
561 sub $Ktbl,#640 @ rewind K512
564 vldmia sp!,{d8-d15} @ epilogue
570 .size sha512_block_data_order,.-sha512_block_data_order
571 .asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
573 .comm OPENSSL_armcap_P,4,4
576 $code =~ s/\`([^\`]*)\`/eval $1/gem;
577 $code =~ s/\bbx\s+lr\b/.word\t0xe12fff1e/gm; # make it possible to compile with -march=armv4
579 close STDOUT; # enforce flush