3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # SHA256 block procedure for ARMv4. May 2007.
12 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
13 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
14 # byte [on single-issue Xscale PXA250 core].
18 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
19 # Cortex A8 core and ~20 cycles per processed byte.
23 # Profiler-assisted and platform-specific optimization resulted in 16%
24 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
28 # Add NEON implementation. On Cortex A8 it was measured to process one
29 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
30 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
31 # code (meaning that latter performs sub-optimally, nothing was done
36 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
38 while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
39 open STDOUT,">$output";
53 @V=($A,$B,$C,$D,$E,$F,$G,$H);
63 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
65 $code.=<<___ if ($i<16);
67 @ ldr $t1,[$inp],#4 @ $i
69 str $inp,[sp,#17*4] @ make room for $t4
71 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
72 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
73 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
76 @ ldrb $t1,[$inp,#3] @ $i
77 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
82 orr $t1,$t1,$t0,lsl#16
84 str $inp,[sp,#17*4] @ make room for $t4
86 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
87 orr $t1,$t1,$t2,lsl#24
88 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
92 ldr $t2,[$Ktbl],#4 @ *K256++
93 add $h,$h,$t1 @ h+=X[i]
94 str $t1,[sp,#`$i%16`*4]
96 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
98 add $h,$h,$t2 @ h+=K256[i]
99 eor $t1,$t1,$g @ Ch(e,f,g)
100 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
101 add $h,$h,$t1 @ h+=Ch(e,f,g)
104 cmp $t2,#0xf2 @ done?
108 ldr $t1,[$inp],#4 @ prefetch
112 eor $t2,$a,$b @ a^b, b^c in next round
114 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
115 eor $t2,$a,$b @ a^b, b^c in next round
116 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
118 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
119 and $t3,$t3,$t2 @ (b^c)&=(a^b)
121 eor $t3,$t3,$b @ Maj(a,b,c)
122 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
123 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
129 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
132 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
133 @ ldr $t4,[sp,#`($i+14)%16`*4]
134 mov $t0,$t1,ror#$sigma0[0]
135 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
136 mov $t2,$t4,ror#$sigma1[0]
137 eor $t0,$t0,$t1,ror#$sigma0[1]
138 eor $t2,$t2,$t4,ror#$sigma1[1]
139 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
140 ldr $t1,[sp,#`($i+0)%16`*4]
141 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
142 ldr $t4,[sp,#`($i+9)%16`*4]
145 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
147 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
148 add $t1,$t1,$t4 @ X[i]
154 #include "arm_arch.h"
162 .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
163 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
164 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
165 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
166 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
167 .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
168 .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
169 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
170 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
171 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
172 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
173 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
174 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
175 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
176 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
177 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
180 #if __ARM_MAX_ARCH__>=7
182 .word OPENSSL_armcap_P-sha256_block_data_order
186 .global sha256_block_data_order
187 .type sha256_block_data_order,%function
188 sha256_block_data_order:
189 sub r3,pc,#8 @ sha256_block_data_order
190 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
191 #if __ARM_MAX_ARCH__>=7
192 ldr r12,.LOPENSSL_armcap
193 ldr r12,[r3,r12] @ OPENSSL_armcap_P
194 tst r12,#ARMV8_SHA256
199 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
200 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
201 sub $Ktbl,r3,#256+32 @ K256
202 sub sp,sp,#16*4 @ alloca(X[16])
209 eor $t3,$B,$C @ magic
212 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
213 $code.=".Lrounds_16_xx:\n";
214 for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
216 ldreq $t3,[sp,#16*4] @ pull ctx
219 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
234 ldr $inp,[sp,#17*4] @ pull inp
235 ldr $t2,[sp,#18*4] @ pull inp+len
238 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
240 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
243 add sp,sp,#`16+3`*4 @ destroy frame
245 ldmia sp!,{r4-r11,pc}
247 ldmia sp!,{r4-r11,lr}
249 moveq pc,lr @ be binary compatible with V4, yet
250 bx lr @ interoperable with Thumb ISA:-)
252 .size sha256_block_data_order,.-sha256_block_data_order
254 ######################################################################
258 my @X=map("q$_",(0..3));
259 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
263 sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
264 sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
266 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
267 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
269 $arg = "#$arg" if ($arg*1 eq $arg);
270 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
276 my @insns = (&$body,&$body,&$body,&$body);
277 my ($a,$b,$c,$d,$e,$f,$g,$h);
279 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
283 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
287 &vshr_u32 ($T2,$T0,$sigma0[0]);
290 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
293 &vshr_u32 ($T1,$T0,$sigma0[2]);
296 &vsli_32 ($T2,$T0,32-$sigma0[0]);
299 &vshr_u32 ($T3,$T0,$sigma0[1]);
305 &vsli_32 ($T3,$T0,32-$sigma0[1]);
308 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
311 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
314 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
317 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
320 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
326 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
329 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
332 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
335 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
338 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
341 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
344 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
350 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
353 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
356 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
359 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
362 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
365 &vadd_i32 ($T0,$T0,@X[0]);
366 while($#insns>=2) { eval(shift(@insns)); }
367 &vst1_32 ("{$T0}","[$Xfer,:128]!");
371 push(@X,shift(@X)); # "rotate" X[]
377 my @insns = (&$body,&$body,&$body,&$body);
378 my ($a,$b,$c,$d,$e,$f,$g,$h);
384 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
389 &vrev32_8 (@X[0],@X[0]);
394 &vadd_i32 ($T0,$T0,@X[0]);
395 foreach (@insns) { eval; } # remaining instructions
396 &vst1_32 ("{$T0}","[$Xfer,:128]!");
398 push(@X,shift(@X)); # "rotate" X[]
403 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
404 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
406 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
407 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
409 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
410 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
411 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
412 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
413 '&eor ($t2,$a,$b)', # a^b, b^c in next round
414 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
415 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
416 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
417 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
418 '&ldr ($t1,"[sp,#64]") if ($j==31)',
419 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
420 '&add ($d,$d,$h)', # d+=h
421 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
422 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
423 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
428 #if __ARM_MAX_ARCH__>=7
432 .type sha256_block_data_order_neon,%function
434 sha256_block_data_order_neon:
436 stmdb sp!,{r4-r12,lr}
439 sub sp,sp,#16*4+16 @ alloca
440 sub $Ktbl,r3,#256+32 @ K256
441 bic sp,sp,#15 @ align for 128-bit stores
443 vld1.8 {@X[0]},[$inp]!
444 vld1.8 {@X[1]},[$inp]!
445 vld1.8 {@X[2]},[$inp]!
446 vld1.8 {@X[3]},[$inp]!
447 vld1.32 {$T0},[$Ktbl,:128]!
448 vld1.32 {$T1},[$Ktbl,:128]!
449 vld1.32 {$T2},[$Ktbl,:128]!
450 vld1.32 {$T3},[$Ktbl,:128]!
451 vrev32.8 @X[0],@X[0] @ yes, even on
453 vrev32.8 @X[1],@X[1] @ big-endian
459 str $t2,[sp,#76] @ save original sp
460 vadd.i32 $T0,$T0,@X[0]
461 vadd.i32 $T1,$T1,@X[1]
462 vst1.32 {$T0},[$Xfer,:128]!
463 vadd.i32 $T2,$T2,@X[2]
464 vst1.32 {$T1},[$Xfer,:128]!
465 vadd.i32 $T3,$T3,@X[3]
466 vst1.32 {$T2},[$Xfer,:128]!
467 vst1.32 {$T3},[$Xfer,:128]!
479 &Xupdate(\&body_00_15);
480 &Xupdate(\&body_00_15);
481 &Xupdate(\&body_00_15);
482 &Xupdate(\&body_00_15);
484 teq $t1,#0 @ check for K256 terminator
491 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
493 subeq $inp,$inp,#64 @ avoid SEGV
494 vld1.8 {@X[0]},[$inp]! @ load next input block
495 vld1.8 {@X[1]},[$inp]!
496 vld1.8 {@X[2]},[$inp]!
497 vld1.8 {@X[3]},[$inp]!
501 &Xpreload(\&body_00_15);
502 &Xpreload(\&body_00_15);
503 &Xpreload(\&body_00_15);
504 &Xpreload(\&body_00_15);
507 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
511 add $A,$A,$t0 @ accumulate
532 ldreq sp,[sp,#76] @ restore original sp
536 ldmia sp!,{r4-r12,pc}
537 .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
541 ######################################################################
545 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
546 my @MSG=map("q$_",(8..11));
547 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
551 #if __ARM_MAX_ARCH__>=7
552 .type sha256_block_data_order_armv8,%function
554 sha256_block_data_order_armv8:
556 vld1.32 {$ABCD,$EFGH},[$ctx]
557 sub $Ktbl,r3,#sha256_block_data_order-K256
560 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
561 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
562 vld1.32 {$W0},[$Ktbl]!
563 vrev32.8 @MSG[0],@MSG[0]
564 vrev32.8 @MSG[1],@MSG[1]
565 vrev32.8 @MSG[2],@MSG[2]
566 vrev32.8 @MSG[3],@MSG[3]
567 vmov $ABCD_SAVE,$ABCD @ offload
568 vmov $EFGH_SAVE,$EFGH
571 for($i=0;$i<12;$i++) {
573 vld1.32 {$W1},[$Ktbl]!
574 vadd.i32 $W0,$W0,@MSG[0]
575 sha256su0 @MSG[0],@MSG[1]
577 sha256h $ABCD,$EFGH,$W0
578 sha256h2 $EFGH,$abcd,$W0
579 sha256su1 @MSG[0],@MSG[2],@MSG[3]
581 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
584 vld1.32 {$W1},[$Ktbl]!
585 vadd.i32 $W0,$W0,@MSG[0]
587 sha256h $ABCD,$EFGH,$W0
588 sha256h2 $EFGH,$abcd,$W0
590 vld1.32 {$W0},[$Ktbl]!
591 vadd.i32 $W1,$W1,@MSG[1]
593 sha256h $ABCD,$EFGH,$W1
594 sha256h2 $EFGH,$abcd,$W1
596 vld1.32 {$W1},[$Ktbl]
597 vadd.i32 $W0,$W0,@MSG[2]
598 sub $Ktbl,$Ktbl,#256-16 @ rewind
600 sha256h $ABCD,$EFGH,$W0
601 sha256h2 $EFGH,$abcd,$W0
603 vadd.i32 $W1,$W1,@MSG[3]
605 sha256h $ABCD,$EFGH,$W1
606 sha256h2 $EFGH,$abcd,$W1
608 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
609 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
612 vst1.32 {$ABCD,$EFGH},[$ctx]
615 .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
620 .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
622 #if __ARM_MAX_ARCH__>=7
623 .comm OPENSSL_armcap_P,4,4
628 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
629 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
632 my ($mnemonic,$arg)=@_;
634 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
635 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
636 |(($2&7)<<17)|(($2&8)<<4)
637 |(($3&7)<<1) |(($3&8)<<2);
638 # since ARMv7 instructions are always encoded little-endian.
639 # correct solution is to use .inst directive, but older
640 # assemblers don't implement it:-(
641 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
642 $word&0xff,($word>>8)&0xff,
643 ($word>>16)&0xff,($word>>24)&0xff,
649 foreach (split($/,$code)) {
651 s/\`([^\`]*)\`/eval $1/geo;
653 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
655 s/\bret\b/bx lr/go or
656 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
661 close STDOUT; # enforce flush