3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
12 # SHA256 block procedure for ARMv4. May 2007.
14 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
15 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16 # byte [on single-issue Xscale PXA250 core].
20 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
21 # Cortex A8 core and ~20 cycles per processed byte.
25 # Profiler-assisted and platform-specific optimization resulted in 16%
26 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
30 # Add NEON implementation. On Cortex A8 it was measured to process one
31 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33 # code (meaning that latter performs sub-optimally, nothing was done
38 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
41 if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
42 else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} }
44 if ($flavour && $flavour ne "void") {
45 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
47 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
48 die "can't locate arm-xlate.pl";
50 open STDOUT,"| \"$^X\" $xlate $flavour $output";
52 open STDOUT,">$output";
67 @V=($A,$B,$C,$D,$E,$F,$G,$H);
77 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
79 $code.=<<___ if ($i<16);
81 @ ldr $t1,[$inp],#4 @ $i
83 str $inp,[sp,#17*4] @ make room for $t4
85 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
86 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
87 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
92 @ ldrb $t1,[$inp,#3] @ $i
93 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
98 orr $t1,$t1,$t0,lsl#16
100 str $inp,[sp,#17*4] @ make room for $t4
102 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
103 orr $t1,$t1,$t2,lsl#24
104 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
108 ldr $t2,[$Ktbl],#4 @ *K256++
109 add $h,$h,$t1 @ h+=X[i]
110 str $t1,[sp,#`$i%16`*4]
112 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
114 add $h,$h,$t2 @ h+=K256[i]
115 eor $t1,$t1,$g @ Ch(e,f,g)
116 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
117 add $h,$h,$t1 @ h+=Ch(e,f,g)
120 cmp $t2,#0xf2 @ done?
124 ldr $t1,[$inp],#4 @ prefetch
128 eor $t2,$a,$b @ a^b, b^c in next round
130 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
131 eor $t2,$a,$b @ a^b, b^c in next round
132 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
134 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
135 and $t3,$t3,$t2 @ (b^c)&=(a^b)
137 eor $t3,$t3,$b @ Maj(a,b,c)
138 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
139 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
145 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
148 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
149 @ ldr $t4,[sp,#`($i+14)%16`*4]
150 mov $t0,$t1,ror#$sigma0[0]
151 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
152 mov $t2,$t4,ror#$sigma1[0]
153 eor $t0,$t0,$t1,ror#$sigma0[1]
154 eor $t2,$t2,$t4,ror#$sigma1[1]
155 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
156 ldr $t1,[sp,#`($i+0)%16`*4]
157 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
158 ldr $t4,[sp,#`($i+9)%16`*4]
161 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
163 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
164 add $t1,$t1,$t4 @ X[i]
171 # include "arm_arch.h"
173 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
174 # define __ARM_MAX_ARCH__ 7
178 #if defined(__thumb2__)
188 .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
189 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
190 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
191 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
192 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
193 .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
194 .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
195 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
196 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
197 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
198 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
199 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
200 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
201 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
202 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
203 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
206 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
208 .word OPENSSL_armcap_P-.Lsha256_block_data_order
212 .global sha256_block_data_order
213 .type sha256_block_data_order,%function
214 sha256_block_data_order:
215 .Lsha256_block_data_order:
216 #if __ARM_ARCH__<7 && !defined(__thumb2__)
217 sub r3,pc,#8 @ sha256_block_data_order
219 adr r3,.Lsha256_block_data_order
221 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
222 ldr r12,.LOPENSSL_armcap
223 ldr r12,[r3,r12] @ OPENSSL_armcap_P
227 tst r12,#ARMV8_SHA256
232 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
233 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
234 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
235 sub $Ktbl,r3,#256+32 @ K256
236 sub sp,sp,#16*4 @ alloca(X[16])
243 eor $t3,$B,$C @ magic
246 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
247 $code.=".Lrounds_16_xx:\n";
248 for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
251 ite eq @ Thumb2 thing, sanity check in ARM
253 ldreq $t3,[sp,#16*4] @ pull ctx
256 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
271 ldr $inp,[sp,#17*4] @ pull inp
272 ldr $t2,[sp,#18*4] @ pull inp+len
275 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
277 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
280 add sp,sp,#`16+3`*4 @ destroy frame
282 ldmia sp!,{r4-r11,pc}
284 ldmia sp!,{r4-r11,lr}
286 moveq pc,lr @ be binary compatible with V4, yet
287 bx lr @ interoperable with Thumb ISA:-)
289 .size sha256_block_data_order,.-sha256_block_data_order
291 ######################################################################
295 my @X=map("q$_",(0..3));
296 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
300 sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
301 sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
303 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
304 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
306 $arg = "#$arg" if ($arg*1 eq $arg);
307 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
313 my @insns = (&$body,&$body,&$body,&$body);
314 my ($a,$b,$c,$d,$e,$f,$g,$h);
316 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
320 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
324 &vshr_u32 ($T2,$T0,$sigma0[0]);
327 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
330 &vshr_u32 ($T1,$T0,$sigma0[2]);
333 &vsli_32 ($T2,$T0,32-$sigma0[0]);
336 &vshr_u32 ($T3,$T0,$sigma0[1]);
342 &vsli_32 ($T3,$T0,32-$sigma0[1]);
345 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
348 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
351 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
354 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
357 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
363 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
366 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
369 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
372 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
375 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
378 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
381 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
387 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
390 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
393 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
396 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
399 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
402 &vadd_i32 ($T0,$T0,@X[0]);
403 while($#insns>=2) { eval(shift(@insns)); }
404 &vst1_32 ("{$T0}","[$Xfer,:128]!");
408 push(@X,shift(@X)); # "rotate" X[]
414 my @insns = (&$body,&$body,&$body,&$body);
415 my ($a,$b,$c,$d,$e,$f,$g,$h);
421 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
426 &vrev32_8 (@X[0],@X[0]);
431 &vadd_i32 ($T0,$T0,@X[0]);
432 foreach (@insns) { eval; } # remaining instructions
433 &vst1_32 ("{$T0}","[$Xfer,:128]!");
435 push(@X,shift(@X)); # "rotate" X[]
440 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
441 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
443 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
444 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
446 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
447 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
448 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
449 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
450 '&eor ($t2,$a,$b)', # a^b, b^c in next round
451 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
452 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
453 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
454 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
455 '&ldr ($t1,"[sp,#64]") if ($j==31)',
456 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
457 '&add ($d,$d,$h)', # d+=h
458 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
459 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
460 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
465 #if __ARM_MAX_ARCH__>=7
469 .global sha256_block_data_order_neon
470 .type sha256_block_data_order_neon,%function
473 sha256_block_data_order_neon:
475 stmdb sp!,{r4-r12,lr}
479 bic $H,$H,#15 @ align for 128-bit stores
482 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
484 vld1.8 {@X[0]},[$inp]!
485 vld1.8 {@X[1]},[$inp]!
486 vld1.8 {@X[2]},[$inp]!
487 vld1.8 {@X[3]},[$inp]!
488 vld1.32 {$T0},[$Ktbl,:128]!
489 vld1.32 {$T1},[$Ktbl,:128]!
490 vld1.32 {$T2},[$Ktbl,:128]!
491 vld1.32 {$T3},[$Ktbl,:128]!
492 vrev32.8 @X[0],@X[0] @ yes, even on
494 vrev32.8 @X[1],@X[1] @ big-endian
500 str $t2,[sp,#76] @ save original sp
501 vadd.i32 $T0,$T0,@X[0]
502 vadd.i32 $T1,$T1,@X[1]
503 vst1.32 {$T0},[$Xfer,:128]!
504 vadd.i32 $T2,$T2,@X[2]
505 vst1.32 {$T1},[$Xfer,:128]!
506 vadd.i32 $T3,$T3,@X[3]
507 vst1.32 {$T2},[$Xfer,:128]!
508 vst1.32 {$T3},[$Xfer,:128]!
520 &Xupdate(\&body_00_15);
521 &Xupdate(\&body_00_15);
522 &Xupdate(\&body_00_15);
523 &Xupdate(\&body_00_15);
525 teq $t1,#0 @ check for K256 terminator
532 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
535 subeq $inp,$inp,#64 @ avoid SEGV
536 vld1.8 {@X[0]},[$inp]! @ load next input block
537 vld1.8 {@X[1]},[$inp]!
538 vld1.8 {@X[2]},[$inp]!
539 vld1.8 {@X[3]},[$inp]!
544 &Xpreload(\&body_00_15);
545 &Xpreload(\&body_00_15);
546 &Xpreload(\&body_00_15);
547 &Xpreload(\&body_00_15);
550 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
554 add $A,$A,$t0 @ accumulate
576 ldreq sp,[sp,#76] @ restore original sp
581 ldmia sp!,{r4-r12,pc}
582 .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
586 ######################################################################
590 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
591 my @MSG=map("q$_",(8..11));
592 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
596 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
598 # if defined(__thumb2__)
599 # define INST(a,b,c,d) .byte c,d|0xc,a,b
601 # define INST(a,b,c,d) .byte a,b,c,d
604 .type sha256_block_data_order_armv8,%function
606 sha256_block_data_order_armv8:
608 vld1.32 {$ABCD,$EFGH},[$ctx]
609 sub $Ktbl,$Ktbl,#256+32
610 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
615 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
616 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
617 vld1.32 {$W0},[$Ktbl]!
618 vrev32.8 @MSG[0],@MSG[0]
619 vrev32.8 @MSG[1],@MSG[1]
620 vrev32.8 @MSG[2],@MSG[2]
621 vrev32.8 @MSG[3],@MSG[3]
622 vmov $ABCD_SAVE,$ABCD @ offload
623 vmov $EFGH_SAVE,$EFGH
626 for($i=0;$i<12;$i++) {
628 vld1.32 {$W1},[$Ktbl]!
629 vadd.i32 $W0,$W0,@MSG[0]
630 sha256su0 @MSG[0],@MSG[1]
632 sha256h $ABCD,$EFGH,$W0
633 sha256h2 $EFGH,$abcd,$W0
634 sha256su1 @MSG[0],@MSG[2],@MSG[3]
636 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
639 vld1.32 {$W1},[$Ktbl]!
640 vadd.i32 $W0,$W0,@MSG[0]
642 sha256h $ABCD,$EFGH,$W0
643 sha256h2 $EFGH,$abcd,$W0
645 vld1.32 {$W0},[$Ktbl]!
646 vadd.i32 $W1,$W1,@MSG[1]
648 sha256h $ABCD,$EFGH,$W1
649 sha256h2 $EFGH,$abcd,$W1
651 vld1.32 {$W1},[$Ktbl]
652 vadd.i32 $W0,$W0,@MSG[2]
653 sub $Ktbl,$Ktbl,#256-16 @ rewind
655 sha256h $ABCD,$EFGH,$W0
656 sha256h2 $EFGH,$abcd,$W0
658 vadd.i32 $W1,$W1,@MSG[3]
660 sha256h $ABCD,$EFGH,$W1
661 sha256h2 $EFGH,$abcd,$W1
663 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
664 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
668 vst1.32 {$ABCD,$EFGH},[$ctx]
671 .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
676 .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
678 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
679 .comm OPENSSL_armcap_P,4,4
686 last if (!s/^#/@/ and !/^$/);
692 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
693 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
696 my ($mnemonic,$arg)=@_;
698 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
699 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
700 |(($2&7)<<17)|(($2&8)<<4)
701 |(($3&7)<<1) |(($3&8)<<2);
702 # since ARMv7 instructions are always encoded little-endian.
703 # correct solution is to use .inst directive, but older
704 # assemblers don't implement it:-(
705 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
706 $word&0xff,($word>>8)&0xff,
707 ($word>>16)&0xff,($word>>24)&0xff,
713 foreach (split($/,$code)) {
715 s/\`([^\`]*)\`/eval $1/geo;
717 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
719 s/\bret\b/bx lr/go or
720 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
725 close STDOUT; # enforce flush