3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
9 # Permission to use under GPL terms is granted.
10 # ====================================================================
12 # SHA256 block procedure for ARMv4. May 2007.
14 # Performance is ~2x better than gcc 3.4 generated code and in "abso-
15 # lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
16 # byte [on single-issue Xscale PXA250 core].
20 # Rescheduling for dual-issue pipeline resulted in 22% improvement on
21 # Cortex A8 core and ~20 cycles per processed byte.
25 # Profiler-assisted and platform-specific optimization resulted in 16%
26 # improvement on Cortex A8 core and ~15.4 cycles per processed byte.
30 # Add NEON implementation. On Cortex A8 it was measured to process one
31 # byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
32 # S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
33 # code (meaning that latter performs sub-optimally, nothing was done
38 # Add ARMv8 code path performing at 2.0 cpb on Apple A7.
41 if ($flavour=~/^\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; }
42 else { while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {} }
44 if ($flavour && $flavour ne "void") {
45 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
47 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
48 die "can't locate arm-xlate.pl";
50 open STDOUT,"| \"$^X\" $xlate $flavour $output";
52 open STDOUT,">$output";
67 @V=($A,$B,$C,$D,$E,$F,$G,$H);
77 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
79 $code.=<<___ if ($i<16);
81 @ ldr $t1,[$inp],#4 @ $i
83 str $inp,[sp,#17*4] @ make room for $t4
85 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
86 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
87 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
92 @ ldrb $t1,[$inp,#3] @ $i
93 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
98 orr $t1,$t1,$t0,lsl#16
100 str $inp,[sp,#17*4] @ make room for $t4
102 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
103 orr $t1,$t1,$t2,lsl#24
104 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
108 ldr $t2,[$Ktbl],#4 @ *K256++
109 add $h,$h,$t1 @ h+=X[i]
110 str $t1,[sp,#`$i%16`*4]
112 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
114 add $h,$h,$t2 @ h+=K256[i]
115 eor $t1,$t1,$g @ Ch(e,f,g)
116 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
117 add $h,$h,$t1 @ h+=Ch(e,f,g)
120 cmp $t2,#0xf2 @ done?
124 ldr $t1,[$inp],#4 @ prefetch
128 eor $t2,$a,$b @ a^b, b^c in next round
130 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
131 eor $t2,$a,$b @ a^b, b^c in next round
132 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
134 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
135 and $t3,$t3,$t2 @ (b^c)&=(a^b)
137 eor $t3,$t3,$b @ Maj(a,b,c)
138 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
139 @ add $h,$h,$t3 @ h+=Maj(a,b,c)
145 my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
148 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
149 @ ldr $t4,[sp,#`($i+14)%16`*4]
150 mov $t0,$t1,ror#$sigma0[0]
151 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
152 mov $t2,$t4,ror#$sigma1[0]
153 eor $t0,$t0,$t1,ror#$sigma0[1]
154 eor $t2,$t2,$t4,ror#$sigma1[1]
155 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
156 ldr $t1,[sp,#`($i+0)%16`*4]
157 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
158 ldr $t4,[sp,#`($i+9)%16`*4]
161 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
163 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
164 add $t1,$t1,$t4 @ X[i]
171 # include "arm_arch.h"
173 # define __ARM_ARCH__ __LINUX_ARM_ARCH__
174 # define __ARM_MAX_ARCH__ 7
178 #if defined(__thumb2__)
189 .word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
190 .word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
191 .word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
192 .word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
193 .word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
194 .word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
195 .word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
196 .word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
197 .word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
198 .word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
199 .word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
200 .word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
201 .word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
202 .word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
203 .word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
204 .word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
207 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
209 .word OPENSSL_armcap_P-.Lsha256_block_data_order
213 .global sha256_block_data_order
214 .type sha256_block_data_order,%function
215 sha256_block_data_order:
216 .Lsha256_block_data_order:
217 #if __ARM_ARCH__<7 && !defined(__thumb2__)
218 sub r3,pc,#8 @ sha256_block_data_order
220 adr r3,.Lsha256_block_data_order
222 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
223 ldr r12,.LOPENSSL_armcap
224 ldr r12,[r3,r12] @ OPENSSL_armcap_P
228 tst r12,#ARMV8_SHA256
233 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
234 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
235 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
236 sub $Ktbl,r3,#256+32 @ K256
237 sub sp,sp,#16*4 @ alloca(X[16])
244 eor $t3,$B,$C @ magic
247 for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
248 $code.=".Lrounds_16_xx:\n";
249 for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
252 ite eq @ Thumb2 thing, sanity check in ARM
254 ldreq $t3,[sp,#16*4] @ pull ctx
257 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
272 ldr $inp,[sp,#17*4] @ pull inp
273 ldr $t2,[sp,#18*4] @ pull inp+len
276 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
278 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
281 add sp,sp,#`16+3`*4 @ destroy frame
283 ldmia sp!,{r4-r11,pc}
285 ldmia sp!,{r4-r11,lr}
287 moveq pc,lr @ be binary compatible with V4, yet
288 bx lr @ interoperable with Thumb ISA:-)
290 .size sha256_block_data_order,.-sha256_block_data_order
292 ######################################################################
296 my @X=map("q$_",(0..3));
297 my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
301 sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
302 sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
304 sub AUTOLOAD() # thunk [simplified] x86-style perlasm
305 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
307 $arg = "#$arg" if ($arg*1 eq $arg);
308 $code .= "\t$opcode\t".join(',',@_,$arg)."\n";
314 my @insns = (&$body,&$body,&$body,&$body);
315 my ($a,$b,$c,$d,$e,$f,$g,$h);
317 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4]
321 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12]
325 &vshr_u32 ($T2,$T0,$sigma0[0]);
328 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12]
331 &vshr_u32 ($T1,$T0,$sigma0[2]);
334 &vsli_32 ($T2,$T0,32-$sigma0[0]);
337 &vshr_u32 ($T3,$T0,$sigma0[1]);
343 &vsli_32 ($T3,$T0,32-$sigma0[1]);
346 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
349 &veor ($T1,$T1,$T3); # sigma0(X[1..4])
352 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
355 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
358 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4])
364 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
367 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
370 &veor ($T5,$T5,$T4); # sigma1(X[14..15])
373 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
376 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
379 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
382 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
388 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
391 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
394 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
397 &veor ($T5,$T5,$T4); # sigma1(X[16..17])
400 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
403 &vadd_i32 ($T0,$T0,@X[0]);
404 while($#insns>=2) { eval(shift(@insns)); }
405 &vst1_32 ("{$T0}","[$Xfer,:128]!");
409 push(@X,shift(@X)); # "rotate" X[]
415 my @insns = (&$body,&$body,&$body,&$body);
416 my ($a,$b,$c,$d,$e,$f,$g,$h);
422 &vld1_32 ("{$T0}","[$Ktbl,:128]!");
427 &vrev32_8 (@X[0],@X[0]);
432 &vadd_i32 ($T0,$T0,@X[0]);
433 foreach (@insns) { eval; } # remaining instructions
434 &vst1_32 ("{$T0}","[$Xfer,:128]!");
436 push(@X,shift(@X)); # "rotate" X[]
441 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
442 '&add ($h,$h,$t1)', # h+=X[i]+K[i]
444 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
445 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
447 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
448 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
449 '&eor ($t1,$t1,$g)', # Ch(e,f,g)
450 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
451 '&eor ($t2,$a,$b)', # a^b, b^c in next round
452 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
453 '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
454 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
455 '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
456 '&ldr ($t1,"[sp,#64]") if ($j==31)',
457 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
458 '&add ($d,$d,$h)', # d+=h
459 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
460 '&eor ($t3,$t3,$b)', # Maj(a,b,c)
461 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
466 #if __ARM_MAX_ARCH__>=7
470 .global sha256_block_data_order_neon
471 .type sha256_block_data_order_neon,%function
473 sha256_block_data_order_neon:
475 stmdb sp!,{r4-r12,lr}
479 bic $H,$H,#15 @ align for 128-bit stores
482 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
484 vld1.8 {@X[0]},[$inp]!
485 vld1.8 {@X[1]},[$inp]!
486 vld1.8 {@X[2]},[$inp]!
487 vld1.8 {@X[3]},[$inp]!
488 vld1.32 {$T0},[$Ktbl,:128]!
489 vld1.32 {$T1},[$Ktbl,:128]!
490 vld1.32 {$T2},[$Ktbl,:128]!
491 vld1.32 {$T3},[$Ktbl,:128]!
492 vrev32.8 @X[0],@X[0] @ yes, even on
494 vrev32.8 @X[1],@X[1] @ big-endian
500 str $t2,[sp,#76] @ save original sp
501 vadd.i32 $T0,$T0,@X[0]
502 vadd.i32 $T1,$T1,@X[1]
503 vst1.32 {$T0},[$Xfer,:128]!
504 vadd.i32 $T2,$T2,@X[2]
505 vst1.32 {$T1},[$Xfer,:128]!
506 vadd.i32 $T3,$T3,@X[3]
507 vst1.32 {$T2},[$Xfer,:128]!
508 vst1.32 {$T3},[$Xfer,:128]!
520 &Xupdate(\&body_00_15);
521 &Xupdate(\&body_00_15);
522 &Xupdate(\&body_00_15);
523 &Xupdate(\&body_00_15);
525 teq $t1,#0 @ check for K256 terminator
532 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl
535 subeq $inp,$inp,#64 @ avoid SEGV
536 vld1.8 {@X[0]},[$inp]! @ load next input block
537 vld1.8 {@X[1]},[$inp]!
538 vld1.8 {@X[2]},[$inp]!
539 vld1.8 {@X[3]},[$inp]!
544 &Xpreload(\&body_00_15);
545 &Xpreload(\&body_00_15);
546 &Xpreload(\&body_00_15);
547 &Xpreload(\&body_00_15);
550 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
554 add $A,$A,$t0 @ accumulate
576 ldreq sp,[sp,#76] @ restore original sp
581 ldmia sp!,{r4-r12,pc}
582 .size sha256_block_data_order_neon,.-sha256_block_data_order_neon
586 ######################################################################
590 my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2));
591 my @MSG=map("q$_",(8..11));
592 my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15));
596 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
598 # if defined(__thumb2__)
599 # define INST(a,b,c,d) .byte c,d|0xc,a,b
601 # define INST(a,b,c,d) .byte a,b,c,d
604 .type sha256_block_data_order_armv8,%function
606 sha256_block_data_order_armv8:
608 vld1.32 {$ABCD,$EFGH},[$ctx]
610 sub $Ktbl,$Ktbl,#256+32
611 # elif defined(__thumb2__)
613 sub $Ktbl,$Ktbl,#.LARMv8-K256
617 add $len,$inp,$len,lsl#6 @ len to point at the end of inp
620 vld1.8 {@MSG[0]-@MSG[1]},[$inp]!
621 vld1.8 {@MSG[2]-@MSG[3]},[$inp]!
622 vld1.32 {$W0},[$Ktbl]!
623 vrev32.8 @MSG[0],@MSG[0]
624 vrev32.8 @MSG[1],@MSG[1]
625 vrev32.8 @MSG[2],@MSG[2]
626 vrev32.8 @MSG[3],@MSG[3]
627 vmov $ABCD_SAVE,$ABCD @ offload
628 vmov $EFGH_SAVE,$EFGH
631 for($i=0;$i<12;$i++) {
633 vld1.32 {$W1},[$Ktbl]!
634 vadd.i32 $W0,$W0,@MSG[0]
635 sha256su0 @MSG[0],@MSG[1]
637 sha256h $ABCD,$EFGH,$W0
638 sha256h2 $EFGH,$abcd,$W0
639 sha256su1 @MSG[0],@MSG[2],@MSG[3]
641 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG));
644 vld1.32 {$W1},[$Ktbl]!
645 vadd.i32 $W0,$W0,@MSG[0]
647 sha256h $ABCD,$EFGH,$W0
648 sha256h2 $EFGH,$abcd,$W0
650 vld1.32 {$W0},[$Ktbl]!
651 vadd.i32 $W1,$W1,@MSG[1]
653 sha256h $ABCD,$EFGH,$W1
654 sha256h2 $EFGH,$abcd,$W1
656 vld1.32 {$W1},[$Ktbl]
657 vadd.i32 $W0,$W0,@MSG[2]
658 sub $Ktbl,$Ktbl,#256-16 @ rewind
660 sha256h $ABCD,$EFGH,$W0
661 sha256h2 $EFGH,$abcd,$W0
663 vadd.i32 $W1,$W1,@MSG[3]
665 sha256h $ABCD,$EFGH,$W1
666 sha256h2 $EFGH,$abcd,$W1
668 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE
669 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE
673 vst1.32 {$ABCD,$EFGH},[$ctx]
676 .size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8
681 .asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
683 #if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
684 .comm OPENSSL_armcap_P,4,4
691 last if (!s/^#/@/ and !/^$/);
697 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40,
698 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 );
701 my ($mnemonic,$arg)=@_;
703 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) {
704 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
705 |(($2&7)<<17)|(($2&8)<<4)
706 |(($3&7)<<1) |(($3&8)<<2);
707 # since ARMv7 instructions are always encoded little-endian.
708 # correct solution is to use .inst directive, but older
709 # assemblers don't implement it:-(
710 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
711 $word&0xff,($word>>8)&0xff,
712 ($word>>16)&0xff,($word>>24)&0xff,
718 foreach (split($/,$code)) {
720 s/\`([^\`]*)\`/eval $1/geo;
722 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo;
724 s/\bret\b/bx lr/go or
725 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4
730 close STDOUT; # enforce flush