3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # SHA256 performance improvement over compiler generated code varies
11 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
12 # build]. Just like in SHA1 module I aim to ensure scalability on
13 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
15 # SHA512 on pre-T1 UltraSPARC.
17 # Performance is >75% better than 64-bit code generated by Sun C and
18 # over 2x than 32-bit code. X[16] resides on stack, but access to it
19 # is scheduled for L2 latency and staged through 32 least significant
20 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit bit ABI
21 # duality. Nevetheless it's ~40% faster than SHA256, which is pretty
22 # good [optimal coefficient is 50%].
24 # SHA512 on UltraSPARC T1.
26 # It's not any faster than 64-bit code generated by Sun C 5.8. This is
27 # because 64-bit code generator has the advantage of using 64-bit
28 # loads to access X[16], which I consciously traded for 32-/64-bit ABI
29 # duality [as per above]. But it surpasses 32-bit Sun C generated code
30 # by 60%, not to mention that it doesn't suffer from severe decay when
31 # running 4 times physical cores threads and that it leaves gcc [3.4]
32 # behind by over 4x factor! If compared to SHA256, single thread
33 # performance is only 10% better, but overall throughput for maximum
34 # amount of threads for given CPU exceeds corresponding one of SHA256
35 # by 30% [again, optimal coefficient is 50%].
38 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
39 if ($bits==64) { $bias=2047; $frame=192; }
40 else { $bias=0; $frame=112; }
43 open STDOUT,">$output";
45 if ($output =~ /512/) {
48 $LD="ldx"; # load from memory
49 $ST="stx"; # store to memory
50 $SLL="sllx"; # shift left logical
51 $SRL="srlx"; # shift right logical
54 @sigma0=( 7, 1, 8); # right shift first
55 @sigma1=( 6,19,61); # right shift first
60 $locals=16*$SZ; # X[16]
70 @V=($A,$B,$C,$D,$E,$F,$G,$H);
74 $LD="ld"; # load from memory
75 $ST="st"; # store to memory
76 $SLL="sll"; # shift left logical
77 $SRL="srl"; # shift right logical
80 @sigma0=( 3, 7,18); # right shift first
81 @sigma1=(10,17,19); # right shift first
86 $locals=0; # X[16] is register resident
87 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
97 @V=($A,$B,$C,$D,$E,$F,$G,$H);
113 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
123 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
128 sllx @X[0],$tmp31,@X[0]
133 srlx @X[$j+1],$tmp32,$tmp1
134 sllx @X[$j+1],$tmp31,@X[$j+1]
135 or $tmp1,@X[$j],@X[$j]
146 $code.="\tadd @X[$i/2],$h,$T1\n";
148 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
154 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
155 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
157 $code.=<<___ if ($i==0);
167 $code.=<<___ if ($i<15);
168 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
170 sllx @pair[0],$tmp0,$tmp1
171 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
172 srlx @pair[2],$tmp32,@pair[1]
174 or @pair[1],$tmp2,$tmp2
175 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
177 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
179 $code.=<<___ if ($i==12);
183 $code.=<<___ if ($i==15);
184 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
185 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
187 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
188 sllx @pair[0],$tmp0,$tmp1
189 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
190 srlx @pair[2],$tmp32,@pair[1]
192 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
193 or @pair[1],$tmp2,$tmp2
194 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
196 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
197 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
198 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
199 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
205 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
210 $code.="\tadd $h,$T1,$T1\n";
214 $SRL $e,@Sigma1[0],$h !! $i
216 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
218 $SRL $e,@Sigma1[1],$tmp0
220 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
222 $SRL $e,@Sigma1[2],$tmp0
224 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
226 xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
227 xor $tmp1,$h,$tmp0 ! Sigma1(e)
229 $SRL $a,@Sigma0[0],$h
231 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
232 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
234 $SRL $a,@Sigma0[1],$tmp0
236 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
238 $SRL $a,@Sigma0[2],$tmp0
240 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
242 xor $tmp1,$h,$h ! Sigma0(a)
247 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
248 add $tmp2,$T1,$T1 ! +=K[$i]
263 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
265 $xi=@X[(($i+1)/2)%8];
268 srl $xi,@sigma0[0],$T1 !! Xupdate($i)
269 sll $xi,`32-@sigma0[2]`,$tmp1
270 srl $xi,@sigma0[1],$tmp0
272 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
274 srl $xi,@sigma0[2],$tmp0
278 $xi=@X[(($i+14)/2)%8];
281 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
284 srl $xi,@sigma1[0],$tmp2
285 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
286 sll $xi,`32-@sigma1[2]`,$tmp1
287 srl $xi,@sigma1[1],$tmp0
288 xor $tmp1,$tmp2,$tmp2
289 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
290 xor $tmp0,$tmp2,$tmp2
291 srl $xi,@sigma1[2],$tmp0
292 xor $tmp1,$tmp2,$tmp2
297 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
298 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
299 srl @X[($i/2)%8],0,$tmp0
300 add $xi,$T1,$T1 ! +=X[i]
301 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
306 or $T1,@X[($i/2)%8],@X[($i/2)%8]
309 $xi=@X[(($i+9)/2)%8];
311 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
312 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
313 srl @X[($i/2)%8],0,@X[($i/2)%8]
314 add $xi,$T1,$T1 ! +=X[i+9]
319 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
328 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
331 sllx %l2,32,$tmp0 !! Xupdate($i)
334 srlx $tmp0,@sigma0[0],$T1
335 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
336 sllx $tmp0,`64-@sigma0[2]`,$tmp1
337 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
338 srlx $tmp0,@sigma0[1],$tmp0
340 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
342 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
345 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
348 srlx $tmp2,@sigma1[0],$tmp1
349 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
350 sllx $tmp2,`64-@sigma1[2]`,$tmp0
351 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
352 srlx $tmp2,@sigma1[1],$tmp2
353 xor $tmp0,$tmp1,$tmp1
354 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
355 xor $tmp2,$tmp1,$tmp1
356 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
357 xor $tmp0,$tmp1,$tmp1
359 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
360 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
362 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
366 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
368 add $tmp0,$T1,$T1 ! +=X[$i+9]
369 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
370 add $tmp2,$T1,$T1 ! +=X[$i]
371 $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
376 $code.=<<___ if ($bits==64);
377 .register %g2,#scratch
378 .register %g3,#scratch
381 .section ".text",#alloc,#execinstr
385 .type K${label},#object
389 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
390 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
391 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
392 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
393 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
394 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
395 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
396 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
397 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
398 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
399 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
400 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
401 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
402 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
403 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
404 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
408 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
409 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
410 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
411 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
412 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
413 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
414 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
415 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
416 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
417 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
418 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
419 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
420 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
421 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
422 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
423 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
424 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
425 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
426 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
427 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
428 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
429 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
430 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
431 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
432 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
433 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
434 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
435 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
436 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
437 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
438 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
439 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
440 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
441 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
442 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
443 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
444 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
445 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
446 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
447 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
451 .size K${label},.-K${label}
452 .globl sha${label}_block_data_order
453 sha${label}_block_data_order:
454 save %sp,`-$frame-$locals`,%sp
455 and $inp,`$align-1`,$tmp31
456 sllx $len,`log(16*$SZ)/log(2)`,$len
457 andn $inp,`$align-1`,$inp
461 $code.=<<___ if ($SZ==8); # SHA512
463 sub $tmp32,$tmp31,$tmp32
467 sub %o7,.Lpic-K${label},$Ktbl
469 $LD [$ctx+`0*$SZ`],$A
470 $LD [$ctx+`1*$SZ`],$B
471 $LD [$ctx+`2*$SZ`],$C
472 $LD [$ctx+`3*$SZ`],$D
473 $LD [$ctx+`4*$SZ`],$E
474 $LD [$ctx+`5*$SZ`],$F
475 $LD [$ctx+`6*$SZ`],$G
476 $LD [$ctx+`7*$SZ`],$H
480 for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
482 for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
484 and $tmp2,0xfff,$tmp2
487 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
490 $code.=<<___ if ($SZ==4); # SHA256
491 $LD [$ctx+`0*$SZ`],@X[0]
492 $LD [$ctx+`1*$SZ`],@X[1]
493 $LD [$ctx+`2*$SZ`],@X[2]
494 $LD [$ctx+`3*$SZ`],@X[3]
495 $LD [$ctx+`4*$SZ`],@X[4]
496 $LD [$ctx+`5*$SZ`],@X[5]
497 $LD [$ctx+`6*$SZ`],@X[6]
498 $LD [$ctx+`7*$SZ`],@X[7]
501 $ST $A,[$ctx+`0*$SZ`]
503 $ST $B,[$ctx+`1*$SZ`]
505 $ST $C,[$ctx+`2*$SZ`]
507 $ST $D,[$ctx+`3*$SZ`]
509 $ST $E,[$ctx+`4*$SZ`]
511 $ST $F,[$ctx+`5*$SZ`]
513 $ST $G,[$ctx+`6*$SZ`]
515 $ST $H,[$ctx+`7*$SZ`]
517 $code.=<<___ if ($SZ==8); # SHA512
518 ld [$ctx+`0*$SZ+0`],%l0
519 ld [$ctx+`0*$SZ+4`],%l1
520 ld [$ctx+`1*$SZ+0`],%l2
521 ld [$ctx+`1*$SZ+4`],%l3
522 ld [$ctx+`2*$SZ+0`],%l4
523 ld [$ctx+`2*$SZ+4`],%l5
524 ld [$ctx+`3*$SZ+0`],%l6
527 ld [$ctx+`3*$SZ+4`],%l7
533 $ST $A,[$ctx+`0*$SZ`]
535 $ST $B,[$ctx+`1*$SZ`]
540 $ST $C,[$ctx+`2*$SZ`]
542 $ST $D,[$ctx+`3*$SZ`]
544 ld [$ctx+`4*$SZ+0`],%l0
545 ld [$ctx+`4*$SZ+4`],%l1
546 ld [$ctx+`5*$SZ+0`],%l2
547 ld [$ctx+`5*$SZ+4`],%l3
548 ld [$ctx+`6*$SZ+0`],%l4
549 ld [$ctx+`6*$SZ+4`],%l5
550 ld [$ctx+`7*$SZ+0`],%l6
553 ld [$ctx+`7*$SZ+4`],%l7
559 $ST $E,[$ctx+`4*$SZ`]
561 $ST $F,[$ctx+`5*$SZ`]
566 $ST $G,[$ctx+`6*$SZ`]
568 $ST $H,[$ctx+`7*$SZ`]
571 add $inp,`16*$SZ`,$inp ! advance inp
573 bne `$bits==64?"%xcc":"%icc"`,.Lloop
574 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
578 .type sha${label}_block_data_order,#function
579 .size sha${label}_block_data_order,(.-sha${label}_block_data_order)
580 .asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
583 $code =~ s/\`([^\`]*)\`/eval $1/gem;