3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # SHA256 performance improvement over compiler generated code varies
11 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
12 # build]. Just like in SHA1 module I aim to ensure scalability on
13 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
15 # SHA512 on pre-T1 UltraSPARC.
17 # Performance is >75% better than 64-bit code generated by Sun C and
18 # over 2x than 32-bit code. X[16] resides on stack, but access to it
19 # is scheduled for L2 latency and staged through 32 least significant
20 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit bit ABI
21 # duality. Nevetheless it's ~40% faster than SHA256, which is pretty
22 # good [optimal coefficient is 50%].
24 # SHA512 on UltraSPARC T1.
29 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
30 if ($bits==64) { $bias=2047; $frame=192; }
31 else { $bias=0; $frame=112; }
34 open STDOUT,">$output";
36 if ($output =~ /512/) {
39 $LD="ldx"; # load from memory
40 $ST="stx"; # store to memory
41 $SLL="sllx"; # shift left logical
42 $SRL="srlx"; # shift right logical
45 @sigma0=( 7, 1, 8); # right shift first
46 @sigma1=( 6,19,61); # right shift first
51 $locals=16*$SZ; # X[16]
61 @V=($A,$B,$C,$D,$E,$F,$G,$H);
65 $LD="ld"; # load from memory
66 $ST="st"; # store to memory
67 $SLL="sll"; # shift left logical
68 $SRL="srl"; # shift right logical
71 @sigma0=( 3, 7,18); # right shift first
72 @sigma1=(10,17,19); # right shift first
77 $locals=0; # X[16] is register resident
78 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
88 @V=($A,$B,$C,$D,$E,$F,$G,$H);
104 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
114 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
119 sllx @X[0],$tmp31,@X[0]
124 srlx @X[$j+1],$tmp32,$tmp1
125 sllx @X[$j+1],$tmp31,@X[$j+1]
126 or $tmp1,@X[$j],@X[$j]
137 $code.="\tadd @X[$i/2],$h,$T1\n";
139 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
145 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
146 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
148 $code.=<<___ if ($i==0);
158 $code.=<<___ if ($i<15);
159 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
161 sllx @pair[0],$tmp0,$tmp1
162 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
163 srlx @pair[2],$tmp32,@pair[1]
165 or @pair[1],$tmp2,$tmp2
166 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
168 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
170 $code.=<<___ if ($i==12);
174 $code.=<<___ if ($i==15);
175 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
176 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
178 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
179 sllx @pair[0],$tmp0,$tmp1
180 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
181 srlx @pair[2],$tmp32,@pair[1]
183 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
184 or @pair[1],$tmp2,$tmp2
185 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
187 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
188 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
189 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
190 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
196 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
201 $code.="\tadd $h,$T1,$T1\n";
205 $SRL $e,@Sigma1[0],$h !! $i
207 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
209 $SRL $e,@Sigma1[1],$tmp0
211 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
213 $SRL $e,@Sigma1[2],$tmp0
215 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
217 xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
218 xor $tmp1,$h,$tmp0 ! Sigma1(e)
220 $SRL $a,@Sigma0[0],$h
222 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
223 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
225 $SRL $a,@Sigma0[1],$tmp0
227 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
229 $SRL $a,@Sigma0[2],$tmp0
231 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
233 xor $tmp1,$h,$h ! Sigma0(a)
238 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
239 add $tmp2,$T1,$T1 ! +=K[$i]
254 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
256 $xi=@X[(($i+1)/2)%8];
259 srl $xi,@sigma0[0],$T1 !! Xupdate($i)
260 sll $xi,`32-@sigma0[2]`,$tmp1
261 srl $xi,@sigma0[1],$tmp0
263 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
265 srl $xi,@sigma0[2],$tmp0
269 $xi=@X[(($i+14)/2)%8];
272 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
275 srl $xi,@sigma1[0],$tmp2
276 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
277 sll $xi,`32-@sigma1[2]`,$tmp1
278 srl $xi,@sigma1[1],$tmp0
279 xor $tmp1,$tmp2,$tmp2
280 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
281 xor $tmp0,$tmp2,$tmp2
282 srl $xi,@sigma1[2],$tmp0
283 xor $tmp1,$tmp2,$tmp2
288 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
289 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
290 srl @X[($i/2)%8],0,$tmp0
291 add $xi,$T1,$T1 ! +=X[i]
292 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
297 or $T1,@X[($i/2)%8],@X[($i/2)%8]
300 $xi=@X[(($i+9)/2)%8];
302 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
303 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
304 srl @X[($i/2)%8],0,@X[($i/2)%8]
305 add $xi,$T1,$T1 ! +=X[i+9]
310 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
319 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
322 sllx %l2,32,$tmp0 !! Xupdate($i)
325 srlx $tmp0,@sigma0[0],$T1
326 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
327 sllx $tmp0,`64-@sigma0[2]`,$tmp1
328 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
329 srlx $tmp0,@sigma0[1],$tmp0
331 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
333 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
336 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
339 srlx $tmp2,@sigma1[0],$tmp1
340 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
341 sllx $tmp2,`64-@sigma1[2]`,$tmp0
342 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
343 srlx $tmp2,@sigma1[1],$tmp2
344 xor $tmp0,$tmp1,$tmp1
345 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
346 xor $tmp2,$tmp1,$tmp1
347 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
348 xor $tmp0,$tmp1,$tmp1
350 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
351 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
353 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
357 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
359 add $tmp0,$T1,$T1 ! +=X[$i+9]
360 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
361 add $tmp2,$T1,$T1 ! +=X[$i]
362 $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
367 $code.=<<___ if ($bits==64);
368 .register %g2,#scratch
369 .register %g3,#scratch
372 .section ".text",#alloc,#execinstr
376 .type K${label},#object
380 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
381 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
382 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
383 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
384 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
385 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
386 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
387 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
388 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
389 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
390 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
391 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
392 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
393 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
394 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
395 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
399 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
400 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
401 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
402 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
403 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
404 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
405 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
406 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
407 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
408 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
409 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
410 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
411 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
412 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
413 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
414 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
415 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
416 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
417 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
418 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
419 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
420 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
421 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
422 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
423 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
424 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
425 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
426 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
427 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
428 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
429 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
430 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
431 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
432 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
433 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
434 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
435 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
436 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
437 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
438 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
442 .size K${label},.-K${label}
443 .globl sha${label}_block_data_order
444 sha${label}_block_data_order:
445 save %sp,`-$frame-$locals`,%sp
446 and $inp,`$align-1`,$tmp31
447 sllx $len,`log(16*$SZ)/log(2)`,$len
448 andn $inp,`$align-1`,$inp
452 $code.=<<___ if ($SZ==8); # SHA512
454 sub $tmp32,$tmp31,$tmp32
458 sub %o7,.Lpic-K${label},$Ktbl
460 $LD [$ctx+`0*$SZ`],$A
461 $LD [$ctx+`1*$SZ`],$B
462 $LD [$ctx+`2*$SZ`],$C
463 $LD [$ctx+`3*$SZ`],$D
464 $LD [$ctx+`4*$SZ`],$E
465 $LD [$ctx+`5*$SZ`],$F
466 $LD [$ctx+`6*$SZ`],$G
467 $LD [$ctx+`7*$SZ`],$H
471 for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
473 for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
475 and $tmp2,0xfff,$tmp2
478 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
481 $code.=<<___ if ($SZ==4); # SHA256
482 $LD [$ctx+`0*$SZ`],@X[0]
483 $LD [$ctx+`1*$SZ`],@X[1]
484 $LD [$ctx+`2*$SZ`],@X[2]
485 $LD [$ctx+`3*$SZ`],@X[3]
486 $LD [$ctx+`4*$SZ`],@X[4]
487 $LD [$ctx+`5*$SZ`],@X[5]
488 $LD [$ctx+`6*$SZ`],@X[6]
489 $LD [$ctx+`7*$SZ`],@X[7]
492 $ST $A,[$ctx+`0*$SZ`]
494 $ST $B,[$ctx+`1*$SZ`]
496 $ST $C,[$ctx+`2*$SZ`]
498 $ST $D,[$ctx+`3*$SZ`]
500 $ST $E,[$ctx+`4*$SZ`]
502 $ST $F,[$ctx+`5*$SZ`]
504 $ST $G,[$ctx+`6*$SZ`]
506 $ST $H,[$ctx+`7*$SZ`]
508 $code.=<<___ if ($SZ==8); # SHA512
509 ld [$ctx+`0*$SZ+0`],%l0
510 ld [$ctx+`0*$SZ+4`],%l1
511 ld [$ctx+`1*$SZ+0`],%l2
512 ld [$ctx+`1*$SZ+4`],%l3
513 ld [$ctx+`2*$SZ+0`],%l4
514 ld [$ctx+`2*$SZ+4`],%l5
515 ld [$ctx+`3*$SZ+0`],%l6
518 ld [$ctx+`3*$SZ+4`],%l7
524 $ST $A,[$ctx+`0*$SZ`]
526 $ST $B,[$ctx+`1*$SZ`]
531 $ST $C,[$ctx+`2*$SZ`]
533 $ST $D,[$ctx+`3*$SZ`]
535 ld [$ctx+`4*$SZ+0`],%l0
536 ld [$ctx+`4*$SZ+4`],%l1
537 ld [$ctx+`5*$SZ+0`],%l2
538 ld [$ctx+`5*$SZ+4`],%l3
539 ld [$ctx+`6*$SZ+0`],%l4
540 ld [$ctx+`6*$SZ+4`],%l5
541 ld [$ctx+`7*$SZ+0`],%l6
544 ld [$ctx+`7*$SZ+4`],%l7
550 $ST $E,[$ctx+`4*$SZ`]
552 $ST $F,[$ctx+`5*$SZ`]
557 $ST $G,[$ctx+`6*$SZ`]
559 $ST $H,[$ctx+`7*$SZ`]
562 add $inp,`16*$SZ`,$inp ! advance inp
564 bne `$bits==64?"%xcc":"%icc"`,.Lloop
565 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
569 .type sha${label}_block_data_order,#function
570 .size sha${label}_block_data_order,(.-sha${label}_block_data_order)
571 .asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
574 $code =~ s/\`([^\`]*)\`/eval $1/gem;