3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
9 # Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
10 # ====================================================================
12 # SHA256 performance improvement over compiler generated code varies
13 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
14 # build]. Just like in SHA1 module I aim to ensure scalability on
15 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
17 # SHA512 on pre-T1 UltraSPARC.
19 # Performance is >75% better than 64-bit code generated by Sun C and
20 # over 2x than 32-bit code. X[16] resides on stack, but access to it
21 # is scheduled for L2 latency and staged through 32 least significant
22 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
23 # duality. Nevetheless it's ~40% faster than SHA256, which is pretty
24 # good [optimal coefficient is 50%].
26 # SHA512 on UltraSPARC T1.
28 # It's not any faster than 64-bit code generated by Sun C 5.8. This is
29 # because 64-bit code generator has the advantage of using 64-bit
30 # loads(*) to access X[16], which I consciously traded for 32-/64-bit
31 # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
32 # code by 60%, not to mention that it doesn't suffer from severe decay
33 # when running 4 times physical cores threads and that it leaves gcc
34 # [3.4] behind by over 4x factor! If compared to SHA256, single thread
35 # performance is only 10% better, but overall throughput for maximum
36 # amount of threads for given CPU exceeds corresponding one of SHA256
37 # by 30% [again, optimal coefficient is 50%].
39 # (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
40 # in-order, i.e. load instruction has to complete prior next
41 # instruction in given thread is executed, even if the latter is
42 # not dependent on load result! This means that on T1 two 32-bit
43 # loads are always slower than one 64-bit load. Once again this
44 # is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
45 # 2x32-bit loads can be as fast as 1x64-bit ones.
47 # SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
48 # which is 9.3x/11.1x faster than software. Multi-process benchmark
49 # saturates at 11.5x single-process result on 8-core processor, or
50 # ~11/16GBps per 2.85GHz socket.
54 for (@ARGV) { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
55 if ($bits==64) { $bias=2047; $frame=192; }
56 else { $bias=0; $frame=112; }
59 open STDOUT,">$output";
61 if ($output =~ /512/) {
64 $LD="ldx"; # load from memory
65 $ST="stx"; # store to memory
66 $SLL="sllx"; # shift left logical
67 $SRL="srlx"; # shift right logical
70 @sigma0=( 7, 1, 8); # right shift first
71 @sigma1=( 6,19,61); # right shift first
76 $locals=16*$SZ; # X[16]
86 @V=($A,$B,$C,$D,$E,$F,$G,$H);
90 $LD="ld"; # load from memory
91 $ST="st"; # store to memory
92 $SLL="sll"; # shift left logical
93 $SRL="srl"; # shift right logical
96 @sigma0=( 3, 7,18); # right shift first
97 @sigma1=(10,17,19); # right shift first
102 $locals=0; # X[16] is register resident
103 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
113 @V=($A,$B,$C,$D,$E,$F,$G,$H);
129 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
139 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
144 sllx @X[0],$tmp31,@X[0]
149 srlx @X[$j+1],$tmp32,$tmp1
150 sllx @X[$j+1],$tmp31,@X[$j+1]
151 or $tmp1,@X[$j],@X[$j]
162 $code.="\tadd @X[$i/2],$h,$T1\n";
164 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
170 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
171 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
173 $code.=<<___ if ($i==0);
184 $code.=<<___ if ($i<15);
185 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
187 sllx @pair[0],$tmp0,$tmp1
188 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
189 srlx @pair[2],$tmp32,@pair[1]
191 or @pair[1],$tmp2,$tmp2
192 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
194 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
196 $code.=<<___ if ($i==12);
200 $code.=<<___ if ($i==15);
201 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
202 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
204 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
205 sllx @pair[0],$tmp0,$tmp1
206 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
207 srlx @pair[2],$tmp32,@pair[1]
209 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
210 or @pair[1],$tmp2,$tmp2
211 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
213 $ST $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
214 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
215 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
216 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
222 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
227 $code.="\tadd $h,$T1,$T1\n";
231 $SRL $e,@Sigma1[0],$h !! $i
233 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
235 $SRL $e,@Sigma1[1],$tmp0
237 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
239 $SRL $e,@Sigma1[2],$tmp0
241 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
243 xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
244 xor $tmp1,$h,$tmp0 ! Sigma1(e)
246 $SRL $a,@Sigma0[0],$h
248 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
249 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
251 $SRL $a,@Sigma0[1],$tmp0
253 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
255 $SRL $a,@Sigma0[2],$tmp0
257 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
259 xor $tmp1,$h,$h ! Sigma0(a)
264 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
265 add $tmp2,$T1,$T1 ! +=K[$i]
280 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
282 $xi=@X[(($i+1)/2)%8];
285 srl $xi,@sigma0[0],$T1 !! Xupdate($i)
286 sll $xi,`32-@sigma0[2]`,$tmp1
287 srl $xi,@sigma0[1],$tmp0
289 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
291 srl $xi,@sigma0[2],$tmp0
295 $xi=@X[(($i+14)/2)%8];
298 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
301 srl $xi,@sigma1[0],$tmp2
302 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
303 sll $xi,`32-@sigma1[2]`,$tmp1
304 srl $xi,@sigma1[1],$tmp0
305 xor $tmp1,$tmp2,$tmp2
306 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
307 xor $tmp0,$tmp2,$tmp2
308 srl $xi,@sigma1[2],$tmp0
309 xor $tmp1,$tmp2,$tmp2
314 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
315 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
316 srl @X[($i/2)%8],0,$tmp0
317 add $tmp2,$tmp1,$tmp1
318 add $xi,$T1,$T1 ! +=X[i]
319 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
323 or $T1,@X[($i/2)%8],@X[($i/2)%8]
326 $xi=@X[(($i+9)/2)%8];
328 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
329 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
330 add $xi,$T1,$T1 ! +=X[i+9]
331 add $tmp2,$tmp1,$tmp1
332 srl @X[($i/2)%8],0,@X[($i/2)%8]
336 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
345 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
348 sllx %l2,32,$tmp0 !! Xupdate($i)
351 srlx $tmp0,@sigma0[0],$T1
352 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
353 sllx $tmp0,`64-@sigma0[2]`,$tmp1
354 ld [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
355 srlx $tmp0,@sigma0[1],$tmp0
357 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
359 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
362 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
365 srlx $tmp2,@sigma1[0],$tmp1
366 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
367 sllx $tmp2,`64-@sigma1[2]`,$tmp0
368 ld [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
369 srlx $tmp2,@sigma1[1],$tmp2
370 xor $tmp0,$tmp1,$tmp1
371 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
372 xor $tmp2,$tmp1,$tmp1
373 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
374 xor $tmp0,$tmp1,$tmp1
376 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
377 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
379 ld [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
383 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
385 add $tmp0,$T1,$T1 ! +=X[$i+9]
386 ld [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
387 add $tmp2,$T1,$T1 ! +=X[$i]
388 $ST $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
393 $code.=<<___ if ($bits==64);
394 .register %g2,#scratch
395 .register %g3,#scratch
398 #include "sparc_arch.h"
400 .section ".text",#alloc,#execinstr
404 .type K${label},#object
408 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
409 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
410 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
411 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
412 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
413 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
414 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
415 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
416 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
417 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
418 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
419 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
420 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
421 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
422 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
423 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
427 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
428 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
429 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
430 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
431 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
432 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
433 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
434 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
435 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
436 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
437 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
438 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
439 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
440 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
441 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
442 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
443 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
444 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
445 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
446 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
447 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
448 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
449 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
450 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
451 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
452 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
453 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
454 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
455 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
456 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
457 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
458 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
459 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
460 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
461 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
462 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
463 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
464 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
465 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
466 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
470 .size K${label},.-K${label}
476 .globl sha${label}_block_data_order
478 sha${label}_block_data_order:
479 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
480 ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
482 andcc %g1, CFR_SHA${label}, %g0
486 $code.=<<___ if ($SZ==8); # SHA512
487 ldd [%o0 + 0x00], %f0 ! load context
488 ldd [%o0 + 0x08], %f2
489 ldd [%o0 + 0x10], %f4
490 ldd [%o0 + 0x18], %f6
491 ldd [%o0 + 0x20], %f8
492 ldd [%o0 + 0x28], %f10
494 ldd [%o0 + 0x30], %f12
495 bne,pn %icc, .Lhwunaligned
496 ldd [%o0 + 0x38], %f14
499 ldd [%o1 + 0x00], %f16
500 ldd [%o1 + 0x08], %f18
501 ldd [%o1 + 0x10], %f20
502 ldd [%o1 + 0x18], %f22
503 ldd [%o1 + 0x20], %f24
504 ldd [%o1 + 0x28], %f26
505 ldd [%o1 + 0x30], %f28
506 ldd [%o1 + 0x38], %f30
507 ldd [%o1 + 0x40], %f32
508 ldd [%o1 + 0x48], %f34
509 ldd [%o1 + 0x50], %f36
510 ldd [%o1 + 0x58], %f38
511 ldd [%o1 + 0x60], %f40
512 ldd [%o1 + 0x68], %f42
513 ldd [%o1 + 0x70], %f44
514 subcc %o2, 1, %o2 ! done yet?
515 ldd [%o1 + 0x78], %f46
517 prefetch [%o1 + 63], 20
518 prefetch [%o1 + 64+63], 20
520 .word 0x81b02860 ! SHA512
522 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwaligned_loop
526 std %f0, [%o0 + 0x00] ! store context
527 std %f2, [%o0 + 0x08]
528 std %f4, [%o0 + 0x10]
529 std %f6, [%o0 + 0x18]
530 std %f8, [%o0 + 0x20]
531 std %f10, [%o0 + 0x28]
532 std %f12, [%o0 + 0x30]
534 std %f14, [%o0 + 0x38]
538 alignaddr %o1, %g0, %o1
540 ldd [%o1 + 0x00], %f18
542 ldd [%o1 + 0x08], %f20
543 ldd [%o1 + 0x10], %f22
544 ldd [%o1 + 0x18], %f24
545 ldd [%o1 + 0x20], %f26
546 ldd [%o1 + 0x28], %f28
547 ldd [%o1 + 0x30], %f30
548 ldd [%o1 + 0x38], %f32
549 ldd [%o1 + 0x40], %f34
550 ldd [%o1 + 0x48], %f36
551 ldd [%o1 + 0x50], %f38
552 ldd [%o1 + 0x58], %f40
553 ldd [%o1 + 0x60], %f42
554 ldd [%o1 + 0x68], %f44
555 ldd [%o1 + 0x70], %f46
556 ldd [%o1 + 0x78], %f48
557 subcc %o2, 1, %o2 ! done yet?
558 ldd [%o1 + 0x80], %f50
560 prefetch [%o1 + 63], 20
561 prefetch [%o1 + 64+63], 20
563 faligndata %f18, %f20, %f16
564 faligndata %f20, %f22, %f18
565 faligndata %f22, %f24, %f20
566 faligndata %f24, %f26, %f22
567 faligndata %f26, %f28, %f24
568 faligndata %f28, %f30, %f26
569 faligndata %f30, %f32, %f28
570 faligndata %f32, %f34, %f30
571 faligndata %f34, %f36, %f32
572 faligndata %f36, %f38, %f34
573 faligndata %f38, %f40, %f36
574 faligndata %f40, %f42, %f38
575 faligndata %f42, %f44, %f40
576 faligndata %f44, %f46, %f42
577 faligndata %f46, %f48, %f44
578 faligndata %f48, %f50, %f46
580 .word 0x81b02860 ! SHA512
582 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
583 for %f50, %f50, %f18 ! %f18=%f50
588 $code.=<<___ if ($SZ==4); # SHA256
597 bne,pn %icc, .Lhwunaligned
601 ldd [%o1 + 0x00], %f8
602 ldd [%o1 + 0x08], %f10
603 ldd [%o1 + 0x10], %f12
604 ldd [%o1 + 0x18], %f14
605 ldd [%o1 + 0x20], %f16
606 ldd [%o1 + 0x28], %f18
607 ldd [%o1 + 0x30], %f20
608 subcc %o2, 1, %o2 ! done yet?
609 ldd [%o1 + 0x38], %f22
611 prefetch [%o1 + 63], 20
613 .word 0x81b02840 ! SHA256
615 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwloop
619 st %f0, [%o0 + 0x00] ! store context
631 alignaddr %o1, %g0, %o1
633 ldd [%o1 + 0x00], %f10
635 ldd [%o1 + 0x08], %f12
636 ldd [%o1 + 0x10], %f14
637 ldd [%o1 + 0x18], %f16
638 ldd [%o1 + 0x20], %f18
639 ldd [%o1 + 0x28], %f20
640 ldd [%o1 + 0x30], %f22
641 ldd [%o1 + 0x38], %f24
642 subcc %o2, 1, %o2 ! done yet?
643 ldd [%o1 + 0x40], %f26
645 prefetch [%o1 + 63], 20
647 faligndata %f10, %f12, %f8
648 faligndata %f12, %f14, %f10
649 faligndata %f14, %f16, %f12
650 faligndata %f16, %f18, %f14
651 faligndata %f18, %f20, %f16
652 faligndata %f20, %f22, %f18
653 faligndata %f22, %f24, %f20
654 faligndata %f24, %f26, %f22
656 .word 0x81b02840 ! SHA256
658 bne,pt `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
659 for %f26, %f26, %f10 ! %f10=%f26
667 save %sp,`-$frame-$locals`,%sp
668 and $inp,`$align-1`,$tmp31
669 sllx $len,`log(16*$SZ)/log(2)`,$len
670 andn $inp,`$align-1`,$inp
674 $code.=<<___ if ($SZ==8); # SHA512
676 sub $tmp32,$tmp31,$tmp32
680 add %o7,K${label}-.Lpic,$Ktbl
682 $LD [$ctx+`0*$SZ`],$A
683 $LD [$ctx+`1*$SZ`],$B
684 $LD [$ctx+`2*$SZ`],$C
685 $LD [$ctx+`3*$SZ`],$D
686 $LD [$ctx+`4*$SZ`],$E
687 $LD [$ctx+`5*$SZ`],$F
688 $LD [$ctx+`6*$SZ`],$G
689 $LD [$ctx+`7*$SZ`],$H
693 for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
695 for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
697 and $tmp2,0xfff,$tmp2
700 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
703 $code.=<<___ if ($SZ==4); # SHA256
704 $LD [$ctx+`0*$SZ`],@X[0]
705 $LD [$ctx+`1*$SZ`],@X[1]
706 $LD [$ctx+`2*$SZ`],@X[2]
707 $LD [$ctx+`3*$SZ`],@X[3]
708 $LD [$ctx+`4*$SZ`],@X[4]
709 $LD [$ctx+`5*$SZ`],@X[5]
710 $LD [$ctx+`6*$SZ`],@X[6]
711 $LD [$ctx+`7*$SZ`],@X[7]
714 $ST $A,[$ctx+`0*$SZ`]
716 $ST $B,[$ctx+`1*$SZ`]
718 $ST $C,[$ctx+`2*$SZ`]
720 $ST $D,[$ctx+`3*$SZ`]
722 $ST $E,[$ctx+`4*$SZ`]
724 $ST $F,[$ctx+`5*$SZ`]
726 $ST $G,[$ctx+`6*$SZ`]
728 $ST $H,[$ctx+`7*$SZ`]
730 $code.=<<___ if ($SZ==8); # SHA512
731 ld [$ctx+`0*$SZ+0`],%l0
732 ld [$ctx+`0*$SZ+4`],%l1
733 ld [$ctx+`1*$SZ+0`],%l2
734 ld [$ctx+`1*$SZ+4`],%l3
735 ld [$ctx+`2*$SZ+0`],%l4
736 ld [$ctx+`2*$SZ+4`],%l5
737 ld [$ctx+`3*$SZ+0`],%l6
740 ld [$ctx+`3*$SZ+4`],%l7
746 $ST $A,[$ctx+`0*$SZ`]
748 $ST $B,[$ctx+`1*$SZ`]
753 $ST $C,[$ctx+`2*$SZ`]
755 $ST $D,[$ctx+`3*$SZ`]
757 ld [$ctx+`4*$SZ+0`],%l0
758 ld [$ctx+`4*$SZ+4`],%l1
759 ld [$ctx+`5*$SZ+0`],%l2
760 ld [$ctx+`5*$SZ+4`],%l3
761 ld [$ctx+`6*$SZ+0`],%l4
762 ld [$ctx+`6*$SZ+4`],%l5
763 ld [$ctx+`7*$SZ+0`],%l6
766 ld [$ctx+`7*$SZ+4`],%l7
772 $ST $E,[$ctx+`4*$SZ`]
774 $ST $F,[$ctx+`5*$SZ`]
779 $ST $G,[$ctx+`6*$SZ`]
781 $ST $H,[$ctx+`7*$SZ`]
784 add $inp,`16*$SZ`,$inp ! advance inp
786 bne `$bits==64?"%xcc":"%icc"`,.Lloop
787 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
791 .type sha${label}_block_data_order,#function
792 .size sha${label}_block_data_order,(.-sha${label}_block_data_order)
793 .asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
797 # Purpose of these subroutines is to explicitly encode VIS instructions,
798 # so that one can compile the module without having to specify VIS
799 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
800 # Idea is to reserve for option to produce "universal" binary and let
801 # programmer detect if current CPU is VIS capable at run-time.
803 my ($mnemonic,$rs1,$rs2,$rd)=@_;
805 my %visopf = ( "faligndata" => 0x048,
808 $ref = "$mnemonic\t$rs1,$rs2,$rd";
810 if ($opf=$visopf{$mnemonic}) {
811 foreach ($rs1,$rs2,$rd) {
812 return $ref if (!/%f([0-9]{1,2})/);
815 return $ref if ($1&1);
816 # re-encode for upper double register addressing
821 return sprintf ".word\t0x%08x !%s",
822 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
829 my ($mnemonic,$rs1,$rs2,$rd)=@_;
830 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
831 my $ref="$mnemonic\t$rs1,$rs2,$rd";
833 foreach ($rs1,$rs2,$rd) {
834 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
835 else { return $ref; }
837 return sprintf ".word\t0x%08x !%s",
838 0x81b00300|$rd<<25|$rs1<<14|$rs2,
842 foreach (split("\n",$code)) {
843 s/\`([^\`]*)\`/eval $1/ge;
845 s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
848 s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
849 &unalignaddr($1,$2,$3,$4)