2 # Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
16 # Hardware SPARC T4 support by David S. Miller
17 # ====================================================================
19 # SHA256 performance improvement over compiler generated code varies
20 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
21 # build]. Just like in SHA1 module I aim to ensure scalability on
22 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
24 # SHA512 on pre-T1 UltraSPARC.
26 # Performance is >75% better than 64-bit code generated by Sun C and
27 # over 2x than 32-bit code. X[16] resides on stack, but access to it
28 # is scheduled for L2 latency and staged through 32 least significant
29 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
30 # duality. Nevertheless it's ~40% faster than SHA256, which is pretty
31 # good [optimal coefficient is 50%].
33 # SHA512 on UltraSPARC T1.
35 # It's not any faster than 64-bit code generated by Sun C 5.8. This is
36 # because 64-bit code generator has the advantage of using 64-bit
37 # loads(*) to access X[16], which I consciously traded for 32-/64-bit
38 # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
39 # code by 60%, not to mention that it doesn't suffer from severe decay
40 # when running 4 times physical cores threads and that it leaves gcc
41 # [3.4] behind by over 4x factor! If compared to SHA256, single thread
42 # performance is only 10% better, but overall throughput for maximum
43 # amount of threads for given CPU exceeds corresponding one of SHA256
44 # by 30% [again, optimal coefficient is 50%].
46 # (*) Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
47 # in-order, i.e. load instruction has to complete prior next
48 # instruction in given thread is executed, even if the latter is
49 # not dependent on load result! This means that on T1 two 32-bit
50 # loads are always slower than one 64-bit load. Once again this
51 # is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
52 # 2x32-bit loads can be as fast as 1x64-bit ones.
54 # SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
55 # which is 9.3x/11.1x faster than software. Multi-process benchmark
56 # saturates at 11.5x single-process result on 8-core processor, or
57 # ~11/16GBps per 2.85GHz socket.
59 # $output is the last argument if it looks like a file (it has an extension)
60 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
62 $output and open STDOUT,">$output";
64 if ($output =~ /512/) {
67 $LD="ldx"; # load from memory
68 $ST="stx"; # store to memory
69 $SLL="sllx"; # shift left logical
70 $SRL="srlx"; # shift right logical
73 @sigma0=( 7, 1, 8); # right shift first
74 @sigma1=( 6,19,61); # right shift first
79 $locals=16*$SZ; # X[16]
89 @V=($A,$B,$C,$D,$E,$F,$G,$H);
93 $LD="ld"; # load from memory
94 $ST="st"; # store to memory
95 $SLL="sll"; # shift left logical
96 $SRL="srl"; # shift right logical
99 @sigma0=( 3, 7,18); # right shift first
100 @sigma1=(10,17,19); # right shift first
105 $locals=0; # X[16] is register resident
106 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
116 @V=($A,$B,$C,$D,$E,$F,$G,$H);
132 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
142 subcc %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
147 sllx @X[0],$tmp31,@X[0]
152 srlx @X[$j+1],$tmp32,$tmp1
153 sllx @X[$j+1],$tmp31,@X[$j+1]
154 or $tmp1,@X[$j],@X[$j]
165 $code.="\tadd @X[$i/2],$h,$T1\n";
167 $code.="\tsrlx @X[$i/2],32,$T1\n\tadd $h,$T1,$T1\n";
173 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
174 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
176 $code.=<<___ if ($i==0);
187 $code.=<<___ if ($i<15);
188 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
190 sllx @pair[0],$tmp0,$tmp1
191 `"ld [$inp+".eval(32+0+$i*8)."],@pair[0]" if ($i<12)`
192 srlx @pair[2],$tmp32,@pair[1]
194 or @pair[1],$tmp2,$tmp2
195 `"ld [$inp+".eval(32+4+$i*8)."],@pair[1]" if ($i<12)`
197 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
199 $code.=<<___ if ($i==12);
203 $code.=<<___ if ($i==15);
204 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
205 sllx @pair[1],$tmp31,$tmp2 ! Xload($i)
207 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
208 sllx @pair[0],$tmp0,$tmp1
209 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
210 srlx @pair[2],$tmp32,@pair[1]
212 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
213 or @pair[1],$tmp2,$tmp2
214 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
216 $ST $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
217 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
218 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
219 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
225 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
230 $code.="\tadd $h,$T1,$T1\n";
234 $SRL $e,@Sigma1[0],$h !! $i
236 $SLL $e,`$SZ*8-@Sigma1[2]`,$tmp1
238 $SRL $e,@Sigma1[1],$tmp0
240 $SLL $e,`$SZ*8-@Sigma1[1]`,$tmp1
242 $SRL $e,@Sigma1[2],$tmp0
244 $SLL $e,`$SZ*8-@Sigma1[0]`,$tmp1
246 xor $g,$tmp2,$tmp2 ! Ch(e,f,g)
247 xor $tmp1,$h,$tmp0 ! Sigma1(e)
249 $SRL $a,@Sigma0[0],$h
251 $LD [$Ktbl+`$i*$SZ`],$tmp2 ! K[$i]
252 $SLL $a,`$SZ*8-@Sigma0[2]`,$tmp1
254 $SRL $a,@Sigma0[1],$tmp0
256 $SLL $a,`$SZ*8-@Sigma0[1]`,$tmp1
258 $SRL $a,@Sigma0[2],$tmp0
260 $SLL $a,`$SZ*8-@Sigma0[0]`,$tmp1
262 xor $tmp1,$h,$h ! Sigma0(a)
267 or $tmp0,$tmp1,$tmp1 ! Maj(a,b,c)
268 add $tmp2,$T1,$T1 ! +=K[$i]
283 $code.="\tsrlx @X[(($i+1)/2)%8],32,$xi\n";
285 $xi=@X[(($i+1)/2)%8];
288 srl $xi,@sigma0[0],$T1 !! Xupdate($i)
289 sll $xi,`32-@sigma0[2]`,$tmp1
290 srl $xi,@sigma0[1],$tmp0
292 sll $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
294 srl $xi,@sigma0[2],$tmp0
298 $xi=@X[(($i+14)/2)%8];
301 $code.="\tsrlx @X[(($i+14)/2)%8],32,$xi\n";
304 srl $xi,@sigma1[0],$tmp2
305 xor $tmp0,$T1,$T1 ! T1=sigma0(X[i+1])
306 sll $xi,`32-@sigma1[2]`,$tmp1
307 srl $xi,@sigma1[1],$tmp0
308 xor $tmp1,$tmp2,$tmp2
309 sll $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
310 xor $tmp0,$tmp2,$tmp2
311 srl $xi,@sigma1[2],$tmp0
312 xor $tmp1,$tmp2,$tmp2
317 srlx @X[(($i+9)/2)%8],32,$tmp1 ! X[i+9]
318 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
319 srl @X[($i/2)%8],0,$tmp0
320 add $tmp2,$tmp1,$tmp1
321 add $xi,$T1,$T1 ! +=X[i]
322 xor $tmp0,@X[($i/2)%8],@X[($i/2)%8]
326 or $T1,@X[($i/2)%8],@X[($i/2)%8]
329 $xi=@X[(($i+9)/2)%8];
331 srlx @X[($i/2)%8],32,$tmp1 ! X[i]
332 xor $tmp0,$tmp2,$tmp2 ! sigma1(X[i+14])
333 add $xi,$T1,$T1 ! +=X[i+9]
334 add $tmp2,$tmp1,$tmp1
335 srl @X[($i/2)%8],0,@X[($i/2)%8]
339 or $tmp0,@X[($i/2)%8],@X[($i/2)%8]
348 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
351 sllx %l2,32,$tmp0 !! Xupdate($i)
354 srlx $tmp0,@sigma0[0],$T1
355 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
356 sllx $tmp0,`64-@sigma0[2]`,$tmp1
357 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
358 srlx $tmp0,@sigma0[1],$tmp0
360 sllx $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
362 srlx $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
365 xor $tmp0,$T1,$T1 ! sigma0(X[$i+1])
368 srlx $tmp2,@sigma1[0],$tmp1
369 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
370 sllx $tmp2,`64-@sigma1[2]`,$tmp0
371 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
372 srlx $tmp2,@sigma1[1],$tmp2
373 xor $tmp0,$tmp1,$tmp1
374 sllx $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
375 xor $tmp2,$tmp1,$tmp1
376 srlx $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
377 xor $tmp0,$tmp1,$tmp1
379 xor $tmp2,$tmp1,$tmp1 ! sigma1(X[$i+14])
380 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
382 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
386 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
388 add $tmp0,$T1,$T1 ! +=X[$i+9]
389 ld [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
390 add $tmp2,$T1,$T1 ! +=X[$i]
391 $ST $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
397 #ifndef __ASSEMBLER__
398 # define __ASSEMBLER__ 1
400 #include "crypto/sparc_arch.h"
403 .register %g2,#scratch
404 .register %g3,#scratch
407 .section ".text",#alloc,#execinstr
411 .type K${label},#object
415 .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
416 .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
417 .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
418 .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
419 .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
420 .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
421 .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
422 .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
423 .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
424 .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
425 .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
426 .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
427 .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
428 .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
429 .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
430 .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
434 .long 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
435 .long 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
436 .long 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
437 .long 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
438 .long 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
439 .long 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
440 .long 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
441 .long 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
442 .long 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
443 .long 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
444 .long 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
445 .long 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
446 .long 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
447 .long 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
448 .long 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
449 .long 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
450 .long 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
451 .long 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
452 .long 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
453 .long 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
454 .long 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
455 .long 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
456 .long 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
457 .long 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
458 .long 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
459 .long 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
460 .long 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
461 .long 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
462 .long 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
463 .long 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
464 .long 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
465 .long 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
466 .long 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
467 .long 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
468 .long 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
469 .long 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
470 .long 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
471 .long 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
472 .long 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
473 .long 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
477 .size K${label},.-K${label}
483 .globl sha${label}_block_data_order
485 sha${label}_block_data_order:
486 SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
487 ld [%g1+4],%g1 ! OPENSSL_sparcv9cap_P[1]
489 andcc %g1, CFR_SHA${label}, %g0
493 $code.=<<___ if ($SZ==8); # SHA512
494 ldd [%o0 + 0x00], %f0 ! load context
495 ldd [%o0 + 0x08], %f2
496 ldd [%o0 + 0x10], %f4
497 ldd [%o0 + 0x18], %f6
498 ldd [%o0 + 0x20], %f8
499 ldd [%o0 + 0x28], %f10
501 ldd [%o0 + 0x30], %f12
502 bne,pn %icc, .Lhwunaligned
503 ldd [%o0 + 0x38], %f14
506 ldd [%o1 + 0x00], %f16
507 ldd [%o1 + 0x08], %f18
508 ldd [%o1 + 0x10], %f20
509 ldd [%o1 + 0x18], %f22
510 ldd [%o1 + 0x20], %f24
511 ldd [%o1 + 0x28], %f26
512 ldd [%o1 + 0x30], %f28
513 ldd [%o1 + 0x38], %f30
514 ldd [%o1 + 0x40], %f32
515 ldd [%o1 + 0x48], %f34
516 ldd [%o1 + 0x50], %f36
517 ldd [%o1 + 0x58], %f38
518 ldd [%o1 + 0x60], %f40
519 ldd [%o1 + 0x68], %f42
520 ldd [%o1 + 0x70], %f44
521 subcc %o2, 1, %o2 ! done yet?
522 ldd [%o1 + 0x78], %f46
524 prefetch [%o1 + 63], 20
525 prefetch [%o1 + 64+63], 20
527 .word 0x81b02860 ! SHA512
529 bne,pt SIZE_T_CC, .Lhwaligned_loop
533 std %f0, [%o0 + 0x00] ! store context
534 std %f2, [%o0 + 0x08]
535 std %f4, [%o0 + 0x10]
536 std %f6, [%o0 + 0x18]
537 std %f8, [%o0 + 0x20]
538 std %f10, [%o0 + 0x28]
539 std %f12, [%o0 + 0x30]
541 std %f14, [%o0 + 0x38]
545 alignaddr %o1, %g0, %o1
547 ldd [%o1 + 0x00], %f18
549 ldd [%o1 + 0x08], %f20
550 ldd [%o1 + 0x10], %f22
551 ldd [%o1 + 0x18], %f24
552 ldd [%o1 + 0x20], %f26
553 ldd [%o1 + 0x28], %f28
554 ldd [%o1 + 0x30], %f30
555 ldd [%o1 + 0x38], %f32
556 ldd [%o1 + 0x40], %f34
557 ldd [%o1 + 0x48], %f36
558 ldd [%o1 + 0x50], %f38
559 ldd [%o1 + 0x58], %f40
560 ldd [%o1 + 0x60], %f42
561 ldd [%o1 + 0x68], %f44
562 ldd [%o1 + 0x70], %f46
563 ldd [%o1 + 0x78], %f48
564 subcc %o2, 1, %o2 ! done yet?
565 ldd [%o1 + 0x80], %f50
567 prefetch [%o1 + 63], 20
568 prefetch [%o1 + 64+63], 20
570 faligndata %f18, %f20, %f16
571 faligndata %f20, %f22, %f18
572 faligndata %f22, %f24, %f20
573 faligndata %f24, %f26, %f22
574 faligndata %f26, %f28, %f24
575 faligndata %f28, %f30, %f26
576 faligndata %f30, %f32, %f28
577 faligndata %f32, %f34, %f30
578 faligndata %f34, %f36, %f32
579 faligndata %f36, %f38, %f34
580 faligndata %f38, %f40, %f36
581 faligndata %f40, %f42, %f38
582 faligndata %f42, %f44, %f40
583 faligndata %f44, %f46, %f42
584 faligndata %f46, %f48, %f44
585 faligndata %f48, %f50, %f46
587 .word 0x81b02860 ! SHA512
589 bne,pt SIZE_T_CC, .Lhwunaligned_loop
590 for %f50, %f50, %f18 ! %f18=%f50
595 $code.=<<___ if ($SZ==4); # SHA256
604 bne,pn %icc, .Lhwunaligned
608 ldd [%o1 + 0x00], %f8
609 ldd [%o1 + 0x08], %f10
610 ldd [%o1 + 0x10], %f12
611 ldd [%o1 + 0x18], %f14
612 ldd [%o1 + 0x20], %f16
613 ldd [%o1 + 0x28], %f18
614 ldd [%o1 + 0x30], %f20
615 subcc %o2, 1, %o2 ! done yet?
616 ldd [%o1 + 0x38], %f22
618 prefetch [%o1 + 63], 20
620 .word 0x81b02840 ! SHA256
622 bne,pt SIZE_T_CC, .Lhwloop
626 st %f0, [%o0 + 0x00] ! store context
638 alignaddr %o1, %g0, %o1
640 ldd [%o1 + 0x00], %f10
642 ldd [%o1 + 0x08], %f12
643 ldd [%o1 + 0x10], %f14
644 ldd [%o1 + 0x18], %f16
645 ldd [%o1 + 0x20], %f18
646 ldd [%o1 + 0x28], %f20
647 ldd [%o1 + 0x30], %f22
648 ldd [%o1 + 0x38], %f24
649 subcc %o2, 1, %o2 ! done yet?
650 ldd [%o1 + 0x40], %f26
652 prefetch [%o1 + 63], 20
654 faligndata %f10, %f12, %f8
655 faligndata %f12, %f14, %f10
656 faligndata %f14, %f16, %f12
657 faligndata %f16, %f18, %f14
658 faligndata %f18, %f20, %f16
659 faligndata %f20, %f22, %f18
660 faligndata %f22, %f24, %f20
661 faligndata %f24, %f26, %f22
663 .word 0x81b02840 ! SHA256
665 bne,pt SIZE_T_CC, .Lhwunaligned_loop
666 for %f26, %f26, %f10 ! %f10=%f26
674 save %sp,-STACK_FRAME-$locals,%sp
675 and $inp,`$align-1`,$tmp31
676 sllx $len,`log(16*$SZ)/log(2)`,$len
677 andn $inp,`$align-1`,$inp
681 $code.=<<___ if ($SZ==8); # SHA512
683 sub $tmp32,$tmp31,$tmp32
687 add %o7,K${label}-.Lpic,$Ktbl
689 $LD [$ctx+`0*$SZ`],$A
690 $LD [$ctx+`1*$SZ`],$B
691 $LD [$ctx+`2*$SZ`],$C
692 $LD [$ctx+`3*$SZ`],$D
693 $LD [$ctx+`4*$SZ`],$E
694 $LD [$ctx+`5*$SZ`],$F
695 $LD [$ctx+`6*$SZ`],$G
696 $LD [$ctx+`7*$SZ`],$H
700 for ($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
702 for (;$i<32;$i++) { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
704 and $tmp2,0xfff,$tmp2
707 add $Ktbl,`16*$SZ`,$Ktbl ! Ktbl+=16
710 $code.=<<___ if ($SZ==4); # SHA256
711 $LD [$ctx+`0*$SZ`],@X[0]
712 $LD [$ctx+`1*$SZ`],@X[1]
713 $LD [$ctx+`2*$SZ`],@X[2]
714 $LD [$ctx+`3*$SZ`],@X[3]
715 $LD [$ctx+`4*$SZ`],@X[4]
716 $LD [$ctx+`5*$SZ`],@X[5]
717 $LD [$ctx+`6*$SZ`],@X[6]
718 $LD [$ctx+`7*$SZ`],@X[7]
721 $ST $A,[$ctx+`0*$SZ`]
723 $ST $B,[$ctx+`1*$SZ`]
725 $ST $C,[$ctx+`2*$SZ`]
727 $ST $D,[$ctx+`3*$SZ`]
729 $ST $E,[$ctx+`4*$SZ`]
731 $ST $F,[$ctx+`5*$SZ`]
733 $ST $G,[$ctx+`6*$SZ`]
735 $ST $H,[$ctx+`7*$SZ`]
737 $code.=<<___ if ($SZ==8); # SHA512
738 ld [$ctx+`0*$SZ+0`],%l0
739 ld [$ctx+`0*$SZ+4`],%l1
740 ld [$ctx+`1*$SZ+0`],%l2
741 ld [$ctx+`1*$SZ+4`],%l3
742 ld [$ctx+`2*$SZ+0`],%l4
743 ld [$ctx+`2*$SZ+4`],%l5
744 ld [$ctx+`3*$SZ+0`],%l6
747 ld [$ctx+`3*$SZ+4`],%l7
753 $ST $A,[$ctx+`0*$SZ`]
755 $ST $B,[$ctx+`1*$SZ`]
760 $ST $C,[$ctx+`2*$SZ`]
762 $ST $D,[$ctx+`3*$SZ`]
764 ld [$ctx+`4*$SZ+0`],%l0
765 ld [$ctx+`4*$SZ+4`],%l1
766 ld [$ctx+`5*$SZ+0`],%l2
767 ld [$ctx+`5*$SZ+4`],%l3
768 ld [$ctx+`6*$SZ+0`],%l4
769 ld [$ctx+`6*$SZ+4`],%l5
770 ld [$ctx+`7*$SZ+0`],%l6
773 ld [$ctx+`7*$SZ+4`],%l7
779 $ST $E,[$ctx+`4*$SZ`]
781 $ST $F,[$ctx+`5*$SZ`]
786 $ST $G,[$ctx+`6*$SZ`]
788 $ST $H,[$ctx+`7*$SZ`]
791 add $inp,`16*$SZ`,$inp ! advance inp
794 sub $Ktbl,`($rounds-16)*$SZ`,$Ktbl ! rewind Ktbl
798 .type sha${label}_block_data_order,#function
799 .size sha${label}_block_data_order,(.-sha${label}_block_data_order)
800 .asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
804 # Purpose of these subroutines is to explicitly encode VIS instructions,
805 # so that one can compile the module without having to specify VIS
806 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
807 # Idea is to reserve for option to produce "universal" binary and let
808 # programmer detect if current CPU is VIS capable at run-time.
810 my ($mnemonic,$rs1,$rs2,$rd)=@_;
812 my %visopf = ( "faligndata" => 0x048,
815 $ref = "$mnemonic\t$rs1,$rs2,$rd";
817 if ($opf=$visopf{$mnemonic}) {
818 foreach ($rs1,$rs2,$rd) {
819 return $ref if (!/%f([0-9]{1,2})/);
822 return $ref if ($1&1);
823 # re-encode for upper double register addressing
828 return sprintf ".word\t0x%08x !%s",
829 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
836 my ($mnemonic,$rs1,$rs2,$rd)=@_;
837 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
838 my $ref="$mnemonic\t$rs1,$rs2,$rd";
840 foreach ($rs1,$rs2,$rd) {
841 if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
842 else { return $ref; }
844 return sprintf ".word\t0x%08x !%s",
845 0x81b00300|$rd<<25|$rs1<<14|$rs2,
849 foreach (split("\n",$code)) {
850 s/\`([^\`]*)\`/eval $1/ge;
852 s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
855 s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
856 &unalignaddr($1,$2,$3,$4)
862 close STDOUT or die "error closing STDOUT: $!";