2 # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # The module implements "4-bit" GCM GHASH function and underlying
20 # single multiplication operation in GF(2^128). "4-bit" means that it
21 # uses 256 bytes per-key table [+128 bytes shared table]. Performance
22 # results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
23 # and are expressed in cycles per processed byte, less is better:
25 # gcc 3.3.x cc 5.2 this assembler
27 # 32-bit build 81.4 43.3 12.6 (+546%/+244%)
28 # 64-bit build 20.2 21.2 12.6 (+60%/+68%)
30 # Here is data collected on UltraSPARC T1 system running Linux:
32 # gcc 4.4.1 this assembler
34 # 32-bit build 566 50 (+1000%)
35 # 64-bit build 56 50 (+12%)
37 # I don't quite understand why difference between 32-bit and 64-bit
38 # compiler-generated code is so big. Compilers *were* instructed to
39 # generate code for UltraSPARC and should have used 64-bit registers
40 # for Z vector (see C code) even in 32-bit build... Oh well, it only
41 # means more impressive improvement coefficients for this assembler
42 # module;-) Loops are aggressively modulo-scheduled in respect to
43 # references to input data and Z.hi updates to achieve 12 cycles
44 # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
45 # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
49 # Add VIS3 lookup-table-free implementation using polynomial
50 # multiplication xmulx[hi] and extended addition addxc[cc]
51 # instructions. 4.52/7.63x improvement on T3/T4 or in absolute
52 # terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
53 # saturates at ~15.5x single-process result on 8-core processor,
54 # or ~20.5GBps per 2.85GHz socket.
56 $output=pop and open STDOUT,">$output";
61 $Zhi="%o0"; # 64-bit values
68 $nhi="%l0"; # small values and pointers
77 $Xi="%i0"; # input argument block
84 # define __ASSEMBLER__ 1
86 #include "crypto/sparc_arch.h"
89 .register %g2,#scratch
90 .register %g3,#scratch
93 .section ".text",#alloc,#execinstr
97 .long `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
98 .long `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
99 .long `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
100 .long `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
101 .type rem_4bit,#object
102 .size rem_4bit,(.-rem_4bit)
104 .globl gcm_ghash_4bit
115 add %o7,rem_4bit-1b,$rem_4bit
122 ldx [$Htblo+$nlo],$Zlo
123 ldx [$Htbl+$nlo],$Zhi
127 ldx [$Htblo+$nhi],$Tlo
129 ldx [$Htbl+$nhi],$Thi
131 ldx [$rem_4bit+$remi],$rem
147 ldx [$Htblo+$nlo],$Tlo
150 ldx [$Htbl+$nlo],$Thi
153 ldx [$rem_4bit+$remi],$rem
156 ldub [$inp+$cnt],$nlo
163 ldx [$Htblo+$nhi],$Tlo
166 ldx [$Htbl+$nhi],$Thi
168 ldx [$rem_4bit+$remi],$rem
181 ldx [$Htblo+$nlo],$Tlo
184 ldx [$Htbl+$nlo],$Thi
187 ldx [$rem_4bit+$remi],$rem
196 be,pn SIZE_T_CC,.Ldone
199 ldx [$Htblo+$nhi],$Tlo
202 ldx [$Htbl+$nhi],$Thi
204 ldx [$rem_4bit+$remi],$rem
220 ldx [$Htblo+$nhi],$Tlo
223 ldx [$Htbl+$nhi],$Thi
225 ldx [$rem_4bit+$remi],$rem
237 .type gcm_ghash_4bit,#function
238 .size gcm_ghash_4bit,(.-gcm_ghash_4bit)
245 .globl gcm_gmult_4bit
253 add %o7,rem_4bit-1b,$rem_4bit
258 ldx [$Htblo+$nlo],$Zlo
259 ldx [$Htbl+$nlo],$Zhi
263 ldx [$Htblo+$nhi],$Tlo
265 ldx [$Htbl+$nhi],$Thi
267 ldx [$rem_4bit+$remi],$rem
282 ldx [$Htblo+$nlo],$Tlo
285 ldx [$Htbl+$nlo],$Thi
288 ldx [$rem_4bit+$remi],$rem
297 ldx [$Htblo+$nhi],$Tlo
300 ldx [$Htbl+$nhi],$Thi
302 ldx [$rem_4bit+$remi],$rem
314 ldx [$Htblo+$nlo],$Tlo
317 ldx [$Htbl+$nlo],$Thi
320 ldx [$rem_4bit+$remi],$rem
328 ldx [$Htblo+$nhi],$Tlo
331 ldx [$Htbl+$nhi],$Thi
333 ldx [$rem_4bit+$remi],$rem
345 .type gcm_gmult_4bit,#function
346 .size gcm_gmult_4bit,(.-gcm_gmult_4bit)
350 # Straightforward 128x128-bit multiplication using Karatsuba algorithm
351 # followed by pair of 64-bit reductions [with a shortcut in first one,
352 # which allowed to break dependency between reductions and remove one
353 # multiplication from critical path]. While it might be suboptimal
354 # with regard to sheer number of multiplications, other methods [such
355 # as aggregate reduction] would require more 64-bit registers, which
356 # we don't have in 32-bit application context.
358 ($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
360 ($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
361 (map("%o$_",(0..5,7)),map("%g$_",(1..5)));
363 ($shl,$shr)=map("%l$_",(0..7));
365 # For details regarding "twisted H" see ghash-x86.pl.
377 srax $Hhi,63,$C0 ! broadcast carry
378 addcc $Hlo,$Hlo,$Hlo ! H<<=1
384 stx $Hlo,[%i0+8] ! save twisted H
387 sethi %hi(0xA0406080),$V
388 sethi %hi(0x20C0E000),%l0
389 or $V,%lo(0xA0406080),$V
390 or %l0,%lo(0x20C0E000),%l0
392 or %l0,$V,$V ! (0xE0·i)&0xff=0xA040608020C0E000
397 .type gcm_init_vis3,#function
398 .size gcm_init_vis3,.-gcm_init_vis3
400 .globl gcm_gmult_vis3
405 ldx [$Xip+8],$Xlo ! load Xi
407 ldx [$Htable+8],$Hlo ! load twisted H
411 sllx %l7,57,$xE1 ! 57 is not a typo
412 ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
414 xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
416 xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
418 xmulxhi $Xlo,$Hlo,$Xlo
420 xmulxhi $Xhi,$Hhi,$C3
424 srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
426 sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
428 xor $C0,$C1,$C1 ! Karatsuba post-processing
430 xor $sqr,$Xlo,$Xlo ! real destination is $C1
436 xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
446 stx $C2,[$Xip+8] ! save Xi
451 .type gcm_gmult_vis3,#function
452 .size gcm_gmult_vis3,.-gcm_gmult_vis3
454 .globl gcm_ghash_vis3
459 srln $len,0,$len ! needed on v8+, "nop" on v9
461 ldx [$Xip+8],$C2 ! load Xi
463 ldx [$Htable+8],$Hlo ! load twisted H
467 sllx %l7,57,$xE1 ! 57 is not a typo
468 ldx [$Htable+16],$V ! (0xE0·i)&0xff=0xA040608020C0E000
473 prefetch [$inp+63], 20
476 xor $Hhi,$Hlo,$Hhl ! Karatsuba pre-processing
482 ldx [$inp+16],$C1 ! align data
494 prefetch [$inp+63], 20
497 xor $Xlo,$Xhi,$C2 ! Karatsuba pre-processing
499 xmulxhi $Xlo,$Hlo,$Xlo
501 xmulxhi $Xhi,$Hhi,$C3
505 srlx $V,$sqr,$sqr ! ·0xE0 [implicit &(7<<3)]
507 sllx $sqr,57,$sqr ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
509 xor $C0,$C1,$C1 ! Karatsuba post-processing
511 xor $sqr,$Xlo,$Xlo ! real destination is $C1
517 xmulxhi $C0,$xE1,$Xlo ! ·0xE1<<1<<56
528 stx $C2,[$Xip+8] ! save Xi
533 .type gcm_ghash_vis3,#function
534 .size gcm_ghash_vis3,.-gcm_ghash_vis3
538 .asciz "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
543 # Purpose of these subroutines is to explicitly encode VIS instructions,
544 # so that one can compile the module without having to specify VIS
545 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
546 # Idea is to reserve for option to produce "universal" binary and let
547 # programmer detect if current CPU is VIS capable at run-time.
549 my ($mnemonic,$rs1,$rs2,$rd)=@_;
550 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
552 my %visopf = ( "addxc" => 0x011,
555 "xmulxhi" => 0x116 );
557 $ref = "$mnemonic\t$rs1,$rs2,$rd";
559 if ($opf=$visopf{$mnemonic}) {
560 foreach ($rs1,$rs2,$rd) {
561 return $ref if (!/%([goli])([0-9])/);
565 return sprintf ".word\t0x%08x !%s",
566 0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
573 foreach (split("\n",$code)) {
574 s/\`([^\`]*)\`/eval $1/ge;
576 s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
583 close STDOUT or die "error closing STDOUT: $!";