2 # Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
9 # $output is the last argument if it looks like a file (it has an extension)
10 # $flavour is the first argument if it doesn't look like a file
11 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
12 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
14 $output and open STDOUT,">$output";
16 my @regs = map("x$_",(0..31));
17 my @regaliases = ('zero','ra','sp','gp','tp','t0','t1','t2','s0','s1',
24 @reglookup{@regs} = @regs;
25 @reglookup{@regaliases} = @regs;
27 # Takes a register name, possibly an alias, and converts it to a register index
31 if (!exists($reglookup{$reg})) {
32 die("Unknown register ".$reg);
34 my $regstr = $reglookup{$reg};
35 if (!($regstr =~ /^x([0-9]+)$/)) {
36 die("Could not process register ".$reg);
42 # Encoding for rev8 rd, rs instruction on RV64
43 # XXXXXXXXXXXXX_ rs _XXX_ rd _XXXXXXX
44 my $template = 0b011010111000_00000_101_00000_0010011;
45 my $rd = read_reg shift;
46 my $rs = read_reg shift;
48 return ".word ".($template | ($rs << 15) | ($rd << 7));
52 # Encoding for clmul rd, rs1, rs2 instruction on RV64
53 # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
54 my $template = 0b0000101_00000_00000_001_00000_0110011;
55 my $rd = read_reg shift;
56 my $rs1 = read_reg shift;
57 my $rs2 = read_reg shift;
59 return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
63 # Encoding for clmulh rd, rs1, rs2 instruction on RV64
64 # XXXXXXX_ rs2 _ rs1 _XXX_ rd _XXXXXXX
65 my $template = 0b0000101_00000_00000_011_00000_0110011;
66 my $rd = read_reg shift;
67 my $rs1 = read_reg shift;
68 my $rs2 = read_reg shift;
70 return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
73 ################################################################################
74 # gcm_init_clmul_rv64i_zbb_zbc(u128 Htable[16], const u64 Xi[2])
75 # Initialization function for clmul-based implementation of GMULT
76 # This function is used in tandem with gcm_gmult_clmul_rv64i_zbb_zbc
77 ################################################################################
79 my ($Haddr,$Xi,$TEMP) = ("a0","a1","a2");
84 .globl gcm_init_clmul_rv64i_zbb_zbc
85 .type gcm_init_clmul_rv64i_zbb_zbc,\@function
86 # Initialize clmul-based implementation of galois field multiplication routine.
87 # gcm_init_clmul_rv64i_zbb_zbc(ctx->Htable, ctx->H.u)
88 gcm_init_clmul_rv64i_zbb_zbc:
89 # argument 0 = ctx->Htable (store H here)
90 # argument 1 = H.u[] (2x 64-bit words) [H_high64, H_low64]
92 # Simply store [H_high64, H_low64] for later
104 ################################################################################
105 # gcm_gmult_clmul_rv64i_zbb_zbc(u64 Xi[2], const u128 Htable[16])
106 # Compute GMULT (X*H mod f) using the Zbc (clmul) and Zbb (basic bit manip)
107 # extensions, and the Modified Barrett Reduction technique
108 ################################################################################
110 my ($Xi,$Haddr,$A1,$A0,$B1,$B0,$C1,$C0,$D1,$D0,$E1,$E0,$TEMP,$TEMP2,$qp_low) =
111 ("a0","a1","a2","a3","a4","a5","a6","a7","t0","t1","t2","t3","t4","t5","t6");
116 .globl gcm_gmult_clmul_rv64i_zbb_zbc
117 .type gcm_gmult_clmul_rv64i_zbb_zbc,\@function
118 # static void gcm_gmult_clmul_rv64i_zbb_zbc(u64 Xi[2], const u128 Htable[16])
119 # Computes product of X*H mod f
120 gcm_gmult_clmul_rv64i_zbb_zbc:
122 # Load X and H (H is saved previously in gcm_init_clmul_rv64i_zbb_zbc)
129 li $qp_low,0xe100000000000000
131 # Perform Katratsuba Multiplication to generate a 255-bit intermediate
137 # [E1:E0] = (A0+A1)*(B0+B1)
139 # A*B = [C1:C0+C1+D1+E1:D1+C0+D0+E0:D0]
141 @{[rv64_rev8 $A1, $A1]}
142 @{[rv64_clmul $C0,$A1,$B1]}
143 @{[rv64_clmulh $C1,$A1,$B1]}
145 @{[rv64_rev8 $A0,$A0]}
146 @{[rv64_clmul $D0,$A0,$B0]}
147 @{[rv64_clmulh $D1,$A0,$B0]}
152 @{[rv64_clmul $E0,$TEMP,$TEMP2]}
153 @{[rv64_clmulh $E1,$TEMP,$TEMP2]}
155 # 0th term is just C1
157 # Construct term 1 in E1 (E1 only appears in dword 1)
164 # Construct term 2 in E0 (E0 only appears in dword 2)
171 # final term is just D0
173 # X*H is now stored in [C1,E1,E0,D0]
177 # Or in the high bit of E1
182 # Or in the high bit of E0
187 # Or in the high bit of D0
195 # We want the top 128 bits of the result of c*f
196 # We'll get this by computing the low-half (most significant 128 bits in
197 # the reflected domain) of clmul(c,fs)<<1 first, then
198 # xor in c to complete the calculation
200 # AA = [AA1:AA0] = [E0,D0] = c
201 # BB = [BB1:BB0] = [qp_low,0]
202 # [CC1:CC0] = AA1*BB1
203 # [DD1:DD0] = AA0*BB0
204 # [EE1:EE0] = (AA0+AA1)*(BB0+BB1)
206 # AA*BB = [CC1:CC0+CC1+DD1+EE1:DD1+CC0+DD0+EE0:DD0]
207 # We only need CC0,DD1,DD0,EE0 to compute the low 128 bits of c * qp_low
210 my ($CC0,$EE0,$AA1,$AA0,$BB1) = ($A0,$B1,$E0,$D0,$qp_low);
214 @{[rv64_clmul $CC0,$AA1,$BB1]}
215 #clmul DD0,AA0,BB0 # BB0 is 0, so DD0 = 0
216 #clmulh DD1,AA0,BB0 # BB0 is 0, so DD1 = 0
218 #xor TEMP2,BB0,BB1 # TEMP2 = BB1 = qp_low
219 @{[rv64_clmul $EE0,$TEMP,$BB1]}
221 # Result is [N/A:N/A:DD1+CC0+DD0+EE0:DD0]
222 # Simplifying: [CC0+EE0:0]
224 # Shift left by 1 to correct for bit reflection
227 # xor into c = [E0,D0]
228 # Note that only E0 is affected
233 # The final step is to compute clmul(q,[qp_low:0])<<1
234 # The leftmost 128 bits are the reduced result.
235 # Once again, we use Karatsuba multiplication, but many of the terms
236 # simplify or cancel out.
237 # AA = [AA1:AA0] = [E0,D0] = c
238 # BB = [BB1:BB0] = [qp_low,0]
239 # [CC1:CC0] = AA1*BB1
240 # [DD1:DD0] = AA0*BB0
241 # [EE1:EE0] = (AA0+AA1)*(BB0+BB1)
243 # AA*BB = [CC1:CC0+CC1+DD1+EE1:DD1+CC0+DD0+EE0:DD0]
244 # We need CC1,CC0,DD0,DD1,EE1,EE0 to compute the leftmost 128 bits of AA*BB
248 my ($AA1,$AA0,$BB1,$CC1,$CC0,$EE1,$EE0) = ($E0,$D0,$qp_low,$A0,$A1,$C0,$B0);
252 @{[rv64_clmul $CC0,$AA1,$BB1]}
253 @{[rv64_clmulh $CC1,$AA1,$BB1]}
255 #clmul DD0,AA0,BB0 # BB0 = 0 so DD0 = 0
256 #clmulh DD1,AA0,BB0 # BB0 = 0 so DD1 = 0
259 #xor TEMP2,BB0,BB1 # BB0 = 0 to TEMP2 == BB1 == qp_low
261 @{[rv64_clmul $EE0,$TEMP,$BB1]}
262 @{[rv64_clmulh $EE1,$TEMP,$BB1]}
264 # Need the DD1+CC0+DD0+EE0 term to shift its leftmost bit into the
265 # intermediate result.
266 # This is just CC0+EE0, store it in TEMP
269 # Result is [CC1:CC0+CC1+EE1:(a single bit)]<<1
270 # Combine into [CC1:CC0]
274 # Shift 128-bit quantity, xor in [C1,E1] and store
280 @{[rv64_rev8 $CC1,$CC1]}
287 @{[rv64_rev8 $CC0,$CC0]}
298 close STDOUT or die "error closing STDOUT: $!";