crypto/modes/asm/ghash-riscv64.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 # $output is the last argument if it looks like a file (it has an extension)
  10 # $flavour is the first argument if it doesn't look like a file
  11 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  12 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  13
  14 $output and open STDOUT,">$output";
  15
  16 my @regs = map("x$_",(0..31));
  17 my @regaliases = ('zero','ra','sp','gp','tp','t0','t1','t2','s0','s1',
  18     map("a$_",(0..7)),
  19     map("s$_",(2..11)),
  20     map("t$_",(3..6))
  21 );
  22
  23 my %reglookup;
  24 @reglookup{@regs} = @regs;
  25 @reglookup{@regaliases} = @regs;
  26
  27 # Takes a register name, possibly an alias, and converts it to a register index
  28 # from 0 to 31
  29 sub read_reg {
  30     my $reg = lc shift;
  31     if (!exists($reglookup{$reg})) {
  32         die("Unknown register ".$reg);
  33     }
  34     my $regstr = $reglookup{$reg};
  35     if (!($regstr =~ /^x([0-9]+)$/)) {
  36         die("Could not process register ".$reg);
  37     }
  38     return $1;
  39 }
  40
  41 sub rv64_rev8 {
  42     # Encoding for rev8 rd, rs instruction on RV64
  43     #               XXXXXXXXXXXXX_ rs  _XXX_ rd  _XXXXXXX
  44     my $template = 0b011010111000_00000_101_00000_0010011;
  45     my $rd = read_reg shift;
  46     my $rs = read_reg shift;
  47
  48     return ".word ".($template | ($rs << 15) | ($rd << 7));
  49 }
  50
  51 sub rv64_clmul {
  52     # Encoding for clmul rd, rs1, rs2 instruction on RV64
  53     #                XXXXXXX_ rs2 _ rs1 _XXX_ rd  _XXXXXXX
  54     my $template = 0b0000101_00000_00000_001_00000_0110011;
  55     my $rd = read_reg shift;
  56     my $rs1 = read_reg shift;
  57     my $rs2 = read_reg shift;
  58
  59     return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
  60 }
  61
  62 sub rv64_clmulh {
  63     # Encoding for clmulh rd, rs1, rs2 instruction on RV64
  64     #                XXXXXXX_ rs2 _ rs1 _XXX_ rd  _XXXXXXX
  65     my $template = 0b0000101_00000_00000_011_00000_0110011;
  66     my $rd = read_reg shift;
  67     my $rs1 = read_reg shift;
  68     my $rs2 = read_reg shift;
  69
  70     return ".word ".($template | ($rs2 << 20) | ($rs1 << 15) | ($rd << 7));
  71 }
  72
  73 ################################################################################
  74 # gcm_init_clmul_rv64i_zbb_zbc(u128 Htable[16], const u64 Xi[2])
  75 # Initialization function for clmul-based implementation of GMULT
  76 # This function is used in tandem with gcm_gmult_clmul_rv64i_zbb_zbc
  77 ################################################################################
  78 {
  79 my ($Haddr,$Xi,$TEMP) = ("a0","a1","a2");
  80
  81 $code .= <<___;
  82 .text
  83 .balign 16
  84 .globl gcm_init_clmul_rv64i_zbb_zbc
  85 .type gcm_init_clmul_rv64i_zbb_zbc,\@function
  86 # Initialize clmul-based implementation of galois field multiplication routine.
  87 # gcm_init_clmul_rv64i_zbb_zbc(ctx->Htable, ctx->H.u)
  88 gcm_init_clmul_rv64i_zbb_zbc:
  89     # argument 0 = ctx->Htable (store H here)
  90     # argument 1 = H.u[] (2x 64-bit words) [H_high64, H_low64]
  91
  92     # Simply store [H_high64, H_low64] for later
  93     ld      $TEMP,0($Xi)
  94     sd      $TEMP,0($Haddr)
  95     ld      $TEMP,8($Xi)
  96     sd      $TEMP,8($Haddr)
  97
  98     ret
  99
 100 ___
 101
 102 }
 103
 104 ################################################################################
 105 # gcm_gmult_clmul_rv64i_zbb_zbc(u64 Xi[2], const u128 Htable[16])
 106 # Compute GMULT (X*H mod f) using the Zbc (clmul) and Zbb (basic bit manip)
 107 # extensions, and the Modified Barrett Reduction technique
 108 ################################################################################
 109 {
 110 my ($Xi,$Haddr,$A1,$A0,$B1,$B0,$C1,$C0,$D1,$D0,$E1,$E0,$TEMP,$TEMP2,$qp_low) =
 111  ("a0","a1","a2","a3","a4","a5","a6","a7","t0","t1","t2","t3","t4","t5","t6");
 112
 113 $code .= <<___;
 114 .text
 115 .balign 16
 116 .globl gcm_gmult_clmul_rv64i_zbb_zbc
 117 .type gcm_gmult_clmul_rv64i_zbb_zbc,\@function
 118 # static void gcm_gmult_clmul_rv64i_zbb_zbc(u64 Xi[2], const u128 Htable[16])
 119 # Computes product of X*H mod f
 120 gcm_gmult_clmul_rv64i_zbb_zbc:
 121
 122     # Load X and H (H is saved previously in gcm_init_clmul_rv64i_zbb_zbc)
 123     ld              $A1,0($Xi)
 124     ld              $A0,8($Xi)
 125
 126     ld              $B1,0($Haddr)
 127     ld              $B0,8($Haddr)
 128
 129     li              $qp_low,0xe100000000000000
 130
 131     # Perform Katratsuba Multiplication to generate a 255-bit intermediate
 132     # A = [A1:A0]
 133     # B = [B1:B0]
 134     # Let:
 135     # [C1:C0] = A1*B1
 136     # [D1:D0] = A0*B0
 137     # [E1:E0] = (A0+A1)*(B0+B1)
 138     # Then:
 139     # A*B = [C1:C0+C1+D1+E1:D1+C0+D0+E0:D0]
 140
 141     @{[rv64_rev8    $A1, $A1]}
 142     @{[rv64_clmul   $C0,$A1,$B1]}
 143     @{[rv64_clmulh  $C1,$A1,$B1]}
 144
 145     @{[rv64_rev8    $A0,$A0]}
 146     @{[rv64_clmul   $D0,$A0,$B0]}
 147     @{[rv64_clmulh  $D1,$A0,$B0]}
 148
 149     xor             $TEMP,$A0,$A1
 150     xor             $TEMP2,$B0,$B1
 151
 152     @{[rv64_clmul   $E0,$TEMP,$TEMP2]}
 153     @{[rv64_clmulh  $E1,$TEMP,$TEMP2]}
 154
 155     # 0th term is just C1
 156
 157     # Construct term 1 in E1 (E1 only appears in dword 1)
 158     xor             $E1,$E1,$D1
 159     xor             $E1,$E1,$C1
 160     xor             $E1,$E1,$C0
 161
 162     # Term 1 is E1
 163
 164     # Construct term 2 in E0 (E0 only appears in dword 2)
 165     xor             $E0,$E0,$D0
 166     xor             $E0,$E0,$C0
 167     xor             $E0,$E0,$D1
 168
 169     # Term 2 is E0
 170
 171     # final term is just D0
 172
 173     # X*H is now stored in [C1,E1,E0,D0]
 174
 175     # Left-justify
 176     slli            $C1,$C1,1
 177     # Or in the high bit of E1
 178     srli            $TEMP,$E1,63
 179     or              $C1,$C1,$TEMP
 180
 181     slli            $E1,$E1,1
 182     # Or in the high bit of E0
 183     srli            $TEMP2,$E0,63
 184     or              $E1,$E1,$TEMP2
 185
 186     slli            $E0,$E0,1
 187     # Or in the high bit of D0
 188     srli            $TEMP,$D0,63
 189     or              $E0,$E0,$TEMP
 190
 191     slli            $D0,$D0,1
 192
 193     # Barrett Reduction
 194     # c = [E0, D0]
 195     # We want the top 128 bits of the result of c*f
 196     # We'll get this by computing the low-half (most significant 128 bits in
 197     # the reflected domain) of clmul(c,fs)<<1 first, then
 198     # xor in c to complete the calculation
 199
 200     # AA = [AA1:AA0] = [E0,D0] = c
 201     # BB = [BB1:BB0] = [qp_low,0]
 202     # [CC1:CC0] = AA1*BB1
 203     # [DD1:DD0] = AA0*BB0
 204     # [EE1:EE0] = (AA0+AA1)*(BB0+BB1)
 205     # Then:
 206     # AA*BB = [CC1:CC0+CC1+DD1+EE1:DD1+CC0+DD0+EE0:DD0]
 207     # We only need CC0,DD1,DD0,EE0 to compute the low 128 bits of c * qp_low
 208 ___
 209
 210 my ($CC0,$EE0,$AA1,$AA0,$BB1) = ($A0,$B1,$E0,$D0,$qp_low);
 211
 212 $code .= <<___;
 213
 214     @{[rv64_clmul   $CC0,$AA1,$BB1]}
 215     #clmul          DD0,AA0,BB0     # BB0 is 0, so DD0 = 0
 216     #clmulh         DD1,AA0,BB0     # BB0 is 0, so DD1 = 0
 217     xor             $TEMP,$AA0,$AA1
 218     #xor            TEMP2,BB0,BB1   # TEMP2 = BB1 = qp_low
 219     @{[rv64_clmul   $EE0,$TEMP,$BB1]}
 220
 221     # Result is [N/A:N/A:DD1+CC0+DD0+EE0:DD0]
 222     # Simplifying: [CC0+EE0:0]
 223     xor             $TEMP2,$CC0,$EE0
 224     # Shift left by 1 to correct for bit reflection
 225     slli            $TEMP2,$TEMP2,1
 226
 227     # xor into c = [E0,D0]
 228     # Note that only E0 is affected
 229     xor             $E0,$E0,$TEMP2
 230
 231     # Now, q = [E0,D0]
 232
 233     # The final step is to compute clmul(q,[qp_low:0])<<1
 234     # The leftmost 128 bits are the reduced result.
 235     # Once again, we use Karatsuba multiplication, but many of the terms
 236     # simplify or cancel out.
 237     # AA = [AA1:AA0] = [E0,D0] = c
 238     # BB = [BB1:BB0] = [qp_low,0]
 239     # [CC1:CC0] = AA1*BB1
 240     # [DD1:DD0] = AA0*BB0
 241     # [EE1:EE0] = (AA0+AA1)*(BB0+BB1)
 242     # Then:
 243     # AA*BB = [CC1:CC0+CC1+DD1+EE1:DD1+CC0+DD0+EE0:DD0]
 244     # We need CC1,CC0,DD0,DD1,EE1,EE0 to compute the leftmost 128 bits of AA*BB
 245
 246 ___
 247
 248 my ($AA1,$AA0,$BB1,$CC1,$CC0,$EE1,$EE0) = ($E0,$D0,$qp_low,$A0,$A1,$C0,$B0);
 249
 250 $code .= <<___;
 251
 252     @{[rv64_clmul   $CC0,$AA1,$BB1]}
 253     @{[rv64_clmulh  $CC1,$AA1,$BB1]}
 254
 255     #clmul          DD0,AA0,BB0   # BB0 = 0 so DD0 = 0
 256     #clmulh         DD1,AA0,BB0   # BB0 = 0 so DD1 = 0
 257
 258     xor             $TEMP,$AA0,$AA1
 259     #xor            TEMP2,BB0,BB1 # BB0 = 0 to TEMP2 == BB1 == qp_low
 260
 261     @{[rv64_clmul   $EE0,$TEMP,$BB1]}
 262     @{[rv64_clmulh  $EE1,$TEMP,$BB1]}
 263
 264     # Need the DD1+CC0+DD0+EE0 term to shift its leftmost bit into the
 265     # intermediate result.
 266     # This is just CC0+EE0, store it in TEMP
 267     xor             $TEMP,$CC0,$EE0
 268
 269     # Result is [CC1:CC0+CC1+EE1:(a single bit)]<<1
 270     # Combine into [CC1:CC0]
 271     xor             $CC0,$CC0,$CC1
 272     xor             $CC0,$CC0,$EE1
 273
 274     # Shift 128-bit quantity, xor in [C1,E1] and store
 275     slli            $CC1,$CC1,1
 276     srli            $TEMP2,$CC0,63
 277     or              $CC1,$CC1,$TEMP2
 278     # xor in C1
 279     xor             $CC1,$CC1,$C1
 280     @{[rv64_rev8    $CC1,$CC1]}
 281
 282     slli            $CC0,$CC0,1
 283     srli            $TEMP,$TEMP,63
 284     or              $CC0,$CC0,$TEMP
 285     # xor in E1
 286     xor             $CC0,$CC0,$E1
 287     @{[rv64_rev8    $CC0,$CC0]}
 288     sd              $CC1,0(a0)
 289     sd              $CC0,8(a0)
 290
 291     ret
 292 ___
 293
 294 }
 295
 296 print $code;
 297
 298 close STDOUT or die "error closing STDOUT: $!";