crypto/modes/asm/ghash-riscv64.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 use strict;
  10 use warnings;
  11
  12 use FindBin qw($Bin);
  13 use lib "$Bin";
  14 use lib "$Bin/../../perlasm";
  15 use riscv;
  16
  17 # $output is the last argument if it looks like a file (it has an extension)
  18 # $flavour is the first argument if it doesn't look like a file
  19 my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  20 my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  21
  22 $output and open STDOUT,">$output";
  23
  24 my $code=<<___;
  25 .text
  26 ___
  27
  28 ################################################################################
  29 # void gcm_init_rv64i_zbc(u128 Htable[16], const u64 H[2]);
  30 # void gcm_init_rv64i_zbc__zbb(u128 Htable[16], const u64 H[2]);
  31 # void gcm_init_rv64i_zbc__zbkb(u128 Htable[16], const u64 H[2]);
  32 #
  33 # input:  H: 128-bit H - secret parameter E(K, 0^128)
  34 # output: Htable: Preprocessed key data for gcm_gmult_rv64i_zbc* and
  35 #                 gcm_ghash_rv64i_zbc*
  36 #
  37 # All callers of this function revert the byte-order unconditionally
  38 # on little-endian machines. So we need to revert the byte-order back.
  39 # Additionally we reverse the bits of each byte.
  40
  41 {
  42 my ($Htable,$H,$VAL0,$VAL1,$TMP0,$TMP1,$TMP2) = ("a0","a1","a2","a3","t0","t1","t2");
  43
  44 $code .= <<___;
  45 .p2align 3
  46 .globl gcm_init_rv64i_zbc
  47 .type gcm_init_rv64i_zbc,\@function
  48 gcm_init_rv64i_zbc:
  49     ld      $VAL0,0($H)
  50     ld      $VAL1,8($H)
  51     @{[brev8_rv64i   $VAL0, $TMP0, $TMP1, $TMP2]}
  52     @{[brev8_rv64i   $VAL1, $TMP0, $TMP1, $TMP2]}
  53     @{[sd_rev8_rv64i $VAL0, $Htable, 0, $TMP0]}
  54     @{[sd_rev8_rv64i $VAL1, $Htable, 8, $TMP0]}
  55     ret
  56 .size gcm_init_rv64i_zbc,.-gcm_init_rv64i_zbc
  57 ___
  58 }
  59
  60 {
  61 my ($Htable,$H,$VAL0,$VAL1,$TMP0,$TMP1,$TMP2) = ("a0","a1","a2","a3","t0","t1","t2");
  62
  63 $code .= <<___;
  64 .p2align 3
  65 .globl gcm_init_rv64i_zbc__zbb
  66 .type gcm_init_rv64i_zbc__zbb,\@function
  67 gcm_init_rv64i_zbc__zbb:
  68     ld      $VAL0,0($H)
  69     ld      $VAL1,8($H)
  70     @{[brev8_rv64i $VAL0, $TMP0, $TMP1, $TMP2]}
  71     @{[brev8_rv64i $VAL1, $TMP0, $TMP1, $TMP2]}
  72     @{[rev8 $VAL0, $VAL0]}
  73     @{[rev8 $VAL1, $VAL1]}
  74     sd      $VAL0,0($Htable)
  75     sd      $VAL1,8($Htable)
  76     ret
  77 .size gcm_init_rv64i_zbc__zbb,.-gcm_init_rv64i_zbc__zbb
  78 ___
  79 }
  80
  81 {
  82 my ($Htable,$H,$TMP0,$TMP1) = ("a0","a1","t0","t1");
  83
  84 $code .= <<___;
  85 .p2align 3
  86 .globl gcm_init_rv64i_zbc__zbkb
  87 .type gcm_init_rv64i_zbc__zbkb,\@function
  88 gcm_init_rv64i_zbc__zbkb:
  89     ld      $TMP0,0($H)
  90     ld      $TMP1,8($H)
  91     @{[brev8 $TMP0, $TMP0]}
  92     @{[brev8 $TMP1, $TMP1]}
  93     @{[rev8 $TMP0, $TMP0]}
  94     @{[rev8 $TMP1, $TMP1]}
  95     sd      $TMP0,0($Htable)
  96     sd      $TMP1,8($Htable)
  97     ret
  98 .size gcm_init_rv64i_zbc__zbkb,.-gcm_init_rv64i_zbc__zbkb
  99 ___
 100 }
 101
 102 ################################################################################
 103 # void gcm_gmult_rv64i_zbc(u64 Xi[2], const u128 Htable[16]);
 104 # void gcm_gmult_rv64i_zbc__zbkb(u64 Xi[2], const u128 Htable[16]);
 105 #
 106 # input:  Xi: current hash value
 107 #         Htable: copy of H
 108 # output: Xi: next hash value Xi
 109 #
 110 # Compute GMULT (Xi*H mod f) using the Zbc (clmul) and Zbb (basic bit manip)
 111 # extensions. Using the no-Karatsuba approach and clmul for the final reduction.
 112 # This results in an implementation with minimized number of instructions.
 113 # HW with clmul latencies higher than 2 cycles might observe a performance
 114 # improvement with Karatsuba. HW with clmul latencies higher than 6 cycles
 115 # might observe a performance improvement with additionally converting the
 116 # reduction to shift&xor. For a full discussion of this estimates see
 117 # https://github.com/riscv/riscv-crypto/blob/master/doc/supp/gcm-mode-cmul.adoc
 118 {
 119 my ($Xi,$Htable,$x0,$x1,$y0,$y1) = ("a0","a1","a4","a5","a6","a7");
 120 my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
 121
 122 $code .= <<___;
 123 .p2align 3
 124 .globl gcm_gmult_rv64i_zbc
 125 .type gcm_gmult_rv64i_zbc,\@function
 126 gcm_gmult_rv64i_zbc:
 127     # Load Xi and bit-reverse it
 128     ld        $x0, 0($Xi)
 129     ld        $x1, 8($Xi)
 130     @{[brev8_rv64i $x0, $z0, $z1, $z2]}
 131     @{[brev8_rv64i $x1, $z0, $z1, $z2]}
 132
 133     # Load the key (already bit-reversed)
 134     ld        $y0, 0($Htable)
 135     ld        $y1, 8($Htable)
 136
 137     # Load the reduction constant
 138     la        $polymod, Lpolymod
 139     lbu       $polymod, 0($polymod)
 140
 141     # Multiplication (without Karatsuba)
 142     @{[clmulh $z3, $x1, $y1]}
 143     @{[clmul  $z2, $x1, $y1]}
 144     @{[clmulh $t1, $x0, $y1]}
 145     @{[clmul  $z1, $x0, $y1]}
 146     xor       $z2, $z2, $t1
 147     @{[clmulh $t1, $x1, $y0]}
 148     @{[clmul  $t0, $x1, $y0]}
 149     xor       $z2, $z2, $t1
 150     xor       $z1, $z1, $t0
 151     @{[clmulh $t1, $x0, $y0]}
 152     @{[clmul  $z0, $x0, $y0]}
 153     xor       $z1, $z1, $t1
 154
 155     # Reduction with clmul
 156     @{[clmulh $t1, $z3, $polymod]}
 157     @{[clmul  $t0, $z3, $polymod]}
 158     xor       $z2, $z2, $t1
 159     xor       $z1, $z1, $t0
 160     @{[clmulh $t1, $z2, $polymod]}
 161     @{[clmul  $t0, $z2, $polymod]}
 162     xor       $x1, $z1, $t1
 163     xor       $x0, $z0, $t0
 164
 165     # Bit-reverse Xi back and store it
 166     @{[brev8_rv64i $x0, $z0, $z1, $z2]}
 167     @{[brev8_rv64i $x1, $z0, $z1, $z2]}
 168     sd        $x0, 0($Xi)
 169     sd        $x1, 8($Xi)
 170     ret
 171 .size gcm_gmult_rv64i_zbc,.-gcm_gmult_rv64i_zbc
 172 ___
 173 }
 174
 175 {
 176 my ($Xi,$Htable,$x0,$x1,$y0,$y1) = ("a0","a1","a4","a5","a6","a7");
 177 my ($z0,$z1,$z2,$z3,$t0,$t1,$polymod) = ("t0","t1","t2","t3","t4","t5","t6");
 178
 179 $code .= <<___;
 180 .p2align 3
 181 .globl gcm_gmult_rv64i_zbc__zbkb
 182 .type gcm_gmult_rv64i_zbc__zbkb,\@function
 183 gcm_gmult_rv64i_zbc__zbkb:
 184     # Load Xi and bit-reverse it
 185     ld        $x0, 0($Xi)
 186     ld        $x1, 8($Xi)
 187     @{[brev8  $x0, $x0]}
 188     @{[brev8  $x1, $x1]}
 189
 190     # Load the key (already bit-reversed)
 191     ld        $y0, 0($Htable)
 192     ld        $y1, 8($Htable)
 193
 194     # Load the reduction constant
 195     la        $polymod, Lpolymod
 196     lbu       $polymod, 0($polymod)
 197
 198     # Multiplication (without Karatsuba)
 199     @{[clmulh $z3, $x1, $y1]}
 200     @{[clmul  $z2, $x1, $y1]}
 201     @{[clmulh $t1, $x0, $y1]}
 202     @{[clmul  $z1, $x0, $y1]}
 203     xor       $z2, $z2, $t1
 204     @{[clmulh $t1, $x1, $y0]}
 205     @{[clmul  $t0, $x1, $y0]}
 206     xor       $z2, $z2, $t1
 207     xor       $z1, $z1, $t0
 208     @{[clmulh $t1, $x0, $y0]}
 209     @{[clmul  $z0, $x0, $y0]}
 210     xor       $z1, $z1, $t1
 211
 212     # Reduction with clmul
 213     @{[clmulh $t1, $z3, $polymod]}
 214     @{[clmul  $t0, $z3, $polymod]}
 215     xor       $z2, $z2, $t1
 216     xor       $z1, $z1, $t0
 217     @{[clmulh $t1, $z2, $polymod]}
 218     @{[clmul  $t0, $z2, $polymod]}
 219     xor       $x1, $z1, $t1
 220     xor       $x0, $z0, $t0
 221
 222     # Bit-reverse Xi back and store it
 223     @{[brev8  $x0, $x0]}
 224     @{[brev8  $x1, $x1]}
 225     sd        $x0, 0($Xi)
 226     sd        $x1, 8($Xi)
 227     ret
 228 .size gcm_gmult_rv64i_zbc__zbkb,.-gcm_gmult_rv64i_zbc__zbkb
 229 ___
 230 }
 231
 232 $code .= <<___;
 233 .p2align 3
 234 Lbrev8_const:
 235     .dword  0xAAAAAAAAAAAAAAAA
 236     .dword  0xCCCCCCCCCCCCCCCC
 237     .dword  0xF0F0F0F0F0F0F0F0
 238 .size Lbrev8_const,.-Lbrev8_const
 239
 240 Lpolymod:
 241     .byte 0x87
 242 .size Lpolymod,.-Lpolymod
 243 ___
 244
 245 print $code;
 246
 247 close STDOUT or die "error closing STDOUT: $!";