SM3 acceleration with SM3 hardware instruction on aarch64
authorfangming.fang <fangming.fang@arm.com>
Fri, 24 Dec 2021 08:29:04 +0000 (08:29 +0000)
committerTomas Mraz <tomas@openssl.org>
Fri, 14 Jan 2022 10:40:05 +0000 (11:40 +0100)
SM3 hardware instruction is optional feature of crypto extension for
aarch64. This implementation accelerates SM3 via SM3 instructions. For
the platform not supporting SM3 instruction, the original C
implementation still works. Thanks to AliBaba for testing and reporting
the following perf numbers for Yitian710:

Benchmark on T-Head Yitian-710 2.75GHz:

Before:
type  16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes   16384 bytes
sm3   49297.82k   121062.63k   223106.05k   283371.52k   307574.10k   309400.92k

After (33% - 74% faster):
type  16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes   16384 bytes
sm3   65640.01k   179121.79k   359854.59k   481448.96k   534055.59k   538274.47k

Reviewed-by: Paul Dale <pauli@openssl.org>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/17454)

crypto/arm64cpuid.pl
crypto/arm_arch.h
crypto/armcap.c
crypto/sm3/asm/sm3-armv8.pl [new file with mode: 0644]
crypto/sm3/build.info
crypto/sm3/sm3_local.h

index a86fa6073a1fd58b8f4a0f742666445946bc01d3..b30f505339cddac70761f3cf3e248455a9c56c0a 100755 (executable)
@@ -96,6 +96,14 @@ _armv8_cpuid_probe:
        ret
 .size  _armv8_cpuid_probe,.-_armv8_cpuid_probe
 
+.globl _armv8_sm3_probe
+.type  _armv8_sm3_probe,%function
+_armv8_sm3_probe:
+       AARCH64_VALID_CALL_TARGET
+       .long   0xce63c004      // sm3partw1 v4.4s, v0.4s, v3.4s
+       ret
+.size  _armv8_sm3_probe,.-_armv8_sm3_probe
+
 .globl OPENSSL_cleanse
 .type  OPENSSL_cleanse,%function
 .align 5
index 848f06542c1c8a2e74eb7569156b42525c7069dc..77173cae42b347add592b3c82664fe8e58bf3474 100644 (file)
@@ -79,6 +79,7 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
 # define ARMV8_SHA512    (1<<6)
 # define ARMV8_CPUID     (1<<7)
 # define ARMV8_RNG       (1<<8)
+# define ARMV8_SM3       (1<<9)
 
 /*
  * MIDR_EL1 system register
index 72ed0a024acef93cd8f9a28ec74ecc534c902795..93003c91211d2077c7570042a5eecd958f9802ca 100644 (file)
@@ -53,6 +53,7 @@ void _armv8_sha1_probe(void);
 void _armv8_sha256_probe(void);
 void _armv8_pmull_probe(void);
 # ifdef __aarch64__
+void _armv8_sm3_probe(void);
 void _armv8_sha512_probe(void);
 unsigned int _armv8_cpuid_probe(void);
 void _armv8_rng_probe(void);
@@ -169,6 +170,7 @@ static unsigned long getauxval(unsigned long key)
 #  define HWCAP_CE_SHA1          (1 << 5)
 #  define HWCAP_CE_SHA256        (1 << 6)
 #  define HWCAP_CPUID            (1 << 11)
+#  define HWCAP_CE_SM3           (1 << 18)
 #  define HWCAP_CE_SHA512        (1 << 21)
                                   /* AT_HWCAP2 */
 #  define HWCAP2                 26
@@ -245,6 +247,9 @@ void OPENSSL_cpuid_setup(void)
 
         if (hwcap & HWCAP_CPUID)
             OPENSSL_armcap_P |= ARMV8_CPUID;
+
+        if (hwcap & HWCAP_CE_SM3)
+            OPENSSL_armcap_P |= ARMV8_SM3;
 #  endif
     }
 #  ifdef __aarch64__
@@ -292,6 +297,11 @@ void OPENSSL_cpuid_setup(void)
             _armv8_sha512_probe();
             OPENSSL_armcap_P |= ARMV8_SHA512;
         }
+
+        if (sigsetjmp(ill_jmp, 1) == 0) {
+            _armv8_sm3_probe();
+            OPENSSL_armcap_P |= ARMV8_SM3;
+        }
 #  endif
     }
 #  ifdef __aarch64__
diff --git a/crypto/sm3/asm/sm3-armv8.pl b/crypto/sm3/asm/sm3-armv8.pl
new file mode 100644 (file)
index 0000000..bb71b2e
--- /dev/null
@@ -0,0 +1,282 @@
+#! /usr/bin/env perl
+# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+#
+# This module implements support for Armv8 SM3 instructions
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+# Message expanding:
+#      Wj <- P1(W[j-16]^W[j-9]^(W[j-3]<<<15))^(W[j-13]<<<7)^W[j-6]
+# Input: s0, s1, s2, s3
+#      s0 = w0  | w1  | w2  | w3
+#      s1 = w4  | w5  | w6  | w7
+#      s2 = w8  | w9  | w10 | w11
+#      s3 = w12 | w13 | w14 | w15
+# Output: s4
+sub msg_exp () {
+my $s0 = shift;
+my $s1 = shift;
+my $s2 = shift;
+my $s3 = shift;
+my $s4 = shift;
+my $vtmp1 = shift;
+my $vtmp2 = shift;
+$code.=<<___;
+       // s4 = w7  | w8  | w9  | w10
+       ext     $s4.16b, $s1.16b, $s2.16b, #12
+       // vtmp1 = w3  | w4  | w5  | w6
+       ext     $vtmp1.16b, $s0.16b, $s1.16b, #12
+       // vtmp2 = w10 | w11 | w12 | w13
+       ext     $vtmp2.16b, $s2.16b, $s3.16b, #8
+       sm3partw1       $s4.4s, $s0.4s, $s3.4s
+       sm3partw2       $s4.4s, $vtmp2.4s, $vtmp1.4s
+___
+}
+
+# A round of compresson function
+# Input:
+#      ab - choose instruction among sm3tt1a, sm3tt1b, sm3tt2a, sm3tt2b
+#      vstate0 - vstate1, store digest status(A - H)
+#      vconst0 - vconst1, interleaved used to store Tj <<< j
+#      vtmp - temporary register
+#      vw - for sm3tt1ab, vw = s0 eor s1
+#      s0 - for sm3tt2ab, just be s0
+#      i, choose wj' or wj from vw
+sub round () {
+my $ab = shift;
+my $vstate0 = shift;
+my $vstate1 = shift;
+my $vconst0 = shift;
+my $vconst1 = shift;
+my $vtmp = shift;
+my $vw = shift;
+my $s0 = shift;
+my $i = shift;
+$code.=<<___;
+       sm3ss1  $vtmp.4s, $vstate0.4s, $vconst0.4s, $vstate1.4s
+       shl     $vconst1.4s, $vconst0.4s, #1
+       sri     $vconst1.4s, $vconst0.4s, #31
+       sm3tt1$ab       $vstate0.4s, $vtmp.4s, $vw.4s[$i]
+       sm3tt2$ab       $vstate1.4s, $vtmp.4s, $s0.4s[$i]
+___
+}
+
+sub qround () {
+my $ab = shift;
+my $vstate0 = shift;
+my $vstate1 = shift;
+my $vconst0 = shift;
+my $vconst1 = shift;
+my $vtmp1 = shift;
+my $vtmp2 = shift;
+my $s0 = shift;
+my $s1 = shift;
+my $s2 = shift;
+my $s3 = shift;
+my $s4 = shift;
+       if($s4) {
+               &msg_exp($s0, $s1, $s2, $s3, $s4, $vtmp1, $vtmp2);
+       }
+$code.=<<___;
+       eor     $vtmp1.16b, $s0.16b, $s1.16b
+___
+       &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
+               $vtmp1, $s0, 0);
+       &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
+               $vtmp1, $s0, 1);
+       &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
+               $vtmp1, $s0, 2);
+       &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
+               $vtmp1, $s0, 3);
+}
+
+$code=<<___;
+#include "arm_arch.h"
+.arch  armv8.2-a+sm4
+.text
+___
+
+{{{
+my ($pstate,$pdata,$num)=("x0","x1","w2");
+my ($state1,$state2)=("v5","v6");
+my ($sconst1, $sconst2)=("s16","s17");
+my ($vconst1, $vconst2)=("v16","v17");
+my ($s0,$s1,$s2,$s3,$s4)=map("v$_",(0..4));
+my ($bkstate1,$bkstate2)=("v18","v19");
+my ($vconst_tmp1,$vconst_tmp2)=("v20","v21");
+my ($vtmp1,$vtmp2)=("v22","v23");
+my $constaddr="x8";
+# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num)
+$code.=<<___;
+.globl ossl_hwsm3_block_data_order
+.type  ossl_hwsm3_block_data_order,%function
+.align 5
+ossl_hwsm3_block_data_order:
+       AARCH64_VALID_CALL_TARGET
+       // load state
+       ld1     {$state1.4s-$state2.4s}, [$pstate]
+       rev64   $state1.4s, $state1.4s
+       rev64   $state2.4s, $state2.4s
+       ext     $state1.16b, $state1.16b, $state1.16b, #8
+       ext     $state2.16b, $state2.16b, $state2.16b, #8
+
+       adr     $constaddr, .Tj
+       ldp     $sconst1, $sconst2, [$constaddr]
+
+.Loop:
+       // load input
+       ld1     {$s0.16b-$s3.16b}, [$pdata], #64
+       sub     $num, $num, #1
+
+       mov     $bkstate1.16b, $state1.16b
+       mov     $bkstate2.16b, $state2.16b
+
+#ifndef __ARMEB__
+       rev32   $s0.16b, $s0.16b
+       rev32   $s1.16b, $s1.16b
+       rev32   $s2.16b, $s2.16b
+       rev32   $s3.16b, $s3.16b
+#endif
+
+       ext     $vconst_tmp1.16b, $vconst1.16b, $vconst1.16b, #4
+___
+       &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s0,$s1,$s2,$s3,$s4);
+       &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s1,$s2,$s3,$s4,$s0);
+       &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s2,$s3,$s4,$s0,$s1);
+       &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s3,$s4,$s0,$s1,$s2);
+
+$code.=<<___;
+       ext     $vconst_tmp1.16b, $vconst2.16b, $vconst2.16b, #4
+___
+
+       &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s4,$s0,$s1,$s2,$s3);
+       &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s0,$s1,$s2,$s3,$s4);
+       &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s1,$s2,$s3,$s4,$s0);
+       &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s2,$s3,$s4,$s0,$s1);
+       &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s3,$s4,$s0,$s1,$s2);
+       &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s4,$s0,$s1,$s2,$s3);
+       &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s0,$s1,$s2,$s3,$s4);
+       &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s1,$s2,$s3,$s4,$s0);
+       &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s2,$s3,$s4,$s0,$s1);
+       &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s3,$s4);
+       &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s4,$s0);
+       &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
+                $s0,$s1);
+
+$code.=<<___;
+       eor     $state1.16b, $state1.16b, $bkstate1.16b
+       eor     $state2.16b, $state2.16b, $bkstate2.16b
+
+       // any remained blocks?
+       cbnz    $num, .Loop
+
+       // save state
+       rev64   $state1.4s, $state1.4s
+       rev64   $state2.4s, $state2.4s
+       ext     $state1.16b, $state1.16b, $state1.16b, #8
+       ext     $state2.16b, $state2.16b, $state2.16b, #8
+       st1     {$state1.4s-$state2.4s}, [$pstate]
+       ret
+.size  ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order
+
+.align 3
+.Tj:
+.word  0x79cc4519, 0x9d8a7a87
+___
+}}}
+
+#########################################
+my %sm3partopcode = (
+       "sm3partw1"         =>   0xce60C000,
+        "sm3partw2"         =>   0xce60C400);
+
+my %sm3sslopcode = (
+       "sm3ssl"            =>   0xce400000);
+
+my %sm3ttopcode = (
+       "sm3tt1a"           =>   0xce408000,
+       "sm3tt1b"           =>   0xce408400,
+       "sm3tt2a"           =>   0xce408800,
+       "sm3tt2b"           =>   0xce408C00);
+
+sub unsm3part {
+       my ($mnemonic,$arg)=@_;
+
+       $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o
+       &&
+       sprintf ".inst\t0x%08x\t//%s %s",
+                       $sm3partopcode{$mnemonic}|$1|($2<<5)|($3<<16),
+                       $mnemonic,$arg;
+}
+
+sub unsm3ssl {
+       my ($mnemonic,$arg)=@_;
+
+       $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,
+                \s*[qv](\d+)/o
+       &&
+       sprintf ".inst\t0x%08x\t//%s %s",
+                       $sm3sslopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10),
+                       $mnemonic,$arg;
+}
+
+sub unsm3tt {
+       my ($mnemonic,$arg)=@_;
+
+       $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*\[([0-3])\]/o
+       &&
+       sprintf ".inst\t0x%08x\t//%s %s",
+                       $sm3ttopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<12),
+                       $mnemonic,$arg;
+}
+
+open SELF,$0;
+while(<SELF>) {
+        next if (/^#!/);
+        last if (!s/^#/\/\// and !/^$/);
+        print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+       s/\`([^\`]*)\`/eval($1)/ge;
+
+       s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge;
+       s/\b(sm3ssl)\s+([qv].*)/unsm3ssl($1,$2)/ge;
+       s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge;
+       print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
index eca68216f27e2c2faa43006cc416bb0990ad85d2..2fa54a4a8bc5e0e64be30fbba368174f28a6f76c 100644 (file)
@@ -1,5 +1,22 @@
 LIBS=../../libcrypto
 
 IF[{- !$disabled{sm3} -}]
-  SOURCE[../../libcrypto]=sm3.c legacy_sm3.c
-ENDIF
\ No newline at end of file
+  IF[{- !$disabled{asm} -}]
+    $SM3ASM_aarch64=sm3-armv8.S
+    $SM3DEF_aarch64=OPENSSL_SM3_ASM
+
+    # Now that we have defined all the arch specific variables, use the
+    # appropriate ones, and define the appropriate macros
+    IF[$SM3ASM_{- $target{asm_arch} -}]
+      $SM3ASM=$SM3ASM_{- $target{asm_arch} -}
+      $SM3DEF=$SM3DEF_{- $target{asm_arch} -}
+    ENDIF
+  ENDIF
+
+  SOURCE[../../libcrypto]=sm3.c legacy_sm3.c $SM3ASM
+  DEFINE[../../libcrypto]=$SM3DEF
+
+  GENERATE[sm3-armv8.S]=asm/sm3-armv8.pl
+  INCLUDE[sm3-armv8.o]=..
+ENDIF
+
index 6daeb878a8878645f8ca8545904150120a1d0e8f..ac8a2bf768e0d5fe9f07be09d0fe69f73ab87073 100644 (file)
         ll=(c)->G; (void)HOST_l2c(ll, (s)); \
         ll=(c)->H; (void)HOST_l2c(ll, (s)); \
       } while (0)
-#define HASH_BLOCK_DATA_ORDER   ossl_sm3_block_data_order
+
+#if defined(OPENSSL_SM3_ASM)
+# if defined(__aarch64__)
+#  include "crypto/arm_arch.h"
+#  define HWSM3_CAPABLE (OPENSSL_armcap_P & ARMV8_SM3)
+void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num);
+# endif
+#endif
+
+#if defined(HWSM3_CAPABLE)
+# define HASH_BLOCK_DATA_ORDER (HWSM3_CAPABLE ? ossl_hwsm3_block_data_order \
+                                              : ossl_sm3_block_data_order)
+#else
+# define HASH_BLOCK_DATA_ORDER   ossl_sm3_block_data_order
+#endif
 
 void ossl_sm3_block_data_order(SM3_CTX *c, const void *p, size_t num);
 void ossl_sm3_transform(SM3_CTX *c, const unsigned char *data);