SM4 optimization for ARM by HW instruction
authorDaniel Hu <Daniel.Hu@arm.com>
Tue, 19 Oct 2021 21:49:05 +0000 (22:49 +0100)
committerTomas Mraz <tomas@openssl.org>
Tue, 18 Jan 2022 10:52:14 +0000 (11:52 +0100)
This patch implements the SM4 optimization for ARM processor,
using SM4 HW instruction, which is an optional feature of
crypto extension for aarch64 V8.

Tested on some modern ARM micro-architectures with SM4 support, the
performance uplift can be observed around 8X~40X over existing
C implementation in openssl. Algorithms that can be parallelized
(like CTR, ECB, CBC decryption) are on higher end, with algorithm
like CBC encryption on lower end (due to inter-block dependency)

Perf data on Yitian-710 2.75GHz hardware, before and after optimization:

Before:
  type      16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes  16384 bytes
  SM4-CTR  105787.80k   107837.87k   108380.84k   108462.08k   108549.46k   108554.92k
  SM4-ECB  111924.58k   118173.76k   119776.00k   120093.70k   120264.02k   120274.94k
  SM4-CBC  106428.09k   109190.98k   109674.33k   109774.51k   109827.41k   109827.41k

After (7.4x - 36.6x faster):
  type      16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes  16384 bytes
  SM4-CTR  781979.02k  2432994.28k  3437753.86k  3834177.88k  3963715.58k  3974556.33k
  SM4-ECB  937590.69k  2941689.02k  3945751.81k  4328655.87k  4459181.40k  4468692.31k
  SM4-CBC  890639.88k  1027746.58k  1050621.78k  1056696.66k  1058613.93k  1058701.31k

Signed-off-by: Daniel Hu <Daniel.Hu@arm.com>
Reviewed-by: Paul Dale <pauli@openssl.org>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/17455)

crypto/arm64cpuid.pl
crypto/arm_arch.h
crypto/armcap.c
crypto/evp/e_sm4.c
crypto/sm4/asm/sm4-armv8.pl [new file with mode: 0755]
crypto/sm4/build.info
include/crypto/sm4_platform.h [new file with mode: 0644]
providers/implementations/ciphers/cipher_sm4.h
providers/implementations/ciphers/cipher_sm4_gcm_hw.c
providers/implementations/ciphers/cipher_sm4_hw.c

index b30f505339cddac70761f3cf3e248455a9c56c0a..1841c0cc04685849ff79338a6a2e06f77c13f353 100755 (executable)
@@ -80,6 +80,14 @@ _armv8_pmull_probe:
        ret
 .size  _armv8_pmull_probe,.-_armv8_pmull_probe
 
+.globl _armv8_sm4_probe
+.type  _armv8_sm4_probe,%function
+_armv8_sm4_probe:
+       AARCH64_VALID_CALL_TARGET
+       .long   0xcec08400      // sm4e v0.4s, v0.4s
+       ret
+.size  _armv8_sm4_probe,.-_armv8_sm4_probe
+
 .globl _armv8_sha512_probe
 .type  _armv8_sha512_probe,%function
 _armv8_sha512_probe:
index 77173cae42b347add592b3c82664fe8e58bf3474..291620ebc92b6f33f2f655c898a30b23589971a3 100644 (file)
@@ -80,6 +80,7 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
 # define ARMV8_CPUID     (1<<7)
 # define ARMV8_RNG       (1<<8)
 # define ARMV8_SM3       (1<<9)
+# define ARMV8_SM4       (1<<10)
 
 /*
  * MIDR_EL1 system register
index 93003c91211d2077c7570042a5eecd958f9802ca..5016987eeb8e703360f8060d25bcc4fcf1dda231 100644 (file)
@@ -54,6 +54,7 @@ void _armv8_sha256_probe(void);
 void _armv8_pmull_probe(void);
 # ifdef __aarch64__
 void _armv8_sm3_probe(void);
+void _armv8_sm4_probe(void);
 void _armv8_sha512_probe(void);
 unsigned int _armv8_cpuid_probe(void);
 void _armv8_rng_probe(void);
@@ -171,6 +172,7 @@ static unsigned long getauxval(unsigned long key)
 #  define HWCAP_CE_SHA256        (1 << 6)
 #  define HWCAP_CPUID            (1 << 11)
 #  define HWCAP_CE_SM3           (1 << 18)
+#  define HWCAP_CE_SM4           (1 << 19)
 #  define HWCAP_CE_SHA512        (1 << 21)
                                   /* AT_HWCAP2 */
 #  define HWCAP2                 26
@@ -242,6 +244,9 @@ void OPENSSL_cpuid_setup(void)
             OPENSSL_armcap_P |= ARMV8_SHA256;
 
 #  ifdef __aarch64__
+        if (hwcap & HWCAP_CE_SM4)
+            OPENSSL_armcap_P |= ARMV8_SM4;
+
         if (hwcap & HWCAP_CE_SHA512)
             OPENSSL_armcap_P |= ARMV8_SHA512;
 
@@ -293,6 +298,11 @@ void OPENSSL_cpuid_setup(void)
             OPENSSL_armcap_P |= ARMV8_SHA256;
         }
 #  if defined(__aarch64__) && !defined(__APPLE__)
+        if (sigsetjmp(ill_jmp, 1) == 0) {
+            _armv8_sm4_probe();
+            OPENSSL_armcap_P |= ARMV8_SM4;
+        }
+
         if (sigsetjmp(ill_jmp, 1) == 0) {
             _armv8_sha512_probe();
             OPENSSL_armcap_P |= ARMV8_SHA512;
index abd603015c714ce96251ce22e3c2f59d462d5907..bff79ff19774733f253514c86d5be4baac0ce4bb 100644 (file)
 # include <openssl/modes.h>
 # include "crypto/sm4.h"
 # include "crypto/evp.h"
+# include "crypto/sm4_platform.h"
 # include "evp_local.h"
 
 typedef struct {
-    SM4_KEY ks;
+    union {
+        OSSL_UNION_ALIGN;
+        SM4_KEY ks;
+    } ks;
+    block128_f block;
+    union {
+        ecb128_f ecb;
+        cbc128_f cbc;
+        ctr128_f ctr;
+    } stream;
 } EVP_SM4_KEY;
 
+# define BLOCK_CIPHER_generic(nid,blocksize,ivlen,nmode,mode,MODE,flags) \
+static const EVP_CIPHER sm4_##mode = { \
+        nid##_##nmode,blocksize,128/8,ivlen, \
+        flags|EVP_CIPH_##MODE##_MODE,   \
+        EVP_ORIG_GLOBAL,                \
+        sm4_init_key,                   \
+        sm4_##mode##_cipher,            \
+        NULL,                           \
+        sizeof(EVP_SM4_KEY),            \
+        NULL,NULL,NULL,NULL }; \
+const EVP_CIPHER *EVP_sm4_##mode(void) \
+{ return &sm4_##mode; }
+
+#define DEFINE_BLOCK_CIPHERS(nid,flags)             \
+        BLOCK_CIPHER_generic(nid,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)     \
+        BLOCK_CIPHER_generic(nid,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)      \
+        BLOCK_CIPHER_generic(nid,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)   \
+        BLOCK_CIPHER_generic(nid,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)   \
+        BLOCK_CIPHER_generic(nid,1,16,ctr,ctr,CTR,flags)
+
 static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
                         const unsigned char *iv, int enc)
 {
-    ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
+    int mode;
+    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+
+    mode = EVP_CIPHER_CTX_get_mode(ctx);
+    if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
+        && !enc) {
+#ifdef HWSM4_CAPABLE
+        if (HWSM4_CAPABLE) {
+            HWSM4_set_decrypt_key(key, &dat->ks.ks);
+            dat->block = (block128_f) HWSM4_decrypt;
+            dat->stream.cbc = NULL;
+# ifdef HWSM4_cbc_encrypt
+            if (mode == EVP_CIPH_CBC_MODE)
+                dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt;
+# endif
+# ifdef HWSM4_ecb_encrypt
+            if (mode == EVP_CIPH_ECB_MODE)
+                dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt;
+# endif
+        } else
+#endif
+        {
+            dat->block = (block128_f) ossl_sm4_decrypt;
+            ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
+        }
+    } else
+#ifdef HWSM4_CAPABLE
+    if (HWSM4_CAPABLE) {
+        HWSM4_set_encrypt_key(key, &dat->ks.ks);
+        dat->block = (block128_f) HWSM4_encrypt;
+        dat->stream.cbc = NULL;
+# ifdef HWSM4_cbc_encrypt
+        if (mode == EVP_CIPH_CBC_MODE)
+            dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt;
+        else
+# endif
+# ifdef HWSM4_ecb_encrypt
+        if (mode == EVP_CIPH_ECB_MODE)
+            dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt;
+        else
+# endif
+# ifdef HWSM4_ctr32_encrypt_blocks
+        if (mode == EVP_CIPH_CTR_MODE)
+            dat->stream.ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks;
+        else
+# endif
+            (void)0;            /* terminate potentially open 'else' */
+    } else
+#endif
+    {
+        dat->block = (block128_f) ossl_sm4_encrypt;
+        ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
+    }
     return 1;
 }
 
-static void sm4_cbc_encrypt(const unsigned char *in, unsigned char *out,
-                            size_t len, const SM4_KEY *key,
-                            unsigned char *ivec, const int enc)
+static int sm4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                          const unsigned char *in, size_t len)
 {
-    if (enc)
-        CRYPTO_cbc128_encrypt(in, out, len, key, ivec,
-                              (block128_f)ossl_sm4_encrypt);
+    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+
+    if (dat->stream.cbc)
+        (*dat->stream.cbc) (in, out, len, &dat->ks.ks, ctx->iv,
+                            EVP_CIPHER_CTX_is_encrypting(ctx));
+    else if (EVP_CIPHER_CTX_is_encrypting(ctx))
+        CRYPTO_cbc128_encrypt(in, out, len, &dat->ks, ctx->iv,
+                              dat->block);
     else
-        CRYPTO_cbc128_decrypt(in, out, len, key, ivec,
-                              (block128_f)ossl_sm4_decrypt);
+        CRYPTO_cbc128_decrypt(in, out, len, &dat->ks,
+                              ctx->iv, dat->block);
+    return 1;
 }
 
-static void sm4_cfb128_encrypt(const unsigned char *in, unsigned char *out,
-                               size_t length, const SM4_KEY *key,
-                               unsigned char *ivec, int *num, const int enc)
+static int sm4_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                          const unsigned char *in, size_t len)
 {
-    CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc,
-                          (block128_f)ossl_sm4_encrypt);
+    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+    int num = EVP_CIPHER_CTX_get_num(ctx);
+
+    CRYPTO_cfb128_encrypt(in, out, len, &dat->ks,
+                          ctx->iv, &num,
+                          EVP_CIPHER_CTX_is_encrypting(ctx), dat->block);
+    EVP_CIPHER_CTX_set_num(ctx, num);
+    return 1;
 }
 
-static void sm4_ecb_encrypt(const unsigned char *in, unsigned char *out,
-                            const SM4_KEY *key, const int enc)
+static int sm4_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                          const unsigned char *in, size_t len)
 {
-    if (enc)
-        ossl_sm4_encrypt(in, out, key);
+    size_t bl = EVP_CIPHER_CTX_get_block_size(ctx);
+    size_t i;
+    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+
+    if (len < bl)
+        return 1;
+
+    if (dat->stream.ecb != NULL)
+        (*dat->stream.ecb) (in, out, len, &dat->ks.ks,
+                            EVP_CIPHER_CTX_is_encrypting(ctx));
     else
-        ossl_sm4_decrypt(in, out, key);
+        for (i = 0, len -= bl; i <= len; i += bl)
+            (*dat->block) (in + i, out + i, &dat->ks);
+
+    return 1;
 }
 
-static void sm4_ofb128_encrypt(const unsigned char *in, unsigned char *out,
-                               size_t length, const SM4_KEY *key,
-                               unsigned char *ivec, int *num)
+static int sm4_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                          const unsigned char *in, size_t len)
 {
-    CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num,
-                          (block128_f)ossl_sm4_encrypt);
-}
+    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+    int num = EVP_CIPHER_CTX_get_num(ctx);
 
-IMPLEMENT_BLOCK_CIPHER(sm4, ks, sm4, EVP_SM4_KEY, NID_sm4,
-                       16, 16, 16, 128, EVP_CIPH_FLAG_DEFAULT_ASN1,
-                       sm4_init_key, 0, 0, 0, 0)
+    CRYPTO_ofb128_encrypt(in, out, len, &dat->ks,
+                          ctx->iv, &num, dat->block);
+    EVP_CIPHER_CTX_set_num(ctx, num);
+    return 1;
+}
 
 static int sm4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
                           const unsigned char *in, size_t len)
 {
     int n = EVP_CIPHER_CTX_get_num(ctx);
     unsigned int num;
-    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY, ctx);
+    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
 
     if (n < 0)
         return 0;
     num = (unsigned int)n;
 
-    CRYPTO_ctr128_encrypt(in, out, len, &dat->ks, ctx->iv,
-                          EVP_CIPHER_CTX_buf_noconst(ctx), &num,
-                          (block128_f)ossl_sm4_encrypt);
+    if (dat->stream.ctr)
+        CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks,
+                                    ctx->iv,
+                                    EVP_CIPHER_CTX_buf_noconst(ctx),
+                                    &num, dat->stream.ctr);
+    else
+        CRYPTO_ctr128_encrypt(in, out, len, &dat->ks,
+                              ctx->iv,
+                              EVP_CIPHER_CTX_buf_noconst(ctx), &num,
+                              dat->block);
     EVP_CIPHER_CTX_set_num(ctx, num);
     return 1;
 }
 
-static const EVP_CIPHER sm4_ctr_mode = {
-    NID_sm4_ctr, 1, 16, 16,
-    EVP_CIPH_CTR_MODE,
-    EVP_ORIG_GLOBAL,
-    sm4_init_key,
-    sm4_ctr_cipher,
-    NULL,
-    sizeof(EVP_SM4_KEY),
-    NULL, NULL, NULL, NULL
-};
-
-const EVP_CIPHER *EVP_sm4_ctr(void)
-{
-    return &sm4_ctr_mode;
-}
-
+DEFINE_BLOCK_CIPHERS(NID_sm4, 0)
 #endif
diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl
new file mode 100755 (executable)
index 0000000..7358a6e
--- /dev/null
@@ -0,0 +1,635 @@
+#! /usr/bin/env perl
+# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+# This module implements support for SM4 hw support on aarch64
+# Oct 2021
+#
+
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+    or die "can't call $xlate: $!";
+*STDOUT=*OUT;
+
+$prefix="sm4_v8";
+my @rks=map("v$_",(0..7));
+
+sub rev32() {
+my $dst = shift;
+my $src = shift;
+$code.=<<___;
+#ifndef __ARMEB__
+       rev32   $dst.16b,$src.16b
+#endif
+___
+}
+
+sub enc_blk () {
+my $data = shift;
+$code.=<<___;
+       sm4e    $data.4s,@rks[0].4s
+       sm4e    $data.4s,@rks[1].4s
+       sm4e    $data.4s,@rks[2].4s
+       sm4e    $data.4s,@rks[3].4s
+       sm4e    $data.4s,@rks[4].4s
+       sm4e    $data.4s,@rks[5].4s
+       sm4e    $data.4s,@rks[6].4s
+       sm4e    $data.4s,@rks[7].4s
+       rev64   $data.4S,$data.4S
+       ext     $data.16b,$data.16b,$data.16b,#8
+___
+}
+
+sub enc_4blks () {
+my $data0 = shift;
+my $data1 = shift;
+my $data2 = shift;
+my $data3 = shift;
+$code.=<<___;
+       sm4e    $data0.4s,@rks[0].4s
+       sm4e    $data1.4s,@rks[0].4s
+       sm4e    $data2.4s,@rks[0].4s
+       sm4e    $data3.4s,@rks[0].4s
+
+       sm4e    $data0.4s,@rks[1].4s
+       sm4e    $data1.4s,@rks[1].4s
+       sm4e    $data2.4s,@rks[1].4s
+       sm4e    $data3.4s,@rks[1].4s
+
+       sm4e    $data0.4s,@rks[2].4s
+       sm4e    $data1.4s,@rks[2].4s
+       sm4e    $data2.4s,@rks[2].4s
+       sm4e    $data3.4s,@rks[2].4s
+
+       sm4e    $data0.4s,@rks[3].4s
+       sm4e    $data1.4s,@rks[3].4s
+       sm4e    $data2.4s,@rks[3].4s
+       sm4e    $data3.4s,@rks[3].4s
+
+       sm4e    $data0.4s,@rks[4].4s
+       sm4e    $data1.4s,@rks[4].4s
+       sm4e    $data2.4s,@rks[4].4s
+       sm4e    $data3.4s,@rks[4].4s
+
+       sm4e    $data0.4s,@rks[5].4s
+       sm4e    $data1.4s,@rks[5].4s
+       sm4e    $data2.4s,@rks[5].4s
+       sm4e    $data3.4s,@rks[5].4s
+
+       sm4e    $data0.4s,@rks[6].4s
+       sm4e    $data1.4s,@rks[6].4s
+       sm4e    $data2.4s,@rks[6].4s
+       sm4e    $data3.4s,@rks[6].4s
+
+       sm4e    $data0.4s,@rks[7].4s
+       rev64   $data0.4S,$data0.4S
+       sm4e    $data1.4s,@rks[7].4s
+       ext     $data0.16b,$data0.16b,$data0.16b,#8
+       rev64   $data1.4S,$data1.4S
+       sm4e    $data2.4s,@rks[7].4s
+       ext     $data1.16b,$data1.16b,$data1.16b,#8
+       rev64   $data2.4S,$data2.4S
+       sm4e    $data3.4s,@rks[7].4s
+       ext     $data2.16b,$data2.16b,$data2.16b,#8
+       rev64   $data3.4S,$data3.4S
+       ext     $data3.16b,$data3.16b,$data3.16b,#8
+___
+}
+
+$code=<<___;
+#include "arm_arch.h"
+.arch  armv8-a+crypto
+.text
+___
+
+{{{
+$code.=<<___;
+.align 6
+.Lck:
+       .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
+       .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
+       .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
+       .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
+       .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
+       .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
+       .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
+       .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
+.Lfk:
+       .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
+___
+}}}
+
+{{{
+my ($key,$keys)=("x0","x1");
+my ($tmp)=("x2");
+my ($key0,$key1,$key2,$key3,$key4,$key5,$key6,$key7)=map("v$_",(0..7));
+my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
+my ($fkconst) = ("v24");
+$code.=<<___;
+.globl ${prefix}_set_encrypt_key
+.type  ${prefix}_set_encrypt_key,%function
+.align 5
+${prefix}_set_encrypt_key:
+       AARCH64_VALID_CALL_TARGET
+       ld1     {$key0.4s},[$key]
+       adr     $tmp,.Lfk
+       ld1     {$fkconst.4s},[$tmp]
+       adr     $tmp,.Lck
+       ld1     {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
+___
+       &rev32($key0, $key0);
+$code.=<<___;
+       ld1     {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
+       eor     $key0.16b,$key0.16b,$fkconst.16b;
+       sm4ekey $key0.4S,$key0.4S,$const0.4S
+       sm4ekey $key1.4S,$key0.4S,$const1.4S
+       sm4ekey $key2.4S,$key1.4S,$const2.4S
+       sm4ekey $key3.4S,$key2.4S,$const3.4S
+       sm4ekey $key4.4S,$key3.4S,$const4.4S
+       st1     {$key0.4s,$key1.4s,$key2.4s,$key3.4s},[$keys],64
+       sm4ekey $key5.4S,$key4.4S,$const5.4S
+       sm4ekey $key6.4S,$key5.4S,$const6.4S
+       sm4ekey $key7.4S,$key6.4S,$const7.4S
+       st1     {$key4.4s,$key5.4s,$key6.4s,$key7.4s},[$keys]
+       ret
+.size  ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
+___
+}}}
+
+{{{
+my ($key,$keys)=("x0","x1");
+my ($tmp)=("x2");
+my ($key7,$key6,$key5,$key4,$key3,$key2,$key1,$key0)=map("v$_",(0..7));
+my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
+my ($fkconst) = ("v24");
+$code.=<<___;
+.globl ${prefix}_set_decrypt_key
+.type  ${prefix}_set_decrypt_key,%function
+.align 5
+${prefix}_set_decrypt_key:
+       AARCH64_VALID_CALL_TARGET
+       ld1     {$key0.4s},[$key]
+       adr     $tmp,.Lfk
+       ld1     {$fkconst.4s},[$tmp]
+       adr     $tmp, .Lck
+       ld1     {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
+___
+       &rev32($key0, $key0);
+$code.=<<___;
+       ld1     {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
+       eor     $key0.16b, $key0.16b,$fkconst.16b;
+       sm4ekey $key0.4S,$key0.4S,$const0.4S
+       sm4ekey $key1.4S,$key0.4S,$const1.4S
+       sm4ekey $key2.4S,$key1.4S,$const2.4S
+       rev64   $key0.4s,$key0.4s
+       rev64   $key1.4s,$key1.4s
+       ext     $key0.16b,$key0.16b,$key0.16b,#8
+       ext     $key1.16b,$key1.16b,$key1.16b,#8
+       sm4ekey $key3.4S,$key2.4S,$const3.4S
+       sm4ekey $key4.4S,$key3.4S,$const4.4S
+       rev64   $key2.4s,$key2.4s
+       rev64   $key3.4s,$key3.4s
+       ext     $key2.16b,$key2.16b,$key2.16b,#8
+       ext     $key3.16b,$key3.16b,$key3.16b,#8
+       sm4ekey $key5.4S,$key4.4S,$const5.4S
+       sm4ekey $key6.4S,$key5.4S,$const6.4S
+       rev64   $key4.4s,$key4.4s
+       rev64   $key5.4s,$key5.4s
+       ext     $key4.16b,$key4.16b,$key4.16b,#8
+       ext     $key5.16b,$key5.16b,$key5.16b,#8
+       sm4ekey $key7.4S,$key6.4S,$const7.4S
+       rev64   $key6.4s, $key6.4s
+       rev64   $key7.4s, $key7.4s
+       ext     $key6.16b,$key6.16b,$key6.16b,#8
+       ext     $key7.16b,$key7.16b,$key7.16b,#8
+       st1     {$key7.4s,$key6.4s,$key5.4s,$key4.4s},[$keys],64
+       st1     {$key3.4s,$key2.4s,$key1.4s,$key0.4s},[$keys]
+       ret
+.size  ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
+___
+}}}
+
+{{{
+sub gen_block () {
+my $dir = shift;
+my ($inp,$out,$rk)=map("x$_",(0..2));
+my ($data)=("v16");
+$code.=<<___;
+.globl ${prefix}_${dir}crypt
+.type  ${prefix}_${dir}crypt,%function
+.align 5
+${prefix}_${dir}crypt:
+       AARCH64_VALID_CALL_TARGET
+       ld1     {$data.4s},[$inp]
+       ld1     {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64
+       ld1     {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
+___
+       &rev32($data,$data);
+       &enc_blk($data);
+       &rev32($data,$data);
+$code.=<<___;
+       st1     {$data.4s},[$out]
+       ret
+.size  ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
+___
+}
+
+&gen_block("en");
+&gen_block("de");
+}}}
+
+{{{
+my ($inp,$out,$len,$rk)=map("x$_",(0..3));
+my ($enc) = ("w4");
+my @dat=map("v$_",(16..23));
+$code.=<<___;
+.globl ${prefix}_ecb_encrypt
+.type  ${prefix}_ecb_encrypt,%function
+.align 5
+${prefix}_ecb_encrypt:
+       AARCH64_VALID_CALL_TARGET
+       ld1     {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64
+       ld1     {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
+1:
+       cmp     $len,#64
+       b.lt    1f
+       ld1     {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
+       cmp     $len,#128
+       b.lt    2f
+       ld1     {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64
+       // 8 blocks
+___
+       &rev32(@dat[0],@dat[0]);
+       &rev32(@dat[1],@dat[1]);
+       &rev32(@dat[2],@dat[2]);
+       &rev32(@dat[3],@dat[3]);
+       &rev32(@dat[4],@dat[4]);
+       &rev32(@dat[5],@dat[5]);
+       &rev32(@dat[6],@dat[6]);
+       &rev32(@dat[7],@dat[7]);
+       &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+       &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
+       &rev32(@dat[0],@dat[0]);
+       &rev32(@dat[1],@dat[1]);
+       &rev32(@dat[2],@dat[2]);
+       &rev32(@dat[3],@dat[3]);
+       &rev32(@dat[4],@dat[4]);
+       &rev32(@dat[5],@dat[5]);
+$code.=<<___;
+       st1     {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+___
+       &rev32(@dat[6],@dat[6]);
+       &rev32(@dat[7],@dat[7]);
+$code.=<<___;
+       st1     {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
+       subs    $len,$len,#128
+       b.gt    1b
+       ret
+       // 4 blocks
+2:
+___
+       &rev32(@dat[0],@dat[0]);
+       &rev32(@dat[1],@dat[1]);
+       &rev32(@dat[2],@dat[2]);
+       &rev32(@dat[3],@dat[3]);
+       &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+       &rev32(@dat[0],@dat[0]);
+       &rev32(@dat[1],@dat[1]);
+       &rev32(@dat[2],@dat[2]);
+       &rev32(@dat[3],@dat[3]);
+$code.=<<___;
+       st1     {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+       subs    $len,$len,#64
+       b.gt    1b
+1:
+       subs    $len,$len,#16
+       b.lt    1f
+       ld1     {@dat[0].4s},[$inp],#16
+___
+       &rev32(@dat[0],@dat[0]);
+       &enc_blk(@dat[0]);
+       &rev32(@dat[0],@dat[0]);
+$code.=<<___;
+       st1     {@dat[0].4s},[$out],#16
+       b.ne    1b
+1:
+       ret
+.size  ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
+___
+}}}
+
+{{{
+my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
+my ($enc) = ("w5");
+my @dat=map("v$_",(16..23));
+my @in=map("v$_",(24..31));
+my ($ivec) = ("v8");
+$code.=<<___;
+.globl ${prefix}_cbc_encrypt
+.type  ${prefix}_cbc_encrypt,%function
+.align 5
+${prefix}_cbc_encrypt:
+       AARCH64_VALID_CALL_TARGET
+       stp     d8,d9,[sp, #-16]!
+
+       ld1     {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64
+       ld1     {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
+       ld1     {$ivec.4s},[$ivp]
+       cmp     $enc,#0
+       b.eq    .Ldec
+1:
+       cmp     $len, #64
+       b.lt    1f
+       ld1     {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
+       eor     @dat[0].16b,@dat[0].16b,$ivec.16b
+___
+       &rev32(@dat[1],@dat[1]);
+       &rev32(@dat[0],@dat[0]);
+       &rev32(@dat[2],@dat[2]);
+       &rev32(@dat[3],@dat[3]);
+       &enc_blk(@dat[0]);
+$code.=<<___;
+       eor     @dat[1].16b,@dat[1].16b,@dat[0].16b
+___
+       &enc_blk(@dat[1]);
+       &rev32(@dat[0],@dat[0]);
+$code.=<<___;
+       eor     @dat[2].16b,@dat[2].16b,@dat[1].16b
+___
+       &enc_blk(@dat[2]);
+       &rev32(@dat[1],@dat[1]);
+$code.=<<___;
+       eor     @dat[3].16b,@dat[3].16b,@dat[2].16b
+___
+       &enc_blk(@dat[3]);
+       &rev32(@dat[2],@dat[2]);
+       &rev32(@dat[3],@dat[3]);
+$code.=<<___;
+       mov     $ivec.16b,@dat[3].16b
+       st1     {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+       subs    $len,$len,#64
+       b.ne    1b
+1:
+       subs    $len,$len,#16
+       b.lt    3f
+       ld1     {@dat[0].4s},[$inp],#16
+       eor     $ivec.16b,$ivec.16b,@dat[0].16b
+___
+       &rev32($ivec,$ivec);
+       &enc_blk($ivec);
+       &rev32($ivec,$ivec);
+$code.=<<___;
+       st1     {$ivec.16b},[$out],#16
+       b.ne    1b
+       b       3f
+.Ldec:
+1:
+       cmp     $len, #64
+       b.lt    1f
+       ld1     {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp]
+       ld1     {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64
+       cmp     $len,#128
+       b.lt    2f
+       // 8 blocks mode
+       ld1     {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp]
+       ld1     {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64
+___
+       &rev32(@dat[0],@dat[0]);
+       &rev32(@dat[1],@dat[1]);
+       &rev32(@dat[2],@dat[2]);
+       &rev32(@dat[3],$dat[3]);
+       &rev32(@dat[4],@dat[4]);
+       &rev32(@dat[5],@dat[5]);
+       &rev32(@dat[6],@dat[6]);
+       &rev32(@dat[7],$dat[7]);
+       &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+       &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
+       &rev32(@dat[0],@dat[0]);
+       &rev32(@dat[1],@dat[1]);
+       &rev32(@dat[2],@dat[2]);
+       &rev32(@dat[3],@dat[3]);
+       &rev32(@dat[4],@dat[4]);
+       &rev32(@dat[5],@dat[5]);
+       &rev32(@dat[6],@dat[6]);
+       &rev32(@dat[7],@dat[7]);
+$code.=<<___;
+       eor     @dat[0].16b,@dat[0].16b,$ivec.16b
+       eor     @dat[1].16b,@dat[1].16b,@in[0].16b
+       eor     @dat[2].16b,@dat[2].16b,@in[1].16b
+       mov     $ivec.16b,@in[7].16b
+       eor     @dat[3].16b,$dat[3].16b,@in[2].16b
+       eor     @dat[4].16b,$dat[4].16b,@in[3].16b
+       eor     @dat[5].16b,$dat[5].16b,@in[4].16b
+       eor     @dat[6].16b,$dat[6].16b,@in[5].16b
+       eor     @dat[7].16b,$dat[7].16b,@in[6].16b
+       st1     {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+       st1     {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
+       subs    $len,$len,128
+       b.gt    1b
+       b       3f
+       // 4 blocks mode
+2:
+___
+       &rev32(@dat[0],@dat[0]);
+       &rev32(@dat[1],@dat[1]);
+       &rev32(@dat[2],@dat[2]);
+       &rev32(@dat[3],$dat[3]);
+       &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+       &rev32(@dat[0],@dat[0]);
+       &rev32(@dat[1],@dat[1]);
+       &rev32(@dat[2],@dat[2]);
+       &rev32(@dat[3],@dat[3]);
+$code.=<<___;
+       eor     @dat[0].16b,@dat[0].16b,$ivec.16b
+       eor     @dat[1].16b,@dat[1].16b,@in[0].16b
+       mov     $ivec.16b,@in[3].16b
+       eor     @dat[2].16b,@dat[2].16b,@in[1].16b
+       eor     @dat[3].16b,$dat[3].16b,@in[2].16b
+       st1     {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+       subs    $len,$len,#64
+       b.gt    1b
+1:
+       subs    $len,$len,#16
+       b.lt    3f
+       ld1     {@dat[0].4s},[$inp],#16
+       mov     @in[0].16b,@dat[0].16b
+___
+       &rev32(@dat[0],@dat[0]);
+       &enc_blk(@dat[0]);
+       &rev32(@dat[0],@dat[0]);
+$code.=<<___;
+       eor     @dat[0].16b,@dat[0].16b,$ivec.16b
+       mov     $ivec.16b,@in[0].16b
+       st1     {@dat[0].16b},[$out],#16
+       b.ne    1b
+3:
+       // save back IV
+       st1     {$ivec.16b},[$ivp]
+       ldp     d8,d9,[sp],#16
+       ret
+.size  ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
+___
+}}}
+
+{{{
+my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
+my ($ctr)=("w5");
+my @dat=map("v$_",(16..23));
+my @in=map("v$_",(24..31));
+my ($ivec)=("v8");
+$code.=<<___;
+.globl ${prefix}_ctr32_encrypt_blocks
+.type  ${prefix}_ctr32_encrypt_blocks,%function
+.align 5
+${prefix}_ctr32_encrypt_blocks:
+       AARCH64_VALID_CALL_TARGET
+       stp     d8,d9,[sp, #-16]!
+
+       ld1     {$ivec.4s},[$ivp]
+       ld1     {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64
+       ld1     {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
+___
+       &rev32($ivec,$ivec);
+$code.=<<___;
+       mov     $ctr,$ivec.s[3]
+1:
+       cmp     $len,#4
+       b.lt    1f
+       ld1     {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64
+       mov     @dat[0].16b,$ivec.16b
+       mov     @dat[1].16b,$ivec.16b
+       mov     @dat[2].16b,$ivec.16b
+       mov     @dat[3].16b,$ivec.16b
+       add     $ctr,$ctr,#1
+       mov     $dat[1].s[3],$ctr
+       add     $ctr,$ctr,#1
+       mov     @dat[2].s[3],$ctr
+       add     $ctr,$ctr,#1
+       mov     @dat[3].s[3],$ctr
+       cmp     $len,#8
+       b.lt    2f
+       ld1     {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64
+       mov     @dat[4].16b,$ivec.16b
+       mov     @dat[5].16b,$ivec.16b
+       mov     @dat[6].16b,$ivec.16b
+       mov     @dat[7].16b,$ivec.16b
+       add     $ctr,$ctr,#1
+       mov     $dat[4].s[3],$ctr
+       add     $ctr,$ctr,#1
+       mov     @dat[5].s[3],$ctr
+       add     $ctr,$ctr,#1
+       mov     @dat[6].s[3],$ctr
+       add     $ctr,$ctr,#1
+       mov     @dat[7].s[3],$ctr
+___
+       &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+       &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
+       &rev32(@dat[0],@dat[0]);
+       &rev32(@dat[1],@dat[1]);
+       &rev32(@dat[2],@dat[2]);
+       &rev32(@dat[3],@dat[3]);
+       &rev32(@dat[4],@dat[4]);
+       &rev32(@dat[5],@dat[5]);
+       &rev32(@dat[6],@dat[6]);
+       &rev32(@dat[7],@dat[7]);
+$code.=<<___;
+       eor     @dat[0].16b,@dat[0].16b,@in[0].16b
+       eor     @dat[1].16b,@dat[1].16b,@in[1].16b
+       eor     @dat[2].16b,@dat[2].16b,@in[2].16b
+       eor     @dat[3].16b,@dat[3].16b,@in[3].16b
+       eor     @dat[4].16b,@dat[4].16b,@in[4].16b
+       eor     @dat[5].16b,@dat[5].16b,@in[5].16b
+       eor     @dat[6].16b,@dat[6].16b,@in[6].16b
+       eor     @dat[7].16b,@dat[7].16b,@in[7].16b
+       st1     {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+       st1     {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
+       subs    $len,$len,#8
+       b.eq    3f
+       add     $ctr,$ctr,#1
+       mov     $ivec.s[3],$ctr
+       b       1b
+2:
+___
+       &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
+       &rev32(@dat[0],@dat[0]);
+       &rev32(@dat[1],@dat[1]);
+       &rev32(@dat[2],@dat[2]);
+       &rev32(@dat[3],@dat[3]);
+$code.=<<___;
+       eor     @dat[0].16b,@dat[0].16b,@in[0].16b
+       eor     @dat[1].16b,@dat[1].16b,@in[1].16b
+       eor     @dat[2].16b,@dat[2].16b,@in[2].16b
+       eor     @dat[3].16b,@dat[3].16b,@in[3].16b
+       st1     {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
+       subs    $len,$len,#4
+       b.eq    3f
+       add     $ctr,$ctr,#1
+       mov     $ivec.s[3],$ctr
+       b       1b
+1:
+       subs    $len,$len,#1
+       b.lt    3f
+       mov     $dat[0].16b,$ivec.16b
+       ld1     {@in[0].4s},[$inp],#16
+___
+       &enc_blk(@dat[0]);
+       &rev32(@dat[0],@dat[0]);
+$code.=<<___;
+       eor     $dat[0].16b,$dat[0].16b,@in[0].16b
+       st1     {$dat[0].4s},[$out],#16
+       b.eq    3f
+       add     $ctr,$ctr,#1
+       mov     $ivec.s[3],$ctr
+       b       1b
+3:
+       ldp     d8,d9,[sp],#16
+       ret
+.size  ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+___
+}}}
+########################################
+{   my  %opcode = (
+        "sm4e"          => 0xcec08400,
+        "sm4ekey"       => 0xce60c800);
+
+    sub unsm4 {
+        my ($mnemonic,$arg)=@_;
+
+        $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
+        &&
+        sprintf ".inst\t0x%08x\t//%s %s",
+                        $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
+                        $mnemonic,$arg;
+    }
+}
+
+open SELF,$0;
+while(<SELF>) {
+        next if (/^#!/);
+        last if (!s/^#/\/\// and !/^$/);
+        print;
+}
+close SELF;
+
+foreach(split("\n",$code)) {
+       s/\`([^\`]*)\`/eval($1)/ge;
+
+       s/\b(sm4\w+)\s+([qv].*)/unsm4($1,$2)/ge;
+       print $_,"\n";
+}
+
+close STDOUT or die "error closing STDOUT: $!";
index b65a7d149e5860964c4bb7dc46547b3e0d25edff..e27aa49e67c5e97daf67f3ac9f54b1b60d4d094b 100644 (file)
@@ -1,4 +1,32 @@
 LIBS=../../libcrypto
-SOURCE[../../libcrypto]=\
-        sm4.c
 
+IF[{- !$disabled{asm} -}]
+  $SM4DEF_aarch64=SM4_ASM
+  $SM4ASM_aarch64=sm4-armv8.S
+
+  # Now that we have defined all the arch specific variables, use the
+  # appropriate one, and define the appropriate macros
+  IF[$SM4ASM_{- $target{asm_arch} -}]
+    $SM4ASM=$SM4ASM_{- $target{asm_arch} -}
+    $SM4DEF=$SM4DEF_{- $target{asm_arch} -}
+  ENDIF
+ENDIF
+
+SOURCE[../../libcrypto]= $SM4ASM sm4.c
+
+
+# Implementations are now spread across several libraries, so the defines
+# need to be applied to all affected libraries and modules.
+DEFINE[../../libcrypto]=$SM4DEF
+DEFINE[../../providers/libfips.a]=$SM4DEF
+DEFINE[../../providers/libdefault.a]=$SM4DEF
+# We only need to include the SM4DEF stuff in the legacy provider when it's a
+# separate module and it's dynamically linked with libcrypto.  Otherwise, it
+# already gets everything that the static libcrypto.a has, and doesn't need it
+# added again.
+IF[{- !$disabled{module} && !$disabled{shared} -}]
+  DEFINE[../providers/liblegacy.a]=$SM4DEF
+ENDIF
+
+GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl
+INCLUDE[sm4-armv8.o]=..
diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
new file mode 100644 (file)
index 0000000..42c8b44
--- /dev/null
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License 2.0 (the "License").  You may not use
+ * this file except in compliance with the License.  You can obtain a copy
+ * in the file LICENSE in the source distribution or at
+ * https://www.openssl.org/source/license.html
+ */
+
+#ifndef OSSL_SM4_PLATFORM_H
+# define OSSL_SM4_PLATFORM_H
+# pragma once
+
+# if defined(OPENSSL_CPUID_OBJ)
+#  if (defined(__arm__) || defined(__arm) || defined(__aarch64__))
+#   include "arm_arch.h"
+#   if __ARM_MAX_ARCH__>=8
+#    define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4)
+#    define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key
+#    define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key
+#    define HWSM4_encrypt sm4_v8_encrypt
+#    define HWSM4_decrypt sm4_v8_decrypt
+#    define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt
+#    define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt
+#    define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks
+#   endif
+#  endif
+# endif /* OPENSSL_CPUID_OBJ */
+
+# if defined(HWSM4_CAPABLE)
+int HWSM4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key);
+int HWSM4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key);
+void HWSM4_encrypt(const unsigned char *in, unsigned char *out,
+                   const SM4_KEY *key);
+void HWSM4_decrypt(const unsigned char *in, unsigned char *out,
+                   const SM4_KEY *key);
+void HWSM4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+                       size_t length, const SM4_KEY *key,
+                       unsigned char *ivec, const int enc);
+void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out,
+                       size_t length, const SM4_KEY *key,
+                       const int enc);
+void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+                                size_t len, const void *key,
+                                const unsigned char ivec[16]);
+# endif /* HWSM4_CAPABLE */
+
+#endif /* OSSL_SM4_PLATFORM_H */
index f7f833fcb4cf86bd607868970fd34757c8b3015f..01a031a74d799d509b5c84f7dbc09fb0d37a5787 100644 (file)
@@ -9,6 +9,7 @@
 
 #include "prov/ciphercommon.h"
 #include "crypto/sm4.h"
+#include "crypto/sm4_platform.h"
 
 typedef struct prov_cast_ctx_st {
     PROV_CIPHER_CTX base;      /* Must be first */
index 6bcd1ec4061278a48e7ec461e0497884b950137d..c0c9b22bd3a84be53b011a37b58201a60a3eccd2 100644 (file)
@@ -12,6 +12,7 @@
  */
 
 #include "cipher_sm4_gcm.h"
+#include "crypto/sm4_platform.h"
 
 static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
                            size_t keylen)
@@ -20,9 +21,22 @@ static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
     SM4_KEY *ks = &actx->ks.ks;
 
     ctx->ks = ks;
-    ossl_sm4_set_key(key, ks);
-    CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt);
-    ctx->ctr = (ctr128_f)NULL;
+# ifdef HWSM4_CAPABLE
+    if (HWSM4_CAPABLE) {
+        HWSM4_set_encrypt_key(key, ks);
+        CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f) HWSM4_encrypt);
+#  ifdef HWSM4_ctr32_encrypt_blocks
+        ctx->ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks;
+#  else /* HWSM4_ctr32_encrypt_blocks */
+        ctx->ctr = (ctr128_f)NULL;
+#  endif
+    } else
+# endif /* HWSM4_CAPABLE */
+    {
+        ossl_sm4_set_key(key, ks);
+        CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt);
+        ctx->ctr = (ctr128_f)NULL;
+    }
     ctx->key_set = 1;
 
     return 1;
index 0db04b1a743b3be8f0e1410369e025e829284ae7..4cd3d3d66931cf9ca8f96a049efb9e644cc12660 100644 (file)
@@ -15,14 +15,59 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
     PROV_SM4_CTX *sctx =  (PROV_SM4_CTX *)ctx;
     SM4_KEY *ks = &sctx->ks.ks;
 
-    ossl_sm4_set_key(key, ks);
     ctx->ks = ks;
     if (ctx->enc
             || (ctx->mode != EVP_CIPH_ECB_MODE
-                && ctx->mode != EVP_CIPH_CBC_MODE))
-        ctx->block = (block128_f)ossl_sm4_encrypt;
-    else
-        ctx->block = (block128_f)ossl_sm4_decrypt;
+                && ctx->mode != EVP_CIPH_CBC_MODE)) {
+#ifdef HWSM4_CAPABLE
+        if (HWSM4_CAPABLE) {
+            HWSM4_set_encrypt_key(key, ks);
+            ctx->block = (block128_f)HWSM4_encrypt;
+            ctx->stream.cbc = NULL;
+#ifdef HWSM4_cbc_encrypt
+            if (ctx->mode == EVP_CIPH_CBC_MODE)
+                ctx->stream.cbc = (cbc128_f)HWSM4_cbc_encrypt;
+            else
+#endif
+#ifdef HWSM4_ecb_encrypt
+            if (ctx->mode == EVP_CIPH_ECB_MODE)
+                ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt;
+            else
+#endif
+#ifdef HWSM4_ctr32_encrypt_blocks
+            if (ctx->mode == EVP_CIPH_CTR_MODE)
+                ctx->stream.ctr = (ctr128_f)HWSM4_ctr32_encrypt_blocks;
+            else
+#endif
+            (void)0;            /* terminate potentially open 'else' */
+        } else
+#endif
+        {
+            ossl_sm4_set_key(key, ks);
+            ctx->block = (block128_f)ossl_sm4_encrypt;
+        }
+    } else {
+#ifdef HWSM4_CAPABLE
+        if (HWSM4_CAPABLE) {
+            HWSM4_set_decrypt_key(key, ks);
+            ctx->block = (block128_f)HWSM4_decrypt;
+            ctx->stream.cbc = NULL;
+#ifdef HWSM4_cbc_encrypt
+            if (ctx->mode == EVP_CIPH_CBC_MODE)
+                ctx->stream.cbc = (cbc128_f)HWSM4_cbc_encrypt;
+#endif
+#ifdef HWSM4_ecb_encrypt
+            if (ctx->mode == EVP_CIPH_ECB_MODE)
+                ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt;
+#endif
+        } else
+#endif
+        {
+            ossl_sm4_set_key(key, ks);
+            ctx->block = (block128_f)ossl_sm4_decrypt;
+        }
+    }
+
     return 1;
 }
 
@@ -31,7 +76,7 @@ IMPLEMENT_CIPHER_HW_COPYCTX(cipher_hw_sm4_copyctx, PROV_SM4_CTX)
 # define PROV_CIPHER_HW_sm4_mode(mode)                                         \
 static const PROV_CIPHER_HW sm4_##mode = {                                     \
     cipher_hw_sm4_initkey,                                                     \
-    ossl_cipher_hw_chunked_##mode,                                             \
+    ossl_cipher_hw_generic_##mode,                                             \
     cipher_hw_sm4_copyctx                                                      \
 };                                                                             \
 const PROV_CIPHER_HW *ossl_prov_cipher_hw_sm4_##mode(size_t keybits)           \