summaryrefslogtreecommitdiff
path: root/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch
diff options
context:
space:
mode:
Diffstat (limited to 'Backport-SM4-optimization-for-ARM-by-HW-instruction.patch')
-rw-r--r--Backport-SM4-optimization-for-ARM-by-HW-instruction.patch1228
1 files changed, 1228 insertions, 0 deletions
diff --git a/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch b/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch
new file mode 100644
index 0000000..c68f1a0
--- /dev/null
+++ b/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch
@@ -0,0 +1,1228 @@
+From 1cd480c10b8bbaa6f72d503494ff2973672ec0e4 Mon Sep 17 00:00:00 2001
+From: Daniel Hu <Daniel.Hu@arm.com>
+Date: Tue, 19 Oct 2021 22:49:05 +0100
+Subject: [PATCH 05/13] SM4 optimization for ARM by HW instruction
+
+This patch implements the SM4 optimization for ARM processor,
+using SM4 HW instruction, which is an optional feature of
+crypto extension for aarch64 V8.
+
+Tested on some modern ARM micro-architectures with SM4 support, the
+performance uplift can be observed around 8X~40X over existing
+C implementation in openssl. Algorithms that can be parallelized
+(like CTR, ECB, CBC decryption) are on higher end, with algorithm
+like CBC encryption on lower end (due to inter-block dependency)
+
+Perf data on Yitian-710 2.75GHz hardware, before and after optimization:
+
+Before:
+ type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
+ SM4-CTR 105787.80k 107837.87k 108380.84k 108462.08k 108549.46k 108554.92k
+ SM4-ECB 111924.58k 118173.76k 119776.00k 120093.70k 120264.02k 120274.94k
+ SM4-CBC 106428.09k 109190.98k 109674.33k 109774.51k 109827.41k 109827.41k
+
+After (7.4x - 36.6x faster):
+ type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
+ SM4-CTR 781979.02k 2432994.28k 3437753.86k 3834177.88k 3963715.58k 3974556.33k
+ SM4-ECB 937590.69k 2941689.02k 3945751.81k 4328655.87k 4459181.40k 4468692.31k
+ SM4-CBC 890639.88k 1027746.58k 1050621.78k 1056696.66k 1058613.93k 1058701.31k
+
+Signed-off-by: Daniel Hu <Daniel.Hu@arm.com>
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/17455)
+---
+ crypto/arm64cpuid.pl | 8 +
+ crypto/arm_arch.h | 1 +
+ crypto/armcap.c | 10 +
+ crypto/evp/e_sm4.c | 193 ++++--
+ crypto/sm4/asm/sm4-armv8.pl | 635 ++++++++++++++++++
+ crypto/sm4/build.info | 32 +-
+ include/crypto/sm4_platform.h | 48 ++
+ .../implementations/ciphers/cipher_sm4.h | 1 +
+ .../ciphers/cipher_sm4_gcm_hw.c | 20 +-
+ .../implementations/ciphers/cipher_sm4_hw.c | 57 +-
+ 10 files changed, 945 insertions(+), 60 deletions(-)
+ create mode 100755 crypto/sm4/asm/sm4-armv8.pl
+ create mode 100644 include/crypto/sm4_platform.h
+
+diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
+index 10d267b7ad..36af3e075b 100755
+--- a/crypto/arm64cpuid.pl
++++ b/crypto/arm64cpuid.pl
+@@ -80,6 +80,14 @@ _armv8_pmull_probe:
+ ret
+ .size _armv8_pmull_probe,.-_armv8_pmull_probe
+
++.globl _armv8_sm4_probe
++.type _armv8_sm4_probe,%function
++_armv8_sm4_probe:
++ AARCH64_VALID_CALL_TARGET
++ .long 0xcec08400 // sm4e v0.4s, v0.4s
++ ret
++.size _armv8_sm4_probe,.-_armv8_sm4_probe
++
+ .globl _armv8_sha512_probe
+ .type _armv8_sha512_probe,%function
+ _armv8_sha512_probe:
+diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
+index c8b501f34c..5b5af31d92 100644
+--- a/crypto/arm_arch.h
++++ b/crypto/arm_arch.h
+@@ -85,6 +85,7 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
+ # define ARMV8_CPUID (1<<7)
+ # define ARMV8_RNG (1<<8)
+ # define ARMV8_SM3 (1<<9)
++# define ARMV8_SM4 (1<<10)
+
+ /*
+ * MIDR_EL1 system register
+diff --git a/crypto/armcap.c b/crypto/armcap.c
+index 365a48df45..c5aa062767 100644
+--- a/crypto/armcap.c
++++ b/crypto/armcap.c
+@@ -53,6 +53,7 @@ void _armv8_sha256_probe(void);
+ void _armv8_pmull_probe(void);
+ # ifdef __aarch64__
+ void _armv8_sm3_probe(void);
++void _armv8_sm4_probe(void);
+ void _armv8_sha512_probe(void);
+ unsigned int _armv8_cpuid_probe(void);
+ # endif
+@@ -139,6 +140,7 @@ static unsigned long getauxval(unsigned long key)
+ # define HWCAP_CE_SHA256 (1 << 6)
+ # define HWCAP_CPUID (1 << 11)
+ # define HWCAP_CE_SM3 (1 << 18)
++# define HWCAP_CE_SM4 (1 << 19)
+ # define HWCAP_CE_SHA512 (1 << 21)
+ # endif
+
+@@ -207,6 +209,9 @@ void OPENSSL_cpuid_setup(void)
+ OPENSSL_armcap_P |= ARMV8_SHA256;
+
+ # ifdef __aarch64__
++ if (hwcap & HWCAP_CE_SM4)
++ OPENSSL_armcap_P |= ARMV8_SM4;
++
+ if (hwcap & HWCAP_CE_SHA512)
+ OPENSSL_armcap_P |= ARMV8_SHA512;
+
+@@ -254,6 +259,11 @@ void OPENSSL_cpuid_setup(void)
+ OPENSSL_armcap_P |= ARMV8_SHA256;
+ }
+ # if defined(__aarch64__) && !defined(__APPLE__)
++ if (sigsetjmp(ill_jmp, 1) == 0) {
++ _armv8_sm4_probe();
++ OPENSSL_armcap_P |= ARMV8_SM4;
++ }
++
+ if (sigsetjmp(ill_jmp, 1) == 0) {
+ _armv8_sha512_probe();
+ OPENSSL_armcap_P |= ARMV8_SHA512;
+diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c
+index abd603015c..bff79ff197 100644
+--- a/crypto/evp/e_sm4.c
++++ b/crypto/evp/e_sm4.c
+@@ -17,92 +17,187 @@
+ # include <openssl/modes.h>
+ # include "crypto/sm4.h"
+ # include "crypto/evp.h"
++# include "crypto/sm4_platform.h"
+ # include "evp_local.h"
+
+ typedef struct {
+- SM4_KEY ks;
++ union {
++ OSSL_UNION_ALIGN;
++ SM4_KEY ks;
++ } ks;
++ block128_f block;
++ union {
++ ecb128_f ecb;
++ cbc128_f cbc;
++ ctr128_f ctr;
++ } stream;
+ } EVP_SM4_KEY;
+
++# define BLOCK_CIPHER_generic(nid,blocksize,ivlen,nmode,mode,MODE,flags) \
++static const EVP_CIPHER sm4_##mode = { \
++ nid##_##nmode,blocksize,128/8,ivlen, \
++ flags|EVP_CIPH_##MODE##_MODE, \
++ EVP_ORIG_GLOBAL, \
++ sm4_init_key, \
++ sm4_##mode##_cipher, \
++ NULL, \
++ sizeof(EVP_SM4_KEY), \
++ NULL,NULL,NULL,NULL }; \
++const EVP_CIPHER *EVP_sm4_##mode(void) \
++{ return &sm4_##mode; }
++
++#define DEFINE_BLOCK_CIPHERS(nid,flags) \
++ BLOCK_CIPHER_generic(nid,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
++ BLOCK_CIPHER_generic(nid,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
++ BLOCK_CIPHER_generic(nid,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
++ BLOCK_CIPHER_generic(nid,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
++ BLOCK_CIPHER_generic(nid,1,16,ctr,ctr,CTR,flags)
++
+ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+ {
+- ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
++ int mode;
++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
++
++ mode = EVP_CIPHER_CTX_get_mode(ctx);
++ if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
++ && !enc) {
++#ifdef HWSM4_CAPABLE
++ if (HWSM4_CAPABLE) {
++ HWSM4_set_decrypt_key(key, &dat->ks.ks);
++ dat->block = (block128_f) HWSM4_decrypt;
++ dat->stream.cbc = NULL;
++# ifdef HWSM4_cbc_encrypt
++ if (mode == EVP_CIPH_CBC_MODE)
++ dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt;
++# endif
++# ifdef HWSM4_ecb_encrypt
++ if (mode == EVP_CIPH_ECB_MODE)
++ dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt;
++# endif
++ } else
++#endif
++ {
++ dat->block = (block128_f) ossl_sm4_decrypt;
++ ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
++ }
++ } else
++#ifdef HWSM4_CAPABLE
++ if (HWSM4_CAPABLE) {
++ HWSM4_set_encrypt_key(key, &dat->ks.ks);
++ dat->block = (block128_f) HWSM4_encrypt;
++ dat->stream.cbc = NULL;
++# ifdef HWSM4_cbc_encrypt
++ if (mode == EVP_CIPH_CBC_MODE)
++ dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt;
++ else
++# endif
++# ifdef HWSM4_ecb_encrypt
++ if (mode == EVP_CIPH_ECB_MODE)
++ dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt;
++ else
++# endif
++# ifdef HWSM4_ctr32_encrypt_blocks
++ if (mode == EVP_CIPH_CTR_MODE)
++ dat->stream.ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks;
++ else
++# endif
++ (void)0; /* terminate potentially open 'else' */
++ } else
++#endif
++ {
++ dat->block = (block128_f) ossl_sm4_encrypt;
++ ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
++ }
+ return 1;
+ }
+
+-static void sm4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+- size_t len, const SM4_KEY *key,
+- unsigned char *ivec, const int enc)
++static int sm4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
++ const unsigned char *in, size_t len)
+ {
+- if (enc)
+- CRYPTO_cbc128_encrypt(in, out, len, key, ivec,
+- (block128_f)ossl_sm4_encrypt);
++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
++
++ if (dat->stream.cbc)
++ (*dat->stream.cbc) (in, out, len, &dat->ks.ks, ctx->iv,
++ EVP_CIPHER_CTX_is_encrypting(ctx));
++ else if (EVP_CIPHER_CTX_is_encrypting(ctx))
++ CRYPTO_cbc128_encrypt(in, out, len, &dat->ks, ctx->iv,
++ dat->block);
+ else
+- CRYPTO_cbc128_decrypt(in, out, len, key, ivec,
+- (block128_f)ossl_sm4_decrypt);
++ CRYPTO_cbc128_decrypt(in, out, len, &dat->ks,
++ ctx->iv, dat->block);
++ return 1;
+ }
+
+-static void sm4_cfb128_encrypt(const unsigned char *in, unsigned char *out,
+- size_t length, const SM4_KEY *key,
+- unsigned char *ivec, int *num, const int enc)
++static int sm4_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
++ const unsigned char *in, size_t len)
+ {
+- CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc,
+- (block128_f)ossl_sm4_encrypt);
++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
++ int num = EVP_CIPHER_CTX_get_num(ctx);
++
++ CRYPTO_cfb128_encrypt(in, out, len, &dat->ks,
++ ctx->iv, &num,
++ EVP_CIPHER_CTX_is_encrypting(ctx), dat->block);
++ EVP_CIPHER_CTX_set_num(ctx, num);
++ return 1;
+ }
+
+-static void sm4_ecb_encrypt(const unsigned char *in, unsigned char *out,
+- const SM4_KEY *key, const int enc)
++static int sm4_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
++ const unsigned char *in, size_t len)
+ {
+- if (enc)
+- ossl_sm4_encrypt(in, out, key);
++ size_t bl = EVP_CIPHER_CTX_get_block_size(ctx);
++ size_t i;
++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
++
++ if (len < bl)
++ return 1;
++
++ if (dat->stream.ecb != NULL)
++ (*dat->stream.ecb) (in, out, len, &dat->ks.ks,
++ EVP_CIPHER_CTX_is_encrypting(ctx));
+ else
+- ossl_sm4_decrypt(in, out, key);
++ for (i = 0, len -= bl; i <= len; i += bl)
++ (*dat->block) (in + i, out + i, &dat->ks);
++
++ return 1;
+ }
+
+-static void sm4_ofb128_encrypt(const unsigned char *in, unsigned char *out,
+- size_t length, const SM4_KEY *key,
+- unsigned char *ivec, int *num)
++static int sm4_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
++ const unsigned char *in, size_t len)
+ {
+- CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num,
+- (block128_f)ossl_sm4_encrypt);
+-}
++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
++ int num = EVP_CIPHER_CTX_get_num(ctx);
+
+-IMPLEMENT_BLOCK_CIPHER(sm4, ks, sm4, EVP_SM4_KEY, NID_sm4,
+- 16, 16, 16, 128, EVP_CIPH_FLAG_DEFAULT_ASN1,
+- sm4_init_key, 0, 0, 0, 0)
++ CRYPTO_ofb128_encrypt(in, out, len, &dat->ks,
++ ctx->iv, &num, dat->block);
++ EVP_CIPHER_CTX_set_num(ctx, num);
++ return 1;
++}
+
+ static int sm4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+ {
+ int n = EVP_CIPHER_CTX_get_num(ctx);
+ unsigned int num;
+- EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY, ctx);
++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+
+ if (n < 0)
+ return 0;
+ num = (unsigned int)n;
+
+- CRYPTO_ctr128_encrypt(in, out, len, &dat->ks, ctx->iv,
+- EVP_CIPHER_CTX_buf_noconst(ctx), &num,
+- (block128_f)ossl_sm4_encrypt);
++ if (dat->stream.ctr)
++ CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks,
++ ctx->iv,
++ EVP_CIPHER_CTX_buf_noconst(ctx),
++ &num, dat->stream.ctr);
++ else
++ CRYPTO_ctr128_encrypt(in, out, len, &dat->ks,
++ ctx->iv,
++ EVP_CIPHER_CTX_buf_noconst(ctx), &num,
++ dat->block);
+ EVP_CIPHER_CTX_set_num(ctx, num);
+ return 1;
+ }
+
+-static const EVP_CIPHER sm4_ctr_mode = {
+- NID_sm4_ctr, 1, 16, 16,
+- EVP_CIPH_CTR_MODE,
+- EVP_ORIG_GLOBAL,
+- sm4_init_key,
+- sm4_ctr_cipher,
+- NULL,
+- sizeof(EVP_SM4_KEY),
+- NULL, NULL, NULL, NULL
+-};
+-
+-const EVP_CIPHER *EVP_sm4_ctr(void)
+-{
+- return &sm4_ctr_mode;
+-}
+-
++DEFINE_BLOCK_CIPHERS(NID_sm4, 0)
+ #endif
+diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl
+new file mode 100755
+index 0000000000..7358a6e6a2
+--- /dev/null
++++ b/crypto/sm4/asm/sm4-armv8.pl
+@@ -0,0 +1,635 @@
++#! /usr/bin/env perl
++# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the Apache License 2.0 (the "License"). You may not use
++# this file except in compliance with the License. You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++
++#
++# This module implements support for SM4 hw support on aarch64
++# Oct 2021
++#
++
++# $output is the last argument if it looks like a file (it has an extension)
++# $flavour is the first argument if it doesn't look like a file
++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
++die "can't locate arm-xlate.pl";
++
++open OUT,"| \"$^X\" $xlate $flavour \"$output\""
++ or die "can't call $xlate: $!";
++*STDOUT=*OUT;
++
++$prefix="sm4_v8";
++my @rks=map("v$_",(0..7));
++
++sub rev32() {
++my $dst = shift;
++my $src = shift;
++$code.=<<___;
++#ifndef __ARMEB__
++ rev32 $dst.16b,$src.16b
++#endif
++___
++}
++
++sub enc_blk () {
++my $data = shift;
++$code.=<<___;
++ sm4e $data.4s,@rks[0].4s
++ sm4e $data.4s,@rks[1].4s
++ sm4e $data.4s,@rks[2].4s
++ sm4e $data.4s,@rks[3].4s
++ sm4e $data.4s,@rks[4].4s
++ sm4e $data.4s,@rks[5].4s
++ sm4e $data.4s,@rks[6].4s
++ sm4e $data.4s,@rks[7].4s
++ rev64 $data.4S,$data.4S
++ ext $data.16b,$data.16b,$data.16b,#8
++___
++}
++
++sub enc_4blks () {
++my $data0 = shift;
++my $data1 = shift;
++my $data2 = shift;
++my $data3 = shift;
++$code.=<<___;
++ sm4e $data0.4s,@rks[0].4s
++ sm4e $data1.4s,@rks[0].4s
++ sm4e $data2.4s,@rks[0].4s
++ sm4e $data3.4s,@rks[0].4s
++
++ sm4e $data0.4s,@rks[1].4s
++ sm4e $data1.4s,@rks[1].4s
++ sm4e $data2.4s,@rks[1].4s
++ sm4e $data3.4s,@rks[1].4s
++
++ sm4e $data0.4s,@rks[2].4s
++ sm4e $data1.4s,@rks[2].4s
++ sm4e $data2.4s,@rks[2].4s
++ sm4e $data3.4s,@rks[2].4s
++
++ sm4e $data0.4s,@rks[3].4s
++ sm4e $data1.4s,@rks[3].4s
++ sm4e $data2.4s,@rks[3].4s
++ sm4e $data3.4s,@rks[3].4s
++
++ sm4e $data0.4s,@rks[4].4s
++ sm4e $data1.4s,@rks[4].4s
++ sm4e $data2.4s,@rks[4].4s
++ sm4e $data3.4s,@rks[4].4s
++
++ sm4e $data0.4s,@rks[5].4s
++ sm4e $data1.4s,@rks[5].4s
++ sm4e $data2.4s,@rks[5].4s
++ sm4e $data3.4s,@rks[5].4s
++
++ sm4e $data0.4s,@rks[6].4s
++ sm4e $data1.4s,@rks[6].4s
++ sm4e $data2.4s,@rks[6].4s
++ sm4e $data3.4s,@rks[6].4s
++
++ sm4e $data0.4s,@rks[7].4s
++ rev64 $data0.4S,$data0.4S
++ sm4e $data1.4s,@rks[7].4s
++ ext $data0.16b,$data0.16b,$data0.16b,#8
++ rev64 $data1.4S,$data1.4S
++ sm4e $data2.4s,@rks[7].4s
++ ext $data1.16b,$data1.16b,$data1.16b,#8
++ rev64 $data2.4S,$data2.4S
++ sm4e $data3.4s,@rks[7].4s
++ ext $data2.16b,$data2.16b,$data2.16b,#8
++ rev64 $data3.4S,$data3.4S
++ ext $data3.16b,$data3.16b,$data3.16b,#8
++___
++}
++
++$code=<<___;
++#include "arm_arch.h"
++.arch armv8-a+crypto
++.text
++___
++
++{{{
++$code.=<<___;
++.align 6
++.Lck:
++ .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
++ .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
++ .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
++ .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
++ .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
++ .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
++ .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
++ .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
++.Lfk:
++ .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
++___
++}}}
++
++{{{
++my ($key,$keys)=("x0","x1");
++my ($tmp)=("x2");
++my ($key0,$key1,$key2,$key3,$key4,$key5,$key6,$key7)=map("v$_",(0..7));
++my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
++my ($fkconst) = ("v24");
++$code.=<<___;
++.globl ${prefix}_set_encrypt_key
++.type ${prefix}_set_encrypt_key,%function
++.align 5
++${prefix}_set_encrypt_key:
++ AARCH64_VALID_CALL_TARGET
++ ld1 {$key0.4s},[$key]
++ adr $tmp,.Lfk
++ ld1 {$fkconst.4s},[$tmp]
++ adr $tmp,.Lck
++ ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
++___
++ &rev32($key0, $key0);
++$code.=<<___;
++ ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
++ eor $key0.16b,$key0.16b,$fkconst.16b;
++ sm4ekey $key0.4S,$key0.4S,$const0.4S
++ sm4ekey $key1.4S,$key0.4S,$const1.4S
++ sm4ekey $key2.4S,$key1.4S,$const2.4S
++ sm4ekey $key3.4S,$key2.4S,$const3.4S
++ sm4ekey $key4.4S,$key3.4S,$const4.4S
++ st1 {$key0.4s,$key1.4s,$key2.4s,$key3.4s},[$keys],64
++ sm4ekey $key5.4S,$key4.4S,$const5.4S
++ sm4ekey $key6.4S,$key5.4S,$const6.4S
++ sm4ekey $key7.4S,$key6.4S,$const7.4S
++ st1 {$key4.4s,$key5.4s,$key6.4s,$key7.4s},[$keys]
++ ret
++.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
++___
++}}}
++
++{{{
++my ($key,$keys)=("x0","x1");
++my ($tmp)=("x2");
++my ($key7,$key6,$key5,$key4,$key3,$key2,$key1,$key0)=map("v$_",(0..7));
++my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
++my ($fkconst) = ("v24");
++$code.=<<___;
++.globl ${prefix}_set_decrypt_key
++.type ${prefix}_set_decrypt_key,%function
++.align 5
++${prefix}_set_decrypt_key:
++ AARCH64_VALID_CALL_TARGET
++ ld1 {$key0.4s},[$key]
++ adr $tmp,.Lfk
++ ld1 {$fkconst.4s},[$tmp]
++ adr $tmp, .Lck
++ ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
++___
++ &rev32($key0, $key0);
++$code.=<<___;
++ ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
++ eor $key0.16b, $key0.16b,$fkconst.16b;
++ sm4ekey $key0.4S,$key0.4S,$const0.4S
++ sm4ekey $key1.4S,$key0.4S,$const1.4S
++ sm4ekey $key2.4S,$key1.4S,$const2.4S
++ rev64 $key0.4s,$key0.4s
++ rev64 $key1.4s,$key1.4s
++ ext $key0.16b,$key0.16b,$key0.16b,#8
++ ext $key1.16b,$key1.16b,$key1.16b,#8
++ sm4ekey $key3.4S,$key2.4S,$const3.4S
++ sm4ekey $key4.4S,$key3.4S,$const4.4S
++ rev64 $key2.4s,$key2.4s
++ rev64 $key3.4s,$key3.4s
++ ext $key2.16b,$key2.16b,$key2.16b,#8
++ ext $key3.16b,$key3.16b,$key3.16b,#8
++ sm4ekey $key5.4S,$key4.4S,$const5.4S
++ sm4ekey $key6.4S,$key5.4S,$const6.4S
++ rev64 $key4.4s,$key4.4s
++ rev64 $key5.4s,$key5.4s
++ ext $key4.16b,$key4.16b,$key4.16b,#8
++ ext $key5.16b,$key5.16b,$key5.16b,#8
++ sm4ekey $key7.4S,$key6.4S,$const7.4S
++ rev64 $key6.4s, $key6.4s
++ rev64 $key7.4s, $key7.4s
++ ext $key6.16b,$key6.16b,$key6.16b,#8
++ ext $key7.16b,$key7.16b,$key7.16b,#8
++ st1 {$key7.4s,$key6.4s,$key5.4s,$key4.4s},[$keys],64
++ st1 {$key3.4s,$key2.4s,$key1.4s,$key0.4s},[$keys]
++ ret
++.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
++___
++}}}
++
++{{{
++sub gen_block () {
++my $dir = shift;
++my ($inp,$out,$rk)=map("x$_",(0..2));
++my ($data)=("v16");
++$code.=<<___;
++.globl ${prefix}_${dir}crypt
++.type ${prefix}_${dir}crypt,%function
++.align 5
++${prefix}_${dir}crypt:
++ AARCH64_VALID_CALL_TARGET
++ ld1 {$data.4s},[$inp]
++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64
++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
++___
++ &rev32($data,$data);
++ &enc_blk($data);
++ &rev32($data,$data);
++$code.=<<___;
++ st1 {$data.4s},[$out]
++ ret
++.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
++___
++}
++
++&gen_block("en");
++&gen_block("de");
++}}}
++
++{{{
++my ($inp,$out,$len,$rk)=map("x$_",(0..3));
++my ($enc) = ("w4");
++my @dat=map("v$_",(16..23));
++$code.=<<___;
++.globl ${prefix}_ecb_encrypt
++.type ${prefix}_ecb_encrypt,%function
++.align 5
++${prefix}_ecb_encrypt:
++ AARCH64_VALID_CALL_TARGET
++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64
++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
++1:
++ cmp $len,#64
++ b.lt 1f
++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
++ cmp $len,#128
++ b.lt 2f
++ ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64
++ // 8 blocks
++___
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++ &rev32(@dat[4],@dat[4]);
++ &rev32(@dat[5],@dat[5]);
++ &rev32(@dat[6],@dat[6]);
++ &rev32(@dat[7],@dat[7]);
++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++ &rev32(@dat[4],@dat[4]);
++ &rev32(@dat[5],@dat[5]);
++$code.=<<___;
++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++___
++ &rev32(@dat[6],@dat[6]);
++ &rev32(@dat[7],@dat[7]);
++$code.=<<___;
++ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
++ subs $len,$len,#128
++ b.gt 1b
++ ret
++ // 4 blocks
++2:
++___
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++$code.=<<___;
++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++ subs $len,$len,#64
++ b.gt 1b
++1:
++ subs $len,$len,#16
++ b.lt 1f
++ ld1 {@dat[0].4s},[$inp],#16
++___
++ &rev32(@dat[0],@dat[0]);
++ &enc_blk(@dat[0]);
++ &rev32(@dat[0],@dat[0]);
++$code.=<<___;
++ st1 {@dat[0].4s},[$out],#16
++ b.ne 1b
++1:
++ ret
++.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
++___
++}}}
++
++{{{
++my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
++my ($enc) = ("w5");
++my @dat=map("v$_",(16..23));
++my @in=map("v$_",(24..31));
++my ($ivec) = ("v8");
++$code.=<<___;
++.globl ${prefix}_cbc_encrypt
++.type ${prefix}_cbc_encrypt,%function
++.align 5
++${prefix}_cbc_encrypt:
++ AARCH64_VALID_CALL_TARGET
++ stp d8,d9,[sp, #-16]!
++
++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64
++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
++ ld1 {$ivec.4s},[$ivp]
++ cmp $enc,#0
++ b.eq .Ldec
++1:
++ cmp $len, #64
++ b.lt 1f
++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
++ eor @dat[0].16b,@dat[0].16b,$ivec.16b
++___
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++ &enc_blk(@dat[0]);
++$code.=<<___;
++ eor @dat[1].16b,@dat[1].16b,@dat[0].16b
++___
++ &enc_blk(@dat[1]);
++ &rev32(@dat[0],@dat[0]);
++$code.=<<___;
++ eor @dat[2].16b,@dat[2].16b,@dat[1].16b
++___
++ &enc_blk(@dat[2]);
++ &rev32(@dat[1],@dat[1]);
++$code.=<<___;
++ eor @dat[3].16b,@dat[3].16b,@dat[2].16b
++___
++ &enc_blk(@dat[3]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++$code.=<<___;
++ mov $ivec.16b,@dat[3].16b
++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++ subs $len,$len,#64
++ b.ne 1b
++1:
++ subs $len,$len,#16
++ b.lt 3f
++ ld1 {@dat[0].4s},[$inp],#16
++ eor $ivec.16b,$ivec.16b,@dat[0].16b
++___
++ &rev32($ivec,$ivec);
++ &enc_blk($ivec);
++ &rev32($ivec,$ivec);
++$code.=<<___;
++ st1 {$ivec.16b},[$out],#16
++ b.ne 1b
++ b 3f
++.Ldec:
++1:
++ cmp $len, #64
++ b.lt 1f
++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp]
++ ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64
++ cmp $len,#128
++ b.lt 2f
++ // 8 blocks mode
++ ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp]
++ ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64
++___
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],$dat[3]);
++ &rev32(@dat[4],@dat[4]);
++ &rev32(@dat[5],@dat[5]);
++ &rev32(@dat[6],@dat[6]);
++ &rev32(@dat[7],$dat[7]);
++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++ &rev32(@dat[4],@dat[4]);
++ &rev32(@dat[5],@dat[5]);
++ &rev32(@dat[6],@dat[6]);
++ &rev32(@dat[7],@dat[7]);
++$code.=<<___;
++ eor @dat[0].16b,@dat[0].16b,$ivec.16b
++ eor @dat[1].16b,@dat[1].16b,@in[0].16b
++ eor @dat[2].16b,@dat[2].16b,@in[1].16b
++ mov $ivec.16b,@in[7].16b
++ eor @dat[3].16b,$dat[3].16b,@in[2].16b
++ eor @dat[4].16b,$dat[4].16b,@in[3].16b
++ eor @dat[5].16b,$dat[5].16b,@in[4].16b
++ eor @dat[6].16b,$dat[6].16b,@in[5].16b
++ eor @dat[7].16b,$dat[7].16b,@in[6].16b
++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
++ subs $len,$len,128
++ b.gt 1b
++ b 3f
++ // 4 blocks mode
++2:
++___
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],$dat[3]);
++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++$code.=<<___;
++ eor @dat[0].16b,@dat[0].16b,$ivec.16b
++ eor @dat[1].16b,@dat[1].16b,@in[0].16b
++ mov $ivec.16b,@in[3].16b
++ eor @dat[2].16b,@dat[2].16b,@in[1].16b
++ eor @dat[3].16b,$dat[3].16b,@in[2].16b
++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++ subs $len,$len,#64
++ b.gt 1b
++1:
++ subs $len,$len,#16
++ b.lt 3f
++ ld1 {@dat[0].4s},[$inp],#16
++ mov @in[0].16b,@dat[0].16b
++___
++ &rev32(@dat[0],@dat[0]);
++ &enc_blk(@dat[0]);
++ &rev32(@dat[0],@dat[0]);
++$code.=<<___;
++ eor @dat[0].16b,@dat[0].16b,$ivec.16b
++ mov $ivec.16b,@in[0].16b
++ st1 {@dat[0].16b},[$out],#16
++ b.ne 1b
++3:
++ // save back IV
++ st1 {$ivec.16b},[$ivp]
++ ldp d8,d9,[sp],#16
++ ret
++.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
++___
++}}}
++
++{{{
++my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
++my ($ctr)=("w5");
++my @dat=map("v$_",(16..23));
++my @in=map("v$_",(24..31));
++my ($ivec)=("v8");
++$code.=<<___;
++.globl ${prefix}_ctr32_encrypt_blocks
++.type ${prefix}_ctr32_encrypt_blocks,%function
++.align 5
++${prefix}_ctr32_encrypt_blocks:
++ AARCH64_VALID_CALL_TARGET
++ stp d8,d9,[sp, #-16]!
++
++ ld1 {$ivec.4s},[$ivp]
++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64
++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
++___
++ &rev32($ivec,$ivec);
++$code.=<<___;
++ mov $ctr,$ivec.s[3]
++1:
++ cmp $len,#4
++ b.lt 1f
++ ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64
++ mov @dat[0].16b,$ivec.16b
++ mov @dat[1].16b,$ivec.16b
++ mov @dat[2].16b,$ivec.16b
++ mov @dat[3].16b,$ivec.16b
++ add $ctr,$ctr,#1
++ mov $dat[1].s[3],$ctr
++ add $ctr,$ctr,#1
++ mov @dat[2].s[3],$ctr
++ add $ctr,$ctr,#1
++ mov @dat[3].s[3],$ctr
++ cmp $len,#8
++ b.lt 2f
++ ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64
++ mov @dat[4].16b,$ivec.16b
++ mov @dat[5].16b,$ivec.16b
++ mov @dat[6].16b,$ivec.16b
++ mov @dat[7].16b,$ivec.16b
++ add $ctr,$ctr,#1
++ mov $dat[4].s[3],$ctr
++ add $ctr,$ctr,#1
++ mov @dat[5].s[3],$ctr
++ add $ctr,$ctr,#1
++ mov @dat[6].s[3],$ctr
++ add $ctr,$ctr,#1
++ mov @dat[7].s[3],$ctr
++___
++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++ &rev32(@dat[4],@dat[4]);
++ &rev32(@dat[5],@dat[5]);
++ &rev32(@dat[6],@dat[6]);
++ &rev32(@dat[7],@dat[7]);
++$code.=<<___;
++ eor @dat[0].16b,@dat[0].16b,@in[0].16b
++ eor @dat[1].16b,@dat[1].16b,@in[1].16b
++ eor @dat[2].16b,@dat[2].16b,@in[2].16b
++ eor @dat[3].16b,@dat[3].16b,@in[3].16b
++ eor @dat[4].16b,@dat[4].16b,@in[4].16b
++ eor @dat[5].16b,@dat[5].16b,@in[5].16b
++ eor @dat[6].16b,@dat[6].16b,@in[6].16b
++ eor @dat[7].16b,@dat[7].16b,@in[7].16b
++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
++ subs $len,$len,#8
++ b.eq 3f
++ add $ctr,$ctr,#1
++ mov $ivec.s[3],$ctr
++ b 1b
++2:
++___
++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++$code.=<<___;
++ eor @dat[0].16b,@dat[0].16b,@in[0].16b
++ eor @dat[1].16b,@dat[1].16b,@in[1].16b
++ eor @dat[2].16b,@dat[2].16b,@in[2].16b
++ eor @dat[3].16b,@dat[3].16b,@in[3].16b
++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++ subs $len,$len,#4
++ b.eq 3f
++ add $ctr,$ctr,#1
++ mov $ivec.s[3],$ctr
++ b 1b
++1:
++ subs $len,$len,#1
++ b.lt 3f
++ mov $dat[0].16b,$ivec.16b
++ ld1 {@in[0].4s},[$inp],#16
++___
++ &enc_blk(@dat[0]);
++ &rev32(@dat[0],@dat[0]);
++$code.=<<___;
++ eor $dat[0].16b,$dat[0].16b,@in[0].16b
++ st1 {$dat[0].4s},[$out],#16
++ b.eq 3f
++ add $ctr,$ctr,#1
++ mov $ivec.s[3],$ctr
++ b 1b
++3:
++ ldp d8,d9,[sp],#16
++ ret
++.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
++___
++}}}
++########################################
++{ my %opcode = (
++ "sm4e" => 0xcec08400,
++ "sm4ekey" => 0xce60c800);
++
++ sub unsm4 {
++ my ($mnemonic,$arg)=@_;
++
++ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
++ &&
++ sprintf ".inst\t0x%08x\t//%s %s",
++ $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
++ $mnemonic,$arg;
++ }
++}
++
++open SELF,$0;
++while(<SELF>) {
++ next if (/^#!/);
++ last if (!s/^#/\/\// and !/^$/);
++ print;
++}
++close SELF;
++
++foreach(split("\n",$code)) {
++ s/\`([^\`]*)\`/eval($1)/ge;
++
++ s/\b(sm4\w+)\s+([qv].*)/unsm4($1,$2)/ge;
++ print $_,"\n";
++}
++
++close STDOUT or die "error closing STDOUT: $!";
+diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info
+index b65a7d149e..e27aa49e67 100644
+--- a/crypto/sm4/build.info
++++ b/crypto/sm4/build.info
+@@ -1,4 +1,32 @@
+ LIBS=../../libcrypto
+-SOURCE[../../libcrypto]=\
+- sm4.c
+
++IF[{- !$disabled{asm} -}]
++ $SM4DEF_aarch64=SM4_ASM
++ $SM4ASM_aarch64=sm4-armv8.S
++
++ # Now that we have defined all the arch specific variables, use the
++ # appropriate one, and define the appropriate macros
++ IF[$SM4ASM_{- $target{asm_arch} -}]
++ $SM4ASM=$SM4ASM_{- $target{asm_arch} -}
++ $SM4DEF=$SM4DEF_{- $target{asm_arch} -}
++ ENDIF
++ENDIF
++
++SOURCE[../../libcrypto]= $SM4ASM sm4.c
++
++
++# Implementations are now spread across several libraries, so the defines
++# need to be applied to all affected libraries and modules.
++DEFINE[../../libcrypto]=$SM4DEF
++DEFINE[../../providers/libfips.a]=$SM4DEF
++DEFINE[../../providers/libdefault.a]=$SM4DEF
++# We only need to include the SM4DEF stuff in the legacy provider when it's a
++# separate module and it's dynamically linked with libcrypto. Otherwise, it
++# already gets everything that the static libcrypto.a has, and doesn't need it
++# added again.
++IF[{- !$disabled{module} && !$disabled{shared} -}]
++ DEFINE[../providers/liblegacy.a]=$SM4DEF
++ENDIF
++
++GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl
++INCLUDE[sm4-armv8.o]=..
+diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
+new file mode 100644
+index 0000000000..42c8b44a43
+--- /dev/null
++++ b/include/crypto/sm4_platform.h
+@@ -0,0 +1,48 @@
++/*
++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License"). You may not use
++ * this file except in compliance with the License. You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++#ifndef OSSL_SM4_PLATFORM_H
++# define OSSL_SM4_PLATFORM_H
++# pragma once
++
++# if defined(OPENSSL_CPUID_OBJ)
++# if (defined(__arm__) || defined(__arm) || defined(__aarch64__))
++# include "arm_arch.h"
++# if __ARM_MAX_ARCH__>=8
++# define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4)
++# define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key
++# define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key
++# define HWSM4_encrypt sm4_v8_encrypt
++# define HWSM4_decrypt sm4_v8_decrypt
++# define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt
++# define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt
++# define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks
++# endif
++# endif
++# endif /* OPENSSL_CPUID_OBJ */
++
++# if defined(HWSM4_CAPABLE)
++int HWSM4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key);
++int HWSM4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key);
++void HWSM4_encrypt(const unsigned char *in, unsigned char *out,
++ const SM4_KEY *key);
++void HWSM4_decrypt(const unsigned char *in, unsigned char *out,
++ const SM4_KEY *key);
++void HWSM4_cbc_encrypt(const unsigned char *in, unsigned char *out,
++ size_t length, const SM4_KEY *key,
++ unsigned char *ivec, const int enc);
++void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out,
++ size_t length, const SM4_KEY *key,
++ const int enc);
++void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
++ size_t len, const void *key,
++ const unsigned char ivec[16]);
++# endif /* HWSM4_CAPABLE */
++
++#endif /* OSSL_SM4_PLATFORM_H */
+diff --git a/providers/implementations/ciphers/cipher_sm4.h b/providers/implementations/ciphers/cipher_sm4.h
+index f7f833fcb4..01a031a74d 100644
+--- a/providers/implementations/ciphers/cipher_sm4.h
++++ b/providers/implementations/ciphers/cipher_sm4.h
+@@ -9,6 +9,7 @@
+
+ #include "prov/ciphercommon.h"
+ #include "crypto/sm4.h"
++#include "crypto/sm4_platform.h"
+
+ typedef struct prov_cast_ctx_st {
+ PROV_CIPHER_CTX base; /* Must be first */
+diff --git a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+index 6bcd1ec406..c0c9b22bd3 100644
+--- a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
++++ b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+@@ -12,6 +12,7 @@
+ */
+
+ #include "cipher_sm4_gcm.h"
++#include "crypto/sm4_platform.h"
+
+ static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
+ size_t keylen)
+@@ -20,9 +21,22 @@ static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
+ SM4_KEY *ks = &actx->ks.ks;
+
+ ctx->ks = ks;
+- ossl_sm4_set_key(key, ks);
+- CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt);
+- ctx->ctr = (ctr128_f)NULL;
++# ifdef HWSM4_CAPABLE
++ if (HWSM4_CAPABLE) {
++ HWSM4_set_encrypt_key(key, ks);
++ CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f) HWSM4_encrypt);
++# ifdef HWSM4_ctr32_encrypt_blocks
++ ctx->ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks;
++# else /* HWSM4_ctr32_encrypt_blocks */
++ ctx->ctr = (ctr128_f)NULL;
++# endif
++ } else
++# endif /* HWSM4_CAPABLE */
++ {
++ ossl_sm4_set_key(key, ks);
++ CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt);
++ ctx->ctr = (ctr128_f)NULL;
++ }
+ ctx->key_set = 1;
+
+ return 1;
+diff --git a/providers/implementations/ciphers/cipher_sm4_hw.c b/providers/implementations/ciphers/cipher_sm4_hw.c
+index 0db04b1a74..4cd3d3d669 100644
+--- a/providers/implementations/ciphers/cipher_sm4_hw.c
++++ b/providers/implementations/ciphers/cipher_sm4_hw.c
+@@ -15,14 +15,59 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
+ PROV_SM4_CTX *sctx = (PROV_SM4_CTX *)ctx;
+ SM4_KEY *ks = &sctx->ks.ks;
+
+- ossl_sm4_set_key(key, ks);
+ ctx->ks = ks;
+ if (ctx->enc
+ || (ctx->mode != EVP_CIPH_ECB_MODE
+- && ctx->mode != EVP_CIPH_CBC_MODE))
+- ctx->block = (block128_f)ossl_sm4_encrypt;
+- else
+- ctx->block = (block128_f)ossl_sm4_decrypt;
++ && ctx->mode != EVP_CIPH_CBC_MODE)) {
++#ifdef HWSM4_CAPABLE
++ if (HWSM4_CAPABLE) {
++ HWSM4_set_encrypt_key(key, ks);
++ ctx->block = (block128_f)HWSM4_encrypt;
++ ctx->stream.cbc = NULL;
++#ifdef HWSM4_cbc_encrypt
++ if (ctx->mode == EVP_CIPH_CBC_MODE)
++ ctx->stream.cbc = (cbc128_f)HWSM4_cbc_encrypt;
++ else
++#endif
++#ifdef HWSM4_ecb_encrypt
++ if (ctx->mode == EVP_CIPH_ECB_MODE)
++ ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt;
++ else
++#endif
++#ifdef HWSM4_ctr32_encrypt_blocks
++ if (ctx->mode == EVP_CIPH_CTR_MODE)
++ ctx->stream.ctr = (ctr128_f)HWSM4_ctr32_encrypt_blocks;
++ else
++#endif
++ (void)0; /* terminate potentially open 'else' */
++ } else
++#endif
++ {
++ ossl_sm4_set_key(key, ks);
++ ctx->block = (block128_f)ossl_sm4_encrypt;
++ }
++ } else {
++#ifdef HWSM4_CAPABLE
++ if (HWSM4_CAPABLE) {
++ HWSM4_set_decrypt_key(key, ks);
++ ctx->block = (block128_f)HWSM4_decrypt;
++ ctx->stream.cbc = NULL;
++#ifdef HWSM4_cbc_encrypt
++ if (ctx->mode == EVP_CIPH_CBC_MODE)
++ ctx->stream.cbc = (cbc128_f)HWSM4_cbc_encrypt;
++#endif
++#ifdef HWSM4_ecb_encrypt
++ if (ctx->mode == EVP_CIPH_ECB_MODE)
++ ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt;
++#endif
++ } else
++#endif
++ {
++ ossl_sm4_set_key(key, ks);
++ ctx->block = (block128_f)ossl_sm4_decrypt;
++ }
++ }
++
+ return 1;
+ }
+
+@@ -31,7 +76,7 @@ IMPLEMENT_CIPHER_HW_COPYCTX(cipher_hw_sm4_copyctx, PROV_SM4_CTX)
+ # define PROV_CIPHER_HW_sm4_mode(mode) \
+ static const PROV_CIPHER_HW sm4_##mode = { \
+ cipher_hw_sm4_initkey, \
+- ossl_cipher_hw_chunked_##mode, \
++ ossl_cipher_hw_generic_##mode, \
+ cipher_hw_sm4_copyctx \
+ }; \
+ const PROV_CIPHER_HW *ossl_prov_cipher_hw_sm4_##mode(size_t keybits) \
+--
+2.37.3.windows.1
+