automatic import of openssl3openeuler22.03_LTS_SP2 openeuler22.03_LTS openeuler20.03_LTS_SP1 openeuler20.03

author: CoprDistGit <infra@openeuler.org> 2023-10-02 03:32:16 +0000
committer: CoprDistGit <infra@openeuler.org> 2023-10-02 03:32:16 +0000
commit: e879981f405f8810d1b0d9c1c77aea3e8be6a469 (patch)
tree: 8698c9791c9e77d3be587c5c7ad9d43dce7c6d30
parent: 80d0cbc46bb935a925d434060b67c794844558d9 (diff)
29 files changed, 9493 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index e69de29..40bd54c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+/openssl-3.0.9.tar.gz
diff --git a/Backport-Apply-SM4-optimization-patch-to-Kunpeng-920.patch b/Backport-Apply-SM4-optimization-patch-to-Kunpeng-920.patch
new file mode 100644
index 0000000..6536ed5
--- /dev/null
+++ b/Backport-Apply-SM4-optimization-patch-to-Kunpeng-920.patch
@@ -0,0 +1,74 @@
+From 06f13f85ee86cd7fbc546060fbe2d077176b0be4 Mon Sep 17 00:00:00 2001
+From: Xu Yizhou <xuyizhou1@huawei.com>
+Date: Mon, 31 Oct 2022 11:28:15 +0800
+Subject: [PATCH 11/13] Apply SM4 optimization patch to Kunpeng-920
+
+In the ideal scenario, performance can reach up to 2.2X.
+But in single block input or CFB/OFB mode, CBC encryption,
+performance could drop about 50%.
+
+Perf data on Kunpeng-920 2.6GHz hardware, before and after optimization:
+
+Before:
+type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
+SM4-CTR 75318.96k 79089.62k 79736.15k 79934.12k 80325.44k 80068.61k
+SM4-ECB 80211.39k 84998.36k 86472.28k 87024.93k 87144.80k 86862.51k
+SM4-GCM 72156.19k 82012.08k 83848.02k 84322.65k 85103.65k 84896.43k
+SM4-CBC 77956.13k 80638.81k 81976.17k 81606.31k 82078.91k 81750.70k
+SM4-CFB 78078.20k 81054.87k 81841.07k 82396.38k 82203.99k 82236.76k
+SM4-OFB 78282.76k 82074.03k 82765.74k 82989.06k 83200.68k 83487.17k
+
+After:
+type    16 bytes  64 bytes   256 bytes  1024 bytes 8192 bytes 16384 bytes
+SM4-CTR 35678.07k 120687.25k 176632.27k 177192.62k 177586.18k 178295.18k
+SM4-ECB 35540.32k 122628.07k 175067.90k 178007.84k 178298.88k 178328.92k
+SM4-GCM 34215.75k 116720.50k 170275.16k 171770.88k 172714.21k 172272.30k
+SM4-CBC 35645.60k 36544.86k  36515.50k  36732.15k  36618.24k  36629.16k
+SM4-CFB 35528.14k 35690.99k  35954.86k  35843.42k  35809.18k  35809.96k
+SM4-OFB 35563.55k 35853.56k  35963.05k  36203.52k  36233.85k  36307.82k
+
+Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com>
+
+Reviewed-by: Hugo Landau <hlandau@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/19547)
+---
+ crypto/arm_arch.h             | 4 ++++
+ include/crypto/sm4_platform.h | 3 ++-
+ 2 files changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
+index 5b5af31d92..c10748e5f8 100644
+--- a/crypto/arm_arch.h
++++ b/crypto/arm_arch.h
+@@ -98,9 +98,13 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
+  */
+ 
+ # define ARM_CPU_IMP_ARM           0x41
++# define HISI_CPU_IMP              0x48
+ 
+ # define ARM_CPU_PART_CORTEX_A72   0xD08
+ # define ARM_CPU_PART_N1           0xD0C
++# define ARM_CPU_PART_V1           0xD40
++# define ARM_CPU_PART_N2           0xD49
++# define HISI_CPU_PART_KP920       0xD01
+ 
+ # define MIDR_PARTNUM_SHIFT       4
+ # define MIDR_PARTNUM_MASK        (0xfffU << MIDR_PARTNUM_SHIFT)
+diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
+index 11f9b9d88b..15d8abbcb1 100644
+--- a/include/crypto/sm4_platform.h
++++ b/include/crypto/sm4_platform.h
+@@ -20,7 +20,8 @@ static inline int vpsm4_capable(void)
+ {
+     return (OPENSSL_armcap_P & ARMV8_CPUID) &&
+             (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1) ||
+-             MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1));
++             MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1) ||
++             MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, HISI_CPU_IMP, HISI_CPU_PART_KP920));
+ }
+ #    if defined(VPSM4_ASM)
+ #     define VPSM4_CAPABLE vpsm4_capable()
+-- 
+2.37.3.windows.1
+
diff --git a/Backport-Fix-SM4-CBC-regression-on-Armv8.patch b/Backport-Fix-SM4-CBC-regression-on-Armv8.patch
new file mode 100644
index 0000000..2176932
--- /dev/null
+++ b/Backport-Fix-SM4-CBC-regression-on-Armv8.patch
@@ -0,0 +1,60 @@
+From d7d5490d7201dcfb1f3811ad1bfc57ed9b2c0b77 Mon Sep 17 00:00:00 2001
+From: "fangming.fang" <fangming.fang@arm.com>
+Date: Thu, 8 Dec 2022 10:46:27 +0000
+Subject: [PATCH 09/13] Fix SM4-CBC regression on Armv8
+
+Fixes #19858
+
+During decryption, the last ciphertext is not fed to next block
+correctly when the number of input blocks is exactly 4. Fix this
+and add the corresponding test cases.
+
+Thanks xu-yi-zhou for reporting this issue and proposing the fix.
+
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/19872)
+---
+ crypto/sm4/asm/vpsm4-armv8.pl                 |  2 +-
+ test/recipes/30-test_evp_data/evpciph_sm4.txt | 12 ++++++++++++
+ 2 files changed, 13 insertions(+), 1 deletion(-)
+
+diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl
+index 095d9dae64..c842ef61d5 100755
+--- a/crypto/sm4/asm/vpsm4-armv8.pl
++++ b/crypto/sm4/asm/vpsm4-armv8.pl
+@@ -880,7 +880,7 @@ $code.=<<___;
+ 	subs	$blocks,$blocks,#4
+ 	b.gt	.Lcbc_4_blocks_dec
+ 	// save back IV
+-	st1	{@vtmp[3].16b}, [$ivp]
++	st1	{@data[3].16b}, [$ivp]
+ 	b	100f
+ 1:	// last block
+ 	subs	$blocks,$blocks,#1
+diff --git a/test/recipes/30-test_evp_data/evpciph_sm4.txt b/test/recipes/30-test_evp_data/evpciph_sm4.txt
+index 9fb16ca15c..e9a98c9898 100644
+--- a/test/recipes/30-test_evp_data/evpciph_sm4.txt
++++ b/test/recipes/30-test_evp_data/evpciph_sm4.txt
+@@ -19,6 +19,18 @@ IV  = 0123456789ABCDEFFEDCBA9876543210
+ Plaintext = 0123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA9876543210
+ Ciphertext = 2677F46B09C122CC975533105BD4A22AF6125F7275CE552C3A2BBCF533DE8A3B
+ 
++Cipher = SM4-CBC
++Key = 0123456789ABCDEFFEDCBA9876543210
++IV  = 0123456789ABCDEFFEDCBA9876543210
++Plaintext = 0123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA9876543210
++Ciphertext = 2677F46B09C122CC975533105BD4A22AF6125F7275CE552C3A2BBCF533DE8A3BFFF5A4F208092C0901BA02D5772977369915E3FA2356C9F4EB6460ECC457E7f8E3CFA3DEEBFE9883E3A48BCF7C4A11AA3EC9E0D317C5D319BE72A5CDDDEC640C
++
++Cipher = SM4-CBC
++Key = 0123456789ABCDEFFEDCBA9876543210
++IV  = 0123456789ABCDEFFEDCBA9876543210
++Plaintext = 0123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA9876543210
++Ciphertext = 2677f46b09c122cc975533105bd4a22af6125f7275ce552c3a2bbcf533de8a3bfff5a4f208092c0901ba02d5772977369915e3fa2356c9f4eb6460ecc457e7f8e3cfa3deebfe9883e3a48bcf7c4a11aa3ec9e0d317c5d319be72a5cdddec640c6fc70bfa3ddaafffdd7c09b2774dcb2cec29f0c6f0b6773e985b3e395e924238505a8f120d9ca84de5c3cf7e45f097b14b3a46c5b1068669982a5c1f5f61be291b984f331d44ffb2758f771672448fc957fa1416c446427a41e25d5524a2418b9d96b2f17582f0f1aa9c204c6807f54f7b6833c5f00856659ddabc245936868c
++
+ Cipher = SM4-OFB
+ Key = 0123456789ABCDEFFEDCBA9876543210
+ IV  = 0123456789ABCDEFFEDCBA9876543210
+-- 
+2.37.3.windows.1
+
diff --git a/Backport-Fix-SM4-XTS-build-failure-on-Mac-mini-M1.patch b/Backport-Fix-SM4-XTS-build-failure-on-Mac-mini-M1.patch
new file mode 100644
index 0000000..5bfd186
--- /dev/null
+++ b/Backport-Fix-SM4-XTS-build-failure-on-Mac-mini-M1.patch
@@ -0,0 +1,87 @@
+From 6df7707fb22e8bd1c7d778a2041c1403f9852060 Mon Sep 17 00:00:00 2001
+From: Xu Yizhou <xuyizhou1@huawei.com>
+Date: Fri, 3 Feb 2023 15:59:59 +0800
+Subject: [PATCH 13/13] Fix SM4-XTS build failure on Mac mini M1
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+Reviewed-by: Richard Levitte <levitte@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/20202)
+---
+ crypto/sm4/asm/vpsm4-armv8.pl    |  4 +++-
+ crypto/sm4/asm/vpsm4_ex-armv8.pl | 23 ++++++++++++++++-------
+ 2 files changed, 19 insertions(+), 8 deletions(-)
+
+diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl
+index e19de30901..d30e78f3ce 100755
+--- a/crypto/sm4/asm/vpsm4-armv8.pl
++++ b/crypto/sm4/asm/vpsm4-armv8.pl
+@@ -524,7 +524,7 @@ sub compute_tweak_vec() {
+ 	my $std = shift;
+ 	&rbit(@vtmp[2],$src,$std);
+ $code.=<<___;
+-	ldr  @qtmp[0], =0x01010101010101010101010101010187
++	ldr  @qtmp[0], .Lxts_magic
+ 	shl  $des.16b, @vtmp[2].16b, #1
+ 	ext  @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
+ 	ushr @vtmp[1].16b, @vtmp[1].16b, #7
+@@ -572,6 +572,8 @@ _vpsm4_consts:
+ 	.dword 0x56aa3350a3b1bac6,0xb27022dc677d9197
+ .Lshuffles:
+ 	.dword 0x0B0A090807060504,0x030201000F0E0D0C
++.Lxts_magic:
++	.dword 0x0101010101010187,0x0101010101010101
+ 
+ .size	_vpsm4_consts,.-_vpsm4_consts
+ ___
+diff --git a/crypto/sm4/asm/vpsm4_ex-armv8.pl b/crypto/sm4/asm/vpsm4_ex-armv8.pl
+index 3d094aa535..f2d5b6debf 100644
+--- a/crypto/sm4/asm/vpsm4_ex-armv8.pl
++++ b/crypto/sm4/asm/vpsm4_ex-armv8.pl
+@@ -475,12 +475,12 @@ sub load_sbox () {
+ 	my $data = shift;
+ 
+ $code.=<<___;
+-	ldr $MaskQ,	   =0x0306090c0f0205080b0e0104070a0d00
+-	ldr $TAHMatQ,	=0x22581a6002783a4062185a2042387a00
+-	ldr $TALMatQ,	=0xc10bb67c4a803df715df62a89e54e923
+-	ldr $ATAHMatQ,   =0x1407c6d56c7fbeadb9aa6b78c1d21300
+-	ldr $ATALMatQ,   =0xe383c1a1fe9edcbc6404462679195b3b
+-	ldr $ANDMaskQ,	=0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
++	ldr $MaskQ, .Lsbox_magic
++	ldr $TAHMatQ, .Lsbox_magic+16
++	ldr $TALMatQ, .Lsbox_magic+32
++	ldr $ATAHMatQ, .Lsbox_magic+48
++	ldr $ATALMatQ, .Lsbox_magic+64
++	ldr $ANDMaskQ, .Lsbox_magic+80
+ ___
+ }
+ 
+@@ -525,7 +525,7 @@ sub compute_tweak_vec() {
+ 	my $std = shift;
+ 	&rbit(@vtmp[2],$src,$std);
+ $code.=<<___;
+-	ldr  @qtmp[0], =0x01010101010101010101010101010187
++	ldr  @qtmp[0], .Lxts_magic
+ 	shl  $des.16b, @vtmp[2].16b, #1
+ 	ext  @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
+ 	ushr @vtmp[1].16b, @vtmp[1].16b, #7
+@@ -556,6 +556,15 @@ _${prefix}_consts:
+ 	.dword 0x56aa3350a3b1bac6,0xb27022dc677d9197
+ .Lshuffles:
+ 	.dword 0x0B0A090807060504,0x030201000F0E0D0C
++.Lxts_magic:
++	.dword 0x0101010101010187,0x0101010101010101
++.Lsbox_magic:
++	.dword 0x0b0e0104070a0d00,0x0306090c0f020508
++	.dword 0x62185a2042387a00,0x22581a6002783a40
++	.dword 0x15df62a89e54e923,0xc10bb67c4a803df7
++	.dword 0xb9aa6b78c1d21300,0x1407c6d56c7fbead
++	.dword 0x6404462679195b3b,0xe383c1a1fe9edcbc
++	.dword 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
+ 
+ .size	_${prefix}_consts,.-_${prefix}_consts
+ ___
+-- 
+2.37.3.windows.1
+
diff --git a/Backport-Fix-SM4-test-failures-on-big-endian-ARM-processors.patch b/Backport-Fix-SM4-test-failures-on-big-endian-ARM-processors.patch
new file mode 100644
index 0000000..485fd65
--- /dev/null
+++ b/Backport-Fix-SM4-test-failures-on-big-endian-ARM-processors.patch
@@ -0,0 +1,207 @@
+From b8f24cb95dbe70cbeef08b41f35018141b6ce994 Mon Sep 17 00:00:00 2001
+From: Xu Yizhou <xuyizhou1@huawei.com>
+Date: Thu, 15 Dec 2022 10:21:07 +0800
+Subject: [PATCH 10/13] Fix SM4 test failures on big-endian ARM processors
+
+Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com>
+
+Reviewed-by: Paul Yang <kaishen.yy@antfin.com>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/19910)
+---
+ crypto/sm4/asm/vpsm4-armv8.pl | 52 +++++++++++++++++------------------
+ 1 file changed, 26 insertions(+), 26 deletions(-)
+
+diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl
+index c842ef61d5..73797af582 100755
+--- a/crypto/sm4/asm/vpsm4-armv8.pl
++++ b/crypto/sm4/asm/vpsm4-armv8.pl
+@@ -45,7 +45,7 @@ sub rev32() {
+ 
+ 	if ($src and ("$src" ne "$dst")) {
+ $code.=<<___;
+-#ifndef __ARMEB__
++#ifndef __AARCH64EB__
+ 	rev32	$dst.16b,$src.16b
+ #else
+ 	mov	$dst.16b,$src.16b
+@@ -53,7 +53,7 @@ $code.=<<___;
+ ___
+ 	} else {
+ $code.=<<___;
+-#ifndef __ARMEB__
++#ifndef __AARCH64EB__
+ 	rev32	$dst.16b,$dst.16b
+ #endif
+ ___
+@@ -428,10 +428,10 @@ sub load_sbox () {
+ 
+ $code.=<<___;
+ 	adr	$ptr,.Lsbox
+-	ld1	{@sbox[0].4s,@sbox[1].4s,@sbox[2].4s,@sbox[3].4s},[$ptr],#64
+-	ld1	{@sbox[4].4s,@sbox[5].4s,@sbox[6].4s,@sbox[7].4s},[$ptr],#64
+-	ld1	{@sbox[8].4s,@sbox[9].4s,@sbox[10].4s,@sbox[11].4s},[$ptr],#64
+-	ld1	{@sbox[12].4s,@sbox[13].4s,@sbox[14].4s,@sbox[15].4s},[$ptr]
++	ld1	{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},[$ptr],#64
++	ld1	{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},[$ptr],#64
++	ld1	{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},[$ptr],#64
++	ld1	{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},[$ptr]
+ ___
+ }
+ 
+@@ -492,9 +492,9 @@ ___
+ 	&rev32($vkey,$vkey);
+ $code.=<<___;
+ 	adr	$pointer,.Lshuffles
+-	ld1	{$vmap.4s},[$pointer]
++	ld1	{$vmap.2d},[$pointer]
+ 	adr	$pointer,.Lfk
+-	ld1	{$vfk.4s},[$pointer]
++	ld1	{$vfk.2d},[$pointer]
+ 	eor	$vkey.16b,$vkey.16b,$vfk.16b
+ 	mov	$schedules,#32
+ 	adr	$pointer,.Lck
+@@ -615,7 +615,7 @@ $code.=<<___;
+ .align	5
+ ${prefix}_${dir}crypt:
+ 	AARCH64_VALID_CALL_TARGET
+-	ld1	{@data[0].16b},[$inp]
++	ld1	{@data[0].4s},[$inp]
+ ___
+ 	&load_sbox();
+ 	&rev32(@data[0],@data[0]);
+@@ -624,7 +624,7 @@ $code.=<<___;
+ ___
+ 	&encrypt_1blk(@data[0]);
+ $code.=<<___;
+-	st1	{@data[0].16b},[$outp]
++	st1	{@data[0].4s},[$outp]
+ 	ret
+ .size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
+ ___
+@@ -692,12 +692,12 @@ $code.=<<___;
+ 	cmp	$blocks,#1
+ 	b.lt	100f
+ 	b.gt	1f
+-	ld1	{@data[0].16b},[$inp]
++	ld1	{@data[0].4s},[$inp]
+ ___
+ 	&rev32(@data[0],@data[0]);
+ 	&encrypt_1blk(@data[0]);
+ $code.=<<___;
+-	st1	{@data[0].16b},[$outp]
++	st1	{@data[0].4s},[$outp]
+ 	b	100f
+ 1:	// process last 2 blocks
+ 	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
+@@ -798,11 +798,11 @@ ___
+ 	&rev32($ivec0,$ivec0);
+ 	&encrypt_1blk($ivec0);
+ $code.=<<___;
+-	st1	{$ivec0.16b},[$outp],#16
++	st1	{$ivec0.4s},[$outp],#16
+ 	b	1b
+ 2:
+ 	// save back IV
+-	st1	{$ivec0.16b},[$ivp]
++	st1	{$ivec0.4s},[$ivp]
+ 	ret
+ 
+ .Ldec:
+@@ -834,7 +834,7 @@ ___
+ 	&transpose(@vtmp,@datax);
+ 	&transpose(@data,@datax);
+ $code.=<<___;
+-	ld1	{$ivec1.16b},[$ivp]
++	ld1	{$ivec1.4s},[$ivp]
+ 	ld1	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
+ 	// note ivec1 and vtmpx[3] are resuing the same register
+ 	// care needs to be taken to avoid conflict
+@@ -844,7 +844,7 @@ $code.=<<___;
+ 	eor	@vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
+ 	eor	@vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
+ 	// save back IV
+-	st1	{$vtmpx[3].16b}, [$ivp]
++	st1	{$vtmpx[3].4s}, [$ivp]
+ 	eor	@data[0].16b,@data[0].16b,$datax[3].16b
+ 	eor	@data[1].16b,@data[1].16b,@vtmpx[0].16b
+ 	eor	@data[2].16b,@data[2].16b,@vtmpx[1].16b
+@@ -855,7 +855,7 @@ $code.=<<___;
+ 	b.gt	.Lcbc_8_blocks_dec
+ 	b.eq	100f
+ 1:
+-	ld1	{$ivec1.16b},[$ivp]
++	ld1	{$ivec1.4s},[$ivp]
+ .Lcbc_4_blocks_dec:
+ 	cmp	$blocks,#4
+ 	b.lt	1f
+@@ -880,7 +880,7 @@ $code.=<<___;
+ 	subs	$blocks,$blocks,#4
+ 	b.gt	.Lcbc_4_blocks_dec
+ 	// save back IV
+-	st1	{@data[3].16b}, [$ivp]
++	st1	{@data[3].4s}, [$ivp]
+ 	b	100f
+ 1:	// last block
+ 	subs	$blocks,$blocks,#1
+@@ -888,13 +888,13 @@ $code.=<<___;
+ 	b.gt	1f
+ 	ld1	{@data[0].4s},[$inp],#16
+ 	// save back IV
+-	st1	{$data[0].16b}, [$ivp]
++	st1	{$data[0].4s}, [$ivp]
+ ___
+ 	&rev32(@datax[0],@data[0]);
+ 	&encrypt_1blk(@datax[0]);
+ $code.=<<___;
+ 	eor	@datax[0].16b,@datax[0].16b,$ivec1.16b
+-	st1	{@datax[0].16b},[$outp],#16
++	st1	{@datax[0].4s},[$outp],#16
+ 	b	100f
+ 1:	// last two blocks
+ 	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
+@@ -917,7 +917,7 @@ $code.=<<___;
+ 	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
+ 	st1	{@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
+ 	// save back IV
+-	st1	{@data[1].16b}, [$ivp]
++	st1	{@data[1].4s}, [$ivp]
+ 	b	100f
+ 1:	// last 3 blocks
+ 	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
+@@ -937,7 +937,7 @@ $code.=<<___;
+ 	eor	@vtmp[2].16b,@vtmp[2].16b,@data[1].16b
+ 	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
+ 	// save back IV
+-	st1	{@data[2].16b}, [$ivp]
++	st1	{@data[2].4s}, [$ivp]
+ 100:
+ 	ldp	d10,d11,[sp,#16]
+ 	ldp	d12,d13,[sp,#32]
+@@ -973,9 +973,9 @@ $code.=<<___;
+ ___
+ 	&encrypt_1blk($ivec);
+ $code.=<<___;
+-	ld1	{@data[0].16b},[$inp]
++	ld1	{@data[0].4s},[$inp]
+ 	eor	@data[0].16b,@data[0].16b,$ivec.16b
+-	st1	{@data[0].16b},[$outp]
++	st1	{@data[0].4s},[$outp]
+ 	ret
+ 1:
+ 	AARCH64_SIGN_LINK_REGISTER
+@@ -1053,9 +1053,9 @@ $code.=<<___;
+ ___
+ 	&encrypt_1blk($ivec);
+ $code.=<<___;
+-	ld1	{@data[0].16b},[$inp]
++	ld1	{@data[0].4s},[$inp]
+ 	eor	@data[0].16b,@data[0].16b,$ivec.16b
+-	st1	{@data[0].16b},[$outp]
++	st1	{@data[0].4s},[$outp]
+ 	b	100f
+ 1:	// last 2 blocks processing
+ 	dup	@data[0].4s,$word0
+-- 
+2.37.3.windows.1
+
diff --git a/Backport-Fix-sm3ss1-translation-issue-in-sm3-armv8.pl.patch b/Backport-Fix-sm3ss1-translation-issue-in-sm3-armv8.pl.patch
new file mode 100644
index 0000000..3ecb59c
--- /dev/null
+++ b/Backport-Fix-sm3ss1-translation-issue-in-sm3-armv8.pl.patch
@@ -0,0 +1,67 @@
+From 8746fff8f096fa35c7157199917100aa7b547d7a Mon Sep 17 00:00:00 2001
+From: "fangming.fang" <fangming.fang@arm.com>
+Date: Tue, 18 Jan 2022 02:58:08 +0000
+Subject: [PATCH 03/13] Fix sm3ss1 translation issue in sm3-armv8.pl
+
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+Reviewed-by: Matt Caswell <matt@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/17542)
+---
+ crypto/sm3/asm/sm3-armv8.pl | 15 +++++++--------
+ 1 file changed, 7 insertions(+), 8 deletions(-)
+
+diff --git a/crypto/sm3/asm/sm3-armv8.pl b/crypto/sm3/asm/sm3-armv8.pl
+index bb71b2eade..f0555fd3f2 100644
+--- a/crypto/sm3/asm/sm3-armv8.pl
++++ b/crypto/sm3/asm/sm3-armv8.pl
+@@ -109,7 +109,7 @@ ___
+ 
+ $code=<<___;
+ #include "arm_arch.h"
+-.arch	armv8.2-a+sm4
++.arch	armv8.2-a
+ .text
+ ___
+ 
+@@ -222,8 +222,8 @@ my %sm3partopcode = (
+ 	"sm3partw1"         =>   0xce60C000,
+         "sm3partw2"         =>   0xce60C400);
+ 
+-my %sm3sslopcode = (
+-	"sm3ssl"            =>   0xce400000);
++my %sm3ss1opcode = (
++	"sm3ss1"            =>   0xce400000);
+ 
+ my %sm3ttopcode = (
+ 	"sm3tt1a"           =>   0xce408000,
+@@ -241,14 +241,13 @@ sub unsm3part {
+ 			$mnemonic,$arg;
+ }
+ 
+-sub unsm3ssl {
++sub unsm3ss1 {
+ 	my ($mnemonic,$arg)=@_;
+ 
+-	$arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,
+-                \s*[qv](\d+)/o
++	$arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o
+ 	&&
+ 	sprintf ".inst\t0x%08x\t//%s %s",
+-			$sm3sslopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10),
++			$sm3ss1opcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10),
+ 			$mnemonic,$arg;
+ }
+ 
+@@ -274,7 +273,7 @@ foreach(split("\n",$code)) {
+ 	s/\`([^\`]*)\`/eval($1)/ge;
+ 
+ 	s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge;
+-	s/\b(sm3ssl)\s+([qv].*)/unsm3ssl($1,$2)/ge;
++	s/\b(sm3ss1)\s+([qv].*)/unsm3ss1($1,$2)/ge;
+ 	s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge;
+ 	print $_,"\n";
+ }
+-- 
+2.37.3.windows.1
+
diff --git a/Backport-Further-acceleration-for-SM4-GCM-on-ARM.patch b/Backport-Further-acceleration-for-SM4-GCM-on-ARM.patch
new file mode 100644
index 0000000..11129d9
--- /dev/null
+++ b/Backport-Further-acceleration-for-SM4-GCM-on-ARM.patch
@@ -0,0 +1,73 @@
+From 98da8a58f964e279decc1bbbe8f07d807de05f7f Mon Sep 17 00:00:00 2001
+From: Daniel Hu <Daniel.Hu@arm.com>
+Date: Wed, 2 Mar 2022 12:55:39 +0000
+Subject: [PATCH 06/13] Further acceleration for SM4-GCM on ARM
+
+This patch will allow the SM4-GCM function to leverage the SM4
+high-performance CTR crypto interface already implemented for ARM,
+which is faster than current single block cipher routine used
+for GCM
+
+It does not address the acceleration of GHASH function of GCM,
+which can be a future task, still we can see immediate uplift of
+performance (up to 4X)
+
+Before this patch:
+type             16 bytes     64 bytes    256 bytes   1024 bytes   8192 bytes  16384 bytes
+SM4-GCM         186432.92k   394234.05k   587916.46k   639365.12k   648486.91k   652924.25k
+
+After the patch:
+SM4-GCM         193924.87k   860940.35k  1696083.71k  2302548.31k  2580411.73k  2607398.91k
+
+Signed-off-by: Daniel Hu <Daniel.Hu@arm.com>
+
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/17814)
+---
+ .../ciphers/cipher_sm4_gcm_hw.c               | 25 ++++++++++++++++++-
+ 1 file changed, 24 insertions(+), 1 deletion(-)
+
+diff --git a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+index c0c9b22bd3..b9633f83ed 100644
+--- a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
++++ b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+@@ -42,11 +42,34 @@ static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
+     return 1;
+ }
+ 
++static int hw_gcm_cipher_update(PROV_GCM_CTX *ctx, const unsigned char *in,
++                                size_t len, unsigned char *out)
++{
++    if (ctx->enc) {
++        if (ctx->ctr != NULL) {
++            if (CRYPTO_gcm128_encrypt_ctr32(&ctx->gcm, in, out, len, ctx->ctr))
++                return 0;
++        } else {
++            if (CRYPTO_gcm128_encrypt(&ctx->gcm, in, out, len))
++                return 0;
++        }
++    } else {
++        if (ctx->ctr != NULL) {
++            if (CRYPTO_gcm128_decrypt_ctr32(&ctx->gcm, in, out, len, ctx->ctr))
++                return 0;
++        } else {
++            if (CRYPTO_gcm128_decrypt(&ctx->gcm, in, out, len))
++                return 0;
++        }
++    }
++    return 1;
++}
++
+ static const PROV_GCM_HW sm4_gcm = {
+     sm4_gcm_initkey,
+     ossl_gcm_setiv,
+     ossl_gcm_aad_update,
+-    ossl_gcm_cipher_update,
++    hw_gcm_cipher_update,
+     ossl_gcm_cipher_final,
+     ossl_gcm_one_shot
+ };
+-- 
+2.37.3.windows.1
+
diff --git a/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch b/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch
new file mode 100644
index 0000000..0467d78
--- /dev/null
+++ b/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch
@@ -0,0 +1,457 @@
+From 8a83d735057dde1f727eb0921446e4ca8b085267 Mon Sep 17 00:00:00 2001
+From: "fangming.fang" <fangming.fang@arm.com>
+Date: Fri, 24 Dec 2021 08:29:04 +0000
+Subject: [PATCH 02/13] SM3 acceleration with SM3 hardware instruction on
+ aarch64
+
+SM3 hardware instruction is optional feature of crypto extension for
+aarch64. This implementation accelerates SM3 via SM3 instructions. For
+the platform not supporting SM3 instruction, the original C
+implementation still works. Thanks to AliBaba for testing and reporting
+the following perf numbers for Yitian710:
+
+Benchmark on T-Head Yitian-710 2.75GHz:
+
+Before:
+type  16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes   16384 bytes
+sm3   49297.82k   121062.63k   223106.05k   283371.52k   307574.10k   309400.92k
+
+After (33% - 74% faster):
+type  16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes   16384 bytes
+sm3   65640.01k   179121.79k   359854.59k   481448.96k   534055.59k   538274.47k
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/17454)
+---
+ crypto/arm64cpuid.pl        |   8 +
+ crypto/arm_arch.h           |   2 +
+ crypto/armcap.c             |  10 ++
+ crypto/sm3/asm/sm3-armv8.pl | 282 ++++++++++++++++++++++++++++++++++++
+ crypto/sm3/build.info       |  21 ++-
+ crypto/sm3/sm3_local.h      |  16 +-
+ 6 files changed, 336 insertions(+), 3 deletions(-)
+ create mode 100644 crypto/sm3/asm/sm3-armv8.pl
+
+diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
+index 11f0e50279..10d267b7ad 100755
+--- a/crypto/arm64cpuid.pl
++++ b/crypto/arm64cpuid.pl
+@@ -96,6 +96,14 @@ _armv8_cpuid_probe:
+ 	ret
+ .size	_armv8_cpuid_probe,.-_armv8_cpuid_probe
+ 
++.globl	_armv8_sm3_probe
++.type	_armv8_sm3_probe,%function
++_armv8_sm3_probe:
++	AARCH64_VALID_CALL_TARGET
++	.long	0xce63c004	// sm3partw1 v4.4s, v0.4s, v3.4s
++	ret
++.size	_armv8_sm3_probe,.-_armv8_sm3_probe
++
+ .globl	OPENSSL_cleanse
+ .type	OPENSSL_cleanse,%function
+ .align	5
+diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
+index a815a5c72b..c8b501f34c 100644
+--- a/crypto/arm_arch.h
++++ b/crypto/arm_arch.h
+@@ -83,6 +83,8 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
+ # define ARMV8_PMULL     (1<<5)
+ # define ARMV8_SHA512    (1<<6)
+ # define ARMV8_CPUID     (1<<7)
++# define ARMV8_RNG       (1<<8)
++# define ARMV8_SM3       (1<<9)
+ 
+ /*
+  * MIDR_EL1 system register
+diff --git a/crypto/armcap.c b/crypto/armcap.c
+index c021330e32..365a48df45 100644
+--- a/crypto/armcap.c
++++ b/crypto/armcap.c
+@@ -52,6 +52,7 @@ void _armv8_sha1_probe(void);
+ void _armv8_sha256_probe(void);
+ void _armv8_pmull_probe(void);
+ # ifdef __aarch64__
++void _armv8_sm3_probe(void);
+ void _armv8_sha512_probe(void);
+ unsigned int _armv8_cpuid_probe(void);
+ # endif
+@@ -137,6 +138,7 @@ static unsigned long getauxval(unsigned long key)
+ #  define HWCAP_CE_SHA1          (1 << 5)
+ #  define HWCAP_CE_SHA256        (1 << 6)
+ #  define HWCAP_CPUID            (1 << 11)
++#  define HWCAP_CE_SM3           (1 << 18)
+ #  define HWCAP_CE_SHA512        (1 << 21)
+ # endif
+ 
+@@ -210,6 +212,9 @@ void OPENSSL_cpuid_setup(void)
+ 
+         if (hwcap & HWCAP_CPUID)
+             OPENSSL_armcap_P |= ARMV8_CPUID;
++
++        if (hwcap & HWCAP_CE_SM3)
++            OPENSSL_armcap_P |= ARMV8_SM3;
+ #  endif
+     }
+ # endif
+@@ -253,6 +258,11 @@ void OPENSSL_cpuid_setup(void)
+             _armv8_sha512_probe();
+             OPENSSL_armcap_P |= ARMV8_SHA512;
+         }
++
++        if (sigsetjmp(ill_jmp, 1) == 0) {
++            _armv8_sm3_probe();
++            OPENSSL_armcap_P |= ARMV8_SM3;
++        }
+ #  endif
+     }
+ # endif
+diff --git a/crypto/sm3/asm/sm3-armv8.pl b/crypto/sm3/asm/sm3-armv8.pl
+new file mode 100644
+index 0000000000..bb71b2eade
+--- /dev/null
++++ b/crypto/sm3/asm/sm3-armv8.pl
+@@ -0,0 +1,282 @@
++#! /usr/bin/env perl
++# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the Apache License 2.0 (the "License").  You may not use
++# this file except in compliance with the License.  You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++#
++# This module implements support for Armv8 SM3 instructions
++
++# $output is the last argument if it looks like a file (it has an extension)
++# $flavour is the first argument if it doesn't look like a file
++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
++die "can't locate arm-xlate.pl";
++
++open OUT,"| \"$^X\" $xlate $flavour \"$output\""
++    or die "can't call $xlate: $!";
++*STDOUT=*OUT;
++
++# Message expanding:
++#	Wj <- P1(W[j-16]^W[j-9]^(W[j-3]<<<15))^(W[j-13]<<<7)^W[j-6]
++# Input: s0, s1, s2, s3
++#	s0 = w0  | w1  | w2  | w3
++#	s1 = w4  | w5  | w6  | w7
++#	s2 = w8  | w9  | w10 | w11
++#	s3 = w12 | w13 | w14 | w15
++# Output: s4
++sub msg_exp () {
++my $s0 = shift;
++my $s1 = shift;
++my $s2 = shift;
++my $s3 = shift;
++my $s4 = shift;
++my $vtmp1 = shift;
++my $vtmp2 = shift;
++$code.=<<___;
++	// s4 = w7  | w8  | w9  | w10
++	ext     $s4.16b, $s1.16b, $s2.16b, #12
++	// vtmp1 = w3  | w4  | w5  | w6
++	ext	$vtmp1.16b, $s0.16b, $s1.16b, #12
++	// vtmp2 = w10 | w11 | w12 | w13
++	ext     $vtmp2.16b, $s2.16b, $s3.16b, #8
++	sm3partw1       $s4.4s, $s0.4s, $s3.4s
++	sm3partw2       $s4.4s, $vtmp2.4s, $vtmp1.4s
++___
++}
++
++# A round of compresson function
++# Input:
++# 	ab - choose instruction among sm3tt1a, sm3tt1b, sm3tt2a, sm3tt2b
++# 	vstate0 - vstate1, store digest status(A - H)
++# 	vconst0 - vconst1, interleaved used to store Tj <<< j
++# 	vtmp - temporary register
++# 	vw - for sm3tt1ab, vw = s0 eor s1
++# 	s0 - for sm3tt2ab, just be s0
++# 	i, choose wj' or wj from vw
++sub round () {
++my $ab = shift;
++my $vstate0 = shift;
++my $vstate1 = shift;
++my $vconst0 = shift;
++my $vconst1 = shift;
++my $vtmp = shift;
++my $vw = shift;
++my $s0 = shift;
++my $i = shift;
++$code.=<<___;
++	sm3ss1  $vtmp.4s, $vstate0.4s, $vconst0.4s, $vstate1.4s
++	shl     $vconst1.4s, $vconst0.4s, #1
++	sri     $vconst1.4s, $vconst0.4s, #31
++	sm3tt1$ab       $vstate0.4s, $vtmp.4s, $vw.4s[$i]
++	sm3tt2$ab       $vstate1.4s, $vtmp.4s, $s0.4s[$i]
++___
++}
++
++sub qround () {
++my $ab = shift;
++my $vstate0 = shift;
++my $vstate1 = shift;
++my $vconst0 = shift;
++my $vconst1 = shift;
++my $vtmp1 = shift;
++my $vtmp2 = shift;
++my $s0 = shift;
++my $s1 = shift;
++my $s2 = shift;
++my $s3 = shift;
++my $s4 = shift;
++	if($s4) {
++		&msg_exp($s0, $s1, $s2, $s3, $s4, $vtmp1, $vtmp2);
++	}
++$code.=<<___;
++	eor     $vtmp1.16b, $s0.16b, $s1.16b
++___
++	&round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
++               $vtmp1, $s0, 0);
++	&round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
++               $vtmp1, $s0, 1);
++	&round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
++               $vtmp1, $s0, 2);
++	&round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
++               $vtmp1, $s0, 3);
++}
++
++$code=<<___;
++#include "arm_arch.h"
++.arch	armv8.2-a+sm4
++.text
++___
++
++{{{
++my ($pstate,$pdata,$num)=("x0","x1","w2");
++my ($state1,$state2)=("v5","v6");
++my ($sconst1, $sconst2)=("s16","s17");
++my ($vconst1, $vconst2)=("v16","v17");
++my ($s0,$s1,$s2,$s3,$s4)=map("v$_",(0..4));
++my ($bkstate1,$bkstate2)=("v18","v19");
++my ($vconst_tmp1,$vconst_tmp2)=("v20","v21");
++my ($vtmp1,$vtmp2)=("v22","v23");
++my $constaddr="x8";
++# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num)
++$code.=<<___;
++.globl	ossl_hwsm3_block_data_order
++.type	ossl_hwsm3_block_data_order,%function
++.align	5
++ossl_hwsm3_block_data_order:
++	AARCH64_VALID_CALL_TARGET
++	// load state
++	ld1     {$state1.4s-$state2.4s}, [$pstate]
++	rev64   $state1.4s, $state1.4s
++	rev64   $state2.4s, $state2.4s
++	ext     $state1.16b, $state1.16b, $state1.16b, #8
++	ext     $state2.16b, $state2.16b, $state2.16b, #8
++
++	adr     $constaddr, .Tj
++	ldp     $sconst1, $sconst2, [$constaddr]
++
++.Loop:
++	// load input
++	ld1     {$s0.16b-$s3.16b}, [$pdata], #64
++	sub     $num, $num, #1
++
++	mov     $bkstate1.16b, $state1.16b
++	mov     $bkstate2.16b, $state2.16b
++
++#ifndef __ARMEB__
++	rev32   $s0.16b, $s0.16b
++	rev32   $s1.16b, $s1.16b
++	rev32   $s2.16b, $s2.16b
++	rev32   $s3.16b, $s3.16b
++#endif
++
++	ext     $vconst_tmp1.16b, $vconst1.16b, $vconst1.16b, #4
++___
++	&qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s0,$s1,$s2,$s3,$s4);
++	&qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s1,$s2,$s3,$s4,$s0);
++	&qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s2,$s3,$s4,$s0,$s1);
++	&qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s3,$s4,$s0,$s1,$s2);
++
++$code.=<<___;
++	ext     $vconst_tmp1.16b, $vconst2.16b, $vconst2.16b, #4
++___
++
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s4,$s0,$s1,$s2,$s3);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s0,$s1,$s2,$s3,$s4);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s1,$s2,$s3,$s4,$s0);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s2,$s3,$s4,$s0,$s1);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s3,$s4,$s0,$s1,$s2);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s4,$s0,$s1,$s2,$s3);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s0,$s1,$s2,$s3,$s4);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s1,$s2,$s3,$s4,$s0);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s2,$s3,$s4,$s0,$s1);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s3,$s4);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s4,$s0);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s0,$s1);
++
++$code.=<<___;
++	eor     $state1.16b, $state1.16b, $bkstate1.16b
++	eor     $state2.16b, $state2.16b, $bkstate2.16b
++
++	// any remained blocks?
++	cbnz    $num, .Loop
++
++	// save state
++	rev64   $state1.4s, $state1.4s
++	rev64   $state2.4s, $state2.4s
++	ext     $state1.16b, $state1.16b, $state1.16b, #8
++	ext     $state2.16b, $state2.16b, $state2.16b, #8
++	st1     {$state1.4s-$state2.4s}, [$pstate]
++	ret
++.size	ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order
++
++.align	3
++.Tj:
++.word	0x79cc4519, 0x9d8a7a87
++___
++}}}
++
++#########################################
++my %sm3partopcode = (
++	"sm3partw1"         =>   0xce60C000,
++        "sm3partw2"         =>   0xce60C400);
++
++my %sm3sslopcode = (
++	"sm3ssl"            =>   0xce400000);
++
++my %sm3ttopcode = (
++	"sm3tt1a"           =>   0xce408000,
++	"sm3tt1b"           =>   0xce408400,
++	"sm3tt2a"           =>   0xce408800,
++	"sm3tt2b"           =>   0xce408C00);
++
++sub unsm3part {
++	my ($mnemonic,$arg)=@_;
++
++	$arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o
++	&&
++	sprintf ".inst\t0x%08x\t//%s %s",
++			$sm3partopcode{$mnemonic}|$1|($2<<5)|($3<<16),
++			$mnemonic,$arg;
++}
++
++sub unsm3ssl {
++	my ($mnemonic,$arg)=@_;
++
++	$arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,
++                \s*[qv](\d+)/o
++	&&
++	sprintf ".inst\t0x%08x\t//%s %s",
++			$sm3sslopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10),
++			$mnemonic,$arg;
++}
++
++sub unsm3tt {
++	my ($mnemonic,$arg)=@_;
++
++	$arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*\[([0-3])\]/o
++	&&
++	sprintf ".inst\t0x%08x\t//%s %s",
++			$sm3ttopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<12),
++			$mnemonic,$arg;
++}
++
++open SELF,$0;
++while(<SELF>) {
++        next if (/^#!/);
++        last if (!s/^#/\/\// and !/^$/);
++        print;
++}
++close SELF;
++
++foreach(split("\n",$code)) {
++	s/\`([^\`]*)\`/eval($1)/ge;
++
++	s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge;
++	s/\b(sm3ssl)\s+([qv].*)/unsm3ssl($1,$2)/ge;
++	s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge;
++	print $_,"\n";
++}
++
++close STDOUT or die "error closing STDOUT: $!";
+diff --git a/crypto/sm3/build.info b/crypto/sm3/build.info
+index eca68216f2..2fa54a4a8b 100644
+--- a/crypto/sm3/build.info
++++ b/crypto/sm3/build.info
+@@ -1,5 +1,22 @@
+ LIBS=../../libcrypto
+ 
+ IF[{- !$disabled{sm3} -}]
+-  SOURCE[../../libcrypto]=sm3.c legacy_sm3.c
+-ENDIF
+\ No newline at end of file
++  IF[{- !$disabled{asm} -}]
++    $SM3ASM_aarch64=sm3-armv8.S
++    $SM3DEF_aarch64=OPENSSL_SM3_ASM
++
++    # Now that we have defined all the arch specific variables, use the
++    # appropriate ones, and define the appropriate macros
++    IF[$SM3ASM_{- $target{asm_arch} -}]
++      $SM3ASM=$SM3ASM_{- $target{asm_arch} -}
++      $SM3DEF=$SM3DEF_{- $target{asm_arch} -}
++    ENDIF
++  ENDIF
++
++  SOURCE[../../libcrypto]=sm3.c legacy_sm3.c $SM3ASM
++  DEFINE[../../libcrypto]=$SM3DEF
++
++  GENERATE[sm3-armv8.S]=asm/sm3-armv8.pl
++  INCLUDE[sm3-armv8.o]=..
++ENDIF
++
+diff --git a/crypto/sm3/sm3_local.h b/crypto/sm3/sm3_local.h
+index 6daeb878a8..ac8a2bf768 100644
+--- a/crypto/sm3/sm3_local.h
++++ b/crypto/sm3/sm3_local.h
+@@ -32,7 +32,21 @@
+         ll=(c)->G; (void)HOST_l2c(ll, (s)); \
+         ll=(c)->H; (void)HOST_l2c(ll, (s)); \
+       } while (0)
+-#define HASH_BLOCK_DATA_ORDER   ossl_sm3_block_data_order
++
++#if defined(OPENSSL_SM3_ASM)
++# if defined(__aarch64__)
++#  include "crypto/arm_arch.h"
++#  define HWSM3_CAPABLE (OPENSSL_armcap_P & ARMV8_SM3)
++void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num);
++# endif
++#endif
++
++#if defined(HWSM3_CAPABLE)
++# define HASH_BLOCK_DATA_ORDER (HWSM3_CAPABLE ? ossl_hwsm3_block_data_order \
++                                              : ossl_sm3_block_data_order)
++#else
++# define HASH_BLOCK_DATA_ORDER   ossl_sm3_block_data_order
++#endif
+ 
+ void ossl_sm3_block_data_order(SM3_CTX *c, const void *p, size_t num);
+ void ossl_sm3_transform(SM3_CTX *c, const unsigned char *data);
+-- 
+2.37.3.windows.1
+
diff --git a/Backport-SM4-AESE-optimization-for-ARMv8.patch b/Backport-SM4-AESE-optimization-for-ARMv8.patch
new file mode 100644
index 0000000..0866262
--- /dev/null
+++ b/Backport-SM4-AESE-optimization-for-ARMv8.patch
@@ -0,0 +1,2322 @@
+From 730387aebda57a1bb0af5a74747d4dadc5e033f7 Mon Sep 17 00:00:00 2001
+From: Xu Yizhou <xuyizhou1@huawei.com>
+Date: Wed, 18 Jan 2023 09:55:02 +0800
+Subject: [PATCH 12/13] SM4 AESE optimization for ARMv8
+
+Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com>
+
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/19914)
+---
+ crypto/sm4/asm/vpsm4-armv8.pl                 |  458 +++++
+ crypto/sm4/asm/vpsm4_ex-armv8.pl              | 1544 +++++++++++++++++
+ crypto/sm4/build.info                         |    4 +-
+ include/crypto/sm4_platform.h                 |   41 +-
+ .../implementations/ciphers/cipher_sm4_hw.c   |   26 +-
+ .../implementations/ciphers/cipher_sm4_xts.c  |    4 +-
+ .../implementations/ciphers/cipher_sm4_xts.h  |    2 +-
+ .../ciphers/cipher_sm4_xts_hw.c               |   33 +-
+ 8 files changed, 2090 insertions(+), 22 deletions(-)
+ create mode 100644 crypto/sm4/asm/vpsm4_ex-armv8.pl
+
+diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl
+index 73797af582..e19de30901 100755
+--- a/crypto/sm4/asm/vpsm4-armv8.pl
++++ b/crypto/sm4/asm/vpsm4-armv8.pl
+@@ -28,6 +28,7 @@ open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+ 
+ $prefix="vpsm4";
+ my @vtmp=map("v$_",(0..3));
++my @qtmp=map("q$_",(0..3));
+ my @data=map("v$_",(4..7));
+ my @datax=map("v$_",(8..11));
+ my ($rk0,$rk1)=("v12","v13");
+@@ -36,6 +37,7 @@ my @vtmpx=map("v$_",(12..15));
+ my @sbox=map("v$_",(16..31));
+ my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
+ my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
++my ($xtmp1,$xtmp2)=("x8","x9");
+ my ($ptr,$counter)=("x10","w11");
+ my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
+ 
+@@ -60,6 +62,51 @@ ___
+ 	}
+ }
+ 
++sub rev32_armeb() {
++	my $dst = shift;
++	my $src = shift;
++
++	if ($src and ("$src" ne "$dst")) {
++$code.=<<___;
++#ifdef __AARCH64EB__
++	rev32	$dst.16b,$src.16b
++#else
++	mov	$dst.16b,$src.16b
++#endif
++___
++	} else {
++$code.=<<___;
++#ifdef __AARCH64EB__
++	rev32	$dst.16b,$dst.16b
++#endif
++___
++	}
++}
++
++sub rbit() {
++	my $dst = shift;
++	my $src = shift;
++	my $std = shift;
++
++	if ($src and ("$src" ne "$dst")) {
++		if ($std eq "_gb") {
++$code.=<<___;
++			rbit $dst.16b,$src.16b
++___
++		} else {
++$code.=<<___;
++			mov $dst.16b,$src.16b
++___
++		}
++	} else {
++		if ($std eq "_gb") {
++$code.=<<___;
++			rbit $dst.16b,$src.16b
++___
++		}
++	}
++}
++
+ sub transpose() {
+ 	my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
+ 
+@@ -435,6 +482,58 @@ $code.=<<___;
+ ___
+ }
+ 
++
++sub mov_reg_to_vec() {
++	my $src0 = shift;
++	my $src1 = shift;
++	my $desv = shift;
++$code.=<<___;
++	mov $desv.d[0],$src0
++	mov $desv.d[1],$src1
++___
++	&rev32_armeb($desv,$desv);
++}
++
++sub mov_vec_to_reg() {
++	my $srcv = shift;
++	my $des0 = shift;
++	my $des1 = shift;
++$code.=<<___;
++	mov $des0,$srcv.d[0]
++	mov $des1,$srcv.d[1]
++___
++}
++
++sub compute_tweak() {
++	my $src0 = shift;
++	my $src1 = shift;
++	my $des0 = shift;
++	my $des1 = shift;
++$code.=<<___;
++	mov $wtmp0,0x87
++	extr	$xtmp2,$src1,$src1,#32
++	extr	$des1,$src1,$src0,#63
++	and	$wtmp1,$wtmp0,$wtmp2,asr#31
++	eor	$des0,$xtmp1,$src0,lsl#1
++___
++}
++
++sub compute_tweak_vec() {
++	my $src = shift;
++	my $des = shift;
++	my $std = shift;
++	&rbit(@vtmp[2],$src,$std);
++$code.=<<___;
++	ldr  @qtmp[0], =0x01010101010101010101010101010187
++	shl  $des.16b, @vtmp[2].16b, #1
++	ext  @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
++	ushr @vtmp[1].16b, @vtmp[1].16b, #7
++	mul  @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
++	eor  $des.16b, $des.16b, @vtmp[1].16b
++___
++	&rbit($des,$des,$std);
++}
++
+ $code=<<___;
+ #include "arm_arch.h"
+ .arch	armv8-a
+@@ -1101,6 +1200,365 @@ $code.=<<___;
+ .size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+ ___
+ }}}
++
++{{{
++my ($blocks,$len)=("x2","x2");
++my $ivp=("x5");
++my @twx=map("x$_",(12..27));
++my ($rks1,$rks2)=("x26","x27");
++my $lastBlk=("x26");
++my $enc=("w28");
++my $remain=("x29");
++
++my @tweak=@datax;
++
++sub gen_xts_cipher() {
++	my $std = shift;
++$code.=<<___;
++.globl	${prefix}_xts_encrypt${std}
++.type	${prefix}_xts_encrypt${std},%function
++.align	5
++${prefix}_xts_encrypt${std}:
++	AARCH64_SIGN_LINK_REGISTER
++	stp	x15, x16, [sp, #-0x10]!
++	stp	x17, x18, [sp, #-0x10]!
++	stp	x19, x20, [sp, #-0x10]!
++	stp	x21, x22, [sp, #-0x10]!
++	stp	x23, x24, [sp, #-0x10]!
++	stp	x25, x26, [sp, #-0x10]!
++	stp	x27, x28, [sp, #-0x10]!
++	stp	x29, x30, [sp, #-0x10]!
++	stp	d8, d9, [sp, #-0x10]!
++	stp	d10, d11, [sp, #-0x10]!
++	stp	d12, d13, [sp, #-0x10]!
++	stp	d14, d15, [sp, #-0x10]!
++	mov	$rks1,x3
++	mov	$rks2,x4
++	mov	$enc,w6
++	ld1	{@tweak[0].4s}, [$ivp]
++	mov	$rks,$rks2
++___
++	&load_sbox();
++	&rev32(@tweak[0],@tweak[0]);
++	&encrypt_1blk(@tweak[0]);
++$code.=<<___;
++	mov	$rks,$rks1
++	and	$remain,$len,#0x0F
++	// convert length into blocks
++	lsr	$blocks,$len,4
++	cmp	$blocks,#1
++	b.lt .return${std}
++
++	cmp $remain,0
++	// If the encryption/decryption Length is N times of 16,
++	// the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
++	b.eq .xts_encrypt_blocks${std}
++
++	// If the encryption/decryption length is not N times of 16,
++	// the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
++	// the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
++	subs $blocks,$blocks,#1
++	b.eq .only_2blks_tweak${std}
++.xts_encrypt_blocks${std}:
++___
++	&rbit(@tweak[0],@tweak[0],$std);
++	&rev32_armeb(@tweak[0],@tweak[0]);
++	&mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
++	&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
++	&compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
++	&compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
++	&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
++	&compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
++	&compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
++	&compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
++$code.=<<___;
++.Lxts_8_blocks_process${std}:
++	cmp	$blocks,#8
++	b.lt	.Lxts_4_blocks_process${std}
++___
++	&mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]);
++	&mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]);
++	&mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]);
++	&mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]);
++	&mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]);
++	&mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]);
++	&mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]);
++	&mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]);
++$code.=<<___;
++	ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++___
++	&rbit(@vtmp[0],@vtmp[0],$std);
++	&rbit(@vtmp[1],@vtmp[1],$std);
++	&rbit(@vtmp[2],@vtmp[2],$std);
++	&rbit(@vtmp[3],@vtmp[3],$std);
++$code.=<<___;
++	eor @data[0].16b, @data[0].16b, @vtmp[0].16b
++	eor @data[1].16b, @data[1].16b, @vtmp[1].16b
++	eor @data[2].16b, @data[2].16b, @vtmp[2].16b
++	eor @data[3].16b, @data[3].16b, @vtmp[3].16b
++	ld1	{@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
++___
++	&rbit(@vtmpx[0],@vtmpx[0],$std);
++	&rbit(@vtmpx[1],@vtmpx[1],$std);
++	&rbit(@vtmpx[2],@vtmpx[2],$std);
++	&rbit(@vtmpx[3],@vtmpx[3],$std);
++$code.=<<___;
++	eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b
++	eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b
++	eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b
++	eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++	&rev32(@datax[0],@datax[0]);
++	&rev32(@datax[1],@datax[1]);
++	&rev32(@datax[2],@datax[2]);
++	&rev32(@datax[3],@datax[3]);
++	&transpose(@data,@vtmp);
++	&transpose(@datax,@vtmp);
++$code.=<<___;
++	bl	_${prefix}_enc_8blks
++___
++	&transpose(@vtmp,@datax);
++	&transpose(@data,@datax);
++
++	&mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]);
++	&compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
++	&mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]);
++	&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
++	&mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]);
++	&compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
++	&mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]);
++	&compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
++	&mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
++	&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
++	&mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
++	&compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
++	&mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
++	&compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
++	&mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]);
++	&compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
++$code.=<<___;
++	eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b
++	eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b
++	eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b
++	eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b
++	eor @data[0].16b, @data[0].16b, @tweak[0].16b
++	eor @data[1].16b, @data[1].16b, @tweak[1].16b
++	eor @data[2].16b, @data[2].16b, @tweak[2].16b
++	eor @data[3].16b, @data[3].16b, @tweak[3].16b
++
++	// save the last tweak
++	st1	{@tweak[3].4s},[$ivp]
++	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++	subs	$blocks,$blocks,#8
++	b.gt	.Lxts_8_blocks_process${std}
++	b	100f
++.Lxts_4_blocks_process${std}:
++___
++	&mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
++	&mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
++	&mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
++	&mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
++$code.=<<___;
++	cmp	$blocks,#4
++	b.lt	1f
++	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++___
++	&rbit(@tweak[0],@tweak[0],$std);
++	&rbit(@tweak[1],@tweak[1],$std);
++	&rbit(@tweak[2],@tweak[2],$std);
++	&rbit(@tweak[3],@tweak[3],$std);
++$code.=<<___;
++	eor @data[0].16b, @data[0].16b, @tweak[0].16b
++	eor @data[1].16b, @data[1].16b, @tweak[1].16b
++	eor @data[2].16b, @data[2].16b, @tweak[2].16b
++	eor @data[3].16b, @data[3].16b, @tweak[3].16b
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++	&transpose(@data,@vtmp);
++$code.=<<___;
++	bl	_${prefix}_enc_4blks
++___
++	&transpose(@vtmp,@data);
++$code.=<<___;
++	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
++	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
++	eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
++	eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
++	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++	sub	$blocks,$blocks,#4
++___
++	&mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
++	&mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
++	&mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
++$code.=<<___;
++	// save the last tweak
++	st1	{@tweak[3].4s},[$ivp]
++1:
++	// process last block
++	cmp	$blocks,#1
++	b.lt	100f
++	b.gt	1f
++	ld1	{@data[0].4s},[$inp],#16
++___
++	&rbit(@tweak[0],@tweak[0],$std);
++$code.=<<___;
++	eor @data[0].16b, @data[0].16b, @tweak[0].16b
++___
++	&rev32(@data[0],@data[0]);
++	&encrypt_1blk(@data[0]);
++$code.=<<___;
++	eor @data[0].16b, @data[0].16b, @tweak[0].16b
++	st1	{@data[0].4s},[$outp],#16
++	// save the last tweak
++	st1	{@tweak[0].4s},[$ivp]
++	b	100f
++1:  // process last 2 blocks
++	cmp	$blocks,#2
++	b.gt	1f
++	ld1	{@data[0].4s,@data[1].4s},[$inp],#32
++___
++	&rbit(@tweak[0],@tweak[0],$std);
++	&rbit(@tweak[1],@tweak[1],$std);
++$code.=<<___;
++	eor @data[0].16b, @data[0].16b, @tweak[0].16b
++	eor @data[1].16b, @data[1].16b, @tweak[1].16b
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&transpose(@data,@vtmp);
++$code.=<<___;
++	bl	_${prefix}_enc_4blks
++___
++	&transpose(@vtmp,@data);
++$code.=<<___;
++	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
++	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
++	st1	{@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
++	// save the last tweak
++	st1	{@tweak[1].4s},[$ivp]
++	b	100f
++1:  // process last 3 blocks
++	ld1	{@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
++___
++	&rbit(@tweak[0],@tweak[0],$std);
++	&rbit(@tweak[1],@tweak[1],$std);
++	&rbit(@tweak[2],@tweak[2],$std);
++$code.=<<___;
++	eor @data[0].16b, @data[0].16b, @tweak[0].16b
++	eor @data[1].16b, @data[1].16b, @tweak[1].16b
++	eor @data[2].16b, @data[2].16b, @tweak[2].16b
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&transpose(@data,@vtmp);
++$code.=<<___;
++	bl	_${prefix}_enc_4blks
++___
++	&transpose(@vtmp,@data);
++$code.=<<___;
++	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
++	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
++	eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
++	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
++	// save the last tweak
++	st1	{@tweak[2].4s},[$ivp]
++100:
++	cmp $remain,0
++	b.eq .return${std}
++
++// This brance calculates the last two tweaks, 
++// while the encryption/decryption length is larger than 32
++.last_2blks_tweak${std}:
++	ld1	{@tweak[0].4s},[$ivp]
++___
++	&rev32_armeb(@tweak[0],@tweak[0]);
++	&compute_tweak_vec(@tweak[0],@tweak[1],$std);
++	&compute_tweak_vec(@tweak[1],@tweak[2],$std);
++$code.=<<___;
++	b .check_dec${std}
++
++
++// This brance calculates the last two tweaks, 
++// while the encryption/decryption length is equal to 32, who only need two tweaks
++.only_2blks_tweak${std}:
++	mov @tweak[1].16b,@tweak[0].16b
++___
++	&rev32_armeb(@tweak[1],@tweak[1]);
++	&compute_tweak_vec(@tweak[1],@tweak[2]);
++$code.=<<___;
++	b .check_dec${std}
++
++
++// Determine whether encryption or decryption is required.
++// The last two tweaks need to be swapped for decryption.
++.check_dec${std}:
++	// encryption:1 decryption:0
++	cmp $enc,1
++	b.eq .prcess_last_2blks${std}
++	mov @vtmp[0].16B,@tweak[1].16b
++	mov @tweak[1].16B,@tweak[2].16b
++	mov @tweak[2].16B,@vtmp[0].16b
++
++.prcess_last_2blks${std}:
++___
++	&rev32_armeb(@tweak[1],@tweak[1]);
++	&rev32_armeb(@tweak[2],@tweak[2]);
++$code.=<<___;
++	ld1	{@data[0].4s},[$inp],#16
++	eor @data[0].16b, @data[0].16b, @tweak[1].16b
++___
++	&rev32(@data[0],@data[0]);
++	&encrypt_1blk(@data[0]);
++$code.=<<___;
++	eor @data[0].16b, @data[0].16b, @tweak[1].16b
++	st1	{@data[0].4s},[$outp],#16
++
++	sub $lastBlk,$outp,16
++	.loop${std}:
++		subs $remain,$remain,1
++		ldrb	$wtmp0,[$lastBlk,$remain]
++		ldrb	$wtmp1,[$inp,$remain]
++		strb	$wtmp1,[$lastBlk,$remain]
++		strb	$wtmp0,[$outp,$remain]
++	b.gt .loop${std}
++	ld1		{@data[0].4s}, [$lastBlk]	
++	eor @data[0].16b, @data[0].16b, @tweak[2].16b
++___
++	&rev32(@data[0],@data[0]);
++	&encrypt_1blk(@data[0]);
++$code.=<<___;
++	eor @data[0].16b, @data[0].16b, @tweak[2].16b
++	st1		{@data[0].4s}, [$lastBlk]
++.return${std}:
++	ldp		d14, d15, [sp], #0x10
++	ldp		d12, d13, [sp], #0x10
++	ldp		d10, d11, [sp], #0x10
++	ldp		d8, d9, [sp], #0x10
++	ldp		x29, x30, [sp], #0x10
++	ldp		x27, x28, [sp], #0x10
++	ldp		x25, x26, [sp], #0x10
++	ldp		x23, x24, [sp], #0x10
++	ldp		x21, x22, [sp], #0x10
++	ldp		x19, x20, [sp], #0x10
++	ldp		x17, x18, [sp], #0x10
++	ldp		x15, x16, [sp], #0x10
++	AARCH64_VALIDATE_LINK_REGISTER
++	ret
++.size	${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
++___
++} # end of gen_xts_cipher
++&gen_xts_cipher("_gb");
++&gen_xts_cipher("");
++}}}
+ ########################################
+ open SELF,$0;
+ while(<SELF>) {
+diff --git a/crypto/sm4/asm/vpsm4_ex-armv8.pl b/crypto/sm4/asm/vpsm4_ex-armv8.pl
+new file mode 100644
+index 0000000000..3d094aa535
+--- /dev/null
++++ b/crypto/sm4/asm/vpsm4_ex-armv8.pl
+@@ -0,0 +1,1544 @@
++#! /usr/bin/env perl
++# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the Apache License 2.0 (the "License").  You may not use
++# this file except in compliance with the License.  You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++
++#
++# This module implements SM4 with ASIMD and AESE on AARCH64
++#
++# Dec 2022
++#
++
++# $output is the last argument if it looks like a file (it has an extension)
++# $flavour is the first argument if it doesn't look like a file
++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
++die "can't locate arm-xlate.pl";
++
++open OUT,"| \"$^X\" $xlate $flavour \"$output\""
++	or die "can't call $xlate: $!";
++*STDOUT=*OUT;
++
++$prefix="vpsm4_ex";
++my @vtmp=map("v$_",(0..3));
++my @qtmp=map("q$_",(0..3));
++my @data=map("v$_",(4..7));
++my @datax=map("v$_",(8..11));
++my ($rk0,$rk1)=("v12","v13");
++my ($rka,$rkb)=("v14","v15");
++my @vtmpx=map("v$_",(12..15));
++my ($vtmp4,$vtmp5)=("v24","v25");
++my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31");
++my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31");
++
++my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
++my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
++my ($xtmp1,$xtmp2)=("x8","x9");
++my ($ptr,$counter)=("x10","w11");
++my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
++
++sub rev32() {
++	my $dst = shift;
++	my $src = shift;
++
++	if ($src and ("$src" ne "$dst")) {
++$code.=<<___;
++#ifndef __AARCH64EB__
++	rev32	$dst.16b,$src.16b
++#else
++	mov	$dst.16b,$src.16b
++#endif
++___
++	} else {
++$code.=<<___;
++#ifndef __AARCH64EB__
++	rev32	$dst.16b,$dst.16b
++#endif
++___
++	}
++}
++
++sub rev32_armeb() {
++	my $dst = shift;
++	my $src = shift;
++
++	if ($src and ("$src" ne "$dst")) {
++$code.=<<___;
++#ifdef __AARCH64EB__
++	rev32	$dst.16b,$src.16b
++#else
++	mov	$dst.16b,$src.16b
++#endif
++___
++	} else {
++$code.=<<___;
++#ifdef __AARCH64EB__
++	rev32	$dst.16b,$dst.16b
++#endif
++___
++	}
++}
++
++sub rbit() {
++	my $dst = shift;
++	my $src = shift;
++	my $std = shift;
++
++	if ($src and ("$src" ne "$dst")) {
++		if ($std eq "_gb") {
++$code.=<<___;
++			rbit $dst.16b,$src.16b
++___
++		} else {
++$code.=<<___;
++			mov $dst.16b,$src.16b
++___
++		}
++	} else {
++		if ($std eq "_gb") {
++$code.=<<___;
++			rbit $dst.16b,$src.16b
++___
++		}
++	}
++}
++
++sub transpose() {
++	my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
++
++$code.=<<___;
++	zip1	$vt0.4s,$dat0.4s,$dat1.4s
++	zip2	$vt1.4s,$dat0.4s,$dat1.4s
++	zip1	$vt2.4s,$dat2.4s,$dat3.4s
++	zip2	$vt3.4s,$dat2.4s,$dat3.4s
++	zip1	$dat0.2d,$vt0.2d,$vt2.2d
++	zip2	$dat1.2d,$vt0.2d,$vt2.2d
++	zip1	$dat2.2d,$vt1.2d,$vt3.2d
++	zip2	$dat3.2d,$vt1.2d,$vt3.2d
++___
++}
++
++# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x)
++sub mul_matrix() {
++	my $x = shift;
++	my $higherMat = shift;
++	my $lowerMat = shift;
++	my $tmp = shift;
++$code.=<<___;
++	ushr	$tmp.16b, $x.16b, 4
++	and		$x.16b, $x.16b, $ANDMaskV.16b
++	tbl		$x.16b, {$lowerMat.16b}, $x.16b
++	tbl		$tmp.16b, {$higherMat.16b}, $tmp.16b
++	eor		$x.16b, $x.16b, $tmp.16b
++___
++}
++
++# sbox operations for 4-lane of words
++# sbox operation for 4-lane of words
++sub sbox() {
++	my $dat = shift;
++
++$code.=<<___;
++	// optimize sbox using AESE instruction
++	tbl	@vtmp[0].16b, {$dat.16b}, $MaskV.16b
++___
++	&mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
++$code.=<<___;
++	eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
++	aese @vtmp[0].16b,@vtmp[1].16b
++___
++	&mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4);
++$code.=<<___;
++	mov	$dat.16b,@vtmp[0].16b
++
++	// linear transformation
++	ushr	@vtmp[0].4s,$dat.4s,32-2
++	ushr	@vtmp[1].4s,$dat.4s,32-10
++	ushr	@vtmp[2].4s,$dat.4s,32-18
++	ushr	@vtmp[3].4s,$dat.4s,32-24
++	sli	@vtmp[0].4s,$dat.4s,2
++	sli	@vtmp[1].4s,$dat.4s,10
++	sli	@vtmp[2].4s,$dat.4s,18
++	sli	@vtmp[3].4s,$dat.4s,24
++	eor	$vtmp4.16b,@vtmp[0].16b,$dat.16b
++	eor	$vtmp4.16b,$vtmp4.16b,$vtmp[1].16b
++	eor	$dat.16b,@vtmp[2].16b,@vtmp[3].16b
++	eor	$dat.16b,$dat.16b,$vtmp4.16b
++___
++}
++
++# sbox operation for 8-lane of words
++sub sbox_double() {
++	my $dat = shift;
++	my $datx = shift;
++
++$code.=<<___;
++	// optimize sbox using AESE instruction
++	tbl	@vtmp[0].16b, {$dat.16b}, $MaskV.16b
++	tbl	@vtmp[1].16b, {$datx.16b}, $MaskV.16b
++___
++	&mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
++	&mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4);
++$code.=<<___;
++	eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b
++	aese @vtmp[0].16b,$vtmp5.16b
++	aese @vtmp[1].16b,$vtmp5.16b
++___
++	&mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4);
++	&mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4);
++$code.=<<___;
++	mov	$dat.16b,@vtmp[0].16b
++	mov	$datx.16b,@vtmp[1].16b
++
++	// linear transformation
++	ushr	@vtmp[0].4s,$dat.4s,32-2
++	ushr	$vtmp5.4s,$datx.4s,32-2
++	ushr	@vtmp[1].4s,$dat.4s,32-10
++	ushr	@vtmp[2].4s,$dat.4s,32-18
++	ushr	@vtmp[3].4s,$dat.4s,32-24
++	sli	@vtmp[0].4s,$dat.4s,2
++	sli	$vtmp5.4s,$datx.4s,2
++	sli	@vtmp[1].4s,$dat.4s,10
++	sli	@vtmp[2].4s,$dat.4s,18
++	sli	@vtmp[3].4s,$dat.4s,24
++	eor	$vtmp4.16b,@vtmp[0].16b,$dat.16b
++	eor	$vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
++	eor	$dat.16b,@vtmp[2].16b,@vtmp[3].16b
++	eor	$dat.16b,$dat.16b,$vtmp4.16b
++	ushr	@vtmp[1].4s,$datx.4s,32-10
++	ushr	@vtmp[2].4s,$datx.4s,32-18
++	ushr	@vtmp[3].4s,$datx.4s,32-24
++	sli	@vtmp[1].4s,$datx.4s,10
++	sli	@vtmp[2].4s,$datx.4s,18
++	sli	@vtmp[3].4s,$datx.4s,24
++	eor	$vtmp4.16b,$vtmp5.16b,$datx.16b
++	eor	$vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
++	eor	$datx.16b,@vtmp[2].16b,@vtmp[3].16b
++	eor	$datx.16b,$datx.16b,$vtmp4.16b
++___
++}
++
++# sbox operation for one single word
++sub sbox_1word () {
++	my $word = shift;
++
++$code.=<<___;
++	mov	@vtmp[3].s[0],$word
++	// optimize sbox using AESE instruction
++	tbl	@vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b
++___
++	&mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
++$code.=<<___;
++	eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
++	aese @vtmp[0].16b,@vtmp[1].16b
++___
++	&mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
++$code.=<<___;
++
++	mov	$wtmp0,@vtmp[0].s[0]
++	eor	$word,$wtmp0,$wtmp0,ror #32-2
++	eor	$word,$word,$wtmp0,ror #32-10
++	eor	$word,$word,$wtmp0,ror #32-18
++	eor	$word,$word,$wtmp0,ror #32-24
++___
++}
++
++# sm4 for one block of data, in scalar registers word0/word1/word2/word3
++sub sm4_1blk () {
++	my $kptr = shift;
++
++$code.=<<___;
++	ldp	$wtmp0,$wtmp1,[$kptr],8
++	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
++	eor	$tmpw,$word2,$word3
++	eor	$wtmp2,$wtmp0,$word1
++	eor	$tmpw,$tmpw,$wtmp2
++___
++	&sbox_1word($tmpw);
++$code.=<<___;
++	eor	$word0,$word0,$tmpw
++	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
++	eor	$tmpw,$word2,$word3
++	eor	$wtmp2,$word0,$wtmp1
++	eor	$tmpw,$tmpw,$wtmp2
++___
++	&sbox_1word($tmpw);
++$code.=<<___;
++	ldp	$wtmp0,$wtmp1,[$kptr],8
++	eor	$word1,$word1,$tmpw
++	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
++	eor	$tmpw,$word0,$word1
++	eor	$wtmp2,$wtmp0,$word3
++	eor	$tmpw,$tmpw,$wtmp2
++___
++	&sbox_1word($tmpw);
++$code.=<<___;
++	eor	$word2,$word2,$tmpw
++	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
++	eor	$tmpw,$word0,$word1
++	eor	$wtmp2,$word2,$wtmp1
++	eor	$tmpw,$tmpw,$wtmp2
++___
++	&sbox_1word($tmpw);
++$code.=<<___;
++	eor	$word3,$word3,$tmpw
++___
++}
++
++# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
++sub sm4_4blks () {
++	my $kptr = shift;
++
++$code.=<<___;
++	ldp	$wtmp0,$wtmp1,[$kptr],8
++	dup	$rk0.4s,$wtmp0
++	dup	$rk1.4s,$wtmp1
++
++	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
++	eor	$rka.16b,@data[2].16b,@data[3].16b
++	eor	$rk0.16b,@data[1].16b,$rk0.16b
++	eor	$rk0.16b,$rka.16b,$rk0.16b
++___
++	&sbox($rk0);
++$code.=<<___;
++	eor	@data[0].16b,@data[0].16b,$rk0.16b
++
++	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
++	eor	$rka.16b,$rka.16b,@data[0].16b
++	eor	$rk1.16b,$rka.16b,$rk1.16b
++___
++	&sbox($rk1);
++$code.=<<___;
++	ldp	$wtmp0,$wtmp1,[$kptr],8
++	eor	@data[1].16b,@data[1].16b,$rk1.16b
++
++	dup	$rk0.4s,$wtmp0
++	dup	$rk1.4s,$wtmp1
++
++	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
++	eor	$rka.16b,@data[0].16b,@data[1].16b
++	eor	$rk0.16b,@data[3].16b,$rk0.16b
++	eor	$rk0.16b,$rka.16b,$rk0.16b
++___
++	&sbox($rk0);
++$code.=<<___;
++	eor	@data[2].16b,@data[2].16b,$rk0.16b
++
++	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
++	eor	$rka.16b,$rka.16b,@data[2].16b
++	eor	$rk1.16b,$rka.16b,$rk1.16b
++___
++	&sbox($rk1);
++$code.=<<___;
++	eor	@data[3].16b,@data[3].16b,$rk1.16b
++___
++}
++
++# sm4 for 8 lanes of data, in neon registers
++# data0/data1/data2/data3 datax0/datax1/datax2/datax3
++sub sm4_8blks () {
++	my $kptr = shift;
++
++$code.=<<___;
++	ldp	$wtmp0,$wtmp1,[$kptr],8
++	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
++	dup	$rk0.4s,$wtmp0
++	eor	$rka.16b,@data[2].16b,@data[3].16b
++	eor	$rkb.16b,@datax[2].16b,@datax[3].16b
++	eor	@vtmp[0].16b,@data[1].16b,$rk0.16b
++	eor	@vtmp[1].16b,@datax[1].16b,$rk0.16b
++	eor	$rk0.16b,$rka.16b,@vtmp[0].16b
++	eor	$rk1.16b,$rkb.16b,@vtmp[1].16b
++___
++	&sbox_double($rk0,$rk1);
++$code.=<<___;
++	eor	@data[0].16b,@data[0].16b,$rk0.16b
++	eor	@datax[0].16b,@datax[0].16b,$rk1.16b
++
++	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
++	dup	$rk1.4s,$wtmp1
++	eor	$rka.16b,$rka.16b,@data[0].16b
++	eor	$rkb.16b,$rkb.16b,@datax[0].16b
++	eor	$rk0.16b,$rka.16b,$rk1.16b
++	eor	$rk1.16b,$rkb.16b,$rk1.16b
++___
++	&sbox_double($rk0,$rk1);
++$code.=<<___;
++	ldp	$wtmp0,$wtmp1,[$kptr],8
++	eor	@data[1].16b,@data[1].16b,$rk0.16b
++	eor	@datax[1].16b,@datax[1].16b,$rk1.16b
++
++	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
++	dup	$rk0.4s,$wtmp0
++	eor	$rka.16b,@data[0].16b,@data[1].16b
++	eor	$rkb.16b,@datax[0].16b,@datax[1].16b
++	eor	@vtmp[0].16b,@data[3].16b,$rk0.16b
++	eor	@vtmp[1].16b,@datax[3].16b,$rk0.16b
++	eor	$rk0.16b,$rka.16b,@vtmp[0].16b
++	eor	$rk1.16b,$rkb.16b,@vtmp[1].16b
++___
++	&sbox_double($rk0,$rk1);
++$code.=<<___;
++	eor	@data[2].16b,@data[2].16b,$rk0.16b
++	eor	@datax[2].16b,@datax[2].16b,$rk1.16b
++
++	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
++	dup	$rk1.4s,$wtmp1
++	eor	$rka.16b,$rka.16b,@data[2].16b
++	eor	$rkb.16b,$rkb.16b,@datax[2].16b
++	eor	$rk0.16b,$rka.16b,$rk1.16b
++	eor	$rk1.16b,$rkb.16b,$rk1.16b
++___
++	&sbox_double($rk0,$rk1);
++$code.=<<___;
++	eor	@data[3].16b,@data[3].16b,$rk0.16b
++	eor	@datax[3].16b,@datax[3].16b,$rk1.16b
++___
++}
++
++sub encrypt_1blk_norev() {
++	my $dat = shift;
++
++$code.=<<___;
++	mov	$ptr,$rks
++	mov	$counter,#8
++	mov	$word0,$dat.s[0]
++	mov	$word1,$dat.s[1]
++	mov	$word2,$dat.s[2]
++	mov	$word3,$dat.s[3]
++10:
++___
++	&sm4_1blk($ptr);
++$code.=<<___;
++	subs	$counter,$counter,#1
++	b.ne	10b
++	mov	$dat.s[0],$word3
++	mov	$dat.s[1],$word2
++	mov	$dat.s[2],$word1
++	mov	$dat.s[3],$word0
++___
++}
++
++sub encrypt_1blk() {
++	my $dat = shift;
++
++	&encrypt_1blk_norev($dat);
++	&rev32($dat,$dat);
++}
++
++sub encrypt_4blks() {
++$code.=<<___;
++	mov	$ptr,$rks
++	mov	$counter,#8
++10:
++___
++	&sm4_4blks($ptr);
++$code.=<<___;
++	subs	$counter,$counter,#1
++	b.ne	10b
++___
++	&rev32(@vtmp[3],@data[0]);
++	&rev32(@vtmp[2],@data[1]);
++	&rev32(@vtmp[1],@data[2]);
++	&rev32(@vtmp[0],@data[3]);
++}
++
++sub encrypt_8blks() {
++$code.=<<___;
++	mov	$ptr,$rks
++	mov	$counter,#8
++10:
++___
++	&sm4_8blks($ptr);
++$code.=<<___;
++	subs	$counter,$counter,#1
++	b.ne	10b
++___
++	&rev32(@vtmp[3],@data[0]);
++	&rev32(@vtmp[2],@data[1]);
++	&rev32(@vtmp[1],@data[2]);
++	&rev32(@vtmp[0],@data[3]);
++	&rev32(@data[3],@datax[0]);
++	&rev32(@data[2],@datax[1]);
++	&rev32(@data[1],@datax[2]);
++	&rev32(@data[0],@datax[3]);
++}
++
++sub load_sbox () {
++	my $data = shift;
++
++$code.=<<___;
++	ldr $MaskQ,	   =0x0306090c0f0205080b0e0104070a0d00
++	ldr $TAHMatQ,	=0x22581a6002783a4062185a2042387a00
++	ldr $TALMatQ,	=0xc10bb67c4a803df715df62a89e54e923
++	ldr $ATAHMatQ,   =0x1407c6d56c7fbeadb9aa6b78c1d21300
++	ldr $ATALMatQ,   =0xe383c1a1fe9edcbc6404462679195b3b
++	ldr $ANDMaskQ,	=0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
++___
++}
++
++sub mov_reg_to_vec() {
++	my $src0 = shift;
++	my $src1 = shift;
++	my $desv = shift;
++$code.=<<___;
++	mov $desv.d[0],$src0
++	mov $desv.d[1],$src1
++___
++	&rev32_armeb($desv,$desv);
++}
++
++sub mov_vec_to_reg() {
++	my $srcv = shift;
++	my $des0 = shift;
++	my $des1 = shift;
++$code.=<<___;
++	mov $des0,$srcv.d[0]
++	mov $des1,$srcv.d[1]
++___
++}
++
++sub compute_tweak() {
++	my $src0 = shift;
++	my $src1 = shift;
++	my $des0 = shift;
++	my $des1 = shift;
++$code.=<<___;
++	mov $wtmp0,0x87
++	extr	$xtmp2,$src1,$src1,#32
++	extr	$des1,$src1,$src0,#63
++	and	$wtmp1,$wtmp0,$wtmp2,asr#31
++	eor	$des0,$xtmp1,$src0,lsl#1
++___
++}
++
++sub compute_tweak_vec() {
++	my $src = shift;
++	my $des = shift;
++	my $std = shift;
++	&rbit(@vtmp[2],$src,$std);
++$code.=<<___;
++	ldr  @qtmp[0], =0x01010101010101010101010101010187
++	shl  $des.16b, @vtmp[2].16b, #1
++	ext  @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
++	ushr @vtmp[1].16b, @vtmp[1].16b, #7
++	mul  @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
++	eor  $des.16b, $des.16b, @vtmp[1].16b
++___
++	&rbit($des,$des,$std);
++}
++
++$code=<<___;
++#include "arm_arch.h"
++.arch	armv8-a+crypto
++.text
++
++.type	_${prefix}_consts,%object
++.align	7
++_${prefix}_consts:
++.Lck:
++	.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
++	.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
++	.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
++	.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
++	.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
++	.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
++	.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
++	.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
++.Lfk:
++	.dword 0x56aa3350a3b1bac6,0xb27022dc677d9197
++.Lshuffles:
++	.dword 0x0B0A090807060504,0x030201000F0E0D0C
++
++.size	_${prefix}_consts,.-_${prefix}_consts
++___
++
++{{{
++my ($key,$keys,$enc)=("x0","x1","w2");
++my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
++my ($vkey,$vfk,$vmap)=("v5","v6","v7");
++$code.=<<___;
++.type	_${prefix}_set_key,%function
++.align	4
++_${prefix}_set_key:
++	AARCH64_VALID_CALL_TARGET
++	ld1	{$vkey.4s},[$key]
++___
++	&load_sbox();
++	&rev32($vkey,$vkey);
++$code.=<<___;
++	adr	$pointer,.Lshuffles
++	ld1	{$vmap.2d},[$pointer]
++	adr	$pointer,.Lfk
++	ld1	{$vfk.2d},[$pointer]
++	eor	$vkey.16b,$vkey.16b,$vfk.16b
++	mov	$schedules,#32
++	adr	$pointer,.Lck
++	movi	@vtmp[0].16b,#64
++	cbnz	$enc,1f
++	add	$keys,$keys,124
++1:
++	mov	$wtmp,$vkey.s[1]
++	ldr	$roundkey,[$pointer],#4
++	eor	$roundkey,$roundkey,$wtmp
++	mov	$wtmp,$vkey.s[2]
++	eor	$roundkey,$roundkey,$wtmp
++	mov	$wtmp,$vkey.s[3]
++	eor	$roundkey,$roundkey,$wtmp
++	// optimize sbox using AESE instruction
++	mov	@data[0].s[0],$roundkey
++	tbl	@vtmp[0].16b, {@data[0].16b}, $MaskV.16b
++___
++	&mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
++$code.=<<___;
++	eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
++	aese @vtmp[0].16b,@vtmp[1].16b
++___
++	&mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
++$code.=<<___;
++	mov	$wtmp,@vtmp[0].s[0]
++	eor	$roundkey,$wtmp,$wtmp,ror #19
++	eor	$roundkey,$roundkey,$wtmp,ror #9
++	mov	$wtmp,$vkey.s[0]
++	eor	$roundkey,$roundkey,$wtmp
++	mov	$vkey.s[0],$roundkey
++	cbz	$enc,2f
++	str	$roundkey,[$keys],#4
++	b	3f
++2:
++	str	$roundkey,[$keys],#-4
++3:
++	tbl	$vkey.16b,{$vkey.16b},$vmap.16b
++	subs	$schedules,$schedules,#1
++	b.ne	1b
++	ret
++.size	_${prefix}_set_key,.-_${prefix}_set_key
++___
++}}}
++
++
++{{{
++$code.=<<___;
++.type	_${prefix}_enc_4blks,%function
++.align	4
++_${prefix}_enc_4blks:
++	AARCH64_VALID_CALL_TARGET
++___
++	&encrypt_4blks();
++$code.=<<___;
++	ret
++.size	_${prefix}_enc_4blks,.-_${prefix}_enc_4blks
++___
++}}}
++
++{{{
++$code.=<<___;
++.type	_${prefix}_enc_8blks,%function
++.align	4
++_${prefix}_enc_8blks:
++	AARCH64_VALID_CALL_TARGET
++___
++	&encrypt_8blks();
++$code.=<<___;
++	ret
++.size	_${prefix}_enc_8blks,.-_${prefix}_enc_8blks
++___
++}}}
++
++
++{{{
++my ($key,$keys)=("x0","x1");
++$code.=<<___;
++.globl	${prefix}_set_encrypt_key
++.type	${prefix}_set_encrypt_key,%function
++.align	5
++${prefix}_set_encrypt_key:
++	AARCH64_SIGN_LINK_REGISTER
++	stp	x29,x30,[sp,#-16]!
++	mov	w2,1
++	bl	_${prefix}_set_key
++	ldp	x29,x30,[sp],#16
++	AARCH64_VALIDATE_LINK_REGISTER
++	ret
++.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
++___
++}}}
++
++{{{
++my ($key,$keys)=("x0","x1");
++$code.=<<___;
++.globl	${prefix}_set_decrypt_key
++.type	${prefix}_set_decrypt_key,%function
++.align	5
++${prefix}_set_decrypt_key:
++	AARCH64_SIGN_LINK_REGISTER
++	stp	x29,x30,[sp,#-16]!
++	mov	w2,0
++	bl	_${prefix}_set_key
++	ldp	x29,x30,[sp],#16
++	AARCH64_VALIDATE_LINK_REGISTER
++	ret
++.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
++___
++}}}
++
++{{{
++sub gen_block () {
++	my $dir = shift;
++	my ($inp,$outp,$rk)=map("x$_",(0..2));
++
++$code.=<<___;
++.globl	${prefix}_${dir}crypt
++.type	${prefix}_${dir}crypt,%function
++.align	5
++${prefix}_${dir}crypt:
++	AARCH64_VALID_CALL_TARGET
++	ld1	{@data[0].4s},[$inp]
++___
++	&load_sbox();
++	&rev32(@data[0],@data[0]);
++$code.=<<___;
++	mov	$rks,$rk
++___
++	&encrypt_1blk(@data[0]);
++$code.=<<___;
++	st1	{@data[0].4s},[$outp]
++	ret
++.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
++___
++}
++&gen_block("en");
++&gen_block("de");
++}}}
++
++{{{
++$code.=<<___;
++.globl	${prefix}_ecb_encrypt
++.type	${prefix}_ecb_encrypt,%function
++.align	5
++${prefix}_ecb_encrypt:
++	AARCH64_SIGN_LINK_REGISTER
++	// convert length into blocks
++	lsr	x2,x2,4
++	stp	d8,d9,[sp,#-80]!
++	stp	d10,d11,[sp,#16]
++	stp	d12,d13,[sp,#32]
++	stp	d14,d15,[sp,#48]
++	stp	x29,x30,[sp,#64]
++___
++	&load_sbox();
++$code.=<<___;
++.Lecb_8_blocks_process:
++	cmp	$blocks,#8
++	b.lt	.Lecb_4_blocks_process
++	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++	ld4	{@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++	&rev32(@datax[0],@datax[0]);
++	&rev32(@datax[1],@datax[1]);
++	&rev32(@datax[2],@datax[2]);
++	&rev32(@datax[3],@datax[3]);
++$code.=<<___;
++	bl	_${prefix}_enc_8blks
++	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++	st4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++	subs	$blocks,$blocks,#8
++	b.gt	.Lecb_8_blocks_process
++	b	100f
++.Lecb_4_blocks_process:
++	cmp	$blocks,#4
++	b.lt	1f
++	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++$code.=<<___;
++	bl	_${prefix}_enc_4blks
++	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++	sub	$blocks,$blocks,#4
++1:
++	// process last block
++	cmp	$blocks,#1
++	b.lt	100f
++	b.gt	1f
++	ld1	{@data[0].4s},[$inp]
++___
++	&rev32(@data[0],@data[0]);
++	&encrypt_1blk(@data[0]);
++$code.=<<___;
++	st1	{@data[0].4s},[$outp]
++	b	100f
++1:	// process last 2 blocks
++	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
++	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
++	cmp	$blocks,#2
++	b.gt	1f
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++$code.=<<___;
++	bl	_${prefix}_enc_4blks
++	st4	{@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
++	st4	{@vtmp[0].s-@vtmp[3].s}[1],[$outp]
++	b	100f
++1:	// process last 3 blocks
++	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++$code.=<<___;
++	bl	_${prefix}_enc_4blks
++	st4	{@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
++	st4	{@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
++	st4	{@vtmp[0].s-@vtmp[3].s}[2],[$outp]
++100:
++	ldp	d10,d11,[sp,#16]
++	ldp	d12,d13,[sp,#32]
++	ldp	d14,d15,[sp,#48]
++	ldp	x29,x30,[sp,#64]
++	ldp	d8,d9,[sp],#80
++	AARCH64_VALIDATE_LINK_REGISTER
++	ret
++.size	${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
++___
++}}}
++
++{{{
++my ($len,$ivp,$enc)=("x2","x4","w5");
++my $ivec0=("v3");
++my $ivec1=("v15");
++
++$code.=<<___;
++.globl	${prefix}_cbc_encrypt
++.type	${prefix}_cbc_encrypt,%function
++.align	5
++${prefix}_cbc_encrypt:
++	AARCH64_VALID_CALL_TARGET
++	lsr	$len,$len,4
++___
++	&load_sbox();
++$code.=<<___;
++	cbz	$enc,.Ldec
++	ld1	{$ivec0.4s},[$ivp]
++.Lcbc_4_blocks_enc:
++	cmp	$blocks,#4
++	b.lt	1f
++	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++	eor	@data[0].16b,@data[0].16b,$ivec0.16b
++___
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++	&encrypt_1blk_norev(@data[0]);
++$code.=<<___;
++	eor	@data[1].16b,@data[1].16b,@data[0].16b
++___
++	&encrypt_1blk_norev(@data[1]);
++	&rev32(@data[0],@data[0]);
++
++$code.=<<___;
++	eor	@data[2].16b,@data[2].16b,@data[1].16b
++___
++	&encrypt_1blk_norev(@data[2]);
++	&rev32(@data[1],@data[1]);
++$code.=<<___;
++	eor	@data[3].16b,@data[3].16b,@data[2].16b
++___
++	&encrypt_1blk_norev(@data[3]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++$code.=<<___;
++	orr	$ivec0.16b,@data[3].16b,@data[3].16b
++	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++	subs	$blocks,$blocks,#4
++	b.ne	.Lcbc_4_blocks_enc
++	b	2f
++1:
++	subs	$blocks,$blocks,#1
++	b.lt	2f
++	ld1	{@data[0].4s},[$inp],#16
++	eor	$ivec0.16b,$ivec0.16b,@data[0].16b
++___
++	&rev32($ivec0,$ivec0);
++	&encrypt_1blk($ivec0);
++$code.=<<___;
++	st1	{$ivec0.4s},[$outp],#16
++	b	1b
++2:
++	// save back IV
++	st1	{$ivec0.4s},[$ivp]
++	ret
++
++.Ldec:
++	// decryption mode starts
++	AARCH64_SIGN_LINK_REGISTER
++	stp	d8,d9,[sp,#-80]!
++	stp	d10,d11,[sp,#16]
++	stp	d12,d13,[sp,#32]
++	stp	d14,d15,[sp,#48]
++	stp	x29,x30,[sp,#64]
++.Lcbc_8_blocks_dec:
++	cmp	$blocks,#8
++	b.lt	1f
++	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
++	add	$ptr,$inp,#64
++	ld4	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],$data[3]);
++	&rev32(@datax[0],@datax[0]);
++	&rev32(@datax[1],@datax[1]);
++	&rev32(@datax[2],@datax[2]);
++	&rev32(@datax[3],$datax[3]);
++$code.=<<___;
++	bl	_${prefix}_enc_8blks
++___
++	&transpose(@vtmp,@datax);
++	&transpose(@data,@datax);
++$code.=<<___;
++	ld1	{$ivec1.4s},[$ivp]
++	ld1	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
++	// note ivec1 and vtmpx[3] are resuing the same register
++	// care needs to be taken to avoid conflict
++	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
++	ld1	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
++	eor	@vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
++	eor	@vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
++	eor	@vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
++	// save back IV
++	st1	{$vtmpx[3].4s}, [$ivp]
++	eor	@data[0].16b,@data[0].16b,$datax[3].16b
++	eor	@data[1].16b,@data[1].16b,@vtmpx[0].16b
++	eor	@data[2].16b,@data[2].16b,@vtmpx[1].16b
++	eor	@data[3].16b,$data[3].16b,@vtmpx[2].16b
++	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++	subs	$blocks,$blocks,#8
++	b.gt	.Lcbc_8_blocks_dec
++	b.eq	100f
++1:
++	ld1	{$ivec1.4s},[$ivp]
++.Lcbc_4_blocks_dec:
++	cmp	$blocks,#4
++	b.lt	1f
++	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],$data[3]);
++$code.=<<___;
++	bl	_${prefix}_enc_4blks
++	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++___
++	&transpose(@vtmp,@datax);
++$code.=<<___;
++	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
++	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
++	orr	$ivec1.16b,@data[3].16b,@data[3].16b
++	eor	@vtmp[2].16b,@vtmp[2].16b,@data[1].16b
++	eor	@vtmp[3].16b,$vtmp[3].16b,@data[2].16b
++	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++	subs	$blocks,$blocks,#4
++	b.gt	.Lcbc_4_blocks_dec
++	// save back IV
++	st1	{@data[3].4s}, [$ivp]
++	b	100f
++1:	// last block
++	subs	$blocks,$blocks,#1
++	b.lt	100f
++	b.gt	1f
++	ld1	{@data[0].4s},[$inp],#16
++	// save back IV
++	st1	{$data[0].4s}, [$ivp]
++___
++	&rev32(@datax[0],@data[0]);
++	&encrypt_1blk(@datax[0]);
++$code.=<<___;
++	eor	@datax[0].16b,@datax[0].16b,$ivec1.16b
++	st1	{@datax[0].4s},[$outp],#16
++	b	100f
++1:	// last two blocks
++	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
++	add	$ptr,$inp,#16
++	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
++	subs	$blocks,$blocks,1
++	b.gt	1f
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++$code.=<<___;
++	bl	_${prefix}_enc_4blks
++	ld1	{@data[0].4s,@data[1].4s},[$inp],#32
++___
++	&transpose(@vtmp,@datax);
++$code.=<<___;
++	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
++	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
++	st1	{@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
++	// save back IV
++	st1	{@data[1].4s}, [$ivp]
++	b	100f
++1:	// last 3 blocks
++	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++$code.=<<___;
++	bl	_${prefix}_enc_4blks
++	ld1	{@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
++___
++	&transpose(@vtmp,@datax);
++$code.=<<___;
++	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
++	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
++	eor	@vtmp[2].16b,@vtmp[2].16b,@data[1].16b
++	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
++	// save back IV
++	st1	{@data[2].4s}, [$ivp]
++100:
++	ldp	d10,d11,[sp,#16]
++	ldp	d12,d13,[sp,#32]
++	ldp	d14,d15,[sp,#48]
++	ldp	x29,x30,[sp,#64]
++	ldp	d8,d9,[sp],#80
++	AARCH64_VALIDATE_LINK_REGISTER
++	ret
++.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
++___
++}}}
++
++{{{
++my ($ivp)=("x4");
++my ($ctr)=("w5");
++my $ivec=("v3");
++
++$code.=<<___;
++.globl	${prefix}_ctr32_encrypt_blocks
++.type	${prefix}_ctr32_encrypt_blocks,%function
++.align	5
++${prefix}_ctr32_encrypt_blocks:
++	AARCH64_VALID_CALL_TARGET
++	ld1	{$ivec.4s},[$ivp]
++___
++	&rev32($ivec,$ivec);
++	&load_sbox();
++$code.=<<___;
++	cmp	$blocks,#1
++	b.ne	1f
++	// fast processing for one single block without
++	// context saving overhead
++___
++	&encrypt_1blk($ivec);
++$code.=<<___;
++	ld1	{@data[0].4s},[$inp]
++	eor	@data[0].16b,@data[0].16b,$ivec.16b
++	st1	{@data[0].4s},[$outp]
++	ret
++1:
++	AARCH64_SIGN_LINK_REGISTER
++	stp	d8,d9,[sp,#-80]!
++	stp	d10,d11,[sp,#16]
++	stp	d12,d13,[sp,#32]
++	stp	d14,d15,[sp,#48]
++	stp	x29,x30,[sp,#64]
++	mov	$word0,$ivec.s[0]
++	mov	$word1,$ivec.s[1]
++	mov	$word2,$ivec.s[2]
++	mov	$ctr,$ivec.s[3]
++.Lctr32_4_blocks_process:
++	cmp	$blocks,#4
++	b.lt	1f
++	dup	@data[0].4s,$word0
++	dup	@data[1].4s,$word1
++	dup	@data[2].4s,$word2
++	mov	@data[3].s[0],$ctr
++	add	$ctr,$ctr,#1
++	mov	$data[3].s[1],$ctr
++	add	$ctr,$ctr,#1
++	mov	@data[3].s[2],$ctr
++	add	$ctr,$ctr,#1
++	mov	@data[3].s[3],$ctr
++	add	$ctr,$ctr,#1
++	cmp	$blocks,#8
++	b.ge	.Lctr32_8_blocks_process
++	bl	_${prefix}_enc_4blks
++	ld4	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
++	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
++	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
++	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
++	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
++	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++	subs	$blocks,$blocks,#4
++	b.ne	.Lctr32_4_blocks_process
++	b	100f
++.Lctr32_8_blocks_process:
++	dup	@datax[0].4s,$word0
++	dup	@datax[1].4s,$word1
++	dup	@datax[2].4s,$word2
++	mov	@datax[3].s[0],$ctr
++	add	$ctr,$ctr,#1
++	mov	$datax[3].s[1],$ctr
++	add	$ctr,$ctr,#1
++	mov	@datax[3].s[2],$ctr
++	add	$ctr,$ctr,#1
++	mov	@datax[3].s[3],$ctr
++	add	$ctr,$ctr,#1
++	bl	_${prefix}_enc_8blks
++	ld4	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
++	ld4	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
++	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
++	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
++	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
++	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
++	eor	@data[0].16b,@data[0].16b,@datax[0].16b
++	eor	@data[1].16b,@data[1].16b,@datax[1].16b
++	eor	@data[2].16b,@data[2].16b,@datax[2].16b
++	eor	@data[3].16b,@data[3].16b,@datax[3].16b
++	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++	st4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++	subs	$blocks,$blocks,#8
++	b.ne	.Lctr32_4_blocks_process
++	b	100f
++1:	// last block processing
++	subs	$blocks,$blocks,#1
++	b.lt	100f
++	b.gt	1f
++	mov	$ivec.s[0],$word0
++	mov	$ivec.s[1],$word1
++	mov	$ivec.s[2],$word2
++	mov	$ivec.s[3],$ctr
++___
++	&encrypt_1blk($ivec);
++$code.=<<___;
++	ld1	{@data[0].4s},[$inp]
++	eor	@data[0].16b,@data[0].16b,$ivec.16b
++	st1	{@data[0].4s},[$outp]
++	b	100f
++1:	// last 2 blocks processing
++	dup	@data[0].4s,$word0
++	dup	@data[1].4s,$word1
++	dup	@data[2].4s,$word2
++	mov	@data[3].s[0],$ctr
++	add	$ctr,$ctr,#1
++	mov	@data[3].s[1],$ctr
++	subs	$blocks,$blocks,#1
++	b.ne	1f
++	bl	_${prefix}_enc_4blks
++	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
++	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
++	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
++	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
++	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
++	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
++	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
++	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
++	b	100f
++1:	// last 3 blocks processing
++	add	$ctr,$ctr,#1
++	mov	@data[3].s[2],$ctr
++	bl	_${prefix}_enc_4blks
++	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
++	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
++	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
++	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
++	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
++	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
++	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
++	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
++	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
++	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
++100:
++	ldp	d10,d11,[sp,#16]
++	ldp	d12,d13,[sp,#32]
++	ldp	d14,d15,[sp,#48]
++	ldp	x29,x30,[sp,#64]
++	ldp	d8,d9,[sp],#80
++	AARCH64_VALIDATE_LINK_REGISTER
++	ret
++.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
++___
++}}}
++
++
++{{{
++my ($blocks,$len)=("x2","x2");
++my $ivp=("x5");
++my @twx=map("x$_",(12..27));
++my ($rks1,$rks2)=("x26","x27");
++my $lastBlk=("x26");
++my $enc=("w28");
++my $remain=("x29");
++
++my @tweak=map("v$_",(16..23));
++my $lastTweak=("v25");
++
++sub gen_xts_cipher() {
++	my $std = shift;
++$code.=<<___;
++.globl	${prefix}_xts_encrypt${std}
++.type	${prefix}_xts_encrypt${std},%function
++.align	5
++${prefix}_xts_encrypt${std}:
++	AARCH64_SIGN_LINK_REGISTER
++	stp	x15, x16, [sp, #-0x10]!
++	stp	x17, x18, [sp, #-0x10]!
++	stp	x19, x20, [sp, #-0x10]!
++	stp	x21, x22, [sp, #-0x10]!
++	stp	x23, x24, [sp, #-0x10]!
++	stp	x25, x26, [sp, #-0x10]!
++	stp	x27, x28, [sp, #-0x10]!
++	stp	x29, x30, [sp, #-0x10]!
++	stp	d8, d9, [sp, #-0x10]!
++	stp	d10, d11, [sp, #-0x10]!
++	stp	d12, d13, [sp, #-0x10]!
++	stp	d14, d15, [sp, #-0x10]!
++	mov	$rks1,x3
++	mov	$rks2,x4
++	mov	$enc,w6
++	ld1	{@tweak[0].4s}, [$ivp]
++	mov	$rks,$rks2
++___
++	&load_sbox();
++	&rev32(@tweak[0],@tweak[0]);
++	&encrypt_1blk(@tweak[0]);
++$code.=<<___;
++	mov	$rks,$rks1
++	and	$remain,$len,#0x0F
++	// convert length into blocks
++	lsr	$blocks,$len,4
++	cmp	$blocks,#1
++	b.lt .return${std}
++
++	cmp $remain,0
++	// If the encryption/decryption Length is N times of 16,
++	// the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
++	b.eq .xts_encrypt_blocks${std}
++
++	// If the encryption/decryption length is not N times of 16,
++	// the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
++	// the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
++	subs $blocks,$blocks,#1
++	b.eq .only_2blks_tweak${std}
++.xts_encrypt_blocks${std}:
++___
++	&rbit(@tweak[0],@tweak[0],$std);
++	&rev32_armeb(@tweak[0],@tweak[0]);
++	&mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
++	&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
++	&compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
++	&compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
++	&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
++	&compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
++	&compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
++	&compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
++$code.=<<___;
++.Lxts_8_blocks_process${std}:
++	cmp	$blocks,#8
++___
++	&mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
++	&compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
++	&mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
++	&compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
++	&mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
++	&compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
++	&mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
++	&compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
++	&mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]);
++	&compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
++	&mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]);
++	&compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
++	&mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]);
++	&compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
++	&mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]);
++	&compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
++$code.=<<___;
++	b.lt	.Lxts_4_blocks_process${std}
++	ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++___
++	&rbit(@tweak[0],@tweak[0],$std);
++	&rbit(@tweak[1],@tweak[1],$std);
++	&rbit(@tweak[2],@tweak[2],$std);
++	&rbit(@tweak[3],@tweak[3],$std);
++$code.=<<___;
++	eor @data[0].16b, @data[0].16b, @tweak[0].16b
++	eor @data[1].16b, @data[1].16b, @tweak[1].16b
++	eor @data[2].16b, @data[2].16b, @tweak[2].16b
++	eor @data[3].16b, @data[3].16b, @tweak[3].16b
++	ld1	{@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
++___
++	&rbit(@tweak[4],@tweak[4],$std);
++	&rbit(@tweak[5],@tweak[5],$std);
++	&rbit(@tweak[6],@tweak[6],$std);
++	&rbit(@tweak[7],@tweak[7],$std);
++$code.=<<___;
++	eor @datax[0].16b, @datax[0].16b, @tweak[4].16b
++	eor @datax[1].16b, @datax[1].16b, @tweak[5].16b
++	eor @datax[2].16b, @datax[2].16b, @tweak[6].16b
++	eor @datax[3].16b, @datax[3].16b, @tweak[7].16b
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++	&rev32(@datax[0],@datax[0]);
++	&rev32(@datax[1],@datax[1]);
++	&rev32(@datax[2],@datax[2]);
++	&rev32(@datax[3],@datax[3]);
++	&transpose(@data,@vtmp);
++	&transpose(@datax,@vtmp);
++$code.=<<___;
++	bl	_${prefix}_enc_8blks
++___
++	&transpose(@vtmp,@datax);
++	&transpose(@data,@datax);
++$code.=<<___;
++	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
++	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
++	eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
++	eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
++	eor @data[0].16b, @data[0].16b, @tweak[4].16b
++	eor @data[1].16b, @data[1].16b, @tweak[5].16b
++	eor @data[2].16b, @data[2].16b, @tweak[6].16b
++	eor @data[3].16b, @data[3].16b, @tweak[7].16b
++
++	// save the last tweak
++	mov $lastTweak.16b,@tweak[7].16b
++	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++	subs	$blocks,$blocks,#8
++	b.gt	.Lxts_8_blocks_process${std}
++	b	100f
++.Lxts_4_blocks_process${std}:
++	cmp	$blocks,#4
++	b.lt	1f
++	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++___
++	&rbit(@tweak[0],@tweak[0],$std);
++	&rbit(@tweak[1],@tweak[1],$std);
++	&rbit(@tweak[2],@tweak[2],$std);
++	&rbit(@tweak[3],@tweak[3],$std);
++$code.=<<___;
++	eor @data[0].16b, @data[0].16b, @tweak[0].16b
++	eor @data[1].16b, @data[1].16b, @tweak[1].16b
++	eor @data[2].16b, @data[2].16b, @tweak[2].16b
++	eor @data[3].16b, @data[3].16b, @tweak[3].16b
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++	&transpose(@data,@vtmp);
++$code.=<<___;
++	bl	_${prefix}_enc_4blks
++___
++	&transpose(@vtmp,@data);
++$code.=<<___;
++	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
++	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
++	eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
++	eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
++	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++	sub	$blocks,$blocks,#4
++	mov @tweak[0].16b,@tweak[4].16b
++	mov @tweak[1].16b,@tweak[5].16b
++	mov @tweak[2].16b,@tweak[6].16b
++	// save the last tweak
++	mov $lastTweak.16b,@tweak[3].16b
++1:
++	// process last block
++	cmp	$blocks,#1
++	b.lt	100f
++	b.gt	1f
++	ld1	{@data[0].4s},[$inp],#16
++___
++	&rbit(@tweak[0],@tweak[0],$std);
++$code.=<<___;
++	eor @data[0].16b, @data[0].16b, @tweak[0].16b
++___
++	&rev32(@data[0],@data[0]);
++	&encrypt_1blk(@data[0]);
++$code.=<<___;
++	eor @data[0].16b, @data[0].16b, @tweak[0].16b
++	st1	{@data[0].4s},[$outp],#16
++	// save the last tweak
++	mov $lastTweak.16b,@tweak[0].16b
++	b	100f
++1:  // process last 2 blocks
++	cmp	$blocks,#2
++	b.gt	1f
++	ld1	{@data[0].4s,@data[1].4s},[$inp],#32
++___
++	&rbit(@tweak[0],@tweak[0],$std);
++	&rbit(@tweak[1],@tweak[1],$std);
++$code.=<<___;
++	eor @data[0].16b, @data[0].16b, @tweak[0].16b
++	eor @data[1].16b, @data[1].16b, @tweak[1].16b
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&transpose(@data,@vtmp);
++$code.=<<___;
++	bl	_${prefix}_enc_4blks
++___
++	&transpose(@vtmp,@data);
++$code.=<<___;
++	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
++	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
++	st1	{@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
++	// save the last tweak
++	mov $lastTweak.16b,@tweak[1].16b
++	b	100f
++1:  // process last 3 blocks
++	ld1	{@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
++___
++	&rbit(@tweak[0],@tweak[0],$std);
++	&rbit(@tweak[1],@tweak[1],$std);
++	&rbit(@tweak[2],@tweak[2],$std);
++$code.=<<___;
++	eor @data[0].16b, @data[0].16b, @tweak[0].16b
++	eor @data[1].16b, @data[1].16b, @tweak[1].16b
++	eor @data[2].16b, @data[2].16b, @tweak[2].16b
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&transpose(@data,@vtmp);
++$code.=<<___;
++	bl	_${prefix}_enc_4blks
++___
++	&transpose(@vtmp,@data);
++$code.=<<___;
++	eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
++	eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
++	eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
++	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
++	// save the last tweak
++	mov $lastTweak.16b,@tweak[2].16b
++100:
++	cmp $remain,0
++	b.eq .return${std}
++
++// This brance calculates the last two tweaks, 
++// while the encryption/decryption length is larger than 32
++.last_2blks_tweak${std}:
++___
++	&rev32_armeb($lastTweak,$lastTweak);
++	&compute_tweak_vec($lastTweak,@tweak[1],$std);
++	&compute_tweak_vec(@tweak[1],@tweak[2],$std);
++$code.=<<___;
++	b .check_dec${std}
++
++
++// This brance calculates the last two tweaks, 
++// while the encryption/decryption length is equal to 32, who only need two tweaks
++.only_2blks_tweak${std}:
++	mov @tweak[1].16b,@tweak[0].16b
++___
++	&rev32_armeb(@tweak[1],@tweak[1]);
++	&compute_tweak_vec(@tweak[1],@tweak[2]);
++$code.=<<___;
++	b .check_dec${std}
++
++
++// Determine whether encryption or decryption is required.
++// The last two tweaks need to be swapped for decryption.
++.check_dec${std}:
++	// encryption:1 decryption:0
++	cmp $enc,1
++	b.eq .prcess_last_2blks${std}
++	mov @vtmp[0].16B,@tweak[1].16b
++	mov @tweak[1].16B,@tweak[2].16b
++	mov @tweak[2].16B,@vtmp[0].16b
++
++.prcess_last_2blks${std}:
++___
++	&rev32_armeb(@tweak[1],@tweak[1]);
++	&rev32_armeb(@tweak[2],@tweak[2]);
++$code.=<<___;
++	ld1	{@data[0].4s},[$inp],#16
++	eor @data[0].16b, @data[0].16b, @tweak[1].16b
++___
++	&rev32(@data[0],@data[0]);
++	&encrypt_1blk(@data[0]);
++$code.=<<___;
++	eor @data[0].16b, @data[0].16b, @tweak[1].16b
++	st1	{@data[0].4s},[$outp],#16
++
++	sub $lastBlk,$outp,16
++	.loop${std}:
++		subs $remain,$remain,1
++		ldrb	$wtmp0,[$lastBlk,$remain]
++		ldrb	$wtmp1,[$inp,$remain]
++		strb	$wtmp1,[$lastBlk,$remain]
++		strb	$wtmp0,[$outp,$remain]
++	b.gt .loop${std}
++	ld1		{@data[0].4s}, [$lastBlk]	
++	eor @data[0].16b, @data[0].16b, @tweak[2].16b
++___
++	&rev32(@data[0],@data[0]);
++	&encrypt_1blk(@data[0]);
++$code.=<<___;
++	eor @data[0].16b, @data[0].16b, @tweak[2].16b
++	st1		{@data[0].4s}, [$lastBlk]
++.return${std}:
++	ldp		d14, d15, [sp], #0x10
++	ldp		d12, d13, [sp], #0x10
++	ldp		d10, d11, [sp], #0x10
++	ldp		d8, d9, [sp], #0x10
++	ldp		x29, x30, [sp], #0x10
++	ldp		x27, x28, [sp], #0x10
++	ldp		x25, x26, [sp], #0x10
++	ldp		x23, x24, [sp], #0x10
++	ldp		x21, x22, [sp], #0x10
++	ldp		x19, x20, [sp], #0x10
++	ldp		x17, x18, [sp], #0x10
++	ldp		x15, x16, [sp], #0x10
++	AARCH64_VALIDATE_LINK_REGISTER
++	ret
++.size	${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
++___
++} # end of gen_xts_cipher
++&gen_xts_cipher("_gb");
++&gen_xts_cipher("");
++}}}
++
++########################################
++open SELF,$0;
++while(<SELF>) {
++		next if (/^#!/);
++		last if (!s/^#/\/\// and !/^$/);
++		print;
++}
++close SELF;
++
++foreach(split("\n",$code)) {
++	s/\`([^\`]*)\`/eval($1)/ge;
++	print $_,"\n";
++}
++
++close STDOUT or die "error closing STDOUT: $!";
+diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info
+index 75a215ab80..73ffe5ea09 100644
+--- a/crypto/sm4/build.info
++++ b/crypto/sm4/build.info
+@@ -2,7 +2,7 @@ LIBS=../../libcrypto
+ 
+ IF[{- !$disabled{asm} -}]
+   $SM4DEF_aarch64=SM4_ASM VPSM4_ASM
+-  $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S
++  $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S vpsm4_ex-armv8.S
+ 
+   # Now that we have defined all the arch specific variables, use the
+   # appropriate one, and define the appropriate macros
+@@ -30,5 +30,7 @@ ENDIF
+ 
+ GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl
+ GENERATE[vpsm4-armv8.S]=asm/vpsm4-armv8.pl
++GENERATE[vpsm4_ex-armv8.S]=asm/vpsm4_ex-armv8.pl
+ INCLUDE[sm4-armv8.o]=..
+ INCLUDE[vpsm4-armv8.o]=..
++INCLUDE[vpsm4_ex-armv8.o]=..
+diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
+index 15d8abbcb1..8b9cd10f97 100644
+--- a/include/crypto/sm4_platform.h
++++ b/include/crypto/sm4_platform.h
+@@ -20,11 +20,16 @@ static inline int vpsm4_capable(void)
+ {
+     return (OPENSSL_armcap_P & ARMV8_CPUID) &&
+             (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1) ||
+-             MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1) ||
+-             MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, HISI_CPU_IMP, HISI_CPU_PART_KP920));
++             MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1));
++}
++static inline int vpsm4_ex_capable(void)
++{
++    return (OPENSSL_armcap_P & ARMV8_CPUID) &&
++            (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, HISI_CPU_IMP, HISI_CPU_PART_KP920));
+ }
+ #    if defined(VPSM4_ASM)
+ #     define VPSM4_CAPABLE vpsm4_capable()
++#     define VPSM4_EX_CAPABLE vpsm4_ex_capable()
+ #    endif
+ #    define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4)
+ #    define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key
+@@ -56,7 +61,7 @@ void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+                                 const unsigned char ivec[16]);
+ # endif /* HWSM4_CAPABLE */
+ 
+-#ifdef VPSM4_CAPABLE
++# ifdef VPSM4_CAPABLE
+ int vpsm4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key);
+ int vpsm4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key);
+ void vpsm4_encrypt(const unsigned char *in, unsigned char *out,
+@@ -72,7 +77,37 @@ void vpsm4_ecb_encrypt(const unsigned char *in, unsigned char *out,
+ void vpsm4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+                                 size_t len, const void *key,
+                                 const unsigned char ivec[16]);
++void vpsm4_xts_encrypt(const unsigned char *in, unsigned char *out,
++                       size_t len, const SM4_KEY *key1, const SM4_KEY *key2,
++                       const unsigned char ivec[16], const int enc);
++void vpsm4_xts_encrypt_gb(const unsigned char *in, unsigned char *out,
++                          size_t len, const SM4_KEY *key1, const SM4_KEY *key2,
++                          const unsigned char ivec[16], const int enc);
+ # endif /* VPSM4_CAPABLE */
+ 
++# ifdef VPSM4_EX_CAPABLE
++int vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key);
++int vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key);
++void vpsm4_ex_encrypt(const unsigned char *in, unsigned char *out,
++                      const SM4_KEY *key);
++void vpsm4_ex_decrypt(const unsigned char *in, unsigned char *out,
++                      const SM4_KEY *key);
++void vpsm4_ex_cbc_encrypt(const unsigned char *in, unsigned char *out,
++                          size_t length, const SM4_KEY *key,
++                          unsigned char *ivec, const int enc);
++void vpsm4_ex_ecb_encrypt(const unsigned char *in, unsigned char *out,
++                          size_t length, const SM4_KEY *key,
++                          const int enc);
++void vpsm4_ex_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
++                                   size_t len, const void *key,
++                                   const unsigned char ivec[16]);
++void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out,
++                          size_t len, const SM4_KEY *key1, const SM4_KEY *key2,
++                          const unsigned char ivec[16], const int enc);
++void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out,
++                             size_t len, const SM4_KEY *key1,
++                             const SM4_KEY *key2, const unsigned char ivec[16],
++                             const int enc);
++# endif /* VPSM4_EX_CAPABLE */
+ 
+ #endif /* OSSL_SM4_PLATFORM_H */
+diff --git a/providers/implementations/ciphers/cipher_sm4_hw.c b/providers/implementations/ciphers/cipher_sm4_hw.c
+index 9a2e99f67c..8cabd78266 100644
+--- a/providers/implementations/ciphers/cipher_sm4_hw.c
++++ b/providers/implementations/ciphers/cipher_sm4_hw.c
+@@ -42,6 +42,19 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
+             (void)0;            /* terminate potentially open 'else' */
+         } else
+ #endif
++#ifdef VPSM4_EX_CAPABLE
++        if (VPSM4_EX_CAPABLE) {
++            vpsm4_ex_set_encrypt_key(key, ks);
++            ctx->block = (block128_f)vpsm4_ex_encrypt;
++            ctx->stream.cbc = NULL;
++            if (ctx->mode == EVP_CIPH_CBC_MODE)
++                ctx->stream.cbc = (cbc128_f)vpsm4_ex_cbc_encrypt;
++            else if (ctx->mode == EVP_CIPH_ECB_MODE)
++                ctx->stream.ecb = (ecb128_f)vpsm4_ex_ecb_encrypt;
++            else if (ctx->mode == EVP_CIPH_CTR_MODE)
++                ctx->stream.ctr = (ctr128_f)vpsm4_ex_ctr32_encrypt_blocks;
++        } else
++#endif
+ #ifdef VPSM4_CAPABLE
+         if (VPSM4_CAPABLE) {
+             vpsm4_set_encrypt_key(key, ks);
+@@ -75,6 +88,17 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
+ #endif
+         } else
+ #endif
++#ifdef VPSM4_EX_CAPABLE
++        if (VPSM4_EX_CAPABLE) {
++            vpsm4_ex_set_decrypt_key(key, ks);
++            ctx->block = (block128_f)vpsm4_ex_decrypt;
++            ctx->stream.cbc = NULL;
++            if (ctx->mode == EVP_CIPH_CBC_MODE)
++                ctx->stream.cbc = (cbc128_f)vpsm4_ex_cbc_encrypt;
++            else if (ctx->mode == EVP_CIPH_ECB_MODE)
++                ctx->stream.ecb = (ecb128_f)vpsm4_ex_ecb_encrypt;
++        } else
++#endif
+ #ifdef VPSM4_CAPABLE
+         if (VPSM4_CAPABLE) {
+             vpsm4_set_decrypt_key(key, ks);
+@@ -82,7 +106,7 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
+             ctx->stream.cbc = NULL;
+             if (ctx->mode == EVP_CIPH_CBC_MODE)
+                 ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt;
+-        else if (ctx->mode == EVP_CIPH_ECB_MODE)
++            else if (ctx->mode == EVP_CIPH_ECB_MODE)
+                 ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt;
+         } else
+ #endif
+diff --git a/providers/implementations/ciphers/cipher_sm4_xts.c b/providers/implementations/ciphers/cipher_sm4_xts.c
+index 3c568d4d18..037055fce8 100644
+--- a/providers/implementations/ciphers/cipher_sm4_xts.c
++++ b/providers/implementations/ciphers/cipher_sm4_xts.c
+@@ -145,14 +145,14 @@ static int sm4_xts_cipher(void *vctx, unsigned char *out, size_t *outl,
+     if (ctx->xts_standard) {
+         if (ctx->stream != NULL)
+             (*ctx->stream)(in, out, inl, ctx->xts.key1, ctx->xts.key2,
+-                           ctx->base.iv);
++                           ctx->base.iv, ctx->base.enc);
+         else if (CRYPTO_xts128_encrypt(&ctx->xts, ctx->base.iv, in, out, inl,
+                                        ctx->base.enc))
+             return 0;
+     } else {
+         if (ctx->stream_gb != NULL)
+             (*ctx->stream_gb)(in, out, inl, ctx->xts.key1, ctx->xts.key2,
+-                              ctx->base.iv);
++                              ctx->base.iv, ctx->base.enc);
+         else if (ossl_crypto_xts128gb_encrypt(&ctx->xts, ctx->base.iv, in, out,
+                                               inl, ctx->base.enc))
+             return 0;
+diff --git a/providers/implementations/ciphers/cipher_sm4_xts.h b/providers/implementations/ciphers/cipher_sm4_xts.h
+index 4c369183e2..cfca596979 100644
+--- a/providers/implementations/ciphers/cipher_sm4_xts.h
++++ b/providers/implementations/ciphers/cipher_sm4_xts.h
+@@ -14,7 +14,7 @@
+ PROV_CIPHER_FUNC(void, xts_stream,
+                  (const unsigned char *in, unsigned char *out, size_t len,
+                   const SM4_KEY *key1, const SM4_KEY *key2,
+-                  const unsigned char iv[16]));
++                  const unsigned char iv[16], const int enc));
+ 
+ typedef struct prov_sm4_xts_ctx_st {
+     /* Must be first */
+diff --git a/providers/implementations/ciphers/cipher_sm4_xts_hw.c b/providers/implementations/ciphers/cipher_sm4_xts_hw.c
+index 403eb879b1..67a9923d94 100644
+--- a/providers/implementations/ciphers/cipher_sm4_xts_hw.c
++++ b/providers/implementations/ciphers/cipher_sm4_xts_hw.c
+@@ -11,8 +11,7 @@
+ 
+ #define XTS_SET_KEY_FN(fn_set_enc_key, fn_set_dec_key,                         \
+                        fn_block_enc, fn_block_dec,                             \
+-                       fn_stream_enc, fn_stream_dec,                           \
+-                       fn_stream_gb_enc, fn_stream_gb_dec) {                   \
++                       fn_stream, fn_stream_gb) {                              \
+     size_t bytes = keylen / 2;                                                 \
+                                                                                \
+     if (ctx->enc) {                                                            \
+@@ -26,8 +25,8 @@
+     xctx->xts.block2 = (block128_f)fn_block_enc;                               \
+     xctx->xts.key1 = &xctx->ks1;                                               \
+     xctx->xts.key2 = &xctx->ks2;                                               \
+-    xctx->stream = ctx->enc ? fn_stream_enc : fn_stream_dec;                   \
+-    xctx->stream_gb = ctx->enc ? fn_stream_gb_enc : fn_stream_gb_dec;          \
++    xctx->stream = fn_stream;                                                  \
++    xctx->stream_gb = fn_stream_gb;                                            \
+ }
+ 
+ static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx,
+@@ -35,23 +34,30 @@ static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx,
+                                              size_t keylen)
+ {
+     PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)ctx;
+-    OSSL_xts_stream_fn stream_enc = NULL;
+-    OSSL_xts_stream_fn stream_dec = NULL;
+-    OSSL_xts_stream_fn stream_gb_enc = NULL;
+-    OSSL_xts_stream_fn stream_gb_dec = NULL;
++    OSSL_xts_stream_fn stream = NULL;
++    OSSL_xts_stream_fn stream_gb = NULL;
+ #ifdef HWSM4_CAPABLE
+     if (HWSM4_CAPABLE) {
+         XTS_SET_KEY_FN(HWSM4_set_encrypt_key, HWSM4_set_decrypt_key,
+-                       HWSM4_encrypt, HWSM4_decrypt, stream_enc, stream_dec,
+-                       stream_gb_enc, stream_gb_dec);
++                       HWSM4_encrypt, HWSM4_decrypt, stream, stream_gb);
+         return 1;
+     } else
+ #endif /* HWSM4_CAPABLE */
++#ifdef VPSM4_EX_CAPABLE
++    if (VPSM4_EX_CAPABLE) {
++        stream = vpsm4_ex_xts_encrypt;
++        stream_gb = vpsm4_ex_xts_encrypt_gb;
++        XTS_SET_KEY_FN(vpsm4_ex_set_encrypt_key, vpsm4_ex_set_decrypt_key,
++                       vpsm4_ex_encrypt, vpsm4_ex_decrypt, stream, stream_gb);
++        return 1;
++    } else
++#endif /* VPSM4_EX_CAPABLE */
+ #ifdef VPSM4_CAPABLE
+     if (VPSM4_CAPABLE) {
++        stream = vpsm4_xts_encrypt;
++        stream_gb = vpsm4_xts_encrypt_gb;
+         XTS_SET_KEY_FN(vpsm4_set_encrypt_key, vpsm4_set_decrypt_key,
+-                       vpsm4_encrypt, vpsm4_decrypt, stream_enc, stream_dec,
+-                       stream_gb_enc, stream_gb_dec);
++                       vpsm4_encrypt, vpsm4_decrypt, stream, stream_gb);
+         return 1;
+     } else
+ #endif /* VPSM4_CAPABLE */
+@@ -60,8 +66,7 @@ static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx,
+     }
+     {
+         XTS_SET_KEY_FN(ossl_sm4_set_key, ossl_sm4_set_key, ossl_sm4_encrypt,
+-                       ossl_sm4_decrypt, stream_enc, stream_dec, stream_gb_enc,
+-                       stream_gb_dec);
++                       ossl_sm4_decrypt, stream, stream_gb);
+     }
+     return 1;
+ }
+-- 
+2.37.3.windows.1
+
diff --git a/Backport-SM4-optimization-for-ARM-by-ASIMD.patch b/Backport-SM4-optimization-for-ARM-by-ASIMD.patch
new file mode 100644
index 0000000..5d58d16
--- /dev/null
+++ b/Backport-SM4-optimization-for-ARM-by-ASIMD.patch
@@ -0,0 +1,1334 @@
+From ca0b08e39bb619b6e62ef58c80edc784e8f20966 Mon Sep 17 00:00:00 2001
+From: Daniel Hu <Daniel.Hu@arm.com>
+Date: Mon, 14 Feb 2022 14:36:34 +0000
+Subject: [PATCH 07/13] SM4 optimization for ARM by ASIMD
+
+This patch optimizes SM4 for ARM processor using ASIMD instruction
+
+It will improve performance if both of following conditions are met:
+1) Input data equal to or more than 4 blocks
+2) Cipher mode allows parallelism, including ECB,CTR,GCM or CBC decryption
+
+This patch implements SM4 SBOX lookup in vector registers, with the
+benefit of constant processing time over existing C implementation.
+
+It is only enabled for micro-architecture N1/V1. In the ideal scenario,
+performance can reach up to 2.7X
+
+When either of above two conditions is not met, e.g. single block input
+or CFB/OFB mode, CBC encryption, performance could drop about 50%.
+
+The assembly code has been reviewed internally by ARM engineer
+Fangming.Fang@arm.com
+
+Signed-off-by: Daniel Hu <Daniel.Hu@arm.com>
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/17951)
+---
+ crypto/evp/e_sm4.c                            |   24 +
+ crypto/sm4/asm/vpsm4-armv8.pl                 | 1118 +++++++++++++++++
+ crypto/sm4/build.info                         |    6 +-
+ include/crypto/sm4_platform.h                 |   29 +
+ .../ciphers/cipher_sm4_gcm_hw.c               |    7 +
+ .../implementations/ciphers/cipher_sm4_hw.c   |   24 +
+ 6 files changed, 1206 insertions(+), 2 deletions(-)
+ create mode 100755 crypto/sm4/asm/vpsm4-armv8.pl
+
+diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c
+index bff79ff197..c8e8cfe9c9 100644
+--- a/crypto/evp/e_sm4.c
++++ b/crypto/evp/e_sm4.c
+@@ -76,6 +76,17 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                 dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt;
+ # endif
+         } else
++#endif
++#ifdef VPSM4_CAPABLE
++        if (VPSM4_CAPABLE) {
++            vpsm4_set_decrypt_key(key, &dat->ks.ks);
++            dat->block = (block128_f) vpsm4_decrypt;
++            dat->stream.cbc = NULL;
++            if (mode == EVP_CIPH_CBC_MODE)
++                dat->stream.cbc = (cbc128_f) vpsm4_cbc_encrypt;
++            else if (mode == EVP_CIPH_ECB_MODE)
++                dat->stream.ecb = (ecb128_f) vpsm4_ecb_encrypt;
++        } else
+ #endif
+         {
+             dat->block = (block128_f) ossl_sm4_decrypt;
+@@ -104,6 +115,19 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ # endif
+             (void)0;            /* terminate potentially open 'else' */
+     } else
++#endif
++#ifdef VPSM4_CAPABLE
++    if (VPSM4_CAPABLE) {
++        vpsm4_set_encrypt_key(key, &dat->ks.ks);
++        dat->block = (block128_f) vpsm4_encrypt;
++        dat->stream.cbc = NULL;
++        if (mode == EVP_CIPH_CBC_MODE)
++            dat->stream.cbc = (cbc128_f) vpsm4_cbc_encrypt;
++        else if (mode == EVP_CIPH_ECB_MODE)
++            dat->stream.ecb = (ecb128_f) vpsm4_ecb_encrypt;
++        else if (mode == EVP_CIPH_CTR_MODE)
++            dat->stream.ctr = (ctr128_f) vpsm4_ctr32_encrypt_blocks;
++    } else
+ #endif
+     {
+         dat->block = (block128_f) ossl_sm4_encrypt;
+diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl
+new file mode 100755
+index 0000000000..095d9dae64
+--- /dev/null
++++ b/crypto/sm4/asm/vpsm4-armv8.pl
+@@ -0,0 +1,1118 @@
++#! /usr/bin/env perl
++# Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the Apache License 2.0 (the "License").  You may not use
++# this file except in compliance with the License.  You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++
++#
++# This module implements SM4 with ASIMD on aarch64
++#
++# Feb 2022
++#
++
++# $output is the last argument if it looks like a file (it has an extension)
++# $flavour is the first argument if it doesn't look like a file
++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
++die "can't locate arm-xlate.pl";
++
++open OUT,"| \"$^X\" $xlate $flavour \"$output\""
++    or die "can't call $xlate: $!";
++*STDOUT=*OUT;
++
++$prefix="vpsm4";
++my @vtmp=map("v$_",(0..3));
++my @data=map("v$_",(4..7));
++my @datax=map("v$_",(8..11));
++my ($rk0,$rk1)=("v12","v13");
++my ($rka,$rkb)=("v14","v15");
++my @vtmpx=map("v$_",(12..15));
++my @sbox=map("v$_",(16..31));
++my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
++my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
++my ($ptr,$counter)=("x10","w11");
++my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
++
++sub rev32() {
++	my $dst = shift;
++	my $src = shift;
++
++	if ($src and ("$src" ne "$dst")) {
++$code.=<<___;
++#ifndef __ARMEB__
++	rev32	$dst.16b,$src.16b
++#else
++	mov	$dst.16b,$src.16b
++#endif
++___
++	} else {
++$code.=<<___;
++#ifndef __ARMEB__
++	rev32	$dst.16b,$dst.16b
++#endif
++___
++	}
++}
++
++sub transpose() {
++	my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
++
++$code.=<<___;
++	zip1	$vt0.4s,$dat0.4s,$dat1.4s
++	zip2	$vt1.4s,$dat0.4s,$dat1.4s
++	zip1	$vt2.4s,$dat2.4s,$dat3.4s
++	zip2	$vt3.4s,$dat2.4s,$dat3.4s
++	zip1	$dat0.2d,$vt0.2d,$vt2.2d
++	zip2	$dat1.2d,$vt0.2d,$vt2.2d
++	zip1	$dat2.2d,$vt1.2d,$vt3.2d
++	zip2	$dat3.2d,$vt1.2d,$vt3.2d
++___
++}
++
++# sbox operations for 4-lane of words
++sub sbox() {
++	my $dat = shift;
++
++$code.=<<___;
++	movi	@vtmp[0].16b,#64
++	movi	@vtmp[1].16b,#128
++	movi	@vtmp[2].16b,#192
++	sub	@vtmp[0].16b,$dat.16b,@vtmp[0].16b
++	sub	@vtmp[1].16b,$dat.16b,@vtmp[1].16b
++	sub	@vtmp[2].16b,$dat.16b,@vtmp[2].16b
++	tbl	$dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
++	tbl	@vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
++	tbl	@vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
++	tbl	@vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
++	add	@vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d
++	add	@vtmp[2].2d,@vtmp[2].2d,$dat.2d
++	add	$dat.2d,@vtmp[0].2d,@vtmp[2].2d
++
++	ushr	@vtmp[0].4s,$dat.4s,32-2
++	sli	@vtmp[0].4s,$dat.4s,2
++	ushr	@vtmp[2].4s,$dat.4s,32-10
++	eor	@vtmp[1].16b,@vtmp[0].16b,$dat.16b
++	sli	@vtmp[2].4s,$dat.4s,10
++	eor	@vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b
++	ushr	@vtmp[0].4s,$dat.4s,32-18
++	sli	@vtmp[0].4s,$dat.4s,18
++	ushr	@vtmp[2].4s,$dat.4s,32-24
++	eor	@vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
++	sli	@vtmp[2].4s,$dat.4s,24
++	eor	$dat.16b,@vtmp[2].16b,@vtmp[1].16b
++___
++}
++
++# sbox operation for 8-lane of words
++sub sbox_double() {
++	my $dat = shift;
++	my $datx = shift;
++
++$code.=<<___;
++	movi	@vtmp[3].16b,#64
++	sub	@vtmp[0].16b,$dat.16b,@vtmp[3].16b
++	sub	@vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
++	sub	@vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
++	tbl	$dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
++	tbl	@vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
++	tbl	@vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
++	tbl	@vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
++	add	@vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
++	add	$dat.2d,@vtmp[2].2d,$dat.2d
++	add	$dat.2d,@vtmp[1].2d,$dat.2d
++
++	sub	@vtmp[0].16b,$datx.16b,@vtmp[3].16b
++	sub	@vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
++	sub	@vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
++	tbl	$datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b
++	tbl	@vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
++	tbl	@vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
++	tbl	@vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
++	add	@vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
++	add	$datx.2d,@vtmp[2].2d,$datx.2d
++	add	$datx.2d,@vtmp[1].2d,$datx.2d
++
++	ushr	@vtmp[0].4s,$dat.4s,32-2
++	sli	@vtmp[0].4s,$dat.4s,2
++	ushr	@vtmp[2].4s,$datx.4s,32-2
++	eor	@vtmp[1].16b,@vtmp[0].16b,$dat.16b
++	sli	@vtmp[2].4s,$datx.4s,2
++
++	ushr	@vtmp[0].4s,$dat.4s,32-10
++	eor	@vtmp[3].16b,@vtmp[2].16b,$datx.16b
++	sli	@vtmp[0].4s,$dat.4s,10
++	ushr	@vtmp[2].4s,$datx.4s,32-10
++	eor	@vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
++	sli	@vtmp[2].4s,$datx.4s,10
++
++	ushr	@vtmp[0].4s,$dat.4s,32-18
++	eor	@vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
++	sli	@vtmp[0].4s,$dat.4s,18
++	ushr	@vtmp[2].4s,$datx.4s,32-18
++	eor	@vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
++	sli	@vtmp[2].4s,$datx.4s,18
++
++	ushr	@vtmp[0].4s,$dat.4s,32-24
++	eor	@vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
++	sli	@vtmp[0].4s,$dat.4s,24
++	ushr	@vtmp[2].4s,$datx.4s,32-24
++	eor	$dat.16b,@vtmp[0].16b,@vtmp[1].16b
++	sli	@vtmp[2].4s,$datx.4s,24
++	eor	$datx.16b,@vtmp[2].16b,@vtmp[3].16b
++___
++}
++
++# sbox operation for one single word
++sub sbox_1word () {
++	my $word = shift;
++
++$code.=<<___;
++	movi	@vtmp[1].16b,#64
++	movi	@vtmp[2].16b,#128
++	movi	@vtmp[3].16b,#192
++	mov	@vtmp[0].s[0],$word
++
++	sub	@vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b
++	sub	@vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b
++	sub	@vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b
++
++	tbl	@vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b
++	tbl	@vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b
++	tbl	@vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b
++	tbl	@vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b
++
++	mov	$word,@vtmp[0].s[0]
++	mov	$wtmp0,@vtmp[1].s[0]
++	mov	$wtmp2,@vtmp[2].s[0]
++	add	$wtmp0,$word,$wtmp0
++	mov	$word,@vtmp[3].s[0]
++	add	$wtmp0,$wtmp0,$wtmp2
++	add	$wtmp0,$wtmp0,$word
++
++	eor	$word,$wtmp0,$wtmp0,ror #32-2
++	eor	$word,$word,$wtmp0,ror #32-10
++	eor	$word,$word,$wtmp0,ror #32-18
++	eor	$word,$word,$wtmp0,ror #32-24
++___
++}
++
++# sm4 for one block of data, in scalar registers word0/word1/word2/word3
++sub sm4_1blk () {
++	my $kptr = shift;
++
++$code.=<<___;
++	ldp	$wtmp0,$wtmp1,[$kptr],8
++	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
++	eor	$tmpw,$word2,$word3
++	eor	$wtmp2,$wtmp0,$word1
++	eor	$tmpw,$tmpw,$wtmp2
++___
++	&sbox_1word($tmpw);
++$code.=<<___;
++	eor	$word0,$word0,$tmpw
++	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
++	eor	$tmpw,$word2,$word3
++	eor	$wtmp2,$word0,$wtmp1
++	eor	$tmpw,$tmpw,$wtmp2
++___
++	&sbox_1word($tmpw);
++$code.=<<___;
++	ldp	$wtmp0,$wtmp1,[$kptr],8
++	eor	$word1,$word1,$tmpw
++	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
++	eor	$tmpw,$word0,$word1
++	eor	$wtmp2,$wtmp0,$word3
++	eor	$tmpw,$tmpw,$wtmp2
++___
++	&sbox_1word($tmpw);
++$code.=<<___;
++	eor	$word2,$word2,$tmpw
++	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
++	eor	$tmpw,$word0,$word1
++	eor	$wtmp2,$word2,$wtmp1
++	eor	$tmpw,$tmpw,$wtmp2
++___
++	&sbox_1word($tmpw);
++$code.=<<___;
++	eor	$word3,$word3,$tmpw
++___
++}
++
++# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
++sub sm4_4blks () {
++	my $kptr = shift;
++
++$code.=<<___;
++	ldp	$wtmp0,$wtmp1,[$kptr],8
++	dup	$rk0.4s,$wtmp0
++	dup	$rk1.4s,$wtmp1
++
++	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
++	eor	$rka.16b,@data[2].16b,@data[3].16b
++	eor	$rk0.16b,@data[1].16b,$rk0.16b
++	eor	$rk0.16b,$rka.16b,$rk0.16b
++___
++	&sbox($rk0);
++$code.=<<___;
++	eor	@data[0].16b,@data[0].16b,$rk0.16b
++
++	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
++	eor	$rka.16b,$rka.16b,@data[0].16b
++	eor	$rk1.16b,$rka.16b,$rk1.16b
++___
++	&sbox($rk1);
++$code.=<<___;
++	ldp	$wtmp0,$wtmp1,[$kptr],8
++	eor	@data[1].16b,@data[1].16b,$rk1.16b
++
++	dup	$rk0.4s,$wtmp0
++	dup	$rk1.4s,$wtmp1
++
++	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
++	eor	$rka.16b,@data[0].16b,@data[1].16b
++	eor	$rk0.16b,@data[3].16b,$rk0.16b
++	eor	$rk0.16b,$rka.16b,$rk0.16b
++___
++	&sbox($rk0);
++$code.=<<___;
++	eor	@data[2].16b,@data[2].16b,$rk0.16b
++
++	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
++	eor	$rka.16b,$rka.16b,@data[2].16b
++	eor	$rk1.16b,$rka.16b,$rk1.16b
++___
++	&sbox($rk1);
++$code.=<<___;
++	eor	@data[3].16b,@data[3].16b,$rk1.16b
++___
++}
++
++# sm4 for 8 lanes of data, in neon registers
++# data0/data1/data2/data3 datax0/datax1/datax2/datax3
++sub sm4_8blks () {
++	my $kptr = shift;
++
++$code.=<<___;
++	ldp	$wtmp0,$wtmp1,[$kptr],8
++	// B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
++	dup	$rk0.4s,$wtmp0
++	eor	$rka.16b,@data[2].16b,@data[3].16b
++	eor	$rkb.16b,@datax[2].16b,@datax[3].16b
++	eor	@vtmp[0].16b,@data[1].16b,$rk0.16b
++	eor	@vtmp[1].16b,@datax[1].16b,$rk0.16b
++	eor	$rk0.16b,$rka.16b,@vtmp[0].16b
++	eor	$rk1.16b,$rkb.16b,@vtmp[1].16b
++___
++	&sbox_double($rk0,$rk1);
++$code.=<<___;
++	eor	@data[0].16b,@data[0].16b,$rk0.16b
++	eor	@datax[0].16b,@datax[0].16b,$rk1.16b
++
++	// B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
++	dup	$rk1.4s,$wtmp1
++	eor	$rka.16b,$rka.16b,@data[0].16b
++	eor	$rkb.16b,$rkb.16b,@datax[0].16b
++	eor	$rk0.16b,$rka.16b,$rk1.16b
++	eor	$rk1.16b,$rkb.16b,$rk1.16b
++___
++	&sbox_double($rk0,$rk1);
++$code.=<<___;
++	ldp	$wtmp0,$wtmp1,[$kptr],8
++	eor	@data[1].16b,@data[1].16b,$rk0.16b
++	eor	@datax[1].16b,@datax[1].16b,$rk1.16b
++
++	// B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
++	dup	$rk0.4s,$wtmp0
++	eor	$rka.16b,@data[0].16b,@data[1].16b
++	eor	$rkb.16b,@datax[0].16b,@datax[1].16b
++	eor	@vtmp[0].16b,@data[3].16b,$rk0.16b
++	eor	@vtmp[1].16b,@datax[3].16b,$rk0.16b
++	eor	$rk0.16b,$rka.16b,@vtmp[0].16b
++	eor	$rk1.16b,$rkb.16b,@vtmp[1].16b
++___
++	&sbox_double($rk0,$rk1);
++$code.=<<___;
++	eor	@data[2].16b,@data[2].16b,$rk0.16b
++	eor	@datax[2].16b,@datax[2].16b,$rk1.16b
++
++	// B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
++	dup	$rk1.4s,$wtmp1
++	eor	$rka.16b,$rka.16b,@data[2].16b
++	eor	$rkb.16b,$rkb.16b,@datax[2].16b
++	eor	$rk0.16b,$rka.16b,$rk1.16b
++	eor	$rk1.16b,$rkb.16b,$rk1.16b
++___
++	&sbox_double($rk0,$rk1);
++$code.=<<___;
++	eor	@data[3].16b,@data[3].16b,$rk0.16b
++	eor	@datax[3].16b,@datax[3].16b,$rk1.16b
++___
++}
++
++sub encrypt_1blk_norev() {
++	my $dat = shift;
++
++$code.=<<___;
++	mov	$ptr,$rks
++	mov	$counter,#8
++	mov	$word0,$dat.s[0]
++	mov	$word1,$dat.s[1]
++	mov	$word2,$dat.s[2]
++	mov	$word3,$dat.s[3]
++10:
++___
++	&sm4_1blk($ptr);
++$code.=<<___;
++	subs	$counter,$counter,#1
++	b.ne	10b
++	mov	$dat.s[0],$word3
++	mov	$dat.s[1],$word2
++	mov	$dat.s[2],$word1
++	mov	$dat.s[3],$word0
++___
++}
++
++sub encrypt_1blk() {
++	my $dat = shift;
++
++	&encrypt_1blk_norev($dat);
++	&rev32($dat,$dat);
++}
++
++sub encrypt_4blks() {
++$code.=<<___;
++	mov	$ptr,$rks
++	mov	$counter,#8
++10:
++___
++	&sm4_4blks($ptr);
++$code.=<<___;
++	subs	$counter,$counter,#1
++	b.ne	10b
++___
++	&rev32(@vtmp[3],@data[0]);
++	&rev32(@vtmp[2],@data[1]);
++	&rev32(@vtmp[1],@data[2]);
++	&rev32(@vtmp[0],@data[3]);
++}
++
++sub encrypt_8blks() {
++$code.=<<___;
++	mov	$ptr,$rks
++	mov	$counter,#8
++10:
++___
++	&sm4_8blks($ptr);
++$code.=<<___;
++	subs	$counter,$counter,#1
++	b.ne	10b
++___
++	&rev32(@vtmp[3],@data[0]);
++	&rev32(@vtmp[2],@data[1]);
++	&rev32(@vtmp[1],@data[2]);
++	&rev32(@vtmp[0],@data[3]);
++	&rev32(@data[3],@datax[0]);
++	&rev32(@data[2],@datax[1]);
++	&rev32(@data[1],@datax[2]);
++	&rev32(@data[0],@datax[3]);
++}
++
++sub load_sbox () {
++	my $data = shift;
++
++$code.=<<___;
++	adr	$ptr,.Lsbox
++	ld1	{@sbox[0].4s,@sbox[1].4s,@sbox[2].4s,@sbox[3].4s},[$ptr],#64
++	ld1	{@sbox[4].4s,@sbox[5].4s,@sbox[6].4s,@sbox[7].4s},[$ptr],#64
++	ld1	{@sbox[8].4s,@sbox[9].4s,@sbox[10].4s,@sbox[11].4s},[$ptr],#64
++	ld1	{@sbox[12].4s,@sbox[13].4s,@sbox[14].4s,@sbox[15].4s},[$ptr]
++___
++}
++
++$code=<<___;
++#include "arm_arch.h"
++.arch	armv8-a
++.text
++
++.type	_vpsm4_consts,%object
++.align	7
++_vpsm4_consts:
++.Lsbox:
++	.byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
++	.byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
++	.byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
++	.byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
++	.byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
++	.byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
++	.byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
++	.byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
++	.byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
++	.byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
++	.byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
++	.byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
++	.byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
++	.byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
++	.byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
++	.byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
++.Lck:
++	.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
++	.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
++	.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
++	.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
++	.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
++	.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
++	.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
++	.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
++.Lfk:
++	.dword 0x56aa3350a3b1bac6,0xb27022dc677d9197
++.Lshuffles:
++	.dword 0x0B0A090807060504,0x030201000F0E0D0C
++
++.size	_vpsm4_consts,.-_vpsm4_consts
++___
++
++{{{
++my ($key,$keys,$enc)=("x0","x1","w2");
++my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
++my ($vkey,$vfk,$vmap)=("v5","v6","v7");
++$code.=<<___;
++.type	_vpsm4_set_key,%function
++.align	4
++_vpsm4_set_key:
++	AARCH64_VALID_CALL_TARGET
++	ld1	{$vkey.4s},[$key]
++___
++	&load_sbox();
++	&rev32($vkey,$vkey);
++$code.=<<___;
++	adr	$pointer,.Lshuffles
++	ld1	{$vmap.4s},[$pointer]
++	adr	$pointer,.Lfk
++	ld1	{$vfk.4s},[$pointer]
++	eor	$vkey.16b,$vkey.16b,$vfk.16b
++	mov	$schedules,#32
++	adr	$pointer,.Lck
++	movi	@vtmp[0].16b,#64
++	cbnz	$enc,1f
++	add	$keys,$keys,124
++1:
++	mov	$wtmp,$vkey.s[1]
++	ldr	$roundkey,[$pointer],#4
++	eor	$roundkey,$roundkey,$wtmp
++	mov	$wtmp,$vkey.s[2]
++	eor	$roundkey,$roundkey,$wtmp
++	mov	$wtmp,$vkey.s[3]
++	eor	$roundkey,$roundkey,$wtmp
++	// sbox lookup
++	mov	@data[0].s[0],$roundkey
++	tbl	@vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b
++	sub	@data[0].16b,@data[0].16b,@vtmp[0].16b
++	tbx	@vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b
++	sub	@data[0].16b,@data[0].16b,@vtmp[0].16b
++	tbx	@vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b
++	sub	@data[0].16b,@data[0].16b,@vtmp[0].16b
++	tbx	@vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b
++	mov	$wtmp,@vtmp[1].s[0]
++	eor	$roundkey,$wtmp,$wtmp,ror #19
++	eor	$roundkey,$roundkey,$wtmp,ror #9
++	mov	$wtmp,$vkey.s[0]
++	eor	$roundkey,$roundkey,$wtmp
++	mov	$vkey.s[0],$roundkey
++	cbz	$enc,2f
++	str	$roundkey,[$keys],#4
++	b	3f
++2:
++	str	$roundkey,[$keys],#-4
++3:
++	tbl	$vkey.16b,{$vkey.16b},$vmap.16b
++	subs	$schedules,$schedules,#1
++	b.ne	1b
++	ret
++.size	_vpsm4_set_key,.-_vpsm4_set_key
++___
++}}}
++
++
++{{{
++$code.=<<___;
++.type	_vpsm4_enc_4blks,%function
++.align	4
++_vpsm4_enc_4blks:
++	AARCH64_VALID_CALL_TARGET
++___
++	&encrypt_4blks();
++$code.=<<___;
++	ret
++.size	_vpsm4_enc_4blks,.-_vpsm4_enc_4blks
++___
++}}}
++
++{{{
++$code.=<<___;
++.type	_vpsm4_enc_8blks,%function
++.align	4
++_vpsm4_enc_8blks:
++	AARCH64_VALID_CALL_TARGET
++___
++	&encrypt_8blks();
++$code.=<<___;
++	ret
++.size	_vpsm4_enc_8blks,.-_vpsm4_enc_8blks
++___
++}}}
++
++
++{{{
++my ($key,$keys)=("x0","x1");
++$code.=<<___;
++.globl	${prefix}_set_encrypt_key
++.type	${prefix}_set_encrypt_key,%function
++.align	5
++${prefix}_set_encrypt_key:
++	AARCH64_SIGN_LINK_REGISTER
++	stp	x29,x30,[sp,#-16]!
++	mov	w2,1
++	bl	_vpsm4_set_key
++	ldp	x29,x30,[sp],#16
++	AARCH64_VALIDATE_LINK_REGISTER
++	ret
++.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
++___
++}}}
++
++{{{
++my ($key,$keys)=("x0","x1");
++$code.=<<___;
++.globl	${prefix}_set_decrypt_key
++.type	${prefix}_set_decrypt_key,%function
++.align	5
++${prefix}_set_decrypt_key:
++	AARCH64_SIGN_LINK_REGISTER
++	stp	x29,x30,[sp,#-16]!
++	mov	w2,0
++	bl	_vpsm4_set_key
++	ldp	x29,x30,[sp],#16
++	AARCH64_VALIDATE_LINK_REGISTER
++	ret
++.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
++___
++}}}
++
++{{{
++sub gen_block () {
++	my $dir = shift;
++	my ($inp,$outp,$rk)=map("x$_",(0..2));
++
++$code.=<<___;
++.globl	${prefix}_${dir}crypt
++.type	${prefix}_${dir}crypt,%function
++.align	5
++${prefix}_${dir}crypt:
++	AARCH64_VALID_CALL_TARGET
++	ld1	{@data[0].16b},[$inp]
++___
++	&load_sbox();
++	&rev32(@data[0],@data[0]);
++$code.=<<___;
++	mov	$rks,x2
++___
++	&encrypt_1blk(@data[0]);
++$code.=<<___;
++	st1	{@data[0].16b},[$outp]
++	ret
++.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
++___
++}
++&gen_block("en");
++&gen_block("de");
++}}}
++
++{{{
++my ($enc) = ("w4");
++my @dat=map("v$_",(16..23));
++
++$code.=<<___;
++.globl	${prefix}_ecb_encrypt
++.type	${prefix}_ecb_encrypt,%function
++.align	5
++${prefix}_ecb_encrypt:
++	AARCH64_SIGN_LINK_REGISTER
++	// convert length into blocks
++	lsr	x2,x2,4
++	stp	d8,d9,[sp,#-80]!
++	stp	d10,d11,[sp,#16]
++	stp	d12,d13,[sp,#32]
++	stp	d14,d15,[sp,#48]
++	stp	x29,x30,[sp,#64]
++___
++	&load_sbox();
++$code.=<<___;
++.Lecb_8_blocks_process:
++	cmp	$blocks,#8
++	b.lt	.Lecb_4_blocks_process
++	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++	ld4	{@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++	&rev32(@datax[0],@datax[0]);
++	&rev32(@datax[1],@datax[1]);
++	&rev32(@datax[2],@datax[2]);
++	&rev32(@datax[3],@datax[3]);
++$code.=<<___;
++	bl	_vpsm4_enc_8blks
++	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++	st4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++	subs	$blocks,$blocks,#8
++	b.gt	.Lecb_8_blocks_process
++	b	100f
++.Lecb_4_blocks_process:
++	cmp	$blocks,#4
++	b.lt	1f
++	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++$code.=<<___;
++	bl	_vpsm4_enc_4blks
++	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++	sub	$blocks,$blocks,#4
++1:
++	// process last block
++	cmp	$blocks,#1
++	b.lt	100f
++	b.gt	1f
++	ld1	{@data[0].16b},[$inp]
++___
++	&rev32(@data[0],@data[0]);
++	&encrypt_1blk(@data[0]);
++$code.=<<___;
++	st1	{@data[0].16b},[$outp]
++	b	100f
++1:	// process last 2 blocks
++	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
++	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
++	cmp	$blocks,#2
++	b.gt	1f
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++$code.=<<___;
++	bl	_vpsm4_enc_4blks
++	st4	{@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
++	st4	{@vtmp[0].s-@vtmp[3].s}[1],[$outp]
++	b	100f
++1:	// process last 3 blocks
++	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++$code.=<<___;
++	bl	_vpsm4_enc_4blks
++	st4	{@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
++	st4	{@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
++	st4	{@vtmp[0].s-@vtmp[3].s}[2],[$outp]
++100:
++	ldp	d10,d11,[sp,#16]
++	ldp	d12,d13,[sp,#32]
++	ldp	d14,d15,[sp,#48]
++	ldp	x29,x30,[sp,#64]
++	ldp	d8,d9,[sp],#80
++	AARCH64_VALIDATE_LINK_REGISTER
++	ret
++.size	${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
++___
++}}}
++
++{{{
++my ($len,$ivp,$enc)=("x2","x4","w5");
++my $ivec0=("v3");
++my $ivec1=("v15");
++
++$code.=<<___;
++.globl	${prefix}_cbc_encrypt
++.type	${prefix}_cbc_encrypt,%function
++.align	5
++${prefix}_cbc_encrypt:
++	AARCH64_VALID_CALL_TARGET
++	lsr	$len,$len,4
++___
++	&load_sbox();
++$code.=<<___;
++	cbz	$enc,.Ldec
++	ld1	{$ivec0.4s},[$ivp]
++.Lcbc_4_blocks_enc:
++	cmp	$blocks,#4
++	b.lt	1f
++	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++	eor	@data[0].16b,@data[0].16b,$ivec0.16b
++___
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++	&encrypt_1blk_norev(@data[0]);
++$code.=<<___;
++	eor	@data[1].16b,@data[1].16b,@data[0].16b
++___
++	&encrypt_1blk_norev(@data[1]);
++	&rev32(@data[0],@data[0]);
++
++$code.=<<___;
++	eor	@data[2].16b,@data[2].16b,@data[1].16b
++___
++	&encrypt_1blk_norev(@data[2]);
++	&rev32(@data[1],@data[1]);
++$code.=<<___;
++	eor	@data[3].16b,@data[3].16b,@data[2].16b
++___
++	&encrypt_1blk_norev(@data[3]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++$code.=<<___;
++	orr	$ivec0.16b,@data[3].16b,@data[3].16b
++	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++	subs	$blocks,$blocks,#4
++	b.ne	.Lcbc_4_blocks_enc
++	b	2f
++1:
++	subs	$blocks,$blocks,#1
++	b.lt	2f
++	ld1	{@data[0].4s},[$inp],#16
++	eor	$ivec0.16b,$ivec0.16b,@data[0].16b
++___
++	&rev32($ivec0,$ivec0);
++	&encrypt_1blk($ivec0);
++$code.=<<___;
++	st1	{$ivec0.16b},[$outp],#16
++	b	1b
++2:
++	// save back IV
++	st1	{$ivec0.16b},[$ivp]
++	ret
++
++.Ldec:
++	// decryption mode starts
++	AARCH64_SIGN_LINK_REGISTER
++	stp	d8,d9,[sp,#-80]!
++	stp	d10,d11,[sp,#16]
++	stp	d12,d13,[sp,#32]
++	stp	d14,d15,[sp,#48]
++	stp	x29,x30,[sp,#64]
++.Lcbc_8_blocks_dec:
++	cmp	$blocks,#8
++	b.lt	1f
++	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
++	add	$ptr,$inp,#64
++	ld4	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],$data[3]);
++	&rev32(@datax[0],@datax[0]);
++	&rev32(@datax[1],@datax[1]);
++	&rev32(@datax[2],@datax[2]);
++	&rev32(@datax[3],$datax[3]);
++$code.=<<___;
++	bl	_vpsm4_enc_8blks
++___
++	&transpose(@vtmp,@datax);
++	&transpose(@data,@datax);
++$code.=<<___;
++	ld1	{$ivec1.16b},[$ivp]
++	ld1	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
++	// note ivec1 and vtmpx[3] are resuing the same register
++	// care needs to be taken to avoid conflict
++	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
++	ld1	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
++	eor	@vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
++	eor	@vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
++	eor	@vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
++	// save back IV
++	st1	{$vtmpx[3].16b}, [$ivp]
++	eor	@data[0].16b,@data[0].16b,$datax[3].16b
++	eor	@data[1].16b,@data[1].16b,@vtmpx[0].16b
++	eor	@data[2].16b,@data[2].16b,@vtmpx[1].16b
++	eor	@data[3].16b,$data[3].16b,@vtmpx[2].16b
++	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++	st1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++	subs	$blocks,$blocks,#8
++	b.gt	.Lcbc_8_blocks_dec
++	b.eq	100f
++1:
++	ld1	{$ivec1.16b},[$ivp]
++.Lcbc_4_blocks_dec:
++	cmp	$blocks,#4
++	b.lt	1f
++	ld4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],$data[3]);
++$code.=<<___;
++	bl	_vpsm4_enc_4blks
++	ld1	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++___
++	&transpose(@vtmp,@datax);
++$code.=<<___;
++	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
++	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
++	orr	$ivec1.16b,@data[3].16b,@data[3].16b
++	eor	@vtmp[2].16b,@vtmp[2].16b,@data[1].16b
++	eor	@vtmp[3].16b,$vtmp[3].16b,@data[2].16b
++	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++	subs	$blocks,$blocks,#4
++	b.gt	.Lcbc_4_blocks_dec
++	// save back IV
++	st1	{@vtmp[3].16b}, [$ivp]
++	b	100f
++1:	// last block
++	subs	$blocks,$blocks,#1
++	b.lt	100f
++	b.gt	1f
++	ld1	{@data[0].4s},[$inp],#16
++	// save back IV
++	st1	{$data[0].16b}, [$ivp]
++___
++	&rev32(@datax[0],@data[0]);
++	&encrypt_1blk(@datax[0]);
++$code.=<<___;
++	eor	@datax[0].16b,@datax[0].16b,$ivec1.16b
++	st1	{@datax[0].16b},[$outp],#16
++	b	100f
++1:	// last two blocks
++	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
++	add	$ptr,$inp,#16
++	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
++	subs	$blocks,$blocks,1
++	b.gt	1f
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++$code.=<<___;
++	bl	_vpsm4_enc_4blks
++	ld1	{@data[0].4s,@data[1].4s},[$inp],#32
++___
++	&transpose(@vtmp,@datax);
++$code.=<<___;
++	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
++	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
++	st1	{@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
++	// save back IV
++	st1	{@data[1].16b}, [$ivp]
++	b	100f
++1:	// last 3 blocks
++	ld4	{@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
++___
++	&rev32(@data[0],@data[0]);
++	&rev32(@data[1],@data[1]);
++	&rev32(@data[2],@data[2]);
++	&rev32(@data[3],@data[3]);
++$code.=<<___;
++	bl	_vpsm4_enc_4blks
++	ld1	{@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
++___
++	&transpose(@vtmp,@datax);
++$code.=<<___;
++	eor	@vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
++	eor	@vtmp[1].16b,@vtmp[1].16b,@data[0].16b
++	eor	@vtmp[2].16b,@vtmp[2].16b,@data[1].16b
++	st1	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
++	// save back IV
++	st1	{@data[2].16b}, [$ivp]
++100:
++	ldp	d10,d11,[sp,#16]
++	ldp	d12,d13,[sp,#32]
++	ldp	d14,d15,[sp,#48]
++	ldp	x29,x30,[sp,#64]
++	ldp	d8,d9,[sp],#80
++	AARCH64_VALIDATE_LINK_REGISTER
++	ret
++.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
++___
++}}}
++
++{{{
++my ($ivp)=("x4");
++my ($ctr)=("w5");
++my $ivec=("v3");
++
++$code.=<<___;
++.globl	${prefix}_ctr32_encrypt_blocks
++.type	${prefix}_ctr32_encrypt_blocks,%function
++.align	5
++${prefix}_ctr32_encrypt_blocks:
++	AARCH64_VALID_CALL_TARGET
++	ld1	{$ivec.4s},[$ivp]
++___
++	&rev32($ivec,$ivec);
++	&load_sbox();
++$code.=<<___;
++	cmp	$blocks,#1
++	b.ne	1f
++	// fast processing for one single block without
++	// context saving overhead
++___
++	&encrypt_1blk($ivec);
++$code.=<<___;
++	ld1	{@data[0].16b},[$inp]
++	eor	@data[0].16b,@data[0].16b,$ivec.16b
++	st1	{@data[0].16b},[$outp]
++	ret
++1:
++	AARCH64_SIGN_LINK_REGISTER
++	stp	d8,d9,[sp,#-80]!
++	stp	d10,d11,[sp,#16]
++	stp	d12,d13,[sp,#32]
++	stp	d14,d15,[sp,#48]
++	stp	x29,x30,[sp,#64]
++	mov	$word0,$ivec.s[0]
++	mov	$word1,$ivec.s[1]
++	mov	$word2,$ivec.s[2]
++	mov	$ctr,$ivec.s[3]
++.Lctr32_4_blocks_process:
++	cmp	$blocks,#4
++	b.lt	1f
++	dup	@data[0].4s,$word0
++	dup	@data[1].4s,$word1
++	dup	@data[2].4s,$word2
++	mov	@data[3].s[0],$ctr
++	add	$ctr,$ctr,#1
++	mov	$data[3].s[1],$ctr
++	add	$ctr,$ctr,#1
++	mov	@data[3].s[2],$ctr
++	add	$ctr,$ctr,#1
++	mov	@data[3].s[3],$ctr
++	add	$ctr,$ctr,#1
++	cmp	$blocks,#8
++	b.ge	.Lctr32_8_blocks_process
++	bl	_vpsm4_enc_4blks
++	ld4	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
++	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
++	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
++	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
++	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
++	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++	subs	$blocks,$blocks,#4
++	b.ne	.Lctr32_4_blocks_process
++	b	100f
++.Lctr32_8_blocks_process:
++	dup	@datax[0].4s,$word0
++	dup	@datax[1].4s,$word1
++	dup	@datax[2].4s,$word2
++	mov	@datax[3].s[0],$ctr
++	add	$ctr,$ctr,#1
++	mov	$datax[3].s[1],$ctr
++	add	$ctr,$ctr,#1
++	mov	@datax[3].s[2],$ctr
++	add	$ctr,$ctr,#1
++	mov	@datax[3].s[3],$ctr
++	add	$ctr,$ctr,#1
++	bl	_vpsm4_enc_8blks
++	ld4	{@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
++	ld4	{@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
++	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
++	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
++	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
++	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
++	eor	@data[0].16b,@data[0].16b,@datax[0].16b
++	eor	@data[1].16b,@data[1].16b,@datax[1].16b
++	eor	@data[2].16b,@data[2].16b,@datax[2].16b
++	eor	@data[3].16b,@data[3].16b,@datax[3].16b
++	st4	{@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++	st4	{@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++	subs	$blocks,$blocks,#8
++	b.ne	.Lctr32_4_blocks_process
++	b	100f
++1:	// last block processing
++	subs	$blocks,$blocks,#1
++	b.lt	100f
++	b.gt	1f
++	mov	$ivec.s[0],$word0
++	mov	$ivec.s[1],$word1
++	mov	$ivec.s[2],$word2
++	mov	$ivec.s[3],$ctr
++___
++	&encrypt_1blk($ivec);
++$code.=<<___;
++	ld1	{@data[0].16b},[$inp]
++	eor	@data[0].16b,@data[0].16b,$ivec.16b
++	st1	{@data[0].16b},[$outp]
++	b	100f
++1:	// last 2 blocks processing
++	dup	@data[0].4s,$word0
++	dup	@data[1].4s,$word1
++	dup	@data[2].4s,$word2
++	mov	@data[3].s[0],$ctr
++	add	$ctr,$ctr,#1
++	mov	@data[3].s[1],$ctr
++	subs	$blocks,$blocks,#1
++	b.ne	1f
++	bl	_vpsm4_enc_4blks
++	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
++	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
++	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
++	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
++	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
++	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
++	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
++	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
++	b	100f
++1:	// last 3 blocks processing
++	add	$ctr,$ctr,#1
++	mov	@data[3].s[2],$ctr
++	bl	_vpsm4_enc_4blks
++	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
++	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
++	ld4	{@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
++	eor	@vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
++	eor	@vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
++	eor	@vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
++	eor	@vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
++	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
++	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
++	st4	{@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
++100:
++	ldp	d10,d11,[sp,#16]
++	ldp	d12,d13,[sp,#32]
++	ldp	d14,d15,[sp,#48]
++	ldp	x29,x30,[sp,#64]
++	ldp	d8,d9,[sp],#80
++	AARCH64_VALIDATE_LINK_REGISTER
++	ret
++.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
++___
++}}}
++########################################
++open SELF,$0;
++while(<SELF>) {
++        next if (/^#!/);
++        last if (!s/^#/\/\// and !/^$/);
++        print;
++}
++close SELF;
++
++foreach(split("\n",$code)) {
++	s/\`([^\`]*)\`/eval($1)/ge;
++	print $_,"\n";
++}
++
++close STDOUT or die "error closing STDOUT: $!";
+diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info
+index e27aa49e67..75a215ab80 100644
+--- a/crypto/sm4/build.info
++++ b/crypto/sm4/build.info
+@@ -1,8 +1,8 @@
+ LIBS=../../libcrypto
+ 
+ IF[{- !$disabled{asm} -}]
+-  $SM4DEF_aarch64=SM4_ASM
+-  $SM4ASM_aarch64=sm4-armv8.S
++  $SM4DEF_aarch64=SM4_ASM VPSM4_ASM
++  $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S
+ 
+   # Now that we have defined all the arch specific variables, use the
+   # appropriate one, and define the appropriate macros
+@@ -29,4 +29,6 @@ IF[{- !$disabled{module} && !$disabled{shared} -}]
+ ENDIF
+ 
+ GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl
++GENERATE[vpsm4-armv8.S]=asm/vpsm4-armv8.pl
+ INCLUDE[sm4-armv8.o]=..
++INCLUDE[vpsm4-armv8.o]=..
+diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
+index 42c8b44a43..11f9b9d88b 100644
+--- a/include/crypto/sm4_platform.h
++++ b/include/crypto/sm4_platform.h
+@@ -15,6 +15,16 @@
+ #  if (defined(__arm__) || defined(__arm) || defined(__aarch64__))
+ #   include "arm_arch.h"
+ #   if __ARM_MAX_ARCH__>=8
++extern unsigned int OPENSSL_arm_midr;
++static inline int vpsm4_capable(void)
++{
++    return (OPENSSL_armcap_P & ARMV8_CPUID) &&
++            (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1) ||
++             MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1));
++}
++#    if defined(VPSM4_ASM)
++#     define VPSM4_CAPABLE vpsm4_capable()
++#    endif
+ #    define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4)
+ #    define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key
+ #    define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key
+@@ -45,4 +55,23 @@ void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+                                 const unsigned char ivec[16]);
+ # endif /* HWSM4_CAPABLE */
+ 
++#ifdef VPSM4_CAPABLE
++int vpsm4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key);
++int vpsm4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key);
++void vpsm4_encrypt(const unsigned char *in, unsigned char *out,
++                   const SM4_KEY *key);
++void vpsm4_decrypt(const unsigned char *in, unsigned char *out,
++                   const SM4_KEY *key);
++void vpsm4_cbc_encrypt(const unsigned char *in, unsigned char *out,
++                       size_t length, const SM4_KEY *key,
++                       unsigned char *ivec, const int enc);
++void vpsm4_ecb_encrypt(const unsigned char *in, unsigned char *out,
++                       size_t length, const SM4_KEY *key,
++                       const int enc);
++void vpsm4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
++                                size_t len, const void *key,
++                                const unsigned char ivec[16]);
++# endif /* VPSM4_CAPABLE */
++
++
+ #endif /* OSSL_SM4_PLATFORM_H */
+diff --git a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+index b9633f83ed..db7fe0fe2f 100644
+--- a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
++++ b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+@@ -32,6 +32,13 @@ static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
+ #  endif
+     } else
+ # endif /* HWSM4_CAPABLE */
++# ifdef VPSM4_CAPABLE
++    if (VPSM4_CAPABLE) {
++        vpsm4_set_encrypt_key(key, ks);
++        CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f) vpsm4_encrypt);
++        ctx->ctr = (ctr128_f) vpsm4_ctr32_encrypt_blocks;
++    } else
++# endif /* VPSM4_CAPABLE */
+     {
+         ossl_sm4_set_key(key, ks);
+         CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt);
+diff --git a/providers/implementations/ciphers/cipher_sm4_hw.c b/providers/implementations/ciphers/cipher_sm4_hw.c
+index 4cd3d3d669..9a2e99f67c 100644
+--- a/providers/implementations/ciphers/cipher_sm4_hw.c
++++ b/providers/implementations/ciphers/cipher_sm4_hw.c
+@@ -41,6 +41,19 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
+ #endif
+             (void)0;            /* terminate potentially open 'else' */
+         } else
++#endif
++#ifdef VPSM4_CAPABLE
++        if (VPSM4_CAPABLE) {
++            vpsm4_set_encrypt_key(key, ks);
++            ctx->block = (block128_f)vpsm4_encrypt;
++            ctx->stream.cbc = NULL;
++            if (ctx->mode == EVP_CIPH_CBC_MODE)
++                ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt;
++            else if (ctx->mode == EVP_CIPH_ECB_MODE)
++                ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt;
++            else if (ctx->mode == EVP_CIPH_CTR_MODE)
++                ctx->stream.ctr = (ctr128_f)vpsm4_ctr32_encrypt_blocks;
++        } else
+ #endif
+         {
+             ossl_sm4_set_key(key, ks);
+@@ -61,6 +74,17 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
+                 ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt;
+ #endif
+         } else
++#endif
++#ifdef VPSM4_CAPABLE
++        if (VPSM4_CAPABLE) {
++            vpsm4_set_decrypt_key(key, ks);
++            ctx->block = (block128_f)vpsm4_decrypt;
++            ctx->stream.cbc = NULL;
++            if (ctx->mode == EVP_CIPH_CBC_MODE)
++                ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt;
++        else if (ctx->mode == EVP_CIPH_ECB_MODE)
++                ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt;
++        } else
+ #endif
+         {
+             ossl_sm4_set_key(key, ks);
+-- 
+2.37.3.windows.1
+
diff --git a/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch b/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch
new file mode 100644
index 0000000..c68f1a0
--- /dev/null
+++ b/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch
@@ -0,0 +1,1228 @@
+From 1cd480c10b8bbaa6f72d503494ff2973672ec0e4 Mon Sep 17 00:00:00 2001
+From: Daniel Hu <Daniel.Hu@arm.com>
+Date: Tue, 19 Oct 2021 22:49:05 +0100
+Subject: [PATCH 05/13] SM4 optimization for ARM by HW instruction
+
+This patch implements the SM4 optimization for ARM processor,
+using SM4 HW instruction, which is an optional feature of
+crypto extension for aarch64 V8.
+
+Tested on some modern ARM micro-architectures with SM4 support, the
+performance uplift can be observed around 8X~40X over existing
+C implementation in openssl. Algorithms that can be parallelized
+(like CTR, ECB, CBC decryption) are on higher end, with algorithm
+like CBC encryption on lower end (due to inter-block dependency)
+
+Perf data on Yitian-710 2.75GHz hardware, before and after optimization:
+
+Before:
+  type      16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes  16384 bytes
+  SM4-CTR  105787.80k   107837.87k   108380.84k   108462.08k   108549.46k   108554.92k
+  SM4-ECB  111924.58k   118173.76k   119776.00k   120093.70k   120264.02k   120274.94k
+  SM4-CBC  106428.09k   109190.98k   109674.33k   109774.51k   109827.41k   109827.41k
+
+After (7.4x - 36.6x faster):
+  type      16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes  16384 bytes
+  SM4-CTR  781979.02k  2432994.28k  3437753.86k  3834177.88k  3963715.58k  3974556.33k
+  SM4-ECB  937590.69k  2941689.02k  3945751.81k  4328655.87k  4459181.40k  4468692.31k
+  SM4-CBC  890639.88k  1027746.58k  1050621.78k  1056696.66k  1058613.93k  1058701.31k
+
+Signed-off-by: Daniel Hu <Daniel.Hu@arm.com>
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/17455)
+---
+ crypto/arm64cpuid.pl                          |   8 +
+ crypto/arm_arch.h                             |   1 +
+ crypto/armcap.c                               |  10 +
+ crypto/evp/e_sm4.c                            | 193 ++++--
+ crypto/sm4/asm/sm4-armv8.pl                   | 635 ++++++++++++++++++
+ crypto/sm4/build.info                         |  32 +-
+ include/crypto/sm4_platform.h                 |  48 ++
+ .../implementations/ciphers/cipher_sm4.h      |   1 +
+ .../ciphers/cipher_sm4_gcm_hw.c               |  20 +-
+ .../implementations/ciphers/cipher_sm4_hw.c   |  57 +-
+ 10 files changed, 945 insertions(+), 60 deletions(-)
+ create mode 100755 crypto/sm4/asm/sm4-armv8.pl
+ create mode 100644 include/crypto/sm4_platform.h
+
+diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
+index 10d267b7ad..36af3e075b 100755
+--- a/crypto/arm64cpuid.pl
++++ b/crypto/arm64cpuid.pl
+@@ -80,6 +80,14 @@ _armv8_pmull_probe:
+ 	ret
+ .size	_armv8_pmull_probe,.-_armv8_pmull_probe
+ 
++.globl	_armv8_sm4_probe
++.type	_armv8_sm4_probe,%function
++_armv8_sm4_probe:
++	AARCH64_VALID_CALL_TARGET
++	.long	0xcec08400	// sm4e	v0.4s, v0.4s
++	ret
++.size	_armv8_sm4_probe,.-_armv8_sm4_probe
++
+ .globl	_armv8_sha512_probe
+ .type	_armv8_sha512_probe,%function
+ _armv8_sha512_probe:
+diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
+index c8b501f34c..5b5af31d92 100644
+--- a/crypto/arm_arch.h
++++ b/crypto/arm_arch.h
+@@ -85,6 +85,7 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
+ # define ARMV8_CPUID     (1<<7)
+ # define ARMV8_RNG       (1<<8)
+ # define ARMV8_SM3       (1<<9)
++# define ARMV8_SM4       (1<<10)
+ 
+ /*
+  * MIDR_EL1 system register
+diff --git a/crypto/armcap.c b/crypto/armcap.c
+index 365a48df45..c5aa062767 100644
+--- a/crypto/armcap.c
++++ b/crypto/armcap.c
+@@ -53,6 +53,7 @@ void _armv8_sha256_probe(void);
+ void _armv8_pmull_probe(void);
+ # ifdef __aarch64__
+ void _armv8_sm3_probe(void);
++void _armv8_sm4_probe(void);
+ void _armv8_sha512_probe(void);
+ unsigned int _armv8_cpuid_probe(void);
+ # endif
+@@ -139,6 +140,7 @@ static unsigned long getauxval(unsigned long key)
+ #  define HWCAP_CE_SHA256        (1 << 6)
+ #  define HWCAP_CPUID            (1 << 11)
+ #  define HWCAP_CE_SM3           (1 << 18)
++#  define HWCAP_CE_SM4           (1 << 19)
+ #  define HWCAP_CE_SHA512        (1 << 21)
+ # endif
+ 
+@@ -207,6 +209,9 @@ void OPENSSL_cpuid_setup(void)
+             OPENSSL_armcap_P |= ARMV8_SHA256;
+ 
+ #  ifdef __aarch64__
++        if (hwcap & HWCAP_CE_SM4)
++            OPENSSL_armcap_P |= ARMV8_SM4;
++
+         if (hwcap & HWCAP_CE_SHA512)
+             OPENSSL_armcap_P |= ARMV8_SHA512;
+ 
+@@ -254,6 +259,11 @@ void OPENSSL_cpuid_setup(void)
+             OPENSSL_armcap_P |= ARMV8_SHA256;
+         }
+ #  if defined(__aarch64__) && !defined(__APPLE__)
++        if (sigsetjmp(ill_jmp, 1) == 0) {
++            _armv8_sm4_probe();
++            OPENSSL_armcap_P |= ARMV8_SM4;
++        }
++
+         if (sigsetjmp(ill_jmp, 1) == 0) {
+             _armv8_sha512_probe();
+             OPENSSL_armcap_P |= ARMV8_SHA512;
+diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c
+index abd603015c..bff79ff197 100644
+--- a/crypto/evp/e_sm4.c
++++ b/crypto/evp/e_sm4.c
+@@ -17,92 +17,187 @@
+ # include <openssl/modes.h>
+ # include "crypto/sm4.h"
+ # include "crypto/evp.h"
++# include "crypto/sm4_platform.h"
+ # include "evp_local.h"
+ 
+ typedef struct {
+-    SM4_KEY ks;
++    union {
++        OSSL_UNION_ALIGN;
++        SM4_KEY ks;
++    } ks;
++    block128_f block;
++    union {
++        ecb128_f ecb;
++        cbc128_f cbc;
++        ctr128_f ctr;
++    } stream;
+ } EVP_SM4_KEY;
+ 
++# define BLOCK_CIPHER_generic(nid,blocksize,ivlen,nmode,mode,MODE,flags) \
++static const EVP_CIPHER sm4_##mode = { \
++        nid##_##nmode,blocksize,128/8,ivlen, \
++        flags|EVP_CIPH_##MODE##_MODE,   \
++        EVP_ORIG_GLOBAL,                \
++        sm4_init_key,                   \
++        sm4_##mode##_cipher,            \
++        NULL,                           \
++        sizeof(EVP_SM4_KEY),            \
++        NULL,NULL,NULL,NULL }; \
++const EVP_CIPHER *EVP_sm4_##mode(void) \
++{ return &sm4_##mode; }
++
++#define DEFINE_BLOCK_CIPHERS(nid,flags)             \
++        BLOCK_CIPHER_generic(nid,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)     \
++        BLOCK_CIPHER_generic(nid,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)      \
++        BLOCK_CIPHER_generic(nid,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)   \
++        BLOCK_CIPHER_generic(nid,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1)   \
++        BLOCK_CIPHER_generic(nid,1,16,ctr,ctr,CTR,flags)
++
+ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+                         const unsigned char *iv, int enc)
+ {
+-    ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
++    int mode;
++    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
++
++    mode = EVP_CIPHER_CTX_get_mode(ctx);
++    if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
++        && !enc) {
++#ifdef HWSM4_CAPABLE
++        if (HWSM4_CAPABLE) {
++            HWSM4_set_decrypt_key(key, &dat->ks.ks);
++            dat->block = (block128_f) HWSM4_decrypt;
++            dat->stream.cbc = NULL;
++# ifdef HWSM4_cbc_encrypt
++            if (mode == EVP_CIPH_CBC_MODE)
++                dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt;
++# endif
++# ifdef HWSM4_ecb_encrypt
++            if (mode == EVP_CIPH_ECB_MODE)
++                dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt;
++# endif
++        } else
++#endif
++        {
++            dat->block = (block128_f) ossl_sm4_decrypt;
++            ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
++        }
++    } else
++#ifdef HWSM4_CAPABLE
++    if (HWSM4_CAPABLE) {
++        HWSM4_set_encrypt_key(key, &dat->ks.ks);
++        dat->block = (block128_f) HWSM4_encrypt;
++        dat->stream.cbc = NULL;
++# ifdef HWSM4_cbc_encrypt
++        if (mode == EVP_CIPH_CBC_MODE)
++            dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt;
++        else
++# endif
++# ifdef HWSM4_ecb_encrypt
++        if (mode == EVP_CIPH_ECB_MODE)
++            dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt;
++        else
++# endif
++# ifdef HWSM4_ctr32_encrypt_blocks
++        if (mode == EVP_CIPH_CTR_MODE)
++            dat->stream.ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks;
++        else
++# endif
++            (void)0;            /* terminate potentially open 'else' */
++    } else
++#endif
++    {
++        dat->block = (block128_f) ossl_sm4_encrypt;
++        ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
++    }
+     return 1;
+ }
+ 
+-static void sm4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+-                            size_t len, const SM4_KEY *key,
+-                            unsigned char *ivec, const int enc)
++static int sm4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
++                          const unsigned char *in, size_t len)
+ {
+-    if (enc)
+-        CRYPTO_cbc128_encrypt(in, out, len, key, ivec,
+-                              (block128_f)ossl_sm4_encrypt);
++    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
++
++    if (dat->stream.cbc)
++        (*dat->stream.cbc) (in, out, len, &dat->ks.ks, ctx->iv,
++                            EVP_CIPHER_CTX_is_encrypting(ctx));
++    else if (EVP_CIPHER_CTX_is_encrypting(ctx))
++        CRYPTO_cbc128_encrypt(in, out, len, &dat->ks, ctx->iv,
++                              dat->block);
+     else
+-        CRYPTO_cbc128_decrypt(in, out, len, key, ivec,
+-                              (block128_f)ossl_sm4_decrypt);
++        CRYPTO_cbc128_decrypt(in, out, len, &dat->ks,
++                              ctx->iv, dat->block);
++    return 1;
+ }
+ 
+-static void sm4_cfb128_encrypt(const unsigned char *in, unsigned char *out,
+-                               size_t length, const SM4_KEY *key,
+-                               unsigned char *ivec, int *num, const int enc)
++static int sm4_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
++                          const unsigned char *in, size_t len)
+ {
+-    CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc,
+-                          (block128_f)ossl_sm4_encrypt);
++    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
++    int num = EVP_CIPHER_CTX_get_num(ctx);
++
++    CRYPTO_cfb128_encrypt(in, out, len, &dat->ks,
++                          ctx->iv, &num,
++                          EVP_CIPHER_CTX_is_encrypting(ctx), dat->block);
++    EVP_CIPHER_CTX_set_num(ctx, num);
++    return 1;
+ }
+ 
+-static void sm4_ecb_encrypt(const unsigned char *in, unsigned char *out,
+-                            const SM4_KEY *key, const int enc)
++static int sm4_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
++                          const unsigned char *in, size_t len)
+ {
+-    if (enc)
+-        ossl_sm4_encrypt(in, out, key);
++    size_t bl = EVP_CIPHER_CTX_get_block_size(ctx);
++    size_t i;
++    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
++
++    if (len < bl)
++        return 1;
++
++    if (dat->stream.ecb != NULL)
++        (*dat->stream.ecb) (in, out, len, &dat->ks.ks,
++                            EVP_CIPHER_CTX_is_encrypting(ctx));
+     else
+-        ossl_sm4_decrypt(in, out, key);
++        for (i = 0, len -= bl; i <= len; i += bl)
++            (*dat->block) (in + i, out + i, &dat->ks);
++
++    return 1;
+ }
+ 
+-static void sm4_ofb128_encrypt(const unsigned char *in, unsigned char *out,
+-                               size_t length, const SM4_KEY *key,
+-                               unsigned char *ivec, int *num)
++static int sm4_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
++                          const unsigned char *in, size_t len)
+ {
+-    CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num,
+-                          (block128_f)ossl_sm4_encrypt);
+-}
++    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
++    int num = EVP_CIPHER_CTX_get_num(ctx);
+ 
+-IMPLEMENT_BLOCK_CIPHER(sm4, ks, sm4, EVP_SM4_KEY, NID_sm4,
+-                       16, 16, 16, 128, EVP_CIPH_FLAG_DEFAULT_ASN1,
+-                       sm4_init_key, 0, 0, 0, 0)
++    CRYPTO_ofb128_encrypt(in, out, len, &dat->ks,
++                          ctx->iv, &num, dat->block);
++    EVP_CIPHER_CTX_set_num(ctx, num);
++    return 1;
++}
+ 
+ static int sm4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+                           const unsigned char *in, size_t len)
+ {
+     int n = EVP_CIPHER_CTX_get_num(ctx);
+     unsigned int num;
+-    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY, ctx);
++    EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+ 
+     if (n < 0)
+         return 0;
+     num = (unsigned int)n;
+ 
+-    CRYPTO_ctr128_encrypt(in, out, len, &dat->ks, ctx->iv,
+-                          EVP_CIPHER_CTX_buf_noconst(ctx), &num,
+-                          (block128_f)ossl_sm4_encrypt);
++    if (dat->stream.ctr)
++        CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks,
++                                    ctx->iv,
++                                    EVP_CIPHER_CTX_buf_noconst(ctx),
++                                    &num, dat->stream.ctr);
++    else
++        CRYPTO_ctr128_encrypt(in, out, len, &dat->ks,
++                              ctx->iv,
++                              EVP_CIPHER_CTX_buf_noconst(ctx), &num,
++                              dat->block);
+     EVP_CIPHER_CTX_set_num(ctx, num);
+     return 1;
+ }
+ 
+-static const EVP_CIPHER sm4_ctr_mode = {
+-    NID_sm4_ctr, 1, 16, 16,
+-    EVP_CIPH_CTR_MODE,
+-    EVP_ORIG_GLOBAL,
+-    sm4_init_key,
+-    sm4_ctr_cipher,
+-    NULL,
+-    sizeof(EVP_SM4_KEY),
+-    NULL, NULL, NULL, NULL
+-};
+-
+-const EVP_CIPHER *EVP_sm4_ctr(void)
+-{
+-    return &sm4_ctr_mode;
+-}
+-
++DEFINE_BLOCK_CIPHERS(NID_sm4, 0)
+ #endif
+diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl
+new file mode 100755
+index 0000000000..7358a6e6a2
+--- /dev/null
++++ b/crypto/sm4/asm/sm4-armv8.pl
+@@ -0,0 +1,635 @@
++#! /usr/bin/env perl
++# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the Apache License 2.0 (the "License").  You may not use
++# this file except in compliance with the License.  You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++
++#
++# This module implements support for SM4 hw support on aarch64
++# Oct 2021
++#
++
++# $output is the last argument if it looks like a file (it has an extension)
++# $flavour is the first argument if it doesn't look like a file
++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
++die "can't locate arm-xlate.pl";
++
++open OUT,"| \"$^X\" $xlate $flavour \"$output\""
++    or die "can't call $xlate: $!";
++*STDOUT=*OUT;
++
++$prefix="sm4_v8";
++my @rks=map("v$_",(0..7));
++
++sub rev32() {
++my $dst = shift;
++my $src = shift;
++$code.=<<___;
++#ifndef __ARMEB__
++	rev32	$dst.16b,$src.16b
++#endif
++___
++}
++
++sub enc_blk () {
++my $data = shift;
++$code.=<<___;
++	sm4e	$data.4s,@rks[0].4s
++	sm4e	$data.4s,@rks[1].4s
++	sm4e	$data.4s,@rks[2].4s
++	sm4e	$data.4s,@rks[3].4s
++	sm4e	$data.4s,@rks[4].4s
++	sm4e	$data.4s,@rks[5].4s
++	sm4e	$data.4s,@rks[6].4s
++	sm4e	$data.4s,@rks[7].4s
++	rev64	$data.4S,$data.4S
++	ext	$data.16b,$data.16b,$data.16b,#8
++___
++}
++
++sub enc_4blks () {
++my $data0 = shift;
++my $data1 = shift;
++my $data2 = shift;
++my $data3 = shift;
++$code.=<<___;
++	sm4e	$data0.4s,@rks[0].4s
++	sm4e	$data1.4s,@rks[0].4s
++	sm4e	$data2.4s,@rks[0].4s
++	sm4e	$data3.4s,@rks[0].4s
++
++	sm4e	$data0.4s,@rks[1].4s
++	sm4e	$data1.4s,@rks[1].4s
++	sm4e	$data2.4s,@rks[1].4s
++	sm4e	$data3.4s,@rks[1].4s
++
++	sm4e	$data0.4s,@rks[2].4s
++	sm4e	$data1.4s,@rks[2].4s
++	sm4e	$data2.4s,@rks[2].4s
++	sm4e	$data3.4s,@rks[2].4s
++
++	sm4e	$data0.4s,@rks[3].4s
++	sm4e	$data1.4s,@rks[3].4s
++	sm4e	$data2.4s,@rks[3].4s
++	sm4e	$data3.4s,@rks[3].4s
++
++	sm4e	$data0.4s,@rks[4].4s
++	sm4e	$data1.4s,@rks[4].4s
++	sm4e	$data2.4s,@rks[4].4s
++	sm4e	$data3.4s,@rks[4].4s
++
++	sm4e	$data0.4s,@rks[5].4s
++	sm4e	$data1.4s,@rks[5].4s
++	sm4e	$data2.4s,@rks[5].4s
++	sm4e	$data3.4s,@rks[5].4s
++
++	sm4e	$data0.4s,@rks[6].4s
++	sm4e	$data1.4s,@rks[6].4s
++	sm4e	$data2.4s,@rks[6].4s
++	sm4e	$data3.4s,@rks[6].4s
++
++	sm4e	$data0.4s,@rks[7].4s
++	rev64	$data0.4S,$data0.4S
++	sm4e	$data1.4s,@rks[7].4s
++	ext	$data0.16b,$data0.16b,$data0.16b,#8
++	rev64	$data1.4S,$data1.4S
++	sm4e	$data2.4s,@rks[7].4s
++	ext	$data1.16b,$data1.16b,$data1.16b,#8
++	rev64	$data2.4S,$data2.4S
++	sm4e	$data3.4s,@rks[7].4s
++	ext	$data2.16b,$data2.16b,$data2.16b,#8
++	rev64	$data3.4S,$data3.4S
++	ext	$data3.16b,$data3.16b,$data3.16b,#8
++___
++}
++
++$code=<<___;
++#include "arm_arch.h"
++.arch	armv8-a+crypto
++.text
++___
++
++{{{
++$code.=<<___;
++.align	6
++.Lck:
++	.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
++	.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
++	.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
++	.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
++	.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
++	.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
++	.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
++	.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
++.Lfk:
++	.long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
++___
++}}}
++
++{{{
++my ($key,$keys)=("x0","x1");
++my ($tmp)=("x2");
++my ($key0,$key1,$key2,$key3,$key4,$key5,$key6,$key7)=map("v$_",(0..7));
++my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
++my ($fkconst) = ("v24");
++$code.=<<___;
++.globl	${prefix}_set_encrypt_key
++.type	${prefix}_set_encrypt_key,%function
++.align	5
++${prefix}_set_encrypt_key:
++	AARCH64_VALID_CALL_TARGET
++	ld1	{$key0.4s},[$key]
++	adr	$tmp,.Lfk
++	ld1	{$fkconst.4s},[$tmp]
++	adr	$tmp,.Lck
++	ld1	{$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
++___
++	&rev32($key0, $key0);
++$code.=<<___;
++	ld1	{$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
++	eor	$key0.16b,$key0.16b,$fkconst.16b;
++	sm4ekey	$key0.4S,$key0.4S,$const0.4S
++	sm4ekey	$key1.4S,$key0.4S,$const1.4S
++	sm4ekey	$key2.4S,$key1.4S,$const2.4S
++	sm4ekey	$key3.4S,$key2.4S,$const3.4S
++	sm4ekey	$key4.4S,$key3.4S,$const4.4S
++	st1	{$key0.4s,$key1.4s,$key2.4s,$key3.4s},[$keys],64
++	sm4ekey	$key5.4S,$key4.4S,$const5.4S
++	sm4ekey	$key6.4S,$key5.4S,$const6.4S
++	sm4ekey	$key7.4S,$key6.4S,$const7.4S
++	st1	{$key4.4s,$key5.4s,$key6.4s,$key7.4s},[$keys]
++	ret
++.size	${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
++___
++}}}
++
++{{{
++my ($key,$keys)=("x0","x1");
++my ($tmp)=("x2");
++my ($key7,$key6,$key5,$key4,$key3,$key2,$key1,$key0)=map("v$_",(0..7));
++my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
++my ($fkconst) = ("v24");
++$code.=<<___;
++.globl	${prefix}_set_decrypt_key
++.type	${prefix}_set_decrypt_key,%function
++.align	5
++${prefix}_set_decrypt_key:
++	AARCH64_VALID_CALL_TARGET
++	ld1	{$key0.4s},[$key]
++	adr	$tmp,.Lfk
++	ld1	{$fkconst.4s},[$tmp]
++	adr	$tmp, .Lck
++	ld1	{$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
++___
++	&rev32($key0, $key0);
++$code.=<<___;
++	ld1	{$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
++	eor	$key0.16b, $key0.16b,$fkconst.16b;
++	sm4ekey	$key0.4S,$key0.4S,$const0.4S
++	sm4ekey	$key1.4S,$key0.4S,$const1.4S
++	sm4ekey	$key2.4S,$key1.4S,$const2.4S
++	rev64	$key0.4s,$key0.4s
++	rev64	$key1.4s,$key1.4s
++	ext	$key0.16b,$key0.16b,$key0.16b,#8
++	ext	$key1.16b,$key1.16b,$key1.16b,#8
++	sm4ekey	$key3.4S,$key2.4S,$const3.4S
++	sm4ekey	$key4.4S,$key3.4S,$const4.4S
++	rev64	$key2.4s,$key2.4s
++	rev64	$key3.4s,$key3.4s
++	ext	$key2.16b,$key2.16b,$key2.16b,#8
++	ext	$key3.16b,$key3.16b,$key3.16b,#8
++	sm4ekey	$key5.4S,$key4.4S,$const5.4S
++	sm4ekey	$key6.4S,$key5.4S,$const6.4S
++	rev64	$key4.4s,$key4.4s
++	rev64	$key5.4s,$key5.4s
++	ext	$key4.16b,$key4.16b,$key4.16b,#8
++	ext	$key5.16b,$key5.16b,$key5.16b,#8
++	sm4ekey	$key7.4S,$key6.4S,$const7.4S
++	rev64	$key6.4s, $key6.4s
++	rev64	$key7.4s, $key7.4s
++	ext	$key6.16b,$key6.16b,$key6.16b,#8
++	ext	$key7.16b,$key7.16b,$key7.16b,#8
++	st1	{$key7.4s,$key6.4s,$key5.4s,$key4.4s},[$keys],64
++	st1	{$key3.4s,$key2.4s,$key1.4s,$key0.4s},[$keys]
++	ret
++.size	${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
++___
++}}}
++
++{{{
++sub gen_block () {
++my $dir = shift;
++my ($inp,$out,$rk)=map("x$_",(0..2));
++my ($data)=("v16");
++$code.=<<___;
++.globl	${prefix}_${dir}crypt
++.type	${prefix}_${dir}crypt,%function
++.align	5
++${prefix}_${dir}crypt:
++	AARCH64_VALID_CALL_TARGET
++	ld1	{$data.4s},[$inp]
++	ld1	{@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64
++	ld1	{@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
++___
++	&rev32($data,$data);
++	&enc_blk($data);
++	&rev32($data,$data);
++$code.=<<___;
++	st1	{$data.4s},[$out]
++	ret
++.size	${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
++___
++}
++
++&gen_block("en");
++&gen_block("de");
++}}}
++
++{{{
++my ($inp,$out,$len,$rk)=map("x$_",(0..3));
++my ($enc) = ("w4");
++my @dat=map("v$_",(16..23));
++$code.=<<___;
++.globl	${prefix}_ecb_encrypt
++.type	${prefix}_ecb_encrypt,%function
++.align	5
++${prefix}_ecb_encrypt:
++	AARCH64_VALID_CALL_TARGET
++	ld1	{@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64
++	ld1	{@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
++1:
++	cmp	$len,#64
++	b.lt	1f
++	ld1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
++	cmp	$len,#128
++	b.lt	2f
++	ld1	{@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64
++	// 8 blocks
++___
++	&rev32(@dat[0],@dat[0]);
++	&rev32(@dat[1],@dat[1]);
++	&rev32(@dat[2],@dat[2]);
++	&rev32(@dat[3],@dat[3]);
++	&rev32(@dat[4],@dat[4]);
++	&rev32(@dat[5],@dat[5]);
++	&rev32(@dat[6],@dat[6]);
++	&rev32(@dat[7],@dat[7]);
++	&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++	&enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
++	&rev32(@dat[0],@dat[0]);
++	&rev32(@dat[1],@dat[1]);
++	&rev32(@dat[2],@dat[2]);
++	&rev32(@dat[3],@dat[3]);
++	&rev32(@dat[4],@dat[4]);
++	&rev32(@dat[5],@dat[5]);
++$code.=<<___;
++	st1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++___
++	&rev32(@dat[6],@dat[6]);
++	&rev32(@dat[7],@dat[7]);
++$code.=<<___;
++	st1	{@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
++	subs	$len,$len,#128
++	b.gt	1b
++	ret
++	// 4 blocks
++2:
++___
++	&rev32(@dat[0],@dat[0]);
++	&rev32(@dat[1],@dat[1]);
++	&rev32(@dat[2],@dat[2]);
++	&rev32(@dat[3],@dat[3]);
++	&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++	&rev32(@dat[0],@dat[0]);
++	&rev32(@dat[1],@dat[1]);
++	&rev32(@dat[2],@dat[2]);
++	&rev32(@dat[3],@dat[3]);
++$code.=<<___;
++	st1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++	subs	$len,$len,#64
++	b.gt	1b
++1:
++	subs	$len,$len,#16
++	b.lt	1f
++	ld1	{@dat[0].4s},[$inp],#16
++___
++	&rev32(@dat[0],@dat[0]);
++	&enc_blk(@dat[0]);
++	&rev32(@dat[0],@dat[0]);
++$code.=<<___;
++	st1	{@dat[0].4s},[$out],#16
++	b.ne	1b
++1:
++	ret
++.size	${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
++___
++}}}
++
++{{{
++my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
++my ($enc) = ("w5");
++my @dat=map("v$_",(16..23));
++my @in=map("v$_",(24..31));
++my ($ivec) = ("v8");
++$code.=<<___;
++.globl	${prefix}_cbc_encrypt
++.type	${prefix}_cbc_encrypt,%function
++.align	5
++${prefix}_cbc_encrypt:
++	AARCH64_VALID_CALL_TARGET
++	stp	d8,d9,[sp, #-16]!
++
++	ld1	{@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64
++	ld1	{@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
++	ld1	{$ivec.4s},[$ivp]
++	cmp	$enc,#0
++	b.eq	.Ldec
++1:
++	cmp	$len, #64
++	b.lt	1f
++	ld1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
++	eor	@dat[0].16b,@dat[0].16b,$ivec.16b
++___
++	&rev32(@dat[1],@dat[1]);
++	&rev32(@dat[0],@dat[0]);
++	&rev32(@dat[2],@dat[2]);
++	&rev32(@dat[3],@dat[3]);
++	&enc_blk(@dat[0]);
++$code.=<<___;
++	eor	@dat[1].16b,@dat[1].16b,@dat[0].16b
++___
++	&enc_blk(@dat[1]);
++	&rev32(@dat[0],@dat[0]);
++$code.=<<___;
++	eor	@dat[2].16b,@dat[2].16b,@dat[1].16b
++___
++	&enc_blk(@dat[2]);
++	&rev32(@dat[1],@dat[1]);
++$code.=<<___;
++	eor	@dat[3].16b,@dat[3].16b,@dat[2].16b
++___
++	&enc_blk(@dat[3]);
++	&rev32(@dat[2],@dat[2]);
++	&rev32(@dat[3],@dat[3]);
++$code.=<<___;
++	mov	$ivec.16b,@dat[3].16b
++	st1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++	subs	$len,$len,#64
++	b.ne	1b
++1:
++	subs	$len,$len,#16
++	b.lt	3f
++	ld1	{@dat[0].4s},[$inp],#16
++	eor	$ivec.16b,$ivec.16b,@dat[0].16b
++___
++	&rev32($ivec,$ivec);
++	&enc_blk($ivec);
++	&rev32($ivec,$ivec);
++$code.=<<___;
++	st1	{$ivec.16b},[$out],#16
++	b.ne	1b
++	b	3f
++.Ldec:
++1:
++	cmp	$len, #64
++	b.lt	1f
++	ld1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp]
++	ld1	{@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64
++	cmp	$len,#128
++	b.lt	2f
++	// 8 blocks mode
++	ld1	{@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp]
++	ld1	{@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64
++___
++	&rev32(@dat[0],@dat[0]);
++	&rev32(@dat[1],@dat[1]);
++	&rev32(@dat[2],@dat[2]);
++	&rev32(@dat[3],$dat[3]);
++	&rev32(@dat[4],@dat[4]);
++	&rev32(@dat[5],@dat[5]);
++	&rev32(@dat[6],@dat[6]);
++	&rev32(@dat[7],$dat[7]);
++	&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++	&enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
++	&rev32(@dat[0],@dat[0]);
++	&rev32(@dat[1],@dat[1]);
++	&rev32(@dat[2],@dat[2]);
++	&rev32(@dat[3],@dat[3]);
++	&rev32(@dat[4],@dat[4]);
++	&rev32(@dat[5],@dat[5]);
++	&rev32(@dat[6],@dat[6]);
++	&rev32(@dat[7],@dat[7]);
++$code.=<<___;
++	eor	@dat[0].16b,@dat[0].16b,$ivec.16b
++	eor	@dat[1].16b,@dat[1].16b,@in[0].16b
++	eor	@dat[2].16b,@dat[2].16b,@in[1].16b
++	mov	$ivec.16b,@in[7].16b
++	eor	@dat[3].16b,$dat[3].16b,@in[2].16b
++	eor	@dat[4].16b,$dat[4].16b,@in[3].16b
++	eor	@dat[5].16b,$dat[5].16b,@in[4].16b
++	eor	@dat[6].16b,$dat[6].16b,@in[5].16b
++	eor	@dat[7].16b,$dat[7].16b,@in[6].16b
++	st1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++	st1	{@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
++	subs	$len,$len,128
++	b.gt	1b
++	b	3f
++	// 4 blocks mode
++2:
++___
++	&rev32(@dat[0],@dat[0]);
++	&rev32(@dat[1],@dat[1]);
++	&rev32(@dat[2],@dat[2]);
++	&rev32(@dat[3],$dat[3]);
++	&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++	&rev32(@dat[0],@dat[0]);
++	&rev32(@dat[1],@dat[1]);
++	&rev32(@dat[2],@dat[2]);
++	&rev32(@dat[3],@dat[3]);
++$code.=<<___;
++	eor	@dat[0].16b,@dat[0].16b,$ivec.16b
++	eor	@dat[1].16b,@dat[1].16b,@in[0].16b
++	mov	$ivec.16b,@in[3].16b
++	eor	@dat[2].16b,@dat[2].16b,@in[1].16b
++	eor	@dat[3].16b,$dat[3].16b,@in[2].16b
++	st1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++	subs	$len,$len,#64
++	b.gt	1b
++1:
++	subs	$len,$len,#16
++	b.lt	3f
++	ld1	{@dat[0].4s},[$inp],#16
++	mov	@in[0].16b,@dat[0].16b
++___
++	&rev32(@dat[0],@dat[0]);
++	&enc_blk(@dat[0]);
++	&rev32(@dat[0],@dat[0]);
++$code.=<<___;
++	eor	@dat[0].16b,@dat[0].16b,$ivec.16b
++	mov	$ivec.16b,@in[0].16b
++	st1	{@dat[0].16b},[$out],#16
++	b.ne	1b
++3:
++	// save back IV
++	st1	{$ivec.16b},[$ivp]
++	ldp	d8,d9,[sp],#16
++	ret
++.size	${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
++___
++}}}
++
++{{{
++my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
++my ($ctr)=("w5");
++my @dat=map("v$_",(16..23));
++my @in=map("v$_",(24..31));
++my ($ivec)=("v8");
++$code.=<<___;
++.globl	${prefix}_ctr32_encrypt_blocks
++.type	${prefix}_ctr32_encrypt_blocks,%function
++.align	5
++${prefix}_ctr32_encrypt_blocks:
++	AARCH64_VALID_CALL_TARGET
++	stp	d8,d9,[sp, #-16]!
++
++	ld1	{$ivec.4s},[$ivp]
++	ld1	{@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64
++	ld1	{@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
++___
++	&rev32($ivec,$ivec);
++$code.=<<___;
++	mov	$ctr,$ivec.s[3]
++1:
++	cmp	$len,#4
++	b.lt	1f
++	ld1	{@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64
++	mov	@dat[0].16b,$ivec.16b
++	mov	@dat[1].16b,$ivec.16b
++	mov	@dat[2].16b,$ivec.16b
++	mov	@dat[3].16b,$ivec.16b
++	add	$ctr,$ctr,#1
++	mov	$dat[1].s[3],$ctr
++	add	$ctr,$ctr,#1
++	mov	@dat[2].s[3],$ctr
++	add	$ctr,$ctr,#1
++	mov	@dat[3].s[3],$ctr
++	cmp	$len,#8
++	b.lt	2f
++	ld1	{@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64
++	mov	@dat[4].16b,$ivec.16b
++	mov	@dat[5].16b,$ivec.16b
++	mov	@dat[6].16b,$ivec.16b
++	mov	@dat[7].16b,$ivec.16b
++	add	$ctr,$ctr,#1
++	mov	$dat[4].s[3],$ctr
++	add	$ctr,$ctr,#1
++	mov	@dat[5].s[3],$ctr
++	add	$ctr,$ctr,#1
++	mov	@dat[6].s[3],$ctr
++	add	$ctr,$ctr,#1
++	mov	@dat[7].s[3],$ctr
++___
++	&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++	&enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
++	&rev32(@dat[0],@dat[0]);
++	&rev32(@dat[1],@dat[1]);
++	&rev32(@dat[2],@dat[2]);
++	&rev32(@dat[3],@dat[3]);
++	&rev32(@dat[4],@dat[4]);
++	&rev32(@dat[5],@dat[5]);
++	&rev32(@dat[6],@dat[6]);
++	&rev32(@dat[7],@dat[7]);
++$code.=<<___;
++	eor	@dat[0].16b,@dat[0].16b,@in[0].16b
++	eor	@dat[1].16b,@dat[1].16b,@in[1].16b
++	eor	@dat[2].16b,@dat[2].16b,@in[2].16b
++	eor	@dat[3].16b,@dat[3].16b,@in[3].16b
++	eor	@dat[4].16b,@dat[4].16b,@in[4].16b
++	eor	@dat[5].16b,@dat[5].16b,@in[5].16b
++	eor	@dat[6].16b,@dat[6].16b,@in[6].16b
++	eor	@dat[7].16b,@dat[7].16b,@in[7].16b
++	st1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++	st1	{@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
++	subs	$len,$len,#8
++	b.eq	3f
++	add	$ctr,$ctr,#1
++	mov	$ivec.s[3],$ctr
++	b	1b
++2:
++___
++	&enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++	&rev32(@dat[0],@dat[0]);
++	&rev32(@dat[1],@dat[1]);
++	&rev32(@dat[2],@dat[2]);
++	&rev32(@dat[3],@dat[3]);
++$code.=<<___;
++	eor	@dat[0].16b,@dat[0].16b,@in[0].16b
++	eor	@dat[1].16b,@dat[1].16b,@in[1].16b
++	eor	@dat[2].16b,@dat[2].16b,@in[2].16b
++	eor	@dat[3].16b,@dat[3].16b,@in[3].16b
++	st1	{@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++	subs	$len,$len,#4
++	b.eq	3f
++	add	$ctr,$ctr,#1
++	mov	$ivec.s[3],$ctr
++	b	1b
++1:
++	subs	$len,$len,#1
++	b.lt	3f
++	mov	$dat[0].16b,$ivec.16b
++	ld1	{@in[0].4s},[$inp],#16
++___
++	&enc_blk(@dat[0]);
++	&rev32(@dat[0],@dat[0]);
++$code.=<<___;
++	eor	$dat[0].16b,$dat[0].16b,@in[0].16b
++	st1	{$dat[0].4s},[$out],#16
++	b.eq	3f
++	add	$ctr,$ctr,#1
++	mov	$ivec.s[3],$ctr
++	b	1b
++3:
++	ldp	d8,d9,[sp],#16
++	ret
++.size	${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
++___
++}}}
++########################################
++{   my  %opcode = (
++        "sm4e"          => 0xcec08400,
++        "sm4ekey"       => 0xce60c800);
++
++    sub unsm4 {
++        my ($mnemonic,$arg)=@_;
++
++        $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
++        &&
++        sprintf ".inst\t0x%08x\t//%s %s",
++                        $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
++                        $mnemonic,$arg;
++    }
++}
++
++open SELF,$0;
++while(<SELF>) {
++        next if (/^#!/);
++        last if (!s/^#/\/\// and !/^$/);
++        print;
++}
++close SELF;
++
++foreach(split("\n",$code)) {
++	s/\`([^\`]*)\`/eval($1)/ge;
++
++	s/\b(sm4\w+)\s+([qv].*)/unsm4($1,$2)/ge;
++	print $_,"\n";
++}
++
++close STDOUT or die "error closing STDOUT: $!";
+diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info
+index b65a7d149e..e27aa49e67 100644
+--- a/crypto/sm4/build.info
++++ b/crypto/sm4/build.info
+@@ -1,4 +1,32 @@
+ LIBS=../../libcrypto
+-SOURCE[../../libcrypto]=\
+-        sm4.c
+ 
++IF[{- !$disabled{asm} -}]
++  $SM4DEF_aarch64=SM4_ASM
++  $SM4ASM_aarch64=sm4-armv8.S
++
++  # Now that we have defined all the arch specific variables, use the
++  # appropriate one, and define the appropriate macros
++  IF[$SM4ASM_{- $target{asm_arch} -}]
++    $SM4ASM=$SM4ASM_{- $target{asm_arch} -}
++    $SM4DEF=$SM4DEF_{- $target{asm_arch} -}
++  ENDIF
++ENDIF
++
++SOURCE[../../libcrypto]= $SM4ASM sm4.c
++
++
++# Implementations are now spread across several libraries, so the defines
++# need to be applied to all affected libraries and modules.
++DEFINE[../../libcrypto]=$SM4DEF
++DEFINE[../../providers/libfips.a]=$SM4DEF
++DEFINE[../../providers/libdefault.a]=$SM4DEF
++# We only need to include the SM4DEF stuff in the legacy provider when it's a
++# separate module and it's dynamically linked with libcrypto.  Otherwise, it
++# already gets everything that the static libcrypto.a has, and doesn't need it
++# added again.
++IF[{- !$disabled{module} && !$disabled{shared} -}]
++  DEFINE[../providers/liblegacy.a]=$SM4DEF
++ENDIF
++
++GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl
++INCLUDE[sm4-armv8.o]=..
+diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
+new file mode 100644
+index 0000000000..42c8b44a43
+--- /dev/null
++++ b/include/crypto/sm4_platform.h
+@@ -0,0 +1,48 @@
++/*
++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License").  You may not use
++ * this file except in compliance with the License.  You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++#ifndef OSSL_SM4_PLATFORM_H
++# define OSSL_SM4_PLATFORM_H
++# pragma once
++
++# if defined(OPENSSL_CPUID_OBJ)
++#  if (defined(__arm__) || defined(__arm) || defined(__aarch64__))
++#   include "arm_arch.h"
++#   if __ARM_MAX_ARCH__>=8
++#    define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4)
++#    define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key
++#    define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key
++#    define HWSM4_encrypt sm4_v8_encrypt
++#    define HWSM4_decrypt sm4_v8_decrypt
++#    define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt
++#    define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt
++#    define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks
++#   endif
++#  endif
++# endif /* OPENSSL_CPUID_OBJ */
++
++# if defined(HWSM4_CAPABLE)
++int HWSM4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key);
++int HWSM4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key);
++void HWSM4_encrypt(const unsigned char *in, unsigned char *out,
++                   const SM4_KEY *key);
++void HWSM4_decrypt(const unsigned char *in, unsigned char *out,
++                   const SM4_KEY *key);
++void HWSM4_cbc_encrypt(const unsigned char *in, unsigned char *out,
++                       size_t length, const SM4_KEY *key,
++                       unsigned char *ivec, const int enc);
++void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out,
++                       size_t length, const SM4_KEY *key,
++                       const int enc);
++void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
++                                size_t len, const void *key,
++                                const unsigned char ivec[16]);
++# endif /* HWSM4_CAPABLE */
++
++#endif /* OSSL_SM4_PLATFORM_H */
+diff --git a/providers/implementations/ciphers/cipher_sm4.h b/providers/implementations/ciphers/cipher_sm4.h
+index f7f833fcb4..01a031a74d 100644
+--- a/providers/implementations/ciphers/cipher_sm4.h
++++ b/providers/implementations/ciphers/cipher_sm4.h
+@@ -9,6 +9,7 @@
+ 
+ #include "prov/ciphercommon.h"
+ #include "crypto/sm4.h"
++#include "crypto/sm4_platform.h"
+ 
+ typedef struct prov_cast_ctx_st {
+     PROV_CIPHER_CTX base;      /* Must be first */
+diff --git a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+index 6bcd1ec406..c0c9b22bd3 100644
+--- a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
++++ b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+@@ -12,6 +12,7 @@
+  */
+ 
+ #include "cipher_sm4_gcm.h"
++#include "crypto/sm4_platform.h"
+ 
+ static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
+                            size_t keylen)
+@@ -20,9 +21,22 @@ static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
+     SM4_KEY *ks = &actx->ks.ks;
+ 
+     ctx->ks = ks;
+-    ossl_sm4_set_key(key, ks);
+-    CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt);
+-    ctx->ctr = (ctr128_f)NULL;
++# ifdef HWSM4_CAPABLE
++    if (HWSM4_CAPABLE) {
++        HWSM4_set_encrypt_key(key, ks);
++        CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f) HWSM4_encrypt);
++#  ifdef HWSM4_ctr32_encrypt_blocks
++        ctx->ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks;
++#  else /* HWSM4_ctr32_encrypt_blocks */
++        ctx->ctr = (ctr128_f)NULL;
++#  endif
++    } else
++# endif /* HWSM4_CAPABLE */
++    {
++        ossl_sm4_set_key(key, ks);
++        CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt);
++        ctx->ctr = (ctr128_f)NULL;
++    }
+     ctx->key_set = 1;
+ 
+     return 1;
+diff --git a/providers/implementations/ciphers/cipher_sm4_hw.c b/providers/implementations/ciphers/cipher_sm4_hw.c
+index 0db04b1a74..4cd3d3d669 100644
+--- a/providers/implementations/ciphers/cipher_sm4_hw.c
++++ b/providers/implementations/ciphers/cipher_sm4_hw.c
+@@ -15,14 +15,59 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
+     PROV_SM4_CTX *sctx =  (PROV_SM4_CTX *)ctx;
+     SM4_KEY *ks = &sctx->ks.ks;
+ 
+-    ossl_sm4_set_key(key, ks);
+     ctx->ks = ks;
+     if (ctx->enc
+             || (ctx->mode != EVP_CIPH_ECB_MODE
+-                && ctx->mode != EVP_CIPH_CBC_MODE))
+-        ctx->block = (block128_f)ossl_sm4_encrypt;
+-    else
+-        ctx->block = (block128_f)ossl_sm4_decrypt;
++                && ctx->mode != EVP_CIPH_CBC_MODE)) {
++#ifdef HWSM4_CAPABLE
++        if (HWSM4_CAPABLE) {
++            HWSM4_set_encrypt_key(key, ks);
++            ctx->block = (block128_f)HWSM4_encrypt;
++            ctx->stream.cbc = NULL;
++#ifdef HWSM4_cbc_encrypt
++            if (ctx->mode == EVP_CIPH_CBC_MODE)
++                ctx->stream.cbc = (cbc128_f)HWSM4_cbc_encrypt;
++            else
++#endif
++#ifdef HWSM4_ecb_encrypt
++            if (ctx->mode == EVP_CIPH_ECB_MODE)
++                ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt;
++            else
++#endif
++#ifdef HWSM4_ctr32_encrypt_blocks
++            if (ctx->mode == EVP_CIPH_CTR_MODE)
++                ctx->stream.ctr = (ctr128_f)HWSM4_ctr32_encrypt_blocks;
++            else
++#endif
++            (void)0;            /* terminate potentially open 'else' */
++        } else
++#endif
++        {
++            ossl_sm4_set_key(key, ks);
++            ctx->block = (block128_f)ossl_sm4_encrypt;
++        }
++    } else {
++#ifdef HWSM4_CAPABLE
++        if (HWSM4_CAPABLE) {
++            HWSM4_set_decrypt_key(key, ks);
++            ctx->block = (block128_f)HWSM4_decrypt;
++            ctx->stream.cbc = NULL;
++#ifdef HWSM4_cbc_encrypt
++            if (ctx->mode == EVP_CIPH_CBC_MODE)
++                ctx->stream.cbc = (cbc128_f)HWSM4_cbc_encrypt;
++#endif
++#ifdef HWSM4_ecb_encrypt
++            if (ctx->mode == EVP_CIPH_ECB_MODE)
++                ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt;
++#endif
++        } else
++#endif
++        {
++            ossl_sm4_set_key(key, ks);
++            ctx->block = (block128_f)ossl_sm4_decrypt;
++        }
++    }
++
+     return 1;
+ }
+ 
+@@ -31,7 +76,7 @@ IMPLEMENT_CIPHER_HW_COPYCTX(cipher_hw_sm4_copyctx, PROV_SM4_CTX)
+ # define PROV_CIPHER_HW_sm4_mode(mode)                                         \
+ static const PROV_CIPHER_HW sm4_##mode = {                                     \
+     cipher_hw_sm4_initkey,                                                     \
+-    ossl_cipher_hw_chunked_##mode,                                             \
++    ossl_cipher_hw_generic_##mode,                                             \
+     cipher_hw_sm4_copyctx                                                      \
+ };                                                                             \
+ const PROV_CIPHER_HW *ossl_prov_cipher_hw_sm4_##mode(size_t keybits)           \
+-- 
+2.37.3.windows.1
+
diff --git a/Backport-aarch64-support-BTI-and-pointer-authentication-in-as.patch b/Backport-aarch64-support-BTI-and-pointer-authentication-in-as.patch
new file mode 100644
index 0000000..31852cb
--- /dev/null
+++ b/Backport-aarch64-support-BTI-and-pointer-authentication-in-as.patch
@@ -0,0 +1,1521 @@
+From 44b6e3d07ae5b09255710986e61035c862ec68aa Mon Sep 17 00:00:00 2001
+From: Russ Butler <russ.butler@arm.com>
+Date: Sat, 28 Aug 2021 13:57:09 -0500
+Subject: [PATCH 01/13] aarch64: support BTI and pointer authentication in
+ assembly
+
+This change adds optional support for
+- Armv8.3-A Pointer Authentication (PAuth) and
+- Armv8.5-A Branch Target Identification (BTI)
+features to the perl scripts.
+
+Both features can be enabled with additional compiler flags.
+Unless any of these are enabled explicitly there is no code change at
+all.
+
+The extensions are briefly described below. Please read the appropriate
+chapters of the Arm Architecture Reference Manual for the complete
+specification.
+
+Scope
+-----
+
+This change only affects generated assembly code.
+
+Armv8.3-A Pointer Authentication
+--------------------------------
+
+Pointer Authentication extension supports the authentication of the
+contents of registers before they are used for indirect branching
+or load.
+
+PAuth provides a probabilistic method to detect corruption of register
+values. PAuth signing instructions generate a Pointer Authentication
+Code (PAC) based on the value of a register, a seed and a key.
+The generated PAC is inserted into the original value in the register.
+A PAuth authentication instruction recomputes the PAC, and if it matches
+the PAC in the register, restores its original value. In case of a
+mismatch, an architecturally unmapped address is generated instead.
+
+With PAuth, mitigation against ROP (Return-oriented Programming) attacks
+can be implemented. This is achieved by signing the contents of the
+link-register (LR) before it is pushed to stack. Once LR is popped,
+it is authenticated. This way a stack corruption which overwrites the
+LR on the stack is detectable.
+
+The PAuth extension adds several new instructions, some of which are not
+recognized by older hardware. To support a single codebase for both pre
+Armv8.3-A targets and newer ones, only NOP-space instructions are added
+by this patch. These instructions are treated as NOPs on hardware
+which does not support Armv8.3-A. Furthermore, this patch only considers
+cases where LR is saved to the stack and then restored before branching
+to its content. There are cases in the code where LR is pushed to stack
+but it is not used later. We do not address these cases as they are not
+affected by PAuth.
+
+There are two keys available to sign an instruction address: A and B.
+PACIASP and PACIBSP only differ in the used keys: A and B, respectively.
+The keys are typically managed by the operating system.
+
+To enable generating code for PAuth compile with
+-mbranch-protection=<mode>:
+
+- standard or pac-ret: add PACIASP and AUTIASP, also enables BTI
+  (read below)
+- pac-ret+b-key: add PACIBSP and AUTIBSP
+
+Armv8.5-A Branch Target Identification
+--------------------------------------
+
+Branch Target Identification features some new instructions which
+protect the execution of instructions on guarded pages which are not
+intended branch targets.
+
+If Armv8.5-A is supported by the hardware, execution of an instruction
+changes the value of PSTATE.BTYPE field. If an indirect branch
+lands on a guarded page the target instruction must be one of the
+BTI <jc> flavors, or in case of a direct call or jump it can be any
+other instruction. If the target instruction is not compatible with the
+value of PSTATE.BTYPE a Branch Target Exception is generated.
+
+In short, indirect jumps are compatible with BTI <j> and <jc> while
+indirect calls are compatible with BTI <c> and <jc>. Please refer to the
+specification for the details.
+
+Armv8.3-A PACIASP and PACIBSP are implicit branch target
+identification instructions which are equivalent with BTI c or BTI jc
+depending on system register configuration.
+
+BTI is used to mitigate JOP (Jump-oriented Programming) attacks by
+limiting the set of instructions which can be jumped to.
+
+BTI requires active linker support to mark the pages with BTI-enabled
+code as guarded. For ELF64 files BTI compatibility is recorded in the
+.note.gnu.property section. For a shared object or static binary it is
+required that all linked units support BTI. This means that even a
+single assembly file without the required note section turns-off BTI
+for the whole binary or shared object.
+
+The new BTI instructions are treated as NOPs on hardware which does
+not support Armv8.5-A or on pages which are not guarded.
+
+To insert this new and optional instruction compile with
+-mbranch-protection=standard (also enables PAuth) or +bti.
+
+When targeting a guarded page from a non-guarded page, weaker
+compatibility restrictions apply to maintain compatibility between
+legacy and new code. For detailed rules please refer to the Arm ARM.
+
+Compiler support
+----------------
+
+Compiler support requires understanding '-mbranch-protection=<mode>'
+and emitting the appropriate feature macros (__ARM_FEATURE_BTI_DEFAULT
+and __ARM_FEATURE_PAC_DEFAULT). The current state is the following:
+
+-------------------------------------------------------
+| Compiler | -mbranch-protection | Feature macros     |
++----------+---------------------+--------------------+
+| clang    | 9.0.0               | 11.0.0             |
++----------+---------------------+--------------------+
+| gcc      | 9                   | expected in 10.1+  |
+-------------------------------------------------------
+
+Available Platforms
+------------------
+
+Arm Fast Model and QEMU support both extensions.
+
+https://developer.arm.com/tools-and-software/simulation-models/fast-models
+https://www.qemu.org/
+
+Implementation Notes
+--------------------
+
+This change adds BTI landing pads even to assembly functions which are
+likely to be directly called only. In these cases, landing pads might
+be superfluous depending on what code the linker generates.
+Code size and performance impact for these cases would be negligible.
+
+Interaction with C code
+-----------------------
+
+Pointer Authentication is a per-frame protection while Branch Target
+Identification can be turned on and off only for all code pages of a
+whole shared object or static binary. Because of these properties if
+C/C++ code is compiled without any of the above features but assembly
+files support any of them unconditionally there is no incompatibility
+between the two.
+
+Useful Links
+------------
+
+To fully understand the details of both PAuth and BTI it is advised to
+read the related chapters of the Arm Architecture Reference Manual
+(Arm ARM):
+https://developer.arm.com/documentation/ddi0487/latest/
+
+Additional materials:
+
+"Providing protection for complex software"
+https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software
+
+Arm Compiler Reference Guide Version 6.14: -mbranch-protection
+https://developer.arm.com/documentation/101754/0614/armclang-Reference/armclang-Command-line-Options/-mbranch-protection?lang=en
+
+Arm C Language Extensions (ACLE)
+https://developer.arm.com/docs/101028/latest
+
+Addional Notes
+--------------
+
+This patch is a copy of the work done by Tamas Petz in boringssl. It
+contains the changes from the following commits:
+
+aarch64: support BTI and pointer authentication in assembly
+    Change-Id: I4335f92e2ccc8e209c7d68a0a79f1acdf3aeb791
+    URL: https://boringssl-review.googlesource.com/c/boringssl/+/42084
+aarch64: Improve conditional compilation
+    Change-Id: I14902a64e5f403c2b6a117bc9f5fb1a4f4611ebf
+    URL: https://boringssl-review.googlesource.com/c/boringssl/+/43524
+aarch64: Fix name of gnu property note section
+    Change-Id: I6c432d1c852129e9c273f6469a8b60e3983671ec
+    URL: https://boringssl-review.googlesource.com/c/boringssl/+/44024
+
+Change-Id: I2d95ebc5e4aeb5610d3b226f9754ee80cf74a9af
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/16674)
+---
+ crypto/aes/asm/aesv8-armx.pl          | 18 +++++++-
+ crypto/aes/asm/vpaes-armv8.pl         | 39 ++++++++--------
+ crypto/aes/build.info                 |  1 +
+ crypto/arm64cpuid.pl                  | 10 +++++
+ crypto/arm_arch.h                     | 58 ++++++++++++++++++++++++
+ crypto/bn/asm/armv8-mont.pl           | 19 +++++---
+ crypto/chacha/asm/chacha-armv8.pl     | 18 ++++----
+ crypto/ec/asm/ecp_nistz256-armv8.pl   | 64 ++++++++++++++++-----------
+ crypto/modes/asm/aes-gcm-armv8_64.pl  |  6 +++
+ crypto/modes/asm/ghashv8-armx.pl      | 11 +++++
+ crypto/poly1305/asm/poly1305-armv8.pl | 17 ++++++-
+ crypto/sha/asm/keccak1600-armv8.pl    | 30 +++++++------
+ crypto/sha/asm/sha1-armv8.pl          |  5 ++-
+ crypto/sha/asm/sha512-armv8.pl        | 11 +++--
+ crypto/sha/build.info                 |  1 +
+ 15 files changed, 228 insertions(+), 80 deletions(-)
+
+diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl
+index 6a7bf05d1b..ed5ae4207c 100755
+--- a/crypto/aes/asm/aesv8-armx.pl
++++ b/crypto/aes/asm/aesv8-armx.pl
+@@ -120,6 +120,8 @@ ${prefix}_set_encrypt_key:
+ .Lenc_key:
+ ___
+ $code.=<<___	if ($flavour =~ /64/);
++	AARCH64_VALID_CALL_TARGET
++	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ ___
+@@ -295,7 +297,7 @@ $code.=<<___;
+ ${prefix}_set_decrypt_key:
+ ___
+ $code.=<<___	if ($flavour =~ /64/);
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ ___
+@@ -339,7 +341,7 @@ $code.=<<___	if ($flavour !~ /64/);
+ ___
+ $code.=<<___	if ($flavour =~ /64/);
+ 	ldp	x29,x30,[sp],#16
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ ___
+ $code.=<<___;
+@@ -359,6 +361,11 @@ $code.=<<___;
+ .type	${prefix}_${dir}crypt,%function
+ .align	5
+ ${prefix}_${dir}crypt:
++___
++$code.=<<___	if ($flavour =~ /64/);
++	AARCH64_VALID_CALL_TARGET
++___
++$code.=<<___;
+ 	ldr	$rounds,[$key,#240]
+ 	vld1.32	{$rndkey0},[$key],#16
+ 	vld1.8	{$inout},[$inp]
+@@ -442,6 +449,7 @@ $code.=<<___;
+ ${prefix}_ecb_encrypt:
+ ___
+ $code.=<<___	if ($flavour =~ /64/);
++	AARCH64_VALID_CALL_TARGET
+ 	subs	$len,$len,#16
+ 	// Original input data size bigger than 16, jump to big size processing.
+ 	b.ne    .Lecb_big_size
+@@ -1236,6 +1244,8 @@ $code.=<<___;
+ ${prefix}_cbc_encrypt:
+ ___
+ $code.=<<___	if ($flavour =~ /64/);
++	AARCH64_VALID_CALL_TARGET
++	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ ___
+@@ -1764,6 +1774,8 @@ $code.=<<___;
+ ${prefix}_ctr32_encrypt_blocks:
+ ___
+ $code.=<<___	if ($flavour =~ /64/);
++	AARCH64_VALID_CALL_TARGET
++	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ 	stp		x29,x30,[sp,#-16]!
+ 	add		x29,sp,#0
+ ___
+@@ -2256,6 +2268,7 @@ $code.=<<___	if ($flavour =~ /64/);
+ ${prefix}_xts_encrypt:
+ ___
+ $code.=<<___	if ($flavour =~ /64/);
++	AARCH64_VALID_CALL_TARGET
+ 	cmp	$len,#16
+ 	// Original input data size bigger than 16, jump to big size processing.
+ 	b.ne	.Lxts_enc_big_size
+@@ -2930,6 +2943,7 @@ $code.=<<___	if ($flavour =~ /64/);
+ .type	${prefix}_xts_decrypt,%function
+ .align	5
+ ${prefix}_xts_decrypt:
++	AARCH64_VALID_CALL_TARGET
+ ___
+ $code.=<<___	if ($flavour =~ /64/);
+ 	cmp	$len,#16
+diff --git a/crypto/aes/asm/vpaes-armv8.pl b/crypto/aes/asm/vpaes-armv8.pl
+index dcd5065e68..49988e9c2b 100755
+--- a/crypto/aes/asm/vpaes-armv8.pl
++++ b/crypto/aes/asm/vpaes-armv8.pl
+@@ -53,6 +53,8 @@ open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+ *STDOUT=*OUT;
+ 
+ $code.=<<___;
++#include "arm_arch.h"
++
+ .text
+ 
+ .type	_vpaes_consts,%object
+@@ -259,7 +261,7 @@ _vpaes_encrypt_core:
+ .type	vpaes_encrypt,%function
+ .align	4
+ vpaes_encrypt:
+-	.inst	0xd503233f			// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 
+@@ -269,7 +271,7 @@ vpaes_encrypt:
+ 	st1	{v0.16b}, [$out]
+ 
+ 	ldp	x29,x30,[sp],#16
+-	.inst	0xd50323bf			// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	vpaes_encrypt,.-vpaes_encrypt
+ 
+@@ -492,7 +494,7 @@ _vpaes_decrypt_core:
+ .type	vpaes_decrypt,%function
+ .align	4
+ vpaes_decrypt:
+-	.inst	0xd503233f			// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 
+@@ -502,7 +504,7 @@ vpaes_decrypt:
+ 	st1	{v0.16b}, [$out]
+ 
+ 	ldp	x29,x30,[sp],#16
+-	.inst	0xd50323bf			// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	vpaes_decrypt,.-vpaes_decrypt
+ 
+@@ -673,7 +675,7 @@ _vpaes_key_preheat:
+ .type	_vpaes_schedule_core,%function
+ .align	4
+ _vpaes_schedule_core:
+-	.inst	0xd503233f			// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29, x30, [sp,#-16]!
+ 	add	x29,sp,#0
+ 
+@@ -838,7 +840,7 @@ _vpaes_schedule_core:
+ 	eor	v6.16b, v6.16b, v6.16b		// vpxor	%xmm6,	%xmm6,	%xmm6
+ 	eor	v7.16b, v7.16b, v7.16b		// vpxor	%xmm7,	%xmm7,	%xmm7
+ 	ldp	x29, x30, [sp],#16
+-	.inst	0xd50323bf			// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	_vpaes_schedule_core,.-_vpaes_schedule_core
+ 
+@@ -1051,7 +1053,7 @@ _vpaes_schedule_mangle:
+ .type	vpaes_set_encrypt_key,%function
+ .align	4
+ vpaes_set_encrypt_key:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+@@ -1067,7 +1069,7 @@ vpaes_set_encrypt_key:
+ 
+ 	ldp	d8,d9,[sp],#16
+ 	ldp	x29,x30,[sp],#16
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+ 
+@@ -1075,7 +1077,7 @@ vpaes_set_encrypt_key:
+ .type	vpaes_set_decrypt_key,%function
+ .align	4
+ vpaes_set_decrypt_key:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+@@ -1095,7 +1097,7 @@ vpaes_set_decrypt_key:
+ 
+ 	ldp	d8,d9,[sp],#16
+ 	ldp	x29,x30,[sp],#16
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+ ___
+@@ -1108,11 +1110,11 @@ $code.=<<___;
+ .type	vpaes_cbc_encrypt,%function
+ .align	4
+ vpaes_cbc_encrypt:
++	AARCH64_SIGN_LINK_REGISTER
+ 	cbz	$len, .Lcbc_abort
+ 	cmp	w5, #0			// check direction
+ 	b.eq	vpaes_cbc_decrypt
+ 
+-	.inst	0xd503233f		// paciasp
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 
+@@ -1135,15 +1137,16 @@ vpaes_cbc_encrypt:
+ 	st1	{v0.16b}, [$ivec]	// write ivec
+ 
+ 	ldp	x29,x30,[sp],#16
+-	.inst	0xd50323bf		// autiasp
+ .Lcbc_abort:
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+ 
+ .type	vpaes_cbc_decrypt,%function
+ .align	4
+ vpaes_cbc_decrypt:
+-	.inst	0xd503233f		// paciasp
++	// Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
++	// only from vpaes_cbc_encrypt which has already signed the return address.
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+@@ -1185,7 +1188,7 @@ vpaes_cbc_decrypt:
+ 	ldp	d10,d11,[sp],#16
+ 	ldp	d8,d9,[sp],#16
+ 	ldp	x29,x30,[sp],#16
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
+ ___
+@@ -1195,7 +1198,7 @@ $code.=<<___;
+ .type	vpaes_ecb_encrypt,%function
+ .align	4
+ vpaes_ecb_encrypt:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+@@ -1229,7 +1232,7 @@ vpaes_ecb_encrypt:
+ 	ldp	d10,d11,[sp],#16
+ 	ldp	d8,d9,[sp],#16
+ 	ldp	x29,x30,[sp],#16
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
+ 
+@@ -1237,7 +1240,7 @@ vpaes_ecb_encrypt:
+ .type	vpaes_ecb_decrypt,%function
+ .align	4
+ vpaes_ecb_decrypt:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 	stp	d8,d9,[sp,#-16]!	// ABI spec says so
+@@ -1271,7 +1274,7 @@ vpaes_ecb_decrypt:
+ 	ldp	d10,d11,[sp],#16
+ 	ldp	d8,d9,[sp],#16
+ 	ldp	x29,x30,[sp],#16
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
+ ___
+diff --git a/crypto/aes/build.info b/crypto/aes/build.info
+index b250903fa6..47f99fdf33 100644
+--- a/crypto/aes/build.info
++++ b/crypto/aes/build.info
+@@ -116,6 +116,7 @@ INCLUDE[aes-mips.o]=..
+ GENERATE[aesv8-armx.S]=asm/aesv8-armx.pl
+ INCLUDE[aesv8-armx.o]=..
+ GENERATE[vpaes-armv8.S]=asm/vpaes-armv8.pl
++INCLUDE[vpaes-armv8.o]=..
+ 
+ GENERATE[aes-armv4.S]=asm/aes-armv4.pl
+ INCLUDE[aes-armv4.o]=..
+diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
+index ac76dd449f..11f0e50279 100755
+--- a/crypto/arm64cpuid.pl
++++ b/crypto/arm64cpuid.pl
+@@ -31,6 +31,7 @@ $code.=<<___;
+ .globl	_armv7_neon_probe
+ .type	_armv7_neon_probe,%function
+ _armv7_neon_probe:
++	AARCH64_VALID_CALL_TARGET
+ 	orr	v15.16b, v15.16b, v15.16b
+ 	ret
+ .size	_armv7_neon_probe,.-_armv7_neon_probe
+@@ -38,6 +39,7 @@ _armv7_neon_probe:
+ .globl	_armv7_tick
+ .type	_armv7_tick,%function
+ _armv7_tick:
++	AARCH64_VALID_CALL_TARGET
+ #ifdef	__APPLE__
+ 	mrs	x0, CNTPCT_EL0
+ #else
+@@ -49,6 +51,7 @@ _armv7_tick:
+ .globl	_armv8_aes_probe
+ .type	_armv8_aes_probe,%function
+ _armv8_aes_probe:
++	AARCH64_VALID_CALL_TARGET
+ 	aese	v0.16b, v0.16b
+ 	ret
+ .size	_armv8_aes_probe,.-_armv8_aes_probe
+@@ -56,6 +59,7 @@ _armv8_aes_probe:
+ .globl	_armv8_sha1_probe
+ .type	_armv8_sha1_probe,%function
+ _armv8_sha1_probe:
++	AARCH64_VALID_CALL_TARGET
+ 	sha1h	s0, s0
+ 	ret
+ .size	_armv8_sha1_probe,.-_armv8_sha1_probe
+@@ -63,6 +67,7 @@ _armv8_sha1_probe:
+ .globl	_armv8_sha256_probe
+ .type	_armv8_sha256_probe,%function
+ _armv8_sha256_probe:
++	AARCH64_VALID_CALL_TARGET
+ 	sha256su0	v0.4s, v0.4s
+ 	ret
+ .size	_armv8_sha256_probe,.-_armv8_sha256_probe
+@@ -70,6 +75,7 @@ _armv8_sha256_probe:
+ .globl	_armv8_pmull_probe
+ .type	_armv8_pmull_probe,%function
+ _armv8_pmull_probe:
++	AARCH64_VALID_CALL_TARGET
+ 	pmull	v0.1q, v0.1d, v0.1d
+ 	ret
+ .size	_armv8_pmull_probe,.-_armv8_pmull_probe
+@@ -77,6 +83,7 @@ _armv8_pmull_probe:
+ .globl	_armv8_sha512_probe
+ .type	_armv8_sha512_probe,%function
+ _armv8_sha512_probe:
++	AARCH64_VALID_CALL_TARGET
+ 	.long	0xcec08000	// sha512su0	v0.2d,v0.2d
+ 	ret
+ .size	_armv8_sha512_probe,.-_armv8_sha512_probe
+@@ -84,6 +91,7 @@ _armv8_sha512_probe:
+ .globl	_armv8_cpuid_probe
+ .type	_armv8_cpuid_probe,%function
+ _armv8_cpuid_probe:
++	AARCH64_VALID_CALL_TARGET
+ 	mrs	x0, midr_el1
+ 	ret
+ .size	_armv8_cpuid_probe,.-_armv8_cpuid_probe
+@@ -92,6 +100,7 @@ _armv8_cpuid_probe:
+ .type	OPENSSL_cleanse,%function
+ .align	5
+ OPENSSL_cleanse:
++	AARCH64_VALID_CALL_TARGET
+ 	cbz	x1,.Lret	// len==0?
+ 	cmp	x1,#15
+ 	b.hi	.Lot		// len>15
+@@ -123,6 +132,7 @@ OPENSSL_cleanse:
+ .type	CRYPTO_memcmp,%function
+ .align	4
+ CRYPTO_memcmp:
++	AARCH64_VALID_CALL_TARGET
+ 	eor	w3,w3,w3
+ 	cbz	x2,.Lno_data	// len==0?
+ 	cmp	x2,#16
+diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
+index 45d7e15564..a815a5c72b 100644
+--- a/crypto/arm_arch.h
++++ b/crypto/arm_arch.h
+@@ -126,4 +126,62 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
+ 
+ # define MIDR_IS_CPU_MODEL(midr, imp, partnum) \
+            (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum))
++
++#if defined(__ASSEMBLER__)
++
++   /*
++    * Support macros for
++    *   - Armv8.3-A Pointer Authentication and
++    *   - Armv8.5-A Branch Target Identification
++    * features which require emitting a .note.gnu.property section with the
++    * appropriate architecture-dependent feature bits set.
++    * Read more: "ELF for the Arm® 64-bit Architecture"
++    */
++
++#  if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1
++#   define GNU_PROPERTY_AARCH64_BTI (1 << 0)   /* Has Branch Target Identification */
++#   define AARCH64_VALID_CALL_TARGET hint #34  /* BTI 'c' */
++#  else
++#   define GNU_PROPERTY_AARCH64_BTI 0  /* No Branch Target Identification */
++#   define AARCH64_VALID_CALL_TARGET
++#  endif
++
++#  if defined(__ARM_FEATURE_PAC_DEFAULT) && \
++       (__ARM_FEATURE_PAC_DEFAULT & 1) == 1  /* Signed with A-key */
++#   define GNU_PROPERTY_AARCH64_POINTER_AUTH \
++     (1 << 1)                                       /* Has Pointer Authentication */
++#   define AARCH64_SIGN_LINK_REGISTER hint #25      /* PACIASP */
++#   define AARCH64_VALIDATE_LINK_REGISTER hint #29  /* AUTIASP */
++#  elif defined(__ARM_FEATURE_PAC_DEFAULT) && \
++       (__ARM_FEATURE_PAC_DEFAULT & 2) == 2  /* Signed with B-key */
++#   define GNU_PROPERTY_AARCH64_POINTER_AUTH \
++     (1 << 1)                                       /* Has Pointer Authentication */
++#   define AARCH64_SIGN_LINK_REGISTER hint #27      /* PACIBSP */
++#   define AARCH64_VALIDATE_LINK_REGISTER hint #31  /* AUTIBSP */
++#  else
++#   define GNU_PROPERTY_AARCH64_POINTER_AUTH 0  /* No Pointer Authentication */
++#   if GNU_PROPERTY_AARCH64_BTI != 0
++#    define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET
++#   else
++#    define AARCH64_SIGN_LINK_REGISTER
++#   endif
++#   define AARCH64_VALIDATE_LINK_REGISTER
++#  endif
++
++#  if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0
++    .pushsection .note.gnu.property, "a";
++    .balign 8;
++    .long 4;
++    .long 0x10;
++    .long 0x5;
++    .asciz "GNU";
++    .long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
++    .long 4;
++    .long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI);
++    .long 0;
++    .popsection;
++#  endif
++
++# endif  /* defined __ASSEMBLER__ */
++
+ #endif
+diff --git a/crypto/bn/asm/armv8-mont.pl b/crypto/bn/asm/armv8-mont.pl
+index 54d2e8245f..21ab12bdf0 100755
+--- a/crypto/bn/asm/armv8-mont.pl
++++ b/crypto/bn/asm/armv8-mont.pl
+@@ -67,8 +67,8 @@ $n0="x4";	# const BN_ULONG *n0,
+ $num="x5";	# int num);
+ 
+ $code.=<<___;
++#include "arm_arch.h"
+ #ifndef	__KERNEL__
+-# include "arm_arch.h"
+ .extern OPENSSL_armv8_rsa_neonized
+ .hidden OPENSSL_armv8_rsa_neonized
+ #endif
+@@ -78,6 +78,7 @@ $code.=<<___;
+ .type	bn_mul_mont,%function
+ .align	5
+ bn_mul_mont:
++	AARCH64_SIGN_LINK_REGISTER
+ .Lbn_mul_mont:
+ 	tst	$num,#3
+ 	b.ne	.Lmul_mont
+@@ -288,6 +289,7 @@ bn_mul_mont:
+ 	mov	x0,#1
+ 	ldp	x23,x24,[x29,#48]
+ 	ldr	x29,[sp],#64
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	bn_mul_mont,.-bn_mul_mont
+ ___
+@@ -309,6 +311,8 @@ $code.=<<___;
+ .type	bn_mul8x_mont_neon,%function
+ .align	5
+ bn_mul8x_mont_neon:
++	// Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to
++	// only from bn_mul_mont which has already signed the return address.
+ 	stp	x29,x30,[sp,#-80]!
+ 	mov	x16,sp
+ 	stp	d8,d9,[sp,#16]
+@@ -649,6 +653,7 @@ $code.=<<___;
+ 	ldp	d10,d11,[sp,#32]
+ 	ldp	d8,d9,[sp,#16]
+ 	ldr	x29,[sp],#80
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret			// bx lr
+ 
+ .size	bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+@@ -671,7 +676,8 @@ __bn_sqr8x_mont:
+ 	cmp	$ap,$bp
+ 	b.ne	__bn_mul4x_mont
+ .Lsqr8x_mont:
+-	.inst	0xd503233f		// paciasp
++	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
++	// only from bn_mul_mont which has already signed the return address.
+ 	stp	x29,x30,[sp,#-128]!
+ 	add	x29,sp,#0
+ 	stp	x19,x20,[sp,#16]
+@@ -1425,7 +1431,8 @@ $code.=<<___;
+ 	ldp	x25,x26,[x29,#64]
+ 	ldp	x27,x28,[x29,#80]
+ 	ldr	x29,[sp],#128
+-	.inst	0xd50323bf		// autiasp
++	// x30 is loaded earlier
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	__bn_sqr8x_mont,.-__bn_sqr8x_mont
+ ___
+@@ -1449,7 +1456,8 @@ $code.=<<___;
+ .type	__bn_mul4x_mont,%function
+ .align	5
+ __bn_mul4x_mont:
+-	.inst	0xd503233f		// paciasp
++	// Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
++	// only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.
+ 	stp	x29,x30,[sp,#-128]!
+ 	add	x29,sp,#0
+ 	stp	x19,x20,[sp,#16]
+@@ -1883,7 +1891,8 @@ __bn_mul4x_mont:
+ 	ldp	x25,x26,[x29,#64]
+ 	ldp	x27,x28,[x29,#80]
+ 	ldr	x29,[sp],#128
+-	.inst	0xd50323bf		// autiasp
++	// x30 loaded earlier
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	__bn_mul4x_mont,.-__bn_mul4x_mont
+ ___
+diff --git a/crypto/chacha/asm/chacha-armv8.pl b/crypto/chacha/asm/chacha-armv8.pl
+index dcdc4a04e3..e1a8b81594 100755
+--- a/crypto/chacha/asm/chacha-armv8.pl
++++ b/crypto/chacha/asm/chacha-armv8.pl
+@@ -132,8 +132,8 @@ my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+ }
+ 
+ $code.=<<___;
++#include "arm_arch.h"
+ #ifndef	__KERNEL__
+-# include "arm_arch.h"
+ .extern	OPENSSL_armcap_P
+ .hidden	OPENSSL_armcap_P
+ #endif
+@@ -153,6 +153,7 @@ $code.=<<___;
+ .type	ChaCha20_ctr32,%function
+ .align	5
+ ChaCha20_ctr32:
++	AARCH64_SIGN_LINK_REGISTER
+ 	cbz	$len,.Labort
+ 	cmp	$len,#192
+ 	b.lo	.Lshort
+@@ -165,7 +166,6 @@ ChaCha20_ctr32:
+ #endif
+ 
+ .Lshort:
+-	.inst	0xd503233f			// paciasp
+ 	stp	x29,x30,[sp,#-96]!
+ 	add	x29,sp,#0
+ 
+@@ -285,8 +285,8 @@ $code.=<<___;
+ 	ldp	x25,x26,[x29,#64]
+ 	ldp	x27,x28,[x29,#80]
+ 	ldp	x29,x30,[sp],#96
+-	.inst	0xd50323bf			// autiasp
+ .Labort:
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ 
+ .align	4
+@@ -342,7 +342,7 @@ $code.=<<___;
+ 	ldp	x25,x26,[x29,#64]
+ 	ldp	x27,x28,[x29,#80]
+ 	ldp	x29,x30,[sp],#96
+-	.inst	0xd50323bf			// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	ChaCha20_ctr32,.-ChaCha20_ctr32
+ ___
+@@ -432,8 +432,8 @@ $code.=<<___;
+ .type	ChaCha20_neon,%function
+ .align	5
+ ChaCha20_neon:
++	AARCH64_SIGN_LINK_REGISTER
+ .LChaCha20_neon:
+-	.inst	0xd503233f			// paciasp
+ 	stp	x29,x30,[sp,#-96]!
+ 	add	x29,sp,#0
+ 
+@@ -667,7 +667,7 @@ $code.=<<___;
+ 	ldp	x25,x26,[x29,#64]
+ 	ldp	x27,x28,[x29,#80]
+ 	ldp	x29,x30,[sp],#96
+-	.inst	0xd50323bf			// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ 
+ .align	4
+@@ -799,7 +799,7 @@ $code.=<<___;
+ 	ldp	x25,x26,[x29,#64]
+ 	ldp	x27,x28,[x29,#80]
+ 	ldp	x29,x30,[sp],#96
+-	.inst	0xd50323bf			// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	ChaCha20_neon,.-ChaCha20_neon
+ ___
+@@ -844,7 +844,7 @@ $code.=<<___;
+ .type	ChaCha20_512_neon,%function
+ .align	5
+ ChaCha20_512_neon:
+-	.inst	0xd503233f			// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-96]!
+ 	add	x29,sp,#0
+ 
+@@ -1268,7 +1268,7 @@ $code.=<<___;
+ 	ldp	x25,x26,[x29,#64]
+ 	ldp	x27,x28,[x29,#80]
+ 	ldp	x29,x30,[sp],#96
+-	.inst	0xd50323bf			// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	ChaCha20_512_neon,.-ChaCha20_512_neon
+ ___
+diff --git a/crypto/ec/asm/ecp_nistz256-armv8.pl b/crypto/ec/asm/ecp_nistz256-armv8.pl
+index 81ee3947d7..6c5d0e8b3c 100644
+--- a/crypto/ec/asm/ecp_nistz256-armv8.pl
++++ b/crypto/ec/asm/ecp_nistz256-armv8.pl
+@@ -122,7 +122,7 @@ $code.=<<___;
+ .type	ecp_nistz256_to_mont,%function
+ .align	6
+ ecp_nistz256_to_mont:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-32]!
+ 	add	x29,sp,#0
+ 	stp	x19,x20,[sp,#16]
+@@ -138,7 +138,7 @@ ecp_nistz256_to_mont:
+ 
+ 	ldp	x19,x20,[sp,#16]
+ 	ldp	x29,x30,[sp],#32
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
+ 
+@@ -147,7 +147,7 @@ ecp_nistz256_to_mont:
+ .type	ecp_nistz256_from_mont,%function
+ .align	4
+ ecp_nistz256_from_mont:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-32]!
+ 	add	x29,sp,#0
+ 	stp	x19,x20,[sp,#16]
+@@ -163,7 +163,7 @@ ecp_nistz256_from_mont:
+ 
+ 	ldp	x19,x20,[sp,#16]
+ 	ldp	x29,x30,[sp],#32
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
+ 
+@@ -173,7 +173,7 @@ ecp_nistz256_from_mont:
+ .type	ecp_nistz256_mul_mont,%function
+ .align	4
+ ecp_nistz256_mul_mont:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-32]!
+ 	add	x29,sp,#0
+ 	stp	x19,x20,[sp,#16]
+@@ -188,7 +188,7 @@ ecp_nistz256_mul_mont:
+ 
+ 	ldp	x19,x20,[sp,#16]
+ 	ldp	x29,x30,[sp],#32
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
+ 
+@@ -197,7 +197,7 @@ ecp_nistz256_mul_mont:
+ .type	ecp_nistz256_sqr_mont,%function
+ .align	4
+ ecp_nistz256_sqr_mont:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-32]!
+ 	add	x29,sp,#0
+ 	stp	x19,x20,[sp,#16]
+@@ -211,7 +211,7 @@ ecp_nistz256_sqr_mont:
+ 
+ 	ldp	x19,x20,[sp,#16]
+ 	ldp	x29,x30,[sp],#32
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
+ 
+@@ -221,7 +221,7 @@ ecp_nistz256_sqr_mont:
+ .type	ecp_nistz256_add,%function
+ .align	4
+ ecp_nistz256_add:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 
+@@ -235,7 +235,7 @@ ecp_nistz256_add:
+ 	bl	__ecp_nistz256_add
+ 
+ 	ldp	x29,x30,[sp],#16
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	ecp_nistz256_add,.-ecp_nistz256_add
+ 
+@@ -244,7 +244,7 @@ ecp_nistz256_add:
+ .type	ecp_nistz256_div_by_2,%function
+ .align	4
+ ecp_nistz256_div_by_2:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 
+@@ -256,7 +256,7 @@ ecp_nistz256_div_by_2:
+ 	bl	__ecp_nistz256_div_by_2
+ 
+ 	ldp	x29,x30,[sp],#16
+-	.inst	0xd50323bf		//  autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
+ 
+@@ -265,7 +265,7 @@ ecp_nistz256_div_by_2:
+ .type	ecp_nistz256_mul_by_2,%function
+ .align	4
+ ecp_nistz256_mul_by_2:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 
+@@ -281,7 +281,7 @@ ecp_nistz256_mul_by_2:
+ 	bl	__ecp_nistz256_add	// ret = a+a	// 2*a
+ 
+ 	ldp	x29,x30,[sp],#16
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
+ 
+@@ -290,7 +290,7 @@ ecp_nistz256_mul_by_2:
+ .type	ecp_nistz256_mul_by_3,%function
+ .align	4
+ ecp_nistz256_mul_by_3:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 
+@@ -317,7 +317,7 @@ ecp_nistz256_mul_by_3:
+ 	bl	__ecp_nistz256_add	// ret += a	// 2*a+a=3*a
+ 
+ 	ldp	x29,x30,[sp],#16
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
+ 
+@@ -327,7 +327,7 @@ ecp_nistz256_mul_by_3:
+ .type	ecp_nistz256_sub,%function
+ .align	4
+ ecp_nistz256_sub:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 
+@@ -339,7 +339,7 @@ ecp_nistz256_sub:
+ 	bl	__ecp_nistz256_sub_from
+ 
+ 	ldp	x29,x30,[sp],#16
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	ecp_nistz256_sub,.-ecp_nistz256_sub
+ 
+@@ -348,7 +348,7 @@ ecp_nistz256_sub:
+ .type	ecp_nistz256_neg,%function
+ .align	4
+ ecp_nistz256_neg:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 
+@@ -363,7 +363,7 @@ ecp_nistz256_neg:
+ 	bl	__ecp_nistz256_sub_from
+ 
+ 	ldp	x29,x30,[sp],#16
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	ecp_nistz256_neg,.-ecp_nistz256_neg
+ 
+@@ -724,7 +724,7 @@ $code.=<<___;
+ .type	ecp_nistz256_point_double,%function
+ .align	5
+ ecp_nistz256_point_double:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-96]!
+ 	add	x29,sp,#0
+ 	stp	x19,x20,[sp,#16]
+@@ -859,7 +859,7 @@ ecp_nistz256_point_double:
+ 	ldp	x19,x20,[x29,#16]
+ 	ldp	x21,x22,[x29,#32]
+ 	ldp	x29,x30,[sp],#96
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
+ ___
+@@ -882,7 +882,7 @@ $code.=<<___;
+ .type	ecp_nistz256_point_add,%function
+ .align	5
+ ecp_nistz256_point_add:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-96]!
+ 	add	x29,sp,#0
+ 	stp	x19,x20,[sp,#16]
+@@ -1117,7 +1117,7 @@ $code.=<<___;
+ 	ldp	x25,x26,[x29,#64]
+ 	ldp	x27,x28,[x29,#80]
+ 	ldp	x29,x30,[sp],#96
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
+ ___
+@@ -1139,7 +1139,7 @@ $code.=<<___;
+ .type	ecp_nistz256_point_add_affine,%function
+ .align	5
+ ecp_nistz256_point_add_affine:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-80]!
+ 	add	x29,sp,#0
+ 	stp	x19,x20,[sp,#16]
+@@ -1328,7 +1328,7 @@ $code.=<<___;
+ 	ldp	x23,x24,[x29,#48]
+ 	ldp	x25,x26,[x29,#64]
+ 	ldp	x29,x30,[sp],#80
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+ ___
+@@ -1346,6 +1346,8 @@ $code.=<<___;
+ .type	ecp_nistz256_ord_mul_mont,%function
+ .align	4
+ ecp_nistz256_ord_mul_mont:
++	AARCH64_VALID_CALL_TARGET
++	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ 	stp	x29,x30,[sp,#-64]!
+ 	add	x29,sp,#0
+ 	stp	x19,x20,[sp,#16]
+@@ -1487,6 +1489,8 @@ $code.=<<___;
+ .type	ecp_nistz256_ord_sqr_mont,%function
+ .align	4
+ ecp_nistz256_ord_sqr_mont:
++	AARCH64_VALID_CALL_TARGET
++	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ 	stp	x29,x30,[sp,#-64]!
+ 	add	x29,sp,#0
+ 	stp	x19,x20,[sp,#16]
+@@ -1641,6 +1645,8 @@ $code.=<<___;
+ .type	ecp_nistz256_scatter_w5,%function
+ .align	4
+ ecp_nistz256_scatter_w5:
++	AARCH64_VALID_CALL_TARGET
++	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 
+@@ -1703,6 +1709,8 @@ ecp_nistz256_scatter_w5:
+ .type	ecp_nistz256_gather_w5,%function
+ .align	4
+ ecp_nistz256_gather_w5:
++	AARCH64_VALID_CALL_TARGET
++	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 
+@@ -1780,6 +1788,8 @@ ecp_nistz256_gather_w5:
+ .type	ecp_nistz256_scatter_w7,%function
+ .align	4
+ ecp_nistz256_scatter_w7:
++	AARCH64_VALID_CALL_TARGET
++	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 
+@@ -1824,6 +1834,8 @@ ecp_nistz256_scatter_w7:
+ .type	ecp_nistz256_gather_w7,%function
+ .align	4
+ ecp_nistz256_gather_w7:
++	AARCH64_VALID_CALL_TARGET
++	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 
+diff --git a/crypto/modes/asm/aes-gcm-armv8_64.pl b/crypto/modes/asm/aes-gcm-armv8_64.pl
+index 3b9d5b6511..ff5809ec22 100755
+--- a/crypto/modes/asm/aes-gcm-armv8_64.pl
++++ b/crypto/modes/asm/aes-gcm-armv8_64.pl
+@@ -256,6 +256,7 @@ $code.=<<___;
+ .type   aes_gcm_enc_128_kernel,%function
+ .align  4
+ aes_gcm_enc_128_kernel:
++	AARCH64_VALID_CALL_TARGET
+ 	cbz     x1, .L128_enc_ret
+ 	stp     x19, x20, [sp, #-112]!
+ 	mov     x16, x4
+@@ -1089,6 +1090,7 @@ $code.=<<___;
+ .type   aes_gcm_dec_128_kernel,%function
+ .align  4
+ aes_gcm_dec_128_kernel:
++	AARCH64_VALID_CALL_TARGET
+ 	cbz     x1, .L128_dec_ret
+ 	stp     x19, x20, [sp, #-112]!
+ 	mov     x16, x4
+@@ -1973,6 +1975,7 @@ $code.=<<___;
+ .type   aes_gcm_enc_192_kernel,%function
+ .align  4
+ aes_gcm_enc_192_kernel:
++	AARCH64_VALID_CALL_TARGET
+ 	cbz     x1, .L192_enc_ret
+ 	stp     x19, x20, [sp, #-112]!
+ 	mov     x16, x4
+@@ -2858,6 +2861,7 @@ $code.=<<___;
+ .type   aes_gcm_dec_192_kernel,%function
+ .align  4
+ aes_gcm_dec_192_kernel:
++	AARCH64_VALID_CALL_TARGET
+ 	cbz     x1, .L192_dec_ret
+ 	stp     x19, x20, [sp, #-112]!
+ 	mov     x16, x4
+@@ -3797,6 +3801,7 @@ $code.=<<___;
+ .type   aes_gcm_enc_256_kernel,%function
+ .align  4
+ aes_gcm_enc_256_kernel:
++	AARCH64_VALID_CALL_TARGET
+ 	cbz     x1, .L256_enc_ret
+ 	stp     x19, x20, [sp, #-112]!
+ 	mov     x16, x4
+@@ -4729,6 +4734,7 @@ $code.=<<___;
+ .type   aes_gcm_dec_256_kernel,%function
+ .align  4
+ aes_gcm_dec_256_kernel:
++	AARCH64_VALID_CALL_TARGET
+ 	cbz     x1, .L256_dec_ret
+ 	stp     x19, x20, [sp, #-112]!
+ 	mov     x16, x4
+diff --git a/crypto/modes/asm/ghashv8-armx.pl b/crypto/modes/asm/ghashv8-armx.pl
+index b1d35d25b5..57f893e77c 100644
+--- a/crypto/modes/asm/ghashv8-armx.pl
++++ b/crypto/modes/asm/ghashv8-armx.pl
+@@ -107,6 +107,11 @@ $code.=<<___;
+ .type	gcm_init_v8,%function
+ .align	4
+ gcm_init_v8:
++___
++$code.=<<___	if ($flavour =~ /64/);
++	AARCH64_VALID_CALL_TARGET
++___
++$code.=<<___;
+ 	vld1.64		{$t1},[x1]		@ load input H
+ 	vmov.i8		$xC2,#0xe1
+ 	vshl.i64	$xC2,$xC2,#57		@ 0xc2.0
+@@ -214,6 +219,11 @@ $code.=<<___;
+ .type	gcm_gmult_v8,%function
+ .align	4
+ gcm_gmult_v8:
++___
++$code.=<<___	if ($flavour =~ /64/);
++	AARCH64_VALID_CALL_TARGET
++___
++$code.=<<___;
+ 	vld1.64		{$t1},[$Xi]		@ load Xi
+ 	vmov.i8		$xC2,#0xe1
+ 	vld1.64		{$H-$Hhl},[$Htbl]	@ load twisted H, ...
+@@ -268,6 +278,7 @@ $code.=<<___;
+ gcm_ghash_v8:
+ ___
+ $code.=<<___	if ($flavour =~ /64/);
++	AARCH64_VALID_CALL_TARGET
+ 	cmp		$len,#64
+ 	b.hs		.Lgcm_ghash_v8_4x
+ ___
+diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl
+index 113a2151b6..20816c4283 100755
+--- a/crypto/poly1305/asm/poly1305-armv8.pl
++++ b/crypto/poly1305/asm/poly1305-armv8.pl
+@@ -72,6 +72,7 @@ $code.=<<___;
+ .type	poly1305_init,%function
+ .align	5
+ poly1305_init:
++	AARCH64_VALID_CALL_TARGET
+ 	cmp	$inp,xzr
+ 	stp	xzr,xzr,[$ctx]		// zero hash value
+ 	stp	xzr,xzr,[$ctx,#16]	// [along with is_base2_26]
+@@ -119,6 +120,9 @@ poly1305_init:
+ .align	5
+ poly1305_blocks:
+ .Lpoly1305_blocks:
++	// The symbol .Lpoly1305_blocks is not a .globl symbol
++	// but a pointer to it is returned by poly1305_init
++	AARCH64_VALID_CALL_TARGET
+ 	ands	$len,$len,#-16
+ 	b.eq	.Lno_data
+ 
+@@ -184,6 +188,9 @@ poly1305_blocks:
+ .align	5
+ poly1305_emit:
+ .Lpoly1305_emit:
++	// The symbol .poly1305_emit is not a .globl symbol
++	// but a pointer to it is returned by poly1305_init
++	AARCH64_VALID_CALL_TARGET
+ 	ldp	$h0,$h1,[$ctx]		// load hash base 2^64
+ 	ldr	$h2,[$ctx,#16]
+ 	ldp	$t0,$t1,[$nonce]	// load nonce
+@@ -291,13 +298,16 @@ poly1305_splat:
+ .align	5
+ poly1305_blocks_neon:
+ .Lpoly1305_blocks_neon:
++	// The symbol .Lpoly1305_blocks_neon is not a .globl symbol
++	// but a pointer to it is returned by poly1305_init
++	AARCH64_VALID_CALL_TARGET
+ 	ldr	$is_base2_26,[$ctx,#24]
+ 	cmp	$len,#128
+ 	b.hs	.Lblocks_neon
+ 	cbz	$is_base2_26,.Lpoly1305_blocks
+ 
+ .Lblocks_neon:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-80]!
+ 	add	x29,sp,#0
+ 
+@@ -867,7 +877,7 @@ poly1305_blocks_neon:
+ 
+ .Lno_data_neon:
+ 	ldr	x29,[sp],#80
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	poly1305_blocks_neon,.-poly1305_blocks_neon
+ 
+@@ -875,6 +885,9 @@ poly1305_blocks_neon:
+ .align	5
+ poly1305_emit_neon:
+ .Lpoly1305_emit_neon:
++	// The symbol .Lpoly1305_emit_neon is not a .globl symbol
++	// but a pointer to it is returned by poly1305_init
++	AARCH64_VALID_CALL_TARGET
+ 	ldr	$is_base2_26,[$ctx,#24]
+ 	cbz	$is_base2_26,poly1305_emit
+ 
+diff --git a/crypto/sha/asm/keccak1600-armv8.pl b/crypto/sha/asm/keccak1600-armv8.pl
+index 65102e7c29..cf54b62c63 100755
+--- a/crypto/sha/asm/keccak1600-armv8.pl
++++ b/crypto/sha/asm/keccak1600-armv8.pl
+@@ -80,6 +80,8 @@ my @rhotates = ([  0,  1, 62, 28, 27 ],
+                 [ 18,  2, 61, 56, 14 ]);
+ 
+ $code.=<<___;
++#include "arm_arch.h"
++
+ .text
+ 
+ .align 8	// strategic alignment and padding that allows to use
+@@ -125,7 +127,7 @@ $code.=<<___;
+ .align	5
+ KeccakF1600_int:
+ 	adr	$C[2],iotas
+-	.inst	0xd503233f			// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	$C[2],x30,[sp,#16]		// 32 bytes on top are mine
+ 	b	.Loop
+ .align	4
+@@ -297,14 +299,14 @@ $code.=<<___;
+ 	bne	.Loop
+ 
+ 	ldr	x30,[sp,#24]
+-	.inst	0xd50323bf			// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	KeccakF1600_int,.-KeccakF1600_int
+ 
+ .type	KeccakF1600,%function
+ .align	5
+ KeccakF1600:
+-	.inst	0xd503233f			// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-128]!
+ 	add	x29,sp,#0
+ 	stp	x19,x20,[sp,#16]
+@@ -354,7 +356,7 @@ KeccakF1600:
+ 	ldp	x25,x26,[x29,#64]
+ 	ldp	x27,x28,[x29,#80]
+ 	ldp	x29,x30,[sp],#128
+-	.inst	0xd50323bf			// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	KeccakF1600,.-KeccakF1600
+ 
+@@ -362,7 +364,7 @@ KeccakF1600:
+ .type	SHA3_absorb,%function
+ .align	5
+ SHA3_absorb:
+-	.inst	0xd503233f			// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-128]!
+ 	add	x29,sp,#0
+ 	stp	x19,x20,[sp,#16]
+@@ -460,7 +462,7 @@ $code.=<<___;
+ 	ldp	x25,x26,[x29,#64]
+ 	ldp	x27,x28,[x29,#80]
+ 	ldp	x29,x30,[sp],#128
+-	.inst	0xd50323bf			// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	SHA3_absorb,.-SHA3_absorb
+ ___
+@@ -471,7 +473,7 @@ $code.=<<___;
+ .type	SHA3_squeeze,%function
+ .align	5
+ SHA3_squeeze:
+-	.inst	0xd503233f			// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-48]!
+ 	add	x29,sp,#0
+ 	stp	x19,x20,[sp,#16]
+@@ -534,7 +536,7 @@ SHA3_squeeze:
+ 	ldp	x19,x20,[sp,#16]
+ 	ldp	x21,x22,[sp,#32]
+ 	ldp	x29,x30,[sp],#48
+-	.inst	0xd50323bf			// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	SHA3_squeeze,.-SHA3_squeeze
+ ___
+@@ -653,7 +655,7 @@ KeccakF1600_ce:
+ .type	KeccakF1600_cext,%function
+ .align	5
+ KeccakF1600_cext:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-80]!
+ 	add	x29,sp,#0
+ 	stp	d8,d9,[sp,#16]		// per ABI requirement
+@@ -686,7 +688,7 @@ $code.=<<___;
+ 	ldp	d12,d13,[sp,#48]
+ 	ldp	d14,d15,[sp,#64]
+ 	ldr	x29,[sp],#80
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	KeccakF1600_cext,.-KeccakF1600_cext
+ ___
+@@ -699,7 +701,7 @@ $code.=<<___;
+ .type	SHA3_absorb_cext,%function
+ .align	5
+ SHA3_absorb_cext:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-80]!
+ 	add	x29,sp,#0
+ 	stp	d8,d9,[sp,#16]		// per ABI requirement
+@@ -771,7 +773,7 @@ $code.=<<___;
+ 	ldp	d12,d13,[sp,#48]
+ 	ldp	d14,d15,[sp,#64]
+ 	ldp	x29,x30,[sp],#80
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	SHA3_absorb_cext,.-SHA3_absorb_cext
+ ___
+@@ -783,7 +785,7 @@ $code.=<<___;
+ .type	SHA3_squeeze_cext,%function
+ .align	5
+ SHA3_squeeze_cext:
+-	.inst	0xd503233f		// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 	mov	x9,$ctx
+@@ -839,7 +841,7 @@ SHA3_squeeze_cext:
+ 
+ .Lsqueeze_done_ce:
+ 	ldr	x29,[sp],#16
+-	.inst	0xd50323bf		// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	SHA3_squeeze_cext,.-SHA3_squeeze_cext
+ ___
+diff --git a/crypto/sha/asm/sha1-armv8.pl b/crypto/sha/asm/sha1-armv8.pl
+index cdea8845af..5f23a20c1a 100644
+--- a/crypto/sha/asm/sha1-armv8.pl
++++ b/crypto/sha/asm/sha1-armv8.pl
+@@ -175,8 +175,8 @@ ___
+ }
+ 
+ $code.=<<___;
++#include "arm_arch.h"
+ #ifndef	__KERNEL__
+-# include "arm_arch.h"
+ .extern OPENSSL_armcap_P
+ .hidden OPENSSL_armcap_P
+ #endif
+@@ -187,11 +187,13 @@ $code.=<<___;
+ .type	sha1_block_data_order,%function
+ .align	6
+ sha1_block_data_order:
++	AARCH64_VALID_CALL_TARGET
+ 	adrp	x16,OPENSSL_armcap_P
+ 	ldr	w16,[x16,#:lo12:OPENSSL_armcap_P]
+ 	tst	w16,#ARMV8_SHA1
+ 	b.ne	.Lv8_entry
+ 
++	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ 	stp	x29,x30,[sp,#-96]!
+ 	add	x29,sp,#0
+ 	stp	x19,x20,[sp,#16]
+@@ -253,6 +255,7 @@ $code.=<<___;
+ .align	6
+ sha1_block_armv8:
+ .Lv8_entry:
++	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ 	stp	x29,x30,[sp,#-16]!
+ 	add	x29,sp,#0
+ 
+diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl
+index 6bcff0b7d3..f900882fee 100644
+--- a/crypto/sha/asm/sha512-armv8.pl
++++ b/crypto/sha/asm/sha512-armv8.pl
+@@ -190,8 +190,8 @@ ___
+ }
+ 
+ $code.=<<___;
++#include "arm_arch.h"
+ #ifndef	__KERNEL__
+-# include "arm_arch.h"
+ .extern	OPENSSL_armcap_P
+ .hidden	OPENSSL_armcap_P
+ #endif
+@@ -202,6 +202,7 @@ $code.=<<___;
+ .type	$func,%function
+ .align	6
+ $func:
++	AARCH64_VALID_CALL_TARGET
+ #ifndef	__KERNEL__
+ 	adrp	x16,OPENSSL_armcap_P
+ 	ldr	w16,[x16,#:lo12:OPENSSL_armcap_P]
+@@ -218,7 +219,7 @@ $code.=<<___	if ($SZ==8);
+ ___
+ $code.=<<___;
+ #endif
+-	.inst	0xd503233f				// paciasp
++	AARCH64_SIGN_LINK_REGISTER
+ 	stp	x29,x30,[sp,#-128]!
+ 	add	x29,sp,#0
+ 
+@@ -280,7 +281,7 @@ $code.=<<___;
+ 	ldp	x25,x26,[x29,#64]
+ 	ldp	x27,x28,[x29,#80]
+ 	ldp	x29,x30,[sp],#128
+-	.inst	0xd50323bf				// autiasp
++	AARCH64_VALIDATE_LINK_REGISTER
+ 	ret
+ .size	$func,.-$func
+ 
+@@ -370,6 +371,7 @@ $code.=<<___;
+ .align	6
+ sha256_block_armv8:
+ .Lv8_entry:
++	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ 	stp		x29,x30,[sp,#-16]!
+ 	add		x29,sp,#0
+ 
+@@ -632,7 +634,9 @@ $code.=<<___;
+ .type	sha256_block_neon,%function
+ .align	4
+ sha256_block_neon:
++	AARCH64_VALID_CALL_TARGET
+ .Lneon_entry:
++	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later
+ 	stp	x29, x30, [sp, #-16]!
+ 	mov	x29, sp
+ 	sub	sp,sp,#16*4
+@@ -743,6 +747,7 @@ $code.=<<___;
+ .align	6
+ sha512_block_armv8:
+ .Lv8_entry:
++	// Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later
+ 	stp		x29,x30,[sp,#-16]!
+ 	add		x29,sp,#0
+ 
+diff --git a/crypto/sha/build.info b/crypto/sha/build.info
+index d61f7de9b6..556a658d8b 100644
+--- a/crypto/sha/build.info
++++ b/crypto/sha/build.info
+@@ -153,6 +153,7 @@ INCLUDE[sha256-armv8.o]=..
+ GENERATE[sha512-armv8.S]=asm/sha512-armv8.pl
+ INCLUDE[sha512-armv8.o]=..
+ GENERATE[keccak1600-armv8.S]=asm/keccak1600-armv8.pl
++INCLUDE[keccak1600-armv8.o]=..
+ 
+ GENERATE[sha1-s390x.S]=asm/sha1-s390x.pl
+ INCLUDE[sha1-s390x.o]=..
+-- 
+2.37.3.windows.1
+
diff --git a/Backport-providers-Add-SM4-GCM-implementation.patch b/Backport-providers-Add-SM4-GCM-implementation.patch
new file mode 100644
index 0000000..3e2ee23
--- /dev/null
+++ b/Backport-providers-Add-SM4-GCM-implementation.patch
@@ -0,0 +1,360 @@
+From 2f1c0b5f1b585a307f21a70ef3ae652643c25f6d Mon Sep 17 00:00:00 2001
+From: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+Date: Wed, 1 Sep 2021 16:54:15 +0800
+Subject: [PATCH 04/13] providers: Add SM4 GCM implementation
+
+The GCM mode of the SM4 algorithm is specifieded by RFC8998.
+
+Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+
+Reviewed-by: Paul Yang <kaishen.yy@antfin.com>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/16491)
+---
+ providers/defltprov.c                         |  2 +
+ providers/implementations/ciphers/build.info  |  4 +-
+ .../implementations/ciphers/cipher_sm4_ccm.c  | 39 +++++++++++++++++
+ .../implementations/ciphers/cipher_sm4_ccm.h  | 22 ++++++++++
+ .../ciphers/cipher_sm4_ccm_hw.c               | 41 ++++++++++++++++++
+ .../implementations/ciphers/cipher_sm4_gcm.c  | 40 +++++++++++++++++
+ .../implementations/ciphers/cipher_sm4_gcm.h  | 22 ++++++++++
+ .../ciphers/cipher_sm4_gcm_hw.c               | 43 +++++++++++++++++++
+ .../include/prov/implementations.h            |  2 +
+ .../implementations/include/prov/names.h      |  2 +
+ test/recipes/30-test_evp_data/evpciph_sm4.txt | 20 +++++++++
+ 11 files changed, 236 insertions(+), 1 deletion(-)
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_ccm.c
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_ccm.h
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_ccm_hw.c
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_gcm.c
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_gcm.h
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+
+diff --git a/providers/defltprov.c b/providers/defltprov.c
+index ed3f4799e7..cc0b0c3b62 100644
+--- a/providers/defltprov.c
++++ b/providers/defltprov.c
+@@ -289,6 +289,8 @@ static const OSSL_ALGORITHM_CAPABLE deflt_ciphers[] = {
+     ALG(PROV_NAMES_DES_EDE_CFB, ossl_tdes_ede2_cfb_functions),
+ #endif /* OPENSSL_NO_DES */
+ #ifndef OPENSSL_NO_SM4
++    ALG(PROV_NAMES_SM4_GCM, ossl_sm4128gcm_functions),
++    ALG(PROV_NAMES_SM4_CCM, ossl_sm4128ccm_functions),
+     ALG(PROV_NAMES_SM4_ECB, ossl_sm4128ecb_functions),
+     ALG(PROV_NAMES_SM4_CBC, ossl_sm4128cbc_functions),
+     ALG(PROV_NAMES_SM4_CTR, ossl_sm4128ctr_functions),
+diff --git a/providers/implementations/ciphers/build.info b/providers/implementations/ciphers/build.info
+index e4c5f4f051..b5d9d4f6c1 100644
+--- a/providers/implementations/ciphers/build.info
++++ b/providers/implementations/ciphers/build.info
+@@ -105,7 +105,9 @@ ENDIF
+ 
+ IF[{- !$disabled{sm4} -}]
+   SOURCE[$SM4_GOAL]=\
+-      cipher_sm4.c cipher_sm4_hw.c
++      cipher_sm4.c cipher_sm4_hw.c \
++      cipher_sm4_gcm.c cipher_sm4_gcm_hw.c \
++      cipher_sm4_ccm.c cipher_sm4_ccm_hw.c
+ ENDIF
+ 
+ IF[{- !$disabled{ocb} -}]
+diff --git a/providers/implementations/ciphers/cipher_sm4_ccm.c b/providers/implementations/ciphers/cipher_sm4_ccm.c
+new file mode 100644
+index 0000000000..f0295a5ca2
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_ccm.c
+@@ -0,0 +1,39 @@
++/*
++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License").  You may not use
++ * this file except in compliance with the License.  You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++/* Dispatch functions for SM4 CCM mode */
++
++#include "cipher_sm4_ccm.h"
++#include "prov/implementations.h"
++#include "prov/providercommon.h"
++
++static OSSL_FUNC_cipher_freectx_fn sm4_ccm_freectx;
++
++static void *sm4_ccm_newctx(void *provctx, size_t keybits)
++{
++    PROV_SM4_CCM_CTX *ctx;
++
++    if (!ossl_prov_is_running())
++        return NULL;
++
++    ctx = OPENSSL_zalloc(sizeof(*ctx));
++    if (ctx != NULL)
++        ossl_ccm_initctx(&ctx->base, keybits, ossl_prov_sm4_hw_ccm(keybits));
++    return ctx;
++}
++
++static void sm4_ccm_freectx(void *vctx)
++{
++    PROV_SM4_CCM_CTX *ctx = (PROV_SM4_CCM_CTX *)vctx;
++
++    OPENSSL_clear_free(ctx,  sizeof(*ctx));
++}
++
++/* sm4128ccm functions */
++IMPLEMENT_aead_cipher(sm4, ccm, CCM, AEAD_FLAGS, 128, 8, 96);
+diff --git a/providers/implementations/ciphers/cipher_sm4_ccm.h b/providers/implementations/ciphers/cipher_sm4_ccm.h
+new file mode 100644
+index 0000000000..189e71e9e4
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_ccm.h
+@@ -0,0 +1,22 @@
++/*
++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License").  You may not use
++ * this file except in compliance with the License.  You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++#include "crypto/sm4.h"
++#include "prov/ciphercommon.h"
++#include "prov/ciphercommon_ccm.h"
++
++typedef struct prov_sm4_ccm_ctx_st {
++    PROV_CCM_CTX base; /* Must be first */
++    union {
++        OSSL_UNION_ALIGN;
++        SM4_KEY ks;
++    } ks;                       /* SM4 key schedule to use */
++} PROV_SM4_CCM_CTX;
++
++const PROV_CCM_HW *ossl_prov_sm4_hw_ccm(size_t keylen);
+diff --git a/providers/implementations/ciphers/cipher_sm4_ccm_hw.c b/providers/implementations/ciphers/cipher_sm4_ccm_hw.c
+new file mode 100644
+index 0000000000..791daf3e46
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_ccm_hw.c
+@@ -0,0 +1,41 @@
++/*
++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License").  You may not use
++ * this file except in compliance with the License.  You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++/*-
++ * Generic support for SM4 CCM.
++ */
++
++#include "cipher_sm4_ccm.h"
++
++static int ccm_sm4_initkey(PROV_CCM_CTX *ctx,
++                           const unsigned char *key, size_t keylen)
++{
++    PROV_SM4_CCM_CTX *actx = (PROV_SM4_CCM_CTX *)ctx;
++
++    ossl_sm4_set_key(key, &actx->ks.ks);
++    CRYPTO_ccm128_init(&ctx->ccm_ctx, ctx->m, ctx->l, &actx->ks.ks,
++                       (block128_f)ossl_sm4_encrypt);
++    ctx->str = NULL;
++    ctx->key_set = 1;
++    return 1;
++}
++
++static const PROV_CCM_HW ccm_sm4 = {
++    ccm_sm4_initkey,
++    ossl_ccm_generic_setiv,
++    ossl_ccm_generic_setaad,
++    ossl_ccm_generic_auth_encrypt,
++    ossl_ccm_generic_auth_decrypt,
++    ossl_ccm_generic_gettag
++};
++
++const PROV_CCM_HW *ossl_prov_sm4_hw_ccm(size_t keybits)
++{
++    return &ccm_sm4;
++}
+diff --git a/providers/implementations/ciphers/cipher_sm4_gcm.c b/providers/implementations/ciphers/cipher_sm4_gcm.c
+new file mode 100644
+index 0000000000..7a936f00ee
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_gcm.c
+@@ -0,0 +1,40 @@
++/*
++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License").  You may not use
++ * this file except in compliance with the License.  You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++/* Dispatch functions for SM4 GCM mode */
++
++#include "cipher_sm4_gcm.h"
++#include "prov/implementations.h"
++#include "prov/providercommon.h"
++
++static OSSL_FUNC_cipher_freectx_fn sm4_gcm_freectx;
++
++static void *sm4_gcm_newctx(void *provctx, size_t keybits)
++{
++    PROV_SM4_GCM_CTX *ctx;
++
++    if (!ossl_prov_is_running())
++        return NULL;
++
++    ctx = OPENSSL_zalloc(sizeof(*ctx));
++    if (ctx != NULL)
++        ossl_gcm_initctx(provctx, &ctx->base, keybits,
++                         ossl_prov_sm4_hw_gcm(keybits));
++    return ctx;
++}
++
++static void sm4_gcm_freectx(void *vctx)
++{
++    PROV_SM4_GCM_CTX *ctx = (PROV_SM4_GCM_CTX *)vctx;
++
++    OPENSSL_clear_free(ctx,  sizeof(*ctx));
++}
++
++/* ossl_sm4128gcm_functions */
++IMPLEMENT_aead_cipher(sm4, gcm, GCM, AEAD_FLAGS, 128, 8, 96);
+diff --git a/providers/implementations/ciphers/cipher_sm4_gcm.h b/providers/implementations/ciphers/cipher_sm4_gcm.h
+new file mode 100644
+index 0000000000..2b6b5f3ece
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_gcm.h
+@@ -0,0 +1,22 @@
++/*
++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License").  You may not use
++ * this file except in compliance with the License.  You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++#include "crypto/sm4.h"
++#include "prov/ciphercommon.h"
++#include "prov/ciphercommon_gcm.h"
++
++typedef struct prov_sm4_gcm_ctx_st {
++    PROV_GCM_CTX base;              /* must be first entry in struct */
++    union {
++        OSSL_UNION_ALIGN;
++        SM4_KEY ks;
++    } ks;
++} PROV_SM4_GCM_CTX;
++
++const PROV_GCM_HW *ossl_prov_sm4_hw_gcm(size_t keybits);
+diff --git a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+new file mode 100644
+index 0000000000..6bcd1ec406
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+@@ -0,0 +1,43 @@
++/*
++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License").  You may not use
++ * this file except in compliance with the License.  You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++/*-
++ * Generic support for SM4 GCM.
++ */
++
++#include "cipher_sm4_gcm.h"
++
++static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
++                           size_t keylen)
++{
++    PROV_SM4_GCM_CTX *actx = (PROV_SM4_GCM_CTX *)ctx;
++    SM4_KEY *ks = &actx->ks.ks;
++
++    ctx->ks = ks;
++    ossl_sm4_set_key(key, ks);
++    CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt);
++    ctx->ctr = (ctr128_f)NULL;
++    ctx->key_set = 1;
++
++    return 1;
++}
++
++static const PROV_GCM_HW sm4_gcm = {
++    sm4_gcm_initkey,
++    ossl_gcm_setiv,
++    ossl_gcm_aad_update,
++    ossl_gcm_cipher_update,
++    ossl_gcm_cipher_final,
++    ossl_gcm_one_shot
++};
++
++const PROV_GCM_HW *ossl_prov_sm4_hw_gcm(size_t keybits)
++{
++    return &sm4_gcm;
++}
+diff --git a/providers/implementations/include/prov/implementations.h b/providers/implementations/include/prov/implementations.h
+index 3f6dd7ee16..498eab4ad4 100644
+--- a/providers/implementations/include/prov/implementations.h
++++ b/providers/implementations/include/prov/implementations.h
+@@ -174,6 +174,8 @@ extern const OSSL_DISPATCH ossl_seed128ofb128_functions[];
+ extern const OSSL_DISPATCH ossl_seed128cfb128_functions[];
+ #endif /* OPENSSL_NO_SEED */
+ #ifndef OPENSSL_NO_SM4
++extern const OSSL_DISPATCH ossl_sm4128gcm_functions[];
++extern const OSSL_DISPATCH ossl_sm4128ccm_functions[];
+ extern const OSSL_DISPATCH ossl_sm4128ecb_functions[];
+ extern const OSSL_DISPATCH ossl_sm4128cbc_functions[];
+ extern const OSSL_DISPATCH ossl_sm4128ctr_functions[];
+diff --git a/providers/implementations/include/prov/names.h b/providers/implementations/include/prov/names.h
+index e0dbb69a9d..0fac23a850 100644
+--- a/providers/implementations/include/prov/names.h
++++ b/providers/implementations/include/prov/names.h
+@@ -162,6 +162,8 @@
+ #define PROV_NAMES_SM4_CTR "SM4-CTR:1.2.156.10197.1.104.7"
+ #define PROV_NAMES_SM4_OFB "SM4-OFB:SM4-OFB128:1.2.156.10197.1.104.3"
+ #define PROV_NAMES_SM4_CFB "SM4-CFB:SM4-CFB128:1.2.156.10197.1.104.4"
++#define PROV_NAMES_SM4_GCM "SM4-GCM:1.2.156.10197.1.104.8"
++#define PROV_NAMES_SM4_CCM "SM4-CCM:1.2.156.10197.1.104.9"
+ #define PROV_NAMES_ChaCha20 "ChaCha20"
+ #define PROV_NAMES_ChaCha20_Poly1305 "ChaCha20-Poly1305"
+ #define PROV_NAMES_CAST5_ECB "CAST5-ECB"
+diff --git a/test/recipes/30-test_evp_data/evpciph_sm4.txt b/test/recipes/30-test_evp_data/evpciph_sm4.txt
+index ec8a45bd3f..9fb16ca15c 100644
+--- a/test/recipes/30-test_evp_data/evpciph_sm4.txt
++++ b/test/recipes/30-test_evp_data/evpciph_sm4.txt
+@@ -36,3 +36,23 @@ Key = 0123456789ABCDEFFEDCBA9876543210
+ IV  = 0123456789ABCDEFFEDCBA9876543210
+ Plaintext = AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCDDDDDDDDDDDDDDDDEEEEEEEEEEEEEEEEFFFFFFFFFFFFFFFFEEEEEEEEEEEEEEEEAAAAAAAAAAAAAAAA
+ Ciphertext = C2B4759E78AC3CF43D0852F4E8D5F9FD7256E8A5FCB65A350EE00630912E44492A0B17E1B85B060D0FBA612D8A95831638B361FD5FFACD942F081485A83CA35D
++
++Title = SM4 GCM test vectors from RFC8998
++
++Cipher = SM4-GCM
++Key = 0123456789abcdeffedcba9876543210
++IV = 00001234567800000000abcd
++AAD = feedfacedeadbeeffeedfacedeadbeefabaddad2
++Tag = 83de3541e4c2b58177e065a9bf7b62ec
++Plaintext = aaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbccccccccccccccccddddddddddddddddeeeeeeeeeeeeeeeeffffffffffffffffeeeeeeeeeeeeeeeeaaaaaaaaaaaaaaaa
++Ciphertext = 17f399f08c67d5ee19d0dc9969c4bb7d5fd46fd3756489069157b282bb200735d82710ca5c22f0ccfa7cbf93d496ac15a56834cbcf98c397b4024a2691233b8d
++
++Title = SM4 CCM test vectors from RFC8998
++
++Cipher = SM4-CCM
++Key = 0123456789abcdeffedcba9876543210
++IV = 00001234567800000000abcd
++AAD = feedfacedeadbeeffeedfacedeadbeefabaddad2
++Tag = 16842d4fa186f56ab33256971fa110f4
++Plaintext = aaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbccccccccccccccccddddddddddddddddeeeeeeeeeeeeeeeeffffffffffffffffeeeeeeeeeeeeeeeeaaaaaaaaaaaaaaaa
++Ciphertext = 48af93501fa62adbcd414cce6034d895dda1bf8f132f042098661572e7483094fd12e518ce062c98acee28d95df4416bed31a2f04476c18bb40c84a74b97dc5b
+-- 
+2.37.3.windows.1
+
diff --git a/Backport-providers-Add-SM4-XTS-implementation.patch b/Backport-providers-Add-SM4-XTS-implementation.patch
new file mode 100644
index 0000000..5136236
--- /dev/null
+++ b/Backport-providers-Add-SM4-XTS-implementation.patch
@@ -0,0 +1,763 @@
+From 57c854480481bd6b0900984d17db17426c44aa40 Mon Sep 17 00:00:00 2001
+From: Xu Yizhou <xuyizhou1@huawei.com>
+Date: Fri, 25 Nov 2022 13:52:49 +0800
+Subject: [PATCH 08/13] providers: Add SM4 XTS implementation
+
+Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com>
+
+Reviewed-by: Hugo Landau <hlandau@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/19619)
+---
+ crypto/modes/build.info                       |   2 +-
+ crypto/modes/xts128gb.c                       | 199 +++++++++++++
+ include/crypto/modes.h                        |   6 +
+ include/openssl/core_names.h                  |   1 +
+ providers/defltprov.c                         |   1 +
+ providers/implementations/ciphers/build.info  |   4 +-
+ .../implementations/ciphers/cipher_sm4_xts.c  | 281 ++++++++++++++++++
+ .../implementations/ciphers/cipher_sm4_xts.h  |  46 +++
+ .../ciphers/cipher_sm4_xts_hw.c               |  89 ++++++
+ .../include/prov/implementations.h            |   1 +
+ .../implementations/include/prov/names.h      |   1 +
+ 11 files changed, 629 insertions(+), 2 deletions(-)
+ create mode 100644 crypto/modes/xts128gb.c
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_xts.c
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_xts.h
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_xts_hw.c
+
+diff --git a/crypto/modes/build.info b/crypto/modes/build.info
+index f3558fa1a4..0ee297ced8 100644
+--- a/crypto/modes/build.info
++++ b/crypto/modes/build.info
+@@ -49,7 +49,7 @@ IF[{- !$disabled{asm} -}]
+ ENDIF
+ 
+ $COMMON=cbc128.c ctr128.c cfb128.c ofb128.c gcm128.c ccm128.c xts128.c \
+-        wrap128.c $MODESASM
++        wrap128.c xts128gb.c $MODESASM
+ SOURCE[../../libcrypto]=$COMMON \
+         cts128.c ocb128.c siv128.c
+ SOURCE[../../providers/libfips.a]=$COMMON
+diff --git a/crypto/modes/xts128gb.c b/crypto/modes/xts128gb.c
+new file mode 100644
+index 0000000000..021c0597e4
+--- /dev/null
++++ b/crypto/modes/xts128gb.c
+@@ -0,0 +1,199 @@
++/*
++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License").  You may not use
++ * this file except in compliance with the License.  You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++#include <string.h>
++#include <openssl/crypto.h>
++#include "internal/endian.h"
++#include "crypto/modes.h"
++
++#ifndef STRICT_ALIGNMENT
++# ifdef __GNUC__
++typedef u64 u64_a1 __attribute((__aligned__(1)));
++# else
++typedef u64 u64_a1;
++# endif
++#endif
++
++int ossl_crypto_xts128gb_encrypt(const XTS128_CONTEXT *ctx,
++                                 const unsigned char iv[16],
++                                 const unsigned char *inp, unsigned char *out,
++                                 size_t len, int enc)
++{
++    DECLARE_IS_ENDIAN;
++    union {
++        u64 u[2];
++        u32 d[4];
++        u8 c[16];
++    } tweak, scratch;
++    unsigned int i;
++
++    if (len < 16)
++        return -1;
++
++    memcpy(tweak.c, iv, 16);
++
++    (*ctx->block2) (tweak.c, tweak.c, ctx->key2);
++
++    if (!enc && (len % 16))
++        len -= 16;
++
++    while (len >= 16) {
++#if defined(STRICT_ALIGNMENT)
++        memcpy(scratch.c, inp, 16);
++        scratch.u[0] ^= tweak.u[0];
++        scratch.u[1] ^= tweak.u[1];
++#else
++        scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak.u[0];
++        scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak.u[1];
++#endif
++        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
++#if defined(STRICT_ALIGNMENT)
++        scratch.u[0] ^= tweak.u[0];
++        scratch.u[1] ^= tweak.u[1];
++        memcpy(out, scratch.c, 16);
++#else
++        ((u64_a1 *)out)[0] = scratch.u[0] ^= tweak.u[0];
++        ((u64_a1 *)out)[1] = scratch.u[1] ^= tweak.u[1];
++#endif
++        inp += 16;
++        out += 16;
++        len -= 16;
++
++        if (len == 0)
++            return 0;
++
++        if (IS_LITTLE_ENDIAN) {
++            u8 res;
++            u64 hi, lo;
++#ifdef BSWAP8
++            hi = BSWAP8(tweak.u[0]);
++            lo = BSWAP8(tweak.u[1]);
++#else
++            u8 *p = tweak.c;
++
++            hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
++            lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
++#endif
++            res = (u8)lo & 1;
++            tweak.u[0] = (lo >> 1) | (hi << 63);
++            tweak.u[1] = hi >> 1;
++            if (res)
++                tweak.c[15] ^= 0xe1;
++#ifdef BSWAP8
++            hi = BSWAP8(tweak.u[0]);
++            lo = BSWAP8(tweak.u[1]);
++#else
++            p = tweak.c;
++
++            hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
++            lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
++#endif
++            tweak.u[0] = lo;
++            tweak.u[1] = hi;
++        } else {
++            u8 carry, res;
++            carry = 0;
++            for (i = 0; i < 16; ++i) {
++                res = (tweak.c[i] << 7) & 0x80;
++                tweak.c[i] = ((tweak.c[i] >> 1) + carry) & 0xff;
++                carry = res;
++            }
++            if (res)
++                tweak.c[0] ^= 0xe1;
++        }
++    }
++    if (enc) {
++        for (i = 0; i < len; ++i) {
++            u8 c = inp[i];
++            out[i] = scratch.c[i];
++            scratch.c[i] = c;
++        }
++        scratch.u[0] ^= tweak.u[0];
++        scratch.u[1] ^= tweak.u[1];
++        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
++        scratch.u[0] ^= tweak.u[0];
++        scratch.u[1] ^= tweak.u[1];
++        memcpy(out - 16, scratch.c, 16);
++    } else {
++        union {
++            u64 u[2];
++            u8 c[16];
++        } tweak1;
++
++        if (IS_LITTLE_ENDIAN) {
++            u8 res;
++            u64 hi, lo;
++#ifdef BSWAP8
++            hi = BSWAP8(tweak.u[0]);
++            lo = BSWAP8(tweak.u[1]);
++#else
++            u8 *p = tweak.c;
++
++            hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
++            lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
++#endif
++            res = (u8)lo & 1;
++            tweak1.u[0] = (lo >> 1) | (hi << 63);
++            tweak1.u[1] = hi >> 1;
++            if (res)
++                tweak1.c[15] ^= 0xe1;
++#ifdef BSWAP8
++            hi = BSWAP8(tweak1.u[0]);
++            lo = BSWAP8(tweak1.u[1]);
++#else
++            p = tweak1.c;
++
++            hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
++            lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
++#endif
++            tweak1.u[0] = lo;
++            tweak1.u[1] = hi;
++        } else {
++            u8 carry, res;
++            carry = 0;
++            for (i = 0; i < 16; ++i) {
++                res = (tweak.c[i] << 7) & 0x80;
++                tweak1.c[i] = ((tweak.c[i] >> 1) + carry) & 0xff;
++                carry = res;
++            }
++            if (res)
++                tweak1.c[0] ^= 0xe1;
++        }
++#if defined(STRICT_ALIGNMENT)
++        memcpy(scratch.c, inp, 16);
++        scratch.u[0] ^= tweak1.u[0];
++        scratch.u[1] ^= tweak1.u[1];
++#else
++        scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak1.u[0];
++        scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak1.u[1];
++#endif
++        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
++        scratch.u[0] ^= tweak1.u[0];
++        scratch.u[1] ^= tweak1.u[1];
++
++        for (i = 0; i < len; ++i) {
++            u8 c = inp[16 + i];
++            out[16 + i] = scratch.c[i];
++            scratch.c[i] = c;
++        }
++        scratch.u[0] ^= tweak.u[0];
++        scratch.u[1] ^= tweak.u[1];
++        (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
++#if defined(STRICT_ALIGNMENT)
++        scratch.u[0] ^= tweak.u[0];
++        scratch.u[1] ^= tweak.u[1];
++        memcpy(out, scratch.c, 16);
++#else
++        ((u64_a1 *)out)[0] = scratch.u[0] ^ tweak.u[0];
++        ((u64_a1 *)out)[1] = scratch.u[1] ^ tweak.u[1];
++#endif
++    }
++
++    return 0;
++}
+diff --git a/include/crypto/modes.h b/include/crypto/modes.h
+index 19f9d85959..475b77f925 100644
+--- a/include/crypto/modes.h
++++ b/include/crypto/modes.h
+@@ -148,6 +148,12 @@ struct xts128_context {
+     block128_f block1, block2;
+ };
+ 
++/* XTS mode for SM4 algorithm specified by GB/T 17964-2021 */
++int ossl_crypto_xts128gb_encrypt(const XTS128_CONTEXT *ctx,
++                                 const unsigned char iv[16],
++                                 const unsigned char *inp, unsigned char *out,
++                                 size_t len, int enc);
++
+ struct ccm128_context {
+     union {
+         u64 u[2];
+diff --git a/include/openssl/core_names.h b/include/openssl/core_names.h
+index 6bed5a8a67..a90971099d 100644
+--- a/include/openssl/core_names.h
++++ b/include/openssl/core_names.h
+@@ -97,6 +97,7 @@ extern "C" {
+ #define OSSL_CIPHER_PARAM_CTS_MODE             "cts_mode"     /* utf8_string */
+ /* For passing the AlgorithmIdentifier parameter in DER form */
+ #define OSSL_CIPHER_PARAM_ALGORITHM_ID_PARAMS  "alg_id_param" /* octet_string */
++#define OSSL_CIPHER_PARAM_XTS_STANDARD         "xts_standard" /* utf8_string */
+ 
+ #define OSSL_CIPHER_PARAM_TLS1_MULTIBLOCK_MAX_SEND_FRAGMENT                    \
+     "tls1multi_maxsndfrag" /* uint */
+diff --git a/providers/defltprov.c b/providers/defltprov.c
+index cc0b0c3b62..ab898d3f44 100644
+--- a/providers/defltprov.c
++++ b/providers/defltprov.c
+@@ -296,6 +296,7 @@ static const OSSL_ALGORITHM_CAPABLE deflt_ciphers[] = {
+     ALG(PROV_NAMES_SM4_CTR, ossl_sm4128ctr_functions),
+     ALG(PROV_NAMES_SM4_OFB, ossl_sm4128ofb128_functions),
+     ALG(PROV_NAMES_SM4_CFB, ossl_sm4128cfb128_functions),
++    ALG(PROV_NAMES_SM4_XTS, ossl_sm4128xts_functions),
+ #endif /* OPENSSL_NO_SM4 */
+ #ifndef OPENSSL_NO_CHACHA
+     ALG(PROV_NAMES_ChaCha20, ossl_chacha20_functions),
+diff --git a/providers/implementations/ciphers/build.info b/providers/implementations/ciphers/build.info
+index b5d9d4f6c1..9f6eacf5e3 100644
+--- a/providers/implementations/ciphers/build.info
++++ b/providers/implementations/ciphers/build.info
+@@ -107,7 +107,9 @@ IF[{- !$disabled{sm4} -}]
+   SOURCE[$SM4_GOAL]=\
+       cipher_sm4.c cipher_sm4_hw.c \
+       cipher_sm4_gcm.c cipher_sm4_gcm_hw.c \
+-      cipher_sm4_ccm.c cipher_sm4_ccm_hw.c
++      cipher_sm4_ccm.c cipher_sm4_ccm_hw.c \
++      cipher_sm4_xts.c cipher_sm4_xts_hw.c
++
+ ENDIF
+ 
+ IF[{- !$disabled{ocb} -}]
+diff --git a/providers/implementations/ciphers/cipher_sm4_xts.c b/providers/implementations/ciphers/cipher_sm4_xts.c
+new file mode 100644
+index 0000000000..3c568d4d18
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_xts.c
+@@ -0,0 +1,281 @@
++
++/*
++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License").  You may not use
++ * this file except in compliance with the License.  You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++/* Dispatch functions for SM4 XTS mode */
++
++#include <openssl/proverr.h>
++#include "cipher_sm4_xts.h"
++#include "prov/implementations.h"
++#include "prov/providercommon.h"
++
++#define SM4_XTS_FLAGS PROV_CIPHER_FLAG_CUSTOM_IV
++#define SM4_XTS_IV_BITS 128
++#define SM4_XTS_BLOCK_BITS 8
++
++/* forward declarations */
++static OSSL_FUNC_cipher_encrypt_init_fn sm4_xts_einit;
++static OSSL_FUNC_cipher_decrypt_init_fn sm4_xts_dinit;
++static OSSL_FUNC_cipher_update_fn sm4_xts_stream_update;
++static OSSL_FUNC_cipher_final_fn sm4_xts_stream_final;
++static OSSL_FUNC_cipher_cipher_fn sm4_xts_cipher;
++static OSSL_FUNC_cipher_freectx_fn sm4_xts_freectx;
++static OSSL_FUNC_cipher_dupctx_fn sm4_xts_dupctx;
++static OSSL_FUNC_cipher_set_ctx_params_fn sm4_xts_set_ctx_params;
++static OSSL_FUNC_cipher_settable_ctx_params_fn sm4_xts_settable_ctx_params;
++
++/*-
++ * Provider dispatch functions
++ */
++static int sm4_xts_init(void *vctx, const unsigned char *key, size_t keylen,
++                        const unsigned char *iv, size_t ivlen,
++                        const OSSL_PARAM params[], int enc)
++{
++    PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)vctx;
++    PROV_CIPHER_CTX *ctx = &xctx->base;
++
++    if (!ossl_prov_is_running())
++        return 0;
++
++    ctx->enc = enc;
++
++    if (iv != NULL) {
++        if (!ossl_cipher_generic_initiv(vctx, iv, ivlen))
++            return 0;
++    }
++    if (key != NULL) {
++        if (keylen != ctx->keylen) {
++            ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_KEY_LENGTH);
++            return 0;
++        }
++        if (!ctx->hw->init(ctx, key, keylen))
++            return 0;
++    }
++    return sm4_xts_set_ctx_params(xctx, params);
++}
++
++static int sm4_xts_einit(void *vctx, const unsigned char *key, size_t keylen,
++                         const unsigned char *iv, size_t ivlen,
++                         const OSSL_PARAM params[])
++{
++    return sm4_xts_init(vctx, key, keylen, iv, ivlen, params, 1);
++}
++
++static int sm4_xts_dinit(void *vctx, const unsigned char *key, size_t keylen,
++                         const unsigned char *iv, size_t ivlen,
++                         const OSSL_PARAM params[])
++{
++    return sm4_xts_init(vctx, key, keylen, iv, ivlen, params, 0);
++}
++
++static void *sm4_xts_newctx(void *provctx, unsigned int mode, uint64_t flags,
++                            size_t kbits, size_t blkbits, size_t ivbits)
++{
++    PROV_SM4_XTS_CTX *ctx = OPENSSL_zalloc(sizeof(*ctx));
++
++    if (ctx != NULL) {
++        ossl_cipher_generic_initkey(&ctx->base, kbits, blkbits, ivbits, mode,
++                                    flags, ossl_prov_cipher_hw_sm4_xts(kbits),
++                                    NULL);
++    }
++    return ctx;
++}
++
++static void sm4_xts_freectx(void *vctx)
++{
++    PROV_SM4_XTS_CTX *ctx = (PROV_SM4_XTS_CTX *)vctx;
++
++    ossl_cipher_generic_reset_ctx((PROV_CIPHER_CTX *)vctx);
++    OPENSSL_clear_free(ctx, sizeof(*ctx));
++}
++
++static void *sm4_xts_dupctx(void *vctx)
++{
++    PROV_SM4_XTS_CTX *in = (PROV_SM4_XTS_CTX *)vctx;
++    PROV_SM4_XTS_CTX *ret = NULL;
++
++    if (!ossl_prov_is_running())
++        return NULL;
++
++    if (in->xts.key1 != NULL) {
++        if (in->xts.key1 != &in->ks1)
++            return NULL;
++    }
++    if (in->xts.key2 != NULL) {
++        if (in->xts.key2 != &in->ks2)
++            return NULL;
++    }
++    ret = OPENSSL_malloc(sizeof(*ret));
++    if (ret == NULL)
++        return NULL;
++    in->base.hw->copyctx(&ret->base, &in->base);
++    return ret;
++}
++
++static int sm4_xts_cipher(void *vctx, unsigned char *out, size_t *outl,
++                          size_t outsize, const unsigned char *in, size_t inl)
++{
++    PROV_SM4_XTS_CTX *ctx = (PROV_SM4_XTS_CTX *)vctx;
++
++    if (!ossl_prov_is_running()
++            || ctx->xts.key1 == NULL
++            || ctx->xts.key2 == NULL
++            || !ctx->base.iv_set
++            || out == NULL
++            || in == NULL
++            || inl < SM4_BLOCK_SIZE)
++        return 0;
++
++    /*
++     * Impose a limit of 2^20 blocks per data unit as specified by
++     * IEEE Std 1619-2018.  The earlier and obsolete IEEE Std 1619-2007
++     * indicated that this was a SHOULD NOT rather than a MUST NOT.
++     * NIST SP 800-38E mandates the same limit.
++     */
++    if (inl > XTS_MAX_BLOCKS_PER_DATA_UNIT * SM4_BLOCK_SIZE) {
++        ERR_raise(ERR_LIB_PROV, PROV_R_XTS_DATA_UNIT_IS_TOO_LARGE);
++        return 0;
++    }
++    if (ctx->xts_standard) {
++        if (ctx->stream != NULL)
++            (*ctx->stream)(in, out, inl, ctx->xts.key1, ctx->xts.key2,
++                           ctx->base.iv);
++        else if (CRYPTO_xts128_encrypt(&ctx->xts, ctx->base.iv, in, out, inl,
++                                       ctx->base.enc))
++            return 0;
++    } else {
++        if (ctx->stream_gb != NULL)
++            (*ctx->stream_gb)(in, out, inl, ctx->xts.key1, ctx->xts.key2,
++                              ctx->base.iv);
++        else if (ossl_crypto_xts128gb_encrypt(&ctx->xts, ctx->base.iv, in, out,
++                                              inl, ctx->base.enc))
++            return 0;
++    }
++    *outl = inl;
++    return 1;
++}
++
++static int sm4_xts_stream_update(void *vctx, unsigned char *out, size_t *outl,
++                                 size_t outsize, const unsigned char *in,
++                                 size_t inl)
++{
++    PROV_SM4_XTS_CTX *ctx = (PROV_SM4_XTS_CTX *)vctx;
++
++    if (outsize < inl) {
++        ERR_raise(ERR_LIB_PROV, PROV_R_OUTPUT_BUFFER_TOO_SMALL);
++        return 0;
++    }
++
++    if (!sm4_xts_cipher(ctx, out, outl, outsize, in, inl)) {
++        ERR_raise(ERR_LIB_PROV, PROV_R_CIPHER_OPERATION_FAILED);
++        return 0;
++    }
++
++    return 1;
++}
++
++static int sm4_xts_stream_final(void *vctx, unsigned char *out, size_t *outl,
++                                size_t outsize)
++{
++    if (!ossl_prov_is_running())
++        return 0;
++    *outl = 0;
++    return 1;
++}
++
++static const OSSL_PARAM sm4_xts_known_settable_ctx_params[] = {
++    OSSL_PARAM_utf8_string(OSSL_CIPHER_PARAM_XTS_STANDARD, NULL, 0),
++    OSSL_PARAM_END
++};
++
++static const OSSL_PARAM *sm4_xts_settable_ctx_params(ossl_unused void *cctx,
++                                                     ossl_unused void *provctx)
++{
++    return sm4_xts_known_settable_ctx_params;
++}
++
++static int sm4_xts_set_ctx_params(void *vxctx, const OSSL_PARAM params[])
++{
++    PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)vxctx;
++    const OSSL_PARAM *p;
++
++    if (params == NULL)
++        return 1;
++
++    /*-
++     * Sets the XTS standard to use with SM4-XTS algorithm.
++     *
++     * Must be utf8 string "GB" or "IEEE",
++     * "GB" means the GB/T 17964-2021 standard
++     * "IEEE" means the IEEE Std 1619-2007 standard
++     */
++    p = OSSL_PARAM_locate_const(params, OSSL_CIPHER_PARAM_XTS_STANDARD);
++
++    if (p != NULL) {
++        const char *xts_standard = NULL;
++
++        if (p->data_type != OSSL_PARAM_UTF8_STRING)
++            return 0;
++
++        if (!OSSL_PARAM_get_utf8_string_ptr(p, &xts_standard)) {
++            ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_GET_PARAMETER);
++            return 0;
++        }
++        if (OPENSSL_strcasecmp(xts_standard, "GB") == 0) {
++            xctx->xts_standard = 0;
++        } else if (OPENSSL_strcasecmp(xts_standard, "IEEE") == 0) {
++            xctx->xts_standard = 1;
++        } else {
++            ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER);
++            return 0;
++        }
++    }
++
++    return 1;
++}
++
++#define IMPLEMENT_cipher(lcmode, UCMODE, kbits, flags)                         \
++static OSSL_FUNC_cipher_get_params_fn sm4_##kbits##_##lcmode##_get_params;     \
++static int sm4_##kbits##_##lcmode##_get_params(OSSL_PARAM params[])            \
++{                                                                              \
++    return ossl_cipher_generic_get_params(params, EVP_CIPH_##UCMODE##_MODE,    \
++                                          flags, 2 * kbits, SM4_XTS_BLOCK_BITS,\
++                                          SM4_XTS_IV_BITS);                    \
++}                                                                              \
++static OSSL_FUNC_cipher_newctx_fn sm4_##kbits##_xts_newctx;                    \
++static void *sm4_##kbits##_xts_newctx(void *provctx)                           \
++{                                                                              \
++    return sm4_xts_newctx(provctx, EVP_CIPH_##UCMODE##_MODE, flags, 2 * kbits, \
++                          SM4_XTS_BLOCK_BITS, SM4_XTS_IV_BITS);                \
++}                                                                              \
++const OSSL_DISPATCH ossl_sm4##kbits##xts_functions[] = {                       \
++    { OSSL_FUNC_CIPHER_NEWCTX, (void (*)(void))sm4_##kbits##_xts_newctx },     \
++    { OSSL_FUNC_CIPHER_ENCRYPT_INIT, (void (*)(void))sm4_xts_einit },          \
++    { OSSL_FUNC_CIPHER_DECRYPT_INIT, (void (*)(void))sm4_xts_dinit },          \
++    { OSSL_FUNC_CIPHER_UPDATE, (void (*)(void))sm4_xts_stream_update },        \
++    { OSSL_FUNC_CIPHER_FINAL, (void (*)(void))sm4_xts_stream_final },          \
++    { OSSL_FUNC_CIPHER_CIPHER, (void (*)(void))sm4_xts_cipher },               \
++    { OSSL_FUNC_CIPHER_FREECTX, (void (*)(void))sm4_xts_freectx },             \
++    { OSSL_FUNC_CIPHER_DUPCTX, (void (*)(void))sm4_xts_dupctx },               \
++    { OSSL_FUNC_CIPHER_GET_PARAMS,                                             \
++      (void (*)(void))sm4_##kbits##_##lcmode##_get_params },                   \
++    { OSSL_FUNC_CIPHER_GETTABLE_PARAMS,                                        \
++      (void (*)(void))ossl_cipher_generic_gettable_params },                   \
++    { OSSL_FUNC_CIPHER_GET_CTX_PARAMS,                                         \
++      (void (*)(void))ossl_cipher_generic_get_ctx_params },                    \
++    { OSSL_FUNC_CIPHER_GETTABLE_CTX_PARAMS,                                    \
++      (void (*)(void))ossl_cipher_generic_gettable_ctx_params },               \
++    { OSSL_FUNC_CIPHER_SET_CTX_PARAMS,                                         \
++      (void (*)(void))sm4_xts_set_ctx_params },                                \
++    { OSSL_FUNC_CIPHER_SETTABLE_CTX_PARAMS,                                    \
++     (void (*)(void))sm4_xts_settable_ctx_params },                            \
++    { 0, NULL }                                                                \
++}
++/* ossl_sm4128xts_functions */
++IMPLEMENT_cipher(xts, XTS, 128, SM4_XTS_FLAGS);
+diff --git a/providers/implementations/ciphers/cipher_sm4_xts.h b/providers/implementations/ciphers/cipher_sm4_xts.h
+new file mode 100644
+index 0000000000..4c369183e2
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_xts.h
+@@ -0,0 +1,46 @@
++/*
++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License").  You may not use
++ * this file except in compliance with the License.  You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++#include <crypto/sm4.h>
++#include "prov/ciphercommon.h"
++#include "crypto/sm4_platform.h"
++
++PROV_CIPHER_FUNC(void, xts_stream,
++                 (const unsigned char *in, unsigned char *out, size_t len,
++                  const SM4_KEY *key1, const SM4_KEY *key2,
++                  const unsigned char iv[16]));
++
++typedef struct prov_sm4_xts_ctx_st {
++    /* Must be first */
++    PROV_CIPHER_CTX base;
++
++    /* SM4 key schedules to use */
++    union {
++        OSSL_UNION_ALIGN;
++        SM4_KEY ks;
++    } ks1, ks2;
++
++    /*-
++     * XTS standard to use with SM4-XTS algorithm
++     *
++     * Must be 0 or 1,
++     * 0 for XTS mode specified by GB/T 17964-2021
++     * 1 for XTS mode specified by IEEE Std 1619-2007
++     */
++    int xts_standard;
++
++    XTS128_CONTEXT xts;
++
++    /* Stream function for XTS mode specified by GB/T 17964-2021 */
++    OSSL_xts_stream_fn stream_gb;
++    /* Stream function for XTS mode specified by IEEE Std 1619-2007 */
++    OSSL_xts_stream_fn stream;
++} PROV_SM4_XTS_CTX;
++
++const PROV_CIPHER_HW *ossl_prov_cipher_hw_sm4_xts(size_t keybits);
+diff --git a/providers/implementations/ciphers/cipher_sm4_xts_hw.c b/providers/implementations/ciphers/cipher_sm4_xts_hw.c
+new file mode 100644
+index 0000000000..403eb879b1
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_xts_hw.c
+@@ -0,0 +1,89 @@
++/*
++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License").  You may not use
++ * this file except in compliance with the License.  You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++#include "cipher_sm4_xts.h"
++
++#define XTS_SET_KEY_FN(fn_set_enc_key, fn_set_dec_key,                         \
++                       fn_block_enc, fn_block_dec,                             \
++                       fn_stream_enc, fn_stream_dec,                           \
++                       fn_stream_gb_enc, fn_stream_gb_dec) {                   \
++    size_t bytes = keylen / 2;                                                 \
++                                                                               \
++    if (ctx->enc) {                                                            \
++        fn_set_enc_key(key, &xctx->ks1.ks);                                    \
++        xctx->xts.block1 = (block128_f)fn_block_enc;                           \
++    } else {                                                                   \
++        fn_set_dec_key(key, &xctx->ks1.ks);                                    \
++        xctx->xts.block1 = (block128_f)fn_block_dec;                           \
++    }                                                                          \
++    fn_set_enc_key(key + bytes, &xctx->ks2.ks);                                \
++    xctx->xts.block2 = (block128_f)fn_block_enc;                               \
++    xctx->xts.key1 = &xctx->ks1;                                               \
++    xctx->xts.key2 = &xctx->ks2;                                               \
++    xctx->stream = ctx->enc ? fn_stream_enc : fn_stream_dec;                   \
++    xctx->stream_gb = ctx->enc ? fn_stream_gb_enc : fn_stream_gb_dec;          \
++}
++
++static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx,
++                                             const unsigned char *key,
++                                             size_t keylen)
++{
++    PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)ctx;
++    OSSL_xts_stream_fn stream_enc = NULL;
++    OSSL_xts_stream_fn stream_dec = NULL;
++    OSSL_xts_stream_fn stream_gb_enc = NULL;
++    OSSL_xts_stream_fn stream_gb_dec = NULL;
++#ifdef HWSM4_CAPABLE
++    if (HWSM4_CAPABLE) {
++        XTS_SET_KEY_FN(HWSM4_set_encrypt_key, HWSM4_set_decrypt_key,
++                       HWSM4_encrypt, HWSM4_decrypt, stream_enc, stream_dec,
++                       stream_gb_enc, stream_gb_dec);
++        return 1;
++    } else
++#endif /* HWSM4_CAPABLE */
++#ifdef VPSM4_CAPABLE
++    if (VPSM4_CAPABLE) {
++        XTS_SET_KEY_FN(vpsm4_set_encrypt_key, vpsm4_set_decrypt_key,
++                       vpsm4_encrypt, vpsm4_decrypt, stream_enc, stream_dec,
++                       stream_gb_enc, stream_gb_dec);
++        return 1;
++    } else
++#endif /* VPSM4_CAPABLE */
++    {
++        (void)0;
++    }
++    {
++        XTS_SET_KEY_FN(ossl_sm4_set_key, ossl_sm4_set_key, ossl_sm4_encrypt,
++                       ossl_sm4_decrypt, stream_enc, stream_dec, stream_gb_enc,
++                       stream_gb_dec);
++    }
++    return 1;
++}
++
++static void cipher_hw_sm4_xts_copyctx(PROV_CIPHER_CTX *dst,
++                                      const PROV_CIPHER_CTX *src)
++{
++    PROV_SM4_XTS_CTX *sctx = (PROV_SM4_XTS_CTX *)src;
++    PROV_SM4_XTS_CTX *dctx = (PROV_SM4_XTS_CTX *)dst;
++
++    *dctx = *sctx;
++    dctx->xts.key1 = &dctx->ks1.ks;
++    dctx->xts.key2 = &dctx->ks2.ks;
++}
++
++
++static const PROV_CIPHER_HW sm4_generic_xts = {
++    cipher_hw_sm4_xts_generic_initkey,
++    NULL,
++    cipher_hw_sm4_xts_copyctx
++};
++const PROV_CIPHER_HW *ossl_prov_cipher_hw_sm4_xts(size_t keybits)
++{
++    return &sm4_generic_xts;
++}
+diff --git a/providers/implementations/include/prov/implementations.h b/providers/implementations/include/prov/implementations.h
+index 498eab4ad4..cfa32ea3ca 100644
+--- a/providers/implementations/include/prov/implementations.h
++++ b/providers/implementations/include/prov/implementations.h
+@@ -181,6 +181,7 @@ extern const OSSL_DISPATCH ossl_sm4128cbc_functions[];
+ extern const OSSL_DISPATCH ossl_sm4128ctr_functions[];
+ extern const OSSL_DISPATCH ossl_sm4128ofb128_functions[];
+ extern const OSSL_DISPATCH ossl_sm4128cfb128_functions[];
++extern const OSSL_DISPATCH ossl_sm4128xts_functions[];
+ #endif /* OPENSSL_NO_SM4 */
+ #ifndef OPENSSL_NO_RC5
+ extern const OSSL_DISPATCH ossl_rc5128ecb_functions[];
+diff --git a/providers/implementations/include/prov/names.h b/providers/implementations/include/prov/names.h
+index 0fac23a850..5192f4f471 100644
+--- a/providers/implementations/include/prov/names.h
++++ b/providers/implementations/include/prov/names.h
+@@ -164,6 +164,7 @@
+ #define PROV_NAMES_SM4_CFB "SM4-CFB:SM4-CFB128:1.2.156.10197.1.104.4"
+ #define PROV_NAMES_SM4_GCM "SM4-GCM:1.2.156.10197.1.104.8"
+ #define PROV_NAMES_SM4_CCM "SM4-CCM:1.2.156.10197.1.104.9"
++#define PROV_NAMES_SM4_XTS "SM4-XTS:1.2.156.10197.1.104.10"
+ #define PROV_NAMES_ChaCha20 "ChaCha20"
+ #define PROV_NAMES_ChaCha20_Poly1305 "ChaCha20-Poly1305"
+ #define PROV_NAMES_CAST5_ECB "CAST5-ECB"
+-- 
+2.37.3.windows.1
+
diff --git a/Backport-support-decode-SM2-parameters.patch b/Backport-support-decode-SM2-parameters.patch
new file mode 100644
index 0000000..7f4ea20
--- /dev/null
+++ b/Backport-support-decode-SM2-parameters.patch
@@ -0,0 +1,175 @@
+From 08ae9fa627e858b9f8e96e0c6d3cf84422a11d75 Mon Sep 17 00:00:00 2001
+From: K1 <dongbeiouba@gmail.com>
+Date: Tue, 19 Jul 2022 01:18:12 +0800
+Subject: [PATCH] Support decode SM2 parameters
+
+Reviewed-by: Hugo Landau <hlandau@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/18819)
+
+Signed-off-by: Huaxin Lu <luhuaxin1@huawei.com>
+---
+ apps/ecparam.c                                       | 12 ++++++++++--
+ include/openssl/pem.h                                |  1 +
+ providers/decoders.inc                               |  1 +
+ .../implementations/encode_decode/decode_der2key.c   |  1 +
+ .../implementations/encode_decode/decode_pem2der.c   |  1 +
+ .../implementations/encode_decode/encode_key2text.c  |  8 +++++---
+ .../implementations/include/prov/implementations.h   |  1 +
+ test/recipes/15-test_ecparam.t                       |  4 ++++
+ .../15-test_ecparam_data/valid/sm2-explicit.pem      |  7 +++++++
+ .../recipes/15-test_ecparam_data/valid/sm2-named.pem |  3 +++
+ 10 files changed, 34 insertions(+), 5 deletions(-)
+ create mode 100644 test/recipes/15-test_ecparam_data/valid/sm2-explicit.pem
+ create mode 100644 test/recipes/15-test_ecparam_data/valid/sm2-named.pem
+
+diff --git a/apps/ecparam.c b/apps/ecparam.c
+index 5d66b65569..71f93c4ca5 100644
+--- a/apps/ecparam.c
++++ b/apps/ecparam.c
+@@ -242,9 +242,17 @@ int ecparam_main(int argc, char **argv)
+             goto end;
+         }
+     } else {
+-        params_key = load_keyparams(infile, informat, 1, "EC", "EC parameters");
+-        if (params_key == NULL || !EVP_PKEY_is_a(params_key, "EC"))
++        params_key = load_keyparams_suppress(infile, informat, 1, "EC",
++                                             "EC parameters", 1);
++        if (params_key == NULL)
++            params_key = load_keyparams_suppress(infile, informat, 1, "SM2",
++                                                 "SM2 parameters", 1);
++
++        if (params_key == NULL) {
++            BIO_printf(bio_err, "Unable to load parameters from %s\n", infile);
+             goto end;
++        }
++
+         if (point_format
+             && !EVP_PKEY_set_utf8_string_param(
+                     params_key, OSSL_PKEY_PARAM_EC_POINT_CONVERSION_FORMAT,
+diff --git a/include/openssl/pem.h b/include/openssl/pem.h
+index ed50f081fa..0446c77019 100644
+--- a/include/openssl/pem.h
++++ b/include/openssl/pem.h
+@@ -57,6 +57,7 @@ extern "C" {
+ # define PEM_STRING_ECPRIVATEKEY "EC PRIVATE KEY"
+ # define PEM_STRING_PARAMETERS   "PARAMETERS"
+ # define PEM_STRING_CMS          "CMS"
++# define PEM_STRING_SM2PARAMETERS "SM2 PARAMETERS"
+ 
+ # define PEM_TYPE_ENCRYPTED      10
+ # define PEM_TYPE_MIC_ONLY       20
+diff --git a/providers/decoders.inc b/providers/decoders.inc
+index 2772aad05d..edca39ea36 100644
+--- a/providers/decoders.inc
++++ b/providers/decoders.inc
+@@ -69,6 +69,7 @@ DECODER_w_structure("X448", der, SubjectPublicKeyInfo, x448, yes),
+ # ifndef OPENSSL_NO_SM2
+ DECODER_w_structure("SM2", der, PrivateKeyInfo, sm2, no),
+ DECODER_w_structure("SM2", der, SubjectPublicKeyInfo, sm2, no),
++DECODER_w_structure("SM2", der, type_specific_no_pub, sm2, no),
+ # endif
+ #endif
+ DECODER_w_structure("RSA", der, PrivateKeyInfo, rsa, yes),
+diff --git a/providers/implementations/encode_decode/decode_der2key.c b/providers/implementations/encode_decode/decode_der2key.c
+index ebc2d24833..d4d3731460 100644
+--- a/providers/implementations/encode_decode/decode_der2key.c
++++ b/providers/implementations/encode_decode/decode_der2key.c
+@@ -783,6 +783,7 @@ MAKE_DECODER("ED448", ed448, ecx, SubjectPublicKeyInfo);
+ # ifndef OPENSSL_NO_SM2
+ MAKE_DECODER("SM2", sm2, ec, PrivateKeyInfo);
+ MAKE_DECODER("SM2", sm2, ec, SubjectPublicKeyInfo);
++MAKE_DECODER("SM2", sm2, sm2, type_specific_no_pub);
+ # endif
+ #endif
+ MAKE_DECODER("RSA", rsa, rsa, PrivateKeyInfo);
+diff --git a/providers/implementations/encode_decode/decode_pem2der.c b/providers/implementations/encode_decode/decode_pem2der.c
+index bc937ffb9d..648ecd4584 100644
+--- a/providers/implementations/encode_decode/decode_pem2der.c
++++ b/providers/implementations/encode_decode/decode_pem2der.c
+@@ -119,6 +119,7 @@ static int pem2der_decode(void *vctx, OSSL_CORE_BIO *cin, int selection,
+         { PEM_STRING_DSAPARAMS, OSSL_OBJECT_PKEY, "DSA", "type-specific" },
+         { PEM_STRING_ECPRIVATEKEY, OSSL_OBJECT_PKEY, "EC", "type-specific" },
+         { PEM_STRING_ECPARAMETERS, OSSL_OBJECT_PKEY, "EC", "type-specific" },
++        { PEM_STRING_SM2PARAMETERS, OSSL_OBJECT_PKEY, "SM2", "type-specific" },
+         { PEM_STRING_RSA, OSSL_OBJECT_PKEY, "RSA", "type-specific" },
+         { PEM_STRING_RSA_PUBLIC, OSSL_OBJECT_PKEY, "RSA", "type-specific" },
+ 
+diff --git a/providers/implementations/encode_decode/encode_key2text.c b/providers/implementations/encode_decode/encode_key2text.c
+index 7d983f5e51..a92e04a89d 100644
+--- a/providers/implementations/encode_decode/encode_key2text.c
++++ b/providers/implementations/encode_decode/encode_key2text.c
+@@ -512,7 +512,8 @@ static int ec_to_text(BIO *out, const void *key, int selection)
+     else if ((selection & OSSL_KEYMGMT_SELECT_PUBLIC_KEY) != 0)
+         type_label = "Public-Key";
+     else if ((selection & OSSL_KEYMGMT_SELECT_DOMAIN_PARAMETERS) != 0)
+-        type_label = "EC-Parameters";
++        if (EC_GROUP_get_curve_name(group) != NID_sm2)
++            type_label = "EC-Parameters";
+ 
+     if ((selection & OSSL_KEYMGMT_SELECT_PRIVATE_KEY) != 0) {
+         const BIGNUM *priv_key = EC_KEY_get0_private_key(ec);
+@@ -538,8 +539,9 @@ static int ec_to_text(BIO *out, const void *key, int selection)
+             goto err;
+     }
+ 
+-    if (BIO_printf(out, "%s: (%d bit)\n", type_label,
+-                   EC_GROUP_order_bits(group)) <= 0)
++    if (type_label != NULL
++        && BIO_printf(out, "%s: (%d bit)\n", type_label,
++                      EC_GROUP_order_bits(group)) <= 0)
+         goto err;
+     if (priv != NULL
+         && !print_labeled_buf(out, "priv:", priv, priv_len))
+diff --git a/providers/implementations/include/prov/implementations.h b/providers/implementations/include/prov/implementations.h
+index 03ce43719e..288808bb6f 100644
+--- a/providers/implementations/include/prov/implementations.h
++++ b/providers/implementations/include/prov/implementations.h
+@@ -508,6 +508,7 @@ extern const OSSL_DISPATCH ossl_SubjectPublicKeyInfo_der_to_ed448_decoder_functi
+ #ifndef OPENSSL_NO_SM2
+ extern const OSSL_DISPATCH ossl_PrivateKeyInfo_der_to_sm2_decoder_functions[];
+ extern const OSSL_DISPATCH ossl_SubjectPublicKeyInfo_der_to_sm2_decoder_functions[];
++extern const OSSL_DISPATCH ossl_type_specific_no_pub_der_to_sm2_decoder_functions[];
+ #endif
+ 
+ extern const OSSL_DISPATCH ossl_PrivateKeyInfo_der_to_rsa_decoder_functions[];
+diff --git a/test/recipes/15-test_ecparam.t b/test/recipes/15-test_ecparam.t
+index 37bf620f35..5dba866378 100644
+--- a/test/recipes/15-test_ecparam.t
++++ b/test/recipes/15-test_ecparam.t
+@@ -25,6 +25,10 @@ my @valid = glob(data_file("valid", "*.pem"));
+ my @noncanon = glob(data_file("noncanon", "*.pem"));
+ my @invalid = glob(data_file("invalid", "*.pem"));
+ 
++if (disabled("sm2")) {
++    @valid = grep { !/sm2-.*\.pem/} @valid;
++}
++
+ plan tests => 12;
+ 
+ sub checkload {
+diff --git a/test/recipes/15-test_ecparam_data/valid/sm2-explicit.pem b/test/recipes/15-test_ecparam_data/valid/sm2-explicit.pem
+new file mode 100644
+index 0000000000..bd07654ea4
+--- /dev/null
++++ b/test/recipes/15-test_ecparam_data/valid/sm2-explicit.pem
+@@ -0,0 +1,7 @@
++-----BEGIN SM2 PARAMETERS-----
++MIHgAgEBMCwGByqGSM49AQECIQD////+/////////////////////wAAAAD/////
++/////zBEBCD////+/////////////////////wAAAAD//////////AQgKOn6np2f
++XjRNWp5Lz2UJp/OXifUVq4+S3by9QU2UDpMEQQQyxK4sHxmBGV+ZBEZqOcmUj+ML
++v/JmC+FxWkWJM0x0x7w3NqL09necWb3O42tpIVPQqYd8xipHQALfMuUhOfCgAiEA
++/////v///////////////3ID32shxgUrU7v0CTnVQSMCAQE=
++-----END SM2 PARAMETERS-----
+diff --git a/test/recipes/15-test_ecparam_data/valid/sm2-named.pem b/test/recipes/15-test_ecparam_data/valid/sm2-named.pem
+new file mode 100644
+index 0000000000..d6e280f6c2
+--- /dev/null
++++ b/test/recipes/15-test_ecparam_data/valid/sm2-named.pem
+@@ -0,0 +1,3 @@
++-----BEGIN SM2 PARAMETERS-----
++BggqgRzPVQGCLQ==
++-----END SM2 PARAMETERS-----
+-- 
+2.33.0
+
diff --git a/Feature-support-SM2-CMS-signature.patch b/Feature-support-SM2-CMS-signature.patch
new file mode 100644
index 0000000..b579537
--- /dev/null
+++ b/Feature-support-SM2-CMS-signature.patch
@@ -0,0 +1,41 @@
+From e7f35b6f10599a574acb3bcca40845eeccfdc63b Mon Sep 17 00:00:00 2001
+From: Huaxin Lu <luhuaxin1@huawei.com>
+Date: Fri, 1 Sep 2023 20:08:46 +0800
+Subject: [PATCH] Support SM2 CMS signature
+
+Signed-off-by: Huaxin Lu <luhuaxin1@huawei.com>
+---
+ crypto/cms/cms_sd.c | 2 +-
+ crypto/evp/p_lib.c  | 3 +++
+ 2 files changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/crypto/cms/cms_sd.c b/crypto/cms/cms_sd.c
+index 34c021b..093b41c 100644
+--- a/crypto/cms/cms_sd.c
++++ b/crypto/cms/cms_sd.c
+@@ -232,7 +232,7 @@ static int cms_sd_asn1_ctrl(CMS_SignerInfo *si, int cmd)
+     EVP_PKEY *pkey = si->pkey;
+     int i;
+ 
+-    if (EVP_PKEY_is_a(pkey, "DSA") || EVP_PKEY_is_a(pkey, "EC"))
++    if (EVP_PKEY_is_a(pkey, "DSA") || EVP_PKEY_is_a(pkey, "EC") || EVP_PKEY_is_a(pkey, "SM2"))
+         return ossl_cms_ecdsa_dsa_sign(si, cmd);
+     else if (EVP_PKEY_is_a(pkey, "RSA") || EVP_PKEY_is_a(pkey, "RSA-PSS"))
+         return ossl_cms_rsa_sign(si, cmd);
+diff --git a/crypto/evp/p_lib.c b/crypto/evp/p_lib.c
+index f6acb5b..9567bb0 100644
+--- a/crypto/evp/p_lib.c
++++ b/crypto/evp/p_lib.c
+@@ -982,6 +982,9 @@ int EVP_PKEY_type(int type)
+ 
+ int EVP_PKEY_get_id(const EVP_PKEY *pkey)
+ {
++    if (EVP_PKEY_is_a(pkey, "SM2")) {
++        return EVP_PKEY_SM2;
++    }
+     return pkey->type;
+ }
+ 
+-- 
+2.33.0
+
diff --git a/Feature-use-default-id-if-SM2-id-is-not-set.patch b/Feature-use-default-id-if-SM2-id-is-not-set.patch
new file mode 100644
index 0000000..ee6f2d8
--- /dev/null
+++ b/Feature-use-default-id-if-SM2-id-is-not-set.patch
@@ -0,0 +1,59 @@
+From 12f6ee3806c1f04a682b4c31aeb510a2dca602ef Mon Sep 17 00:00:00 2001
+From: Huaxin Lu <luhuaxin1@huawei.com>
+Date: Fri, 1 Sep 2023 20:27:45 +0800
+Subject: [PATCH] use default id if SM2 id is not set
+
+Signed-off-by: Huaxin Lu <luhuaxin1@huawei.com>
+---
+ crypto/sm2/sm2_sign.c | 14 +++++++++++---
+ 1 file changed, 11 insertions(+), 3 deletions(-)
+
+diff --git a/crypto/sm2/sm2_sign.c b/crypto/sm2/sm2_sign.c
+index ff5be9b..33d3a73 100644
+--- a/crypto/sm2/sm2_sign.c
++++ b/crypto/sm2/sm2_sign.c
+@@ -42,6 +42,8 @@ int ossl_sm2_compute_z_digest(uint8_t *out,
+     uint8_t *buf = NULL;
+     uint16_t entl = 0;
+     uint8_t e_byte = 0;
++    const uint8_t *f_id = id;
++    size_t f_id_len = id_len;
+ 
+     hash = EVP_MD_CTX_new();
+     ctx = BN_CTX_new_ex(ossl_ec_key_get_libctx(key));
+@@ -68,15 +70,21 @@ int ossl_sm2_compute_z_digest(uint8_t *out,
+         goto done;
+     }
+ 
++    /* if id is not set, use default id */
++    if (f_id == NULL || f_id_len == 0) {
++        f_id = (const uint8_t *)SM2_DEFAULT_USERID;
++        f_id_len = strlen(SM2_DEFAULT_USERID);
++    }
++
+     /* Z = h(ENTL || ID || a || b || xG || yG || xA || yA) */
+ 
+-    if (id_len >= (UINT16_MAX / 8)) {
++    if (f_id_len >= (UINT16_MAX / 8)) {
+         /* too large */
+         ERR_raise(ERR_LIB_SM2, SM2_R_ID_TOO_LARGE);
+         goto done;
+     }
+ 
+-    entl = (uint16_t)(8 * id_len);
++    entl = (uint16_t)(8 * f_id_len);
+ 
+     e_byte = entl >> 8;
+     if (!EVP_DigestUpdate(hash, &e_byte, 1)) {
+@@ -89,7 +97,7 @@ int ossl_sm2_compute_z_digest(uint8_t *out,
+         goto done;
+     }
+ 
+-    if (id_len > 0 && !EVP_DigestUpdate(hash, id, id_len)) {
++    if (f_id_len > 0 && !EVP_DigestUpdate(hash, f_id, f_id_len)) {
+         ERR_raise(ERR_LIB_SM2, ERR_R_EVP_LIB);
+         goto done;
+     }
+-- 
+2.33.0
+
diff --git a/Makefile.certificate b/Makefile.certificate
new file mode 100644
index 0000000..cc88c52
--- /dev/null
+++ b/Makefile.certificate
@@ -0,0 +1,82 @@
+UTF8 := $(shell locale -c LC_CTYPE -k | grep -q charmap.*UTF-8 && echo -utf8)
+DAYS=365
+KEYLEN=2048
+TYPE=rsa:$(KEYLEN)
+EXTRA_FLAGS=
+ifdef SERIAL
+	EXTRA_FLAGS+=-set_serial $(SERIAL)
+endif
+
+.PHONY: usage
+.SUFFIXES: .key .csr .crt .pem
+.PRECIOUS: %.key %.csr %.crt %.pem
+
+usage:
+	@echo "This makefile allows you to create:"
+	@echo "  o public/private key pairs"
+	@echo "  o SSL certificate signing requests (CSRs)"
+	@echo "  o self-signed SSL test certificates"
+	@echo
+	@echo "To create a key pair, run \"make SOMETHING.key\"."
+	@echo "To create a CSR, run \"make SOMETHING.csr\"."
+	@echo "To create a test certificate, run \"make SOMETHING.crt\"."
+	@echo "To create a key and a test certificate in one file, run \"make SOMETHING.pem\"."
+	@echo
+	@echo "To create a key for use with Apache, run \"make genkey\"."
+	@echo "To create a CSR for use with Apache, run \"make certreq\"."
+	@echo "To create a test certificate for use with Apache, run \"make testcert\"."
+	@echo
+	@echo "To create a test certificate with serial number other than random, add SERIAL=num"
+	@echo "You can also specify key length with KEYLEN=n and expiration in days with DAYS=n"
+	@echo "Any additional options can be passed to openssl req via EXTRA_FLAGS"
+	@echo
+	@echo Examples:
+	@echo "  make server.key"
+	@echo "  make server.csr"
+	@echo "  make server.crt"
+	@echo "  make stunnel.pem"
+	@echo "  make genkey"
+	@echo "  make certreq"
+	@echo "  make testcert"
+	@echo "  make server.crt SERIAL=1"
+	@echo "  make stunnel.pem EXTRA_FLAGS=-sha384"
+	@echo "  make testcert DAYS=600"
+
+%.pem:
+	umask 77 ; \
+	PEM1=`/bin/mktemp /tmp/openssl.XXXXXX` ; \
+	PEM2=`/bin/mktemp /tmp/openssl.XXXXXX` ; \
+	/usr/bin/openssl req $(UTF8) -newkey $(TYPE) -keyout $$PEM1 -nodes -x509 -days $(DAYS) -out $$PEM2 $(EXTRA_FLAGS) ; \
+	cat $$PEM1 >  $@ ; \
+	echo ""    >> $@ ; \
+	cat $$PEM2 >> $@ ; \
+	$(RM) $$PEM1 $$PEM2
+
+%.key:
+	umask 77 ; \
+	/usr/bin/openssl genrsa -aes128 $(KEYLEN) > $@
+
+%.csr: %.key
+	umask 77 ; \
+	/usr/bin/openssl req $(UTF8) -new -key $^ -out $@
+
+%.crt: %.key
+	umask 77 ; \
+	/usr/bin/openssl req $(UTF8) -new -key $^ -x509 -days $(DAYS) -out $@ $(EXTRA_FLAGS)
+
+TLSROOT=/etc/pki/tls
+KEY=$(TLSROOT)/private/localhost.key
+CSR=$(TLSROOT)/certs/localhost.csr
+CRT=$(TLSROOT)/certs/localhost.crt
+
+genkey: $(KEY)
+certreq: $(CSR)
+testcert: $(CRT)
+
+$(CSR): $(KEY)
+	umask 77 ; \
+	/usr/bin/openssl req $(UTF8) -new -key $(KEY) -out $(CSR)
+
+$(CRT): $(KEY)
+	umask 77 ; \
+	/usr/bin/openssl req $(UTF8) -new -key $(KEY) -x509 -days $(DAYS) -out $(CRT) $(EXTRA_FLAGS)
diff --git a/backport-A-null-pointer-dereference-occurs-when-memory-alloca.patch b/backport-A-null-pointer-dereference-occurs-when-memory-alloca.patch
new file mode 100644
index 0000000..afd87ba
--- /dev/null
+++ b/backport-A-null-pointer-dereference-occurs-when-memory-alloca.patch
@@ -0,0 +1,36 @@
+From a8da305fa3dd6e34ba5aab3978281f652fd12883 Mon Sep 17 00:00:00 2001
+From: yangyangtiantianlonglong <yangtianlong1224@163.com>
+Date: Mon, 31 Jul 2023 07:04:41 -0700
+Subject: [PATCH] A null pointer dereference occurs when memory allocation
+ fails
+
+Fixes #21605
+
+Reviewed-by: Hugo Landau <hlandau@openssl.org>
+Reviewed-by: Matthias St. Pierre <Matthias.St.Pierre@ncp-e.com>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/21606)
+---
+ ssl/ssl_sess.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/ssl/ssl_sess.c b/ssl/ssl_sess.c
+index cda6b7cc5b..2a5d21be79 100644
+--- a/ssl/ssl_sess.c
++++ b/ssl/ssl_sess.c
+@@ -139,8 +139,11 @@ SSL_SESSION *ssl_session_dup(SSL_SESSION *src, int ticket)
+     dest->references = 1;
+ 
+     dest->lock = CRYPTO_THREAD_lock_new();
+-    if (dest->lock == NULL)
++    if (dest->lock == NULL) {
++        OPENSSL_free(dest);
++        dest = NULL;
+         goto err;
++    }
+ 
+     if (!CRYPTO_new_ex_data(CRYPTO_EX_INDEX_SSL_SESSION, dest, &dest->ex_data))
+         goto err;
+-- 
+2.27.0
+
diff --git a/backport-Add-a-test-for-CVE-2023-3446.patch b/backport-Add-a-test-for-CVE-2023-3446.patch
new file mode 100644
index 0000000..6c5f734
--- /dev/null
+++ b/backport-Add-a-test-for-CVE-2023-3446.patch
@@ -0,0 +1,63 @@
+From 8a62fd996cb1c22383ec75b4155d54dec4a1b0ee Mon Sep 17 00:00:00 2001
+From: Matt Caswell <matt@openssl.org>
+Date: Fri, 7 Jul 2023 14:39:48 +0100
+Subject: [PATCH] Add a test for CVE-2023-3446
+
+Confirm that the only errors DH_check() finds with DH parameters with an
+excessively long modulus is that the modulus is too large. We should not
+be performing time consuming checks using that modulus.
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com>
+Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/21451)
+
+(cherry picked from commit ede782b4c8868d1f09c9cd237f82b6f35b7dba8b)
+---
+ test/dhtest.c | 15 +++++++++++++--
+ 1 file changed, 13 insertions(+), 2 deletions(-)
+
+diff --git a/test/dhtest.c b/test/dhtest.c
+index 7b587f3cfa..f8dd8f3aa7 100644
+--- a/test/dhtest.c
++++ b/test/dhtest.c
+@@ -73,7 +73,7 @@ static int dh_test(void)
+         goto err1;
+ 
+     /* check fails, because p is way too small */
+-    if (!DH_check(dh, &i))
++    if (!TEST_true(DH_check(dh, &i)))
+         goto err2;
+     i ^= DH_MODULUS_TOO_SMALL;
+     if (!TEST_false(i & DH_CHECK_P_NOT_PRIME)
+@@ -124,6 +124,17 @@ static int dh_test(void)
+     /* We'll have a stale error on the queue from the above test so clear it */
+     ERR_clear_error();
+ 
++    /* Modulus of size: dh check max modulus bits + 1 */
++    if (!TEST_true(BN_set_word(p, 1))
++            || !TEST_true(BN_lshift(p, p, OPENSSL_DH_CHECK_MAX_MODULUS_BITS)))
++        goto err3;
++
++    /*
++     * We expect no checks at all for an excessively large modulus
++     */
++    if (!TEST_false(DH_check(dh, &i)))
++        goto err3;
++
+     /*
+      * II) key generation
+      */
+@@ -138,7 +149,7 @@ static int dh_test(void)
+         goto err3;
+ 
+     /* ... and check whether it is valid */
+-    if (!DH_check(a, &i))
++    if (!TEST_true(DH_check(a, &i)))
+         goto err3;
+     if (!TEST_false(i & DH_CHECK_P_NOT_PRIME)
+             || !TEST_false(i & DH_CHECK_P_NOT_SAFE_PRIME)
+-- 
+2.27.0
+
diff --git a/backport-Add-testcases-for-empty-associated-data-entries-with.patch b/backport-Add-testcases-for-empty-associated-data-entries-with.patch
new file mode 100644
index 0000000..74126e7
--- /dev/null
+++ b/backport-Add-testcases-for-empty-associated-data-entries-with.patch
@@ -0,0 +1,66 @@
+From 96318a8d21bed334d78797eca5b32790775d5f05 Mon Sep 17 00:00:00 2001
+From: Tomas Mraz <tomas@openssl.org>
+Date: Tue, 4 Jul 2023 17:50:37 +0200
+Subject: [PATCH] Add testcases for empty associated data entries with AES-SIV
+
+Reviewed-by: Matt Caswell <matt@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/21384)
+
+(cherry picked from commit 3993bb0c0c87e3ed0ab4274e4688aa814e164cfc)
+---
+ .../30-test_evp_data/evpciph_aes_siv.txt      | 31 +++++++++++++++++++
+ 1 file changed, 31 insertions(+)
+
+diff --git a/test/recipes/30-test_evp_data/evpciph_aes_siv.txt b/test/recipes/30-test_evp_data/evpciph_aes_siv.txt
+index a78a49158d..e434f13f41 100644
+--- a/test/recipes/30-test_evp_data/evpciph_aes_siv.txt
++++ b/test/recipes/30-test_evp_data/evpciph_aes_siv.txt
+@@ -20,6 +20,19 @@ Tag = 85632d07c6e8f37f950acd320a2ecc93
+ Plaintext =  112233445566778899aabbccddee
+ Ciphertext = 40c02b9690c4dc04daef7f6afe5c
+ 
++Cipher = aes-128-siv
++Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
++Tag = f1c5fdeac1f15a26779c1501f9fb7588
++Plaintext =  112233445566778899aabbccddee
++Ciphertext = 27e946c669088ab06da58c5c831c
++
++Cipher = aes-128-siv
++Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
++AAD =
++Tag = d1022f5b3664e5a4dfaf90f85be6f28a
++Plaintext =  112233445566778899aabbccddee
++Ciphertext = b66cff6b8eca0b79f083b39a0901
++
+ Cipher = aes-128-siv
+ Key = 7f7e7d7c7b7a79787776757473727170404142434445464748494a4b4c4d4e4f
+ AAD = 00112233445566778899aabbccddeeffdeaddadadeaddadaffeeddccbbaa99887766554433221100
+@@ -29,6 +42,24 @@ Tag = 7bdb6e3b432667eb06f4d14bff2fbd0f
+ Plaintext =  7468697320697320736f6d6520706c61696e7465787420746f20656e6372797074207573696e67205349562d414553
+ Ciphertext = cb900f2fddbe404326601965c889bf17dba77ceb094fa663b7a3f748ba8af829ea64ad544a272e9c485b62a3fd5c0d
+ 
++Cipher = aes-128-siv
++Key = 7f7e7d7c7b7a79787776757473727170404142434445464748494a4b4c4d4e4f
++AAD = 00112233445566778899aabbccddeeffdeaddadadeaddadaffeeddccbbaa99887766554433221100
++AAD =
++AAD = 09f911029d74e35bd84156c5635688c0
++Tag = 83ce6593a8fa67eb6fcd2819cedfc011
++Plaintext =  7468697320697320736f6d6520706c61696e7465787420746f20656e6372797074207573696e67205349562d414553
++Ciphertext = 30d937b42f71f71f93fc2d8d702d3eac8dc7651eefcd81120081ff29d626f97f3de17f2969b691c91b69b652bf3a6d
++
++Cipher = aes-128-siv
++Key = 7f7e7d7c7b7a79787776757473727170404142434445464748494a4b4c4d4e4f
++AAD =
++AAD = 00112233445566778899aabbccddeeffdeaddadadeaddadaffeeddccbbaa99887766554433221100
++AAD = 09f911029d74e35bd84156c5635688c0
++Tag = 77dd4a44f5a6b41302121ee7f378de25
++Plaintext =  7468697320697320736f6d6520706c61696e7465787420746f20656e6372797074207573696e67205349562d414553
++Ciphertext = 0fcd664c922464c88939d71fad7aefb864e501b0848a07d39201c1067a7288f3dadf0131a823a0bc3d588e8564a5fe
++
+ Cipher = aes-192-siv
+ Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0f0f1f2f3f4f5f6f7f8f9fafbfcfdfefffffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ AAD = 101112131415161718191a1b1c1d1e1f2021222324252627
+-- 
+2.27.0
+
diff --git a/backport-DH_check-Do-not-try-checking-q-properties-if-it-is-o.patch b/backport-DH_check-Do-not-try-checking-q-properties-if-it-is-o.patch
new file mode 100644
index 0000000..13ad1a2
--- /dev/null
+++ b/backport-DH_check-Do-not-try-checking-q-properties-if-it-is-o.patch
@@ -0,0 +1,61 @@
+From 9002fd07327a91f35ba6c1307e71fa6fd4409b7f Mon Sep 17 00:00:00 2001
+From: Tomas Mraz <tomas@openssl.org>
+Date: Tue, 25 Jul 2023 15:22:48 +0200
+Subject: [PATCH] DH_check(): Do not try checking q properties if it is
+ obviously invalid
+
+If  |q| >= |p| then the q value is obviously wrong as q
+is supposed to be a prime divisor of p-1.
+
+We check if p is overly large so this added test implies that
+q is not large either when performing subsequent tests using that
+q value.
+
+Otherwise if it is too large these additional checks of the q value
+such as the primality test can then trigger DoS by doing overly long
+computations.
+
+Fixes CVE-2023-3817
+
+Reviewed-by: Matt Caswell <matt@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com>
+Reviewed-by: Todd Short <todd.short@me.com>
+(Merged from https://github.com/openssl/openssl/pull/21550)
+
+(cherry picked from commit 1c16253f3c3a8d1e25918c3f404aae6a5b0893de)
+(cherry picked from commit 6a1eb62c29db6cb5eec707f9338aee00f44e26f5)
+---
+ crypto/dh/dh_check.c | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+diff --git a/crypto/dh/dh_check.c b/crypto/dh/dh_check.c
+index aef6f9b1b7..fbe2797569 100644
+--- a/crypto/dh/dh_check.c
++++ b/crypto/dh/dh_check.c
+@@ -143,7 +143,7 @@ int DH_check(const DH *dh, int *ret)
+ #ifdef FIPS_MODULE
+     return DH_check_params(dh, ret);
+ #else
+-    int ok = 0, r;
++    int ok = 0, r, q_good = 0;
+     BN_CTX *ctx = NULL;
+     BIGNUM *t1 = NULL, *t2 = NULL;
+     int nid = DH_get_nid((DH *)dh);
+@@ -172,6 +172,13 @@ int DH_check(const DH *dh, int *ret)
+         goto err;
+ 
+     if (dh->params.q != NULL) {
++        if (BN_ucmp(dh->params.p, dh->params.q) > 0)
++            q_good = 1;
++        else
++            *ret |= DH_CHECK_INVALID_Q_VALUE;
++    }
++
++    if (q_good) {
+         if (BN_cmp(dh->params.g, BN_value_one()) <= 0)
+             *ret |= DH_NOT_SUITABLE_GENERATOR;
+         else if (BN_cmp(dh->params.g, dh->params.p) >= 0)
+-- 
+2.27.0
+
diff --git a/backport-Do-not-ignore-empty-associated-data-with-AES-SIV-mod.patch b/backport-Do-not-ignore-empty-associated-data-with-AES-SIV-mod.patch
new file mode 100644
index 0000000..98b1a0b
--- /dev/null
+++ b/backport-Do-not-ignore-empty-associated-data-with-AES-SIV-mod.patch
@@ -0,0 +1,57 @@
+From 00e2f5eea29994d19293ec4e8c8775ba73678598 Mon Sep 17 00:00:00 2001
+From: Tomas Mraz <tomas@openssl.org>
+Date: Tue, 4 Jul 2023 17:30:35 +0200
+Subject: [PATCH] Do not ignore empty associated data with AES-SIV mode
+
+The AES-SIV mode allows for multiple associated data items
+authenticated separately with any of these being 0 length.
+
+The provided implementation ignores such empty associated data
+which is incorrect in regards to the RFC 5297 and is also
+a security issue because such empty associated data then become
+unauthenticated if an application expects to authenticate them.
+
+Fixes CVE-2023-2975
+
+Reviewed-by: Matt Caswell <matt@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/21384)
+
+(cherry picked from commit c426c281cfc23ab182f7d7d7a35229e7db1494d9)
+---
+ .../implementations/ciphers/cipher_aes_siv.c   | 18 +++++++++++-------
+ 1 file changed, 11 insertions(+), 7 deletions(-)
+
+diff --git a/providers/implementations/ciphers/cipher_aes_siv.c b/providers/implementations/ciphers/cipher_aes_siv.c
+index 45010b90db..b396c8651a 100644
+--- a/providers/implementations/ciphers/cipher_aes_siv.c
++++ b/providers/implementations/ciphers/cipher_aes_siv.c
+@@ -120,14 +120,18 @@ static int siv_cipher(void *vctx, unsigned char *out, size_t *outl,
+     if (!ossl_prov_is_running())
+         return 0;
+ 
+-    if (inl == 0) {
+-        *outl = 0;
+-        return 1;
+-    }
++    /* Ignore just empty encryption/decryption call and not AAD. */
++    if (out != NULL) {
++        if (inl == 0) {
++            if (outl != NULL)
++                *outl = 0;
++            return 1;
++        }
+ 
+-    if (outsize < inl) {
+-        ERR_raise(ERR_LIB_PROV, PROV_R_OUTPUT_BUFFER_TOO_SMALL);
+-        return 0;
++        if (outsize < inl) {
++            ERR_raise(ERR_LIB_PROV, PROV_R_OUTPUT_BUFFER_TOO_SMALL);
++            return 0;
++        }
+     }
+ 
+     if (ctx->hw->cipher(ctx, out, in, inl) <= 0)
+-- 
+2.27.0
+
diff --git a/backport-Fix-DH_check-excessive-time-with-over-sized-modulus.patch b/backport-Fix-DH_check-excessive-time-with-over-sized-modulus.patch
new file mode 100644
index 0000000..53ddf3b
--- /dev/null
+++ b/backport-Fix-DH_check-excessive-time-with-over-sized-modulus.patch
@@ -0,0 +1,74 @@
+From 1fa20cf2f506113c761777127a38bce5068740eb Mon Sep 17 00:00:00 2001
+From: Matt Caswell <matt@openssl.org>
+Date: Thu, 6 Jul 2023 16:36:35 +0100
+Subject: [PATCH] Fix DH_check() excessive time with over sized modulus
+
+The DH_check() function checks numerous aspects of the key or parameters
+that have been supplied. Some of those checks use the supplied modulus
+value even if it is excessively large.
+
+There is already a maximum DH modulus size (10,000 bits) over which
+OpenSSL will not generate or derive keys. DH_check() will however still
+perform various tests for validity on such a large modulus. We introduce a
+new maximum (32,768) over which DH_check() will just fail.
+
+An application that calls DH_check() and supplies a key or parameters
+obtained from an untrusted source could be vulnerable to a Denial of
+Service attack.
+
+The function DH_check() is itself called by a number of other OpenSSL
+functions. An application calling any of those other functions may
+similarly be affected. The other functions affected by this are
+DH_check_ex() and EVP_PKEY_param_check().
+
+CVE-2023-3446
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com>
+Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/21451)
+
+(cherry picked from commit 9e0094e2aa1b3428a12d5095132f133c078d3c3d)
+---
+ crypto/dh/dh_check.c | 6 ++++++
+ include/openssl/dh.h | 6 +++++-
+ 2 files changed, 11 insertions(+), 1 deletion(-)
+
+diff --git a/crypto/dh/dh_check.c b/crypto/dh/dh_check.c
+index 0b391910d6..84a926998e 100644
+--- a/crypto/dh/dh_check.c
++++ b/crypto/dh/dh_check.c
+@@ -152,6 +152,12 @@ int DH_check(const DH *dh, int *ret)
+     if (nid != NID_undef)
+         return 1;
+ 
++    /* Don't do any checks at all with an excessively large modulus */
++    if (BN_num_bits(dh->params.p) > OPENSSL_DH_CHECK_MAX_MODULUS_BITS) {
++        ERR_raise(ERR_LIB_DH, DH_R_MODULUS_TOO_LARGE);
++        return 0;
++    }
++
+     if (!DH_check_params(dh, ret))
+         return 0;
+ 
+diff --git a/include/openssl/dh.h b/include/openssl/dh.h
+index b97871eca7..36420f51d8 100644
+--- a/include/openssl/dh.h
++++ b/include/openssl/dh.h
+@@ -89,7 +89,11 @@ int EVP_PKEY_CTX_get0_dh_kdf_ukm(EVP_PKEY_CTX *ctx, unsigned char **ukm);
+ #  include <openssl/dherr.h>
+ 
+ #  ifndef OPENSSL_DH_MAX_MODULUS_BITS
+-#   define OPENSSL_DH_MAX_MODULUS_BITS    10000
++#   define OPENSSL_DH_MAX_MODULUS_BITS        10000
++#  endif
++
++#  ifndef OPENSSL_DH_CHECK_MAX_MODULUS_BITS
++#   define OPENSSL_DH_CHECK_MAX_MODULUS_BITS  32768
+ #  endif
+ 
+ #  define OPENSSL_DH_FIPS_MIN_MODULUS_BITS 1024
+-- 
+2.27.0
+
diff --git a/backport-Make-DH_check-set-some-error-bits-in-recently-added-.patch b/backport-Make-DH_check-set-some-error-bits-in-recently-added-.patch
new file mode 100644
index 0000000..91e9417
--- /dev/null
+++ b/backport-Make-DH_check-set-some-error-bits-in-recently-added-.patch
@@ -0,0 +1,39 @@
+From e648db50d9a63f71cab5cb78424c2932d019a744 Mon Sep 17 00:00:00 2001
+From: Bernd Edlinger <bernd.edlinger@hotmail.de>
+Date: Sun, 23 Jul 2023 14:27:54 +0200
+Subject: [PATCH] Make DH_check set some error bits in recently added error
+
+The pre-existing error cases where DH_check returned zero
+are not related to the dh params in any way, but are only
+triggered by out-of-memory errors, therefore having *ret
+set to zero feels right, but since the new error case is
+triggered by too large p values that is something different.
+On the other hand some callers of this function might not
+be prepared to handle the return value correctly but only
+rely on *ret. Therefore we set some error bits in *ret as
+additional safety measure.
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/21524)
+
+(cherry picked from commit 81d10e61a4b7d5394d08a718bf7d6bae20e818fc)
+---
+ crypto/dh/dh_check.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/crypto/dh/dh_check.c b/crypto/dh/dh_check.c
+index 84a926998e..aef6f9b1b7 100644
+--- a/crypto/dh/dh_check.c
++++ b/crypto/dh/dh_check.c
+@@ -155,6 +155,7 @@ int DH_check(const DH *dh, int *ret)
+     /* Don't do any checks at all with an excessively large modulus */
+     if (BN_num_bits(dh->params.p) > OPENSSL_DH_CHECK_MAX_MODULUS_BITS) {
+         ERR_raise(ERR_LIB_DH, DH_R_MODULUS_TOO_LARGE);
++        *ret = DH_MODULUS_TOO_LARGE | DH_CHECK_P_NOT_PRIME;
+         return 0;
+     }
+ 
+-- 
+2.27.0
+
diff --git a/backport-dhtest.c-Add-test-of-DH_check-with-q-p-1.patch b/backport-dhtest.c-Add-test-of-DH_check-with-q-p-1.patch
new file mode 100644
index 0000000..d5d7890
--- /dev/null
+++ b/backport-dhtest.c-Add-test-of-DH_check-with-q-p-1.patch
@@ -0,0 +1,53 @@
+From 2255f6c74e6c8b702adcf352b04c5d3e6c759745 Mon Sep 17 00:00:00 2001
+From: Tomas Mraz <tomas@openssl.org>
+Date: Tue, 25 Jul 2023 15:23:43 +0200
+Subject: [PATCH] dhtest.c: Add test of DH_check() with q = p + 1
+
+This must fail with DH_CHECK_INVALID_Q_VALUE and
+with DH_CHECK_Q_NOT_PRIME unset.
+
+Reviewed-by: Matt Caswell <matt@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com>
+Reviewed-by: Todd Short <todd.short@me.com>
+(Merged from https://github.com/openssl/openssl/pull/21550)
+
+(cherry picked from commit ad5d35572695d7b5748b2bd4fb1afaa189b29e28)
+(cherry picked from commit 1478ffad3f123550ec1014642d5c880dfbe270ef)
+---
+ test/dhtest.c | 12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+diff --git a/test/dhtest.c b/test/dhtest.c
+index f8dd8f3aa7..d02b3b7c58 100644
+--- a/test/dhtest.c
++++ b/test/dhtest.c
+@@ -124,6 +124,15 @@ static int dh_test(void)
+     /* We'll have a stale error on the queue from the above test so clear it */
+     ERR_clear_error();
+ 
++    if (!TEST_ptr(BN_copy(q, p)) || !TEST_true(BN_add(q, q, BN_value_one())))
++        goto err3;
++
++    if (!TEST_true(DH_check(dh, &i)))
++        goto err3;
++    if (!TEST_true(i & DH_CHECK_INVALID_Q_VALUE)
++        || !TEST_false(i & DH_CHECK_Q_NOT_PRIME))
++        goto err3;
++
+     /* Modulus of size: dh check max modulus bits + 1 */
+     if (!TEST_true(BN_set_word(p, 1))
+             || !TEST_true(BN_lshift(p, p, OPENSSL_DH_CHECK_MAX_MODULUS_BITS)))
+@@ -135,6 +144,9 @@ static int dh_test(void)
+     if (!TEST_false(DH_check(dh, &i)))
+         goto err3;
+ 
++    /* We'll have a stale error on the queue from the above test so clear it */
++    ERR_clear_error();
++
+     /*
+      * II) key generation
+      */
+-- 
+2.27.0
+
diff --git a/openssl-3.0-build.patch b/openssl-3.0-build.patch
new file mode 100644
index 0000000..83243e1
--- /dev/null
+++ b/openssl-3.0-build.patch
@@ -0,0 +1,38 @@
+From 262bff1615d4461120327c5a9fe904ad1c6ce813 Mon Sep 17 00:00:00 2001
+From: hzero1996 <wangcheng156@huawei.com>
+Date: Sun, 29 Jan 2023 14:53:03 +0800
+Subject: [PATCH] openssl-3.0-build
+
+---
+ Configurations/10-main.conf       | 1 +
+ Configurations/unix-Makefile.tmpl | 2 +-
+ 2 files changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/Configurations/10-main.conf b/Configurations/10-main.conf
+index b578a3c..1ad81c3 100644
+--- a/Configurations/10-main.conf
++++ b/Configurations/10-main.conf
+@@ -772,6 +772,7 @@ my %targets = (
+         inherit_from     => [ "linux-generic64" ],
+         asm_arch         => 'aarch64',
+         perlasm_scheme   => "linux64",
++        multilib         => "64",
+     },
+     "linux-arm64ilp32" => {  # https://wiki.linaro.org/Platform/arm64-ilp32
+         inherit_from     => [ "linux-generic32" ],
+diff --git a/Configurations/unix-Makefile.tmpl b/Configurations/unix-Makefile.tmpl
+index 110ba06..712a779 100644
+--- a/Configurations/unix-Makefile.tmpl
++++ b/Configurations/unix-Makefile.tmpl
+@@ -611,7 +611,7 @@ install_sw: install_dev install_engines install_modules install_runtime
+ 
+ uninstall_sw: uninstall_runtime uninstall_modules uninstall_engines uninstall_dev
+ 
+-install_docs: install_man_docs install_html_docs
++install_docs: install_man_docs
+ 
+ uninstall_docs: uninstall_man_docs uninstall_html_docs
+ 	$(RM) -r $(DESTDIR)$(DOCDIR)
+-- 
+2.27.0
+
diff --git a/openssl.spec b/openssl.spec
new file mode 100644
index 0000000..c51ad86
--- /dev/null
+++ b/openssl.spec
@@ -0,0 +1,94 @@
+%define install_prefix /opt/openssl3
+%define soversion 3
+Name:        openssl3
+Epoch:       1
+Version:     3.0.9
+Release:     1
+Summary:     Cryptography and SSL/TLS Toolkit
+License:     OpenSSL and SSLeay
+URL:         https://www.openssl.org/
+Source0:     https://www.openssl.org/source/openssl-%{version}.tar.gz
+Source1:     Makefile.certificate
+
+Patch1:      openssl-3.0-build.patch
+Patch2:      Backport-aarch64-support-BTI-and-pointer-authentication-in-as.patch
+Patch3:      Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch
+Patch4:      Backport-Fix-sm3ss1-translation-issue-in-sm3-armv8.pl.patch
+Patch5:      Backport-providers-Add-SM4-GCM-implementation.patch
+Patch6:      Backport-SM4-optimization-for-ARM-by-HW-instruction.patch
+Patch7:      Backport-Further-acceleration-for-SM4-GCM-on-ARM.patch
+Patch8:      Backport-SM4-optimization-for-ARM-by-ASIMD.patch
+Patch9:      Backport-providers-Add-SM4-XTS-implementation.patch
+Patch10:     Backport-Fix-SM4-CBC-regression-on-Armv8.patch
+Patch11:     Backport-Fix-SM4-test-failures-on-big-endian-ARM-processors.patch
+Patch12:     Backport-Apply-SM4-optimization-patch-to-Kunpeng-920.patch
+Patch13:     Backport-SM4-AESE-optimization-for-ARMv8.patch
+Patch14:     Backport-Fix-SM4-XTS-build-failure-on-Mac-mini-M1.patch
+Patch15:     backport-Add-testcases-for-empty-associated-data-entries-with.patch
+Patch16:     backport-Do-not-ignore-empty-associated-data-with-AES-SIV-mod.patch
+Patch17:     backport-Add-a-test-for-CVE-2023-3446.patch
+Patch18:     backport-Fix-DH_check-excessive-time-with-over-sized-modulus.patch
+Patch19:     backport-Make-DH_check-set-some-error-bits-in-recently-added-.patch
+Patch20:     backport-DH_check-Do-not-try-checking-q-properties-if-it-is-o.patch
+Patch21:     backport-dhtest.c-Add-test-of-DH_check-with-q-p-1.patch
+Patch22:     Backport-support-decode-SM2-parameters.patch
+Patch23:     Feature-support-SM2-CMS-signature.patch
+Patch24:     Feature-use-default-id-if-SM2-id-is-not-set.patch
+Patch25:     backport-A-null-pointer-dereference-occurs-when-memory-alloca.patch
+
+BuildRequires: gcc gcc-c++ perl make lksctp-tools-devel coreutils util-linux zlib-devel
+Requires:    coreutils
+
+%description
+OpenSSL is a robust, commercial-grade, and full-featured toolkit for the
+Transport Layer Security (TLS) and Secure Sockets Layer (SSL) protocols.
+
+%prep
+%autosetup -n openssl-%{version} -p1
+
+%build
+
+sslarch=%{_os}-%{_target_cpu}
+%ifarch i686
+sslarch=linux-elf
+%endif
+%ifarch riscv64
+sslarch=%{_os}64-%{_target_cpu}
+%endif
+
+%ifarch x86_64 aarch64
+sslflags=enable-ec_nistp_64_gcc_128
+%endif
+
+RPM_OPT_FLAGS="$RPM_OPT_FLAGS -Wa,--noexecstack -Wa,--generate-missing-build-notes=yes -DPURIFY $RPM_LD_FLAGS"
+./Configure \
+	--prefix=%{install_prefix} -Wl,-rpath,%{install_prefix}/lib  ${sslflags} \
+	zlib enable-camellia enable-seed enable-rfc3779 \
+	enable-cms enable-md2 enable-rc5 ${ktlsopt} enable-fips\
+	no-mdc2 no-ec2m enable-sm2 enable-sm4 enable-buildtest-c++\
+	shared  ${sslarch} $RPM_OPT_FLAGS '-DDEVRANDOM="\"/dev/urandom\""' \
+	-Wl,--allow-multiple-definition
+
+
+%make_build all
+
+%install
+# Install OpenSSL.
+#install -d $RPM_BUILD_ROOT{%{_bindir},%{_includedir},%{_libdir},%{_mandir},%{_libdir}/openssl,%{_pkgdocdir}}
+
+%make_install
+
+rm -f %{buildroot}%{install_prefix}{/bin/c_rehash,/ssl/misc/tsget*,/ssl/misc/*.pl}
+
+export QA_RPATHS=$(( 0x0002 ))
+
+%check
+%make_build test
+
+%files
+%license LICENSE.txt
+%{install_prefix}
+
+%changelog
+* Mon Oct 02 2023 Funda Wang <fundawang@yeah.net> - 3.0.9-1
+- Try install into /opt
diff --git a/sources b/sources
new file mode 100644
index 0000000..077b1dd
--- /dev/null
+++ b/sources
@@ -0,0 +1 @@
+8b2aff668b8ce0da24b9505ebfd26b4d  openssl-3.0.9.tar.gz
author	CoprDistGit <infra@openeuler.org>	2023-10-02 03:32:16 +0000
committer	CoprDistGit <infra@openeuler.org>	2023-10-02 03:32:16 +0000
commit	e879981f405f8810d1b0d9c1c77aea3e8be6a469 (patch)
tree	8698c9791c9e77d3be587c5c7ad9d43dce7c6d30
parent	80d0cbc46bb935a925d434060b67c794844558d9 (diff)