summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCoprDistGit <infra@openeuler.org>2023-10-02 03:32:16 +0000
committerCoprDistGit <infra@openeuler.org>2023-10-02 03:32:16 +0000
commite879981f405f8810d1b0d9c1c77aea3e8be6a469 (patch)
tree8698c9791c9e77d3be587c5c7ad9d43dce7c6d30
parent80d0cbc46bb935a925d434060b67c794844558d9 (diff)
-rw-r--r--.gitignore1
-rw-r--r--Backport-Apply-SM4-optimization-patch-to-Kunpeng-920.patch74
-rw-r--r--Backport-Fix-SM4-CBC-regression-on-Armv8.patch60
-rw-r--r--Backport-Fix-SM4-XTS-build-failure-on-Mac-mini-M1.patch87
-rw-r--r--Backport-Fix-SM4-test-failures-on-big-endian-ARM-processors.patch207
-rw-r--r--Backport-Fix-sm3ss1-translation-issue-in-sm3-armv8.pl.patch67
-rw-r--r--Backport-Further-acceleration-for-SM4-GCM-on-ARM.patch73
-rw-r--r--Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch457
-rw-r--r--Backport-SM4-AESE-optimization-for-ARMv8.patch2322
-rw-r--r--Backport-SM4-optimization-for-ARM-by-ASIMD.patch1334
-rw-r--r--Backport-SM4-optimization-for-ARM-by-HW-instruction.patch1228
-rw-r--r--Backport-aarch64-support-BTI-and-pointer-authentication-in-as.patch1521
-rw-r--r--Backport-providers-Add-SM4-GCM-implementation.patch360
-rw-r--r--Backport-providers-Add-SM4-XTS-implementation.patch763
-rw-r--r--Backport-support-decode-SM2-parameters.patch175
-rw-r--r--Feature-support-SM2-CMS-signature.patch41
-rw-r--r--Feature-use-default-id-if-SM2-id-is-not-set.patch59
-rw-r--r--Makefile.certificate82
-rw-r--r--backport-A-null-pointer-dereference-occurs-when-memory-alloca.patch36
-rw-r--r--backport-Add-a-test-for-CVE-2023-3446.patch63
-rw-r--r--backport-Add-testcases-for-empty-associated-data-entries-with.patch66
-rw-r--r--backport-DH_check-Do-not-try-checking-q-properties-if-it-is-o.patch61
-rw-r--r--backport-Do-not-ignore-empty-associated-data-with-AES-SIV-mod.patch57
-rw-r--r--backport-Fix-DH_check-excessive-time-with-over-sized-modulus.patch74
-rw-r--r--backport-Make-DH_check-set-some-error-bits-in-recently-added-.patch39
-rw-r--r--backport-dhtest.c-Add-test-of-DH_check-with-q-p-1.patch53
-rw-r--r--openssl-3.0-build.patch38
-rw-r--r--openssl.spec94
-rw-r--r--sources1
29 files changed, 9493 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index e69de29..40bd54c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1 @@
+/openssl-3.0.9.tar.gz
diff --git a/Backport-Apply-SM4-optimization-patch-to-Kunpeng-920.patch b/Backport-Apply-SM4-optimization-patch-to-Kunpeng-920.patch
new file mode 100644
index 0000000..6536ed5
--- /dev/null
+++ b/Backport-Apply-SM4-optimization-patch-to-Kunpeng-920.patch
@@ -0,0 +1,74 @@
+From 06f13f85ee86cd7fbc546060fbe2d077176b0be4 Mon Sep 17 00:00:00 2001
+From: Xu Yizhou <xuyizhou1@huawei.com>
+Date: Mon, 31 Oct 2022 11:28:15 +0800
+Subject: [PATCH 11/13] Apply SM4 optimization patch to Kunpeng-920
+
+In the ideal scenario, performance can reach up to 2.2X.
+But in single block input or CFB/OFB mode, CBC encryption,
+performance could drop about 50%.
+
+Perf data on Kunpeng-920 2.6GHz hardware, before and after optimization:
+
+Before:
+type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
+SM4-CTR 75318.96k 79089.62k 79736.15k 79934.12k 80325.44k 80068.61k
+SM4-ECB 80211.39k 84998.36k 86472.28k 87024.93k 87144.80k 86862.51k
+SM4-GCM 72156.19k 82012.08k 83848.02k 84322.65k 85103.65k 84896.43k
+SM4-CBC 77956.13k 80638.81k 81976.17k 81606.31k 82078.91k 81750.70k
+SM4-CFB 78078.20k 81054.87k 81841.07k 82396.38k 82203.99k 82236.76k
+SM4-OFB 78282.76k 82074.03k 82765.74k 82989.06k 83200.68k 83487.17k
+
+After:
+type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
+SM4-CTR 35678.07k 120687.25k 176632.27k 177192.62k 177586.18k 178295.18k
+SM4-ECB 35540.32k 122628.07k 175067.90k 178007.84k 178298.88k 178328.92k
+SM4-GCM 34215.75k 116720.50k 170275.16k 171770.88k 172714.21k 172272.30k
+SM4-CBC 35645.60k 36544.86k 36515.50k 36732.15k 36618.24k 36629.16k
+SM4-CFB 35528.14k 35690.99k 35954.86k 35843.42k 35809.18k 35809.96k
+SM4-OFB 35563.55k 35853.56k 35963.05k 36203.52k 36233.85k 36307.82k
+
+Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com>
+
+Reviewed-by: Hugo Landau <hlandau@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/19547)
+---
+ crypto/arm_arch.h | 4 ++++
+ include/crypto/sm4_platform.h | 3 ++-
+ 2 files changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
+index 5b5af31d92..c10748e5f8 100644
+--- a/crypto/arm_arch.h
++++ b/crypto/arm_arch.h
+@@ -98,9 +98,13 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
+ */
+
+ # define ARM_CPU_IMP_ARM 0x41
++# define HISI_CPU_IMP 0x48
+
+ # define ARM_CPU_PART_CORTEX_A72 0xD08
+ # define ARM_CPU_PART_N1 0xD0C
++# define ARM_CPU_PART_V1 0xD40
++# define ARM_CPU_PART_N2 0xD49
++# define HISI_CPU_PART_KP920 0xD01
+
+ # define MIDR_PARTNUM_SHIFT 4
+ # define MIDR_PARTNUM_MASK (0xfffU << MIDR_PARTNUM_SHIFT)
+diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
+index 11f9b9d88b..15d8abbcb1 100644
+--- a/include/crypto/sm4_platform.h
++++ b/include/crypto/sm4_platform.h
+@@ -20,7 +20,8 @@ static inline int vpsm4_capable(void)
+ {
+ return (OPENSSL_armcap_P & ARMV8_CPUID) &&
+ (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1) ||
+- MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1));
++ MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1) ||
++ MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, HISI_CPU_IMP, HISI_CPU_PART_KP920));
+ }
+ # if defined(VPSM4_ASM)
+ # define VPSM4_CAPABLE vpsm4_capable()
+--
+2.37.3.windows.1
+
diff --git a/Backport-Fix-SM4-CBC-regression-on-Armv8.patch b/Backport-Fix-SM4-CBC-regression-on-Armv8.patch
new file mode 100644
index 0000000..2176932
--- /dev/null
+++ b/Backport-Fix-SM4-CBC-regression-on-Armv8.patch
@@ -0,0 +1,60 @@
+From d7d5490d7201dcfb1f3811ad1bfc57ed9b2c0b77 Mon Sep 17 00:00:00 2001
+From: "fangming.fang" <fangming.fang@arm.com>
+Date: Thu, 8 Dec 2022 10:46:27 +0000
+Subject: [PATCH 09/13] Fix SM4-CBC regression on Armv8
+
+Fixes #19858
+
+During decryption, the last ciphertext is not fed to next block
+correctly when the number of input blocks is exactly 4. Fix this
+and add the corresponding test cases.
+
+Thanks xu-yi-zhou for reporting this issue and proposing the fix.
+
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/19872)
+---
+ crypto/sm4/asm/vpsm4-armv8.pl | 2 +-
+ test/recipes/30-test_evp_data/evpciph_sm4.txt | 12 ++++++++++++
+ 2 files changed, 13 insertions(+), 1 deletion(-)
+
+diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl
+index 095d9dae64..c842ef61d5 100755
+--- a/crypto/sm4/asm/vpsm4-armv8.pl
++++ b/crypto/sm4/asm/vpsm4-armv8.pl
+@@ -880,7 +880,7 @@ $code.=<<___;
+ subs $blocks,$blocks,#4
+ b.gt .Lcbc_4_blocks_dec
+ // save back IV
+- st1 {@vtmp[3].16b}, [$ivp]
++ st1 {@data[3].16b}, [$ivp]
+ b 100f
+ 1: // last block
+ subs $blocks,$blocks,#1
+diff --git a/test/recipes/30-test_evp_data/evpciph_sm4.txt b/test/recipes/30-test_evp_data/evpciph_sm4.txt
+index 9fb16ca15c..e9a98c9898 100644
+--- a/test/recipes/30-test_evp_data/evpciph_sm4.txt
++++ b/test/recipes/30-test_evp_data/evpciph_sm4.txt
+@@ -19,6 +19,18 @@ IV = 0123456789ABCDEFFEDCBA9876543210
+ Plaintext = 0123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA9876543210
+ Ciphertext = 2677F46B09C122CC975533105BD4A22AF6125F7275CE552C3A2BBCF533DE8A3B
+
++Cipher = SM4-CBC
++Key = 0123456789ABCDEFFEDCBA9876543210
++IV = 0123456789ABCDEFFEDCBA9876543210
++Plaintext = 0123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA9876543210
++Ciphertext = 2677F46B09C122CC975533105BD4A22AF6125F7275CE552C3A2BBCF533DE8A3BFFF5A4F208092C0901BA02D5772977369915E3FA2356C9F4EB6460ECC457E7f8E3CFA3DEEBFE9883E3A48BCF7C4A11AA3EC9E0D317C5D319BE72A5CDDDEC640C
++
++Cipher = SM4-CBC
++Key = 0123456789ABCDEFFEDCBA9876543210
++IV = 0123456789ABCDEFFEDCBA9876543210
++Plaintext = 0123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA9876543210
++Ciphertext = 2677f46b09c122cc975533105bd4a22af6125f7275ce552c3a2bbcf533de8a3bfff5a4f208092c0901ba02d5772977369915e3fa2356c9f4eb6460ecc457e7f8e3cfa3deebfe9883e3a48bcf7c4a11aa3ec9e0d317c5d319be72a5cdddec640c6fc70bfa3ddaafffdd7c09b2774dcb2cec29f0c6f0b6773e985b3e395e924238505a8f120d9ca84de5c3cf7e45f097b14b3a46c5b1068669982a5c1f5f61be291b984f331d44ffb2758f771672448fc957fa1416c446427a41e25d5524a2418b9d96b2f17582f0f1aa9c204c6807f54f7b6833c5f00856659ddabc245936868c
++
+ Cipher = SM4-OFB
+ Key = 0123456789ABCDEFFEDCBA9876543210
+ IV = 0123456789ABCDEFFEDCBA9876543210
+--
+2.37.3.windows.1
+
diff --git a/Backport-Fix-SM4-XTS-build-failure-on-Mac-mini-M1.patch b/Backport-Fix-SM4-XTS-build-failure-on-Mac-mini-M1.patch
new file mode 100644
index 0000000..5bfd186
--- /dev/null
+++ b/Backport-Fix-SM4-XTS-build-failure-on-Mac-mini-M1.patch
@@ -0,0 +1,87 @@
+From 6df7707fb22e8bd1c7d778a2041c1403f9852060 Mon Sep 17 00:00:00 2001
+From: Xu Yizhou <xuyizhou1@huawei.com>
+Date: Fri, 3 Feb 2023 15:59:59 +0800
+Subject: [PATCH 13/13] Fix SM4-XTS build failure on Mac mini M1
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+Reviewed-by: Richard Levitte <levitte@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/20202)
+---
+ crypto/sm4/asm/vpsm4-armv8.pl | 4 +++-
+ crypto/sm4/asm/vpsm4_ex-armv8.pl | 23 ++++++++++++++++-------
+ 2 files changed, 19 insertions(+), 8 deletions(-)
+
+diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl
+index e19de30901..d30e78f3ce 100755
+--- a/crypto/sm4/asm/vpsm4-armv8.pl
++++ b/crypto/sm4/asm/vpsm4-armv8.pl
+@@ -524,7 +524,7 @@ sub compute_tweak_vec() {
+ my $std = shift;
+ &rbit(@vtmp[2],$src,$std);
+ $code.=<<___;
+- ldr @qtmp[0], =0x01010101010101010101010101010187
++ ldr @qtmp[0], .Lxts_magic
+ shl $des.16b, @vtmp[2].16b, #1
+ ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
+ ushr @vtmp[1].16b, @vtmp[1].16b, #7
+@@ -572,6 +572,8 @@ _vpsm4_consts:
+ .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197
+ .Lshuffles:
+ .dword 0x0B0A090807060504,0x030201000F0E0D0C
++.Lxts_magic:
++ .dword 0x0101010101010187,0x0101010101010101
+
+ .size _vpsm4_consts,.-_vpsm4_consts
+ ___
+diff --git a/crypto/sm4/asm/vpsm4_ex-armv8.pl b/crypto/sm4/asm/vpsm4_ex-armv8.pl
+index 3d094aa535..f2d5b6debf 100644
+--- a/crypto/sm4/asm/vpsm4_ex-armv8.pl
++++ b/crypto/sm4/asm/vpsm4_ex-armv8.pl
+@@ -475,12 +475,12 @@ sub load_sbox () {
+ my $data = shift;
+
+ $code.=<<___;
+- ldr $MaskQ, =0x0306090c0f0205080b0e0104070a0d00
+- ldr $TAHMatQ, =0x22581a6002783a4062185a2042387a00
+- ldr $TALMatQ, =0xc10bb67c4a803df715df62a89e54e923
+- ldr $ATAHMatQ, =0x1407c6d56c7fbeadb9aa6b78c1d21300
+- ldr $ATALMatQ, =0xe383c1a1fe9edcbc6404462679195b3b
+- ldr $ANDMaskQ, =0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
++ ldr $MaskQ, .Lsbox_magic
++ ldr $TAHMatQ, .Lsbox_magic+16
++ ldr $TALMatQ, .Lsbox_magic+32
++ ldr $ATAHMatQ, .Lsbox_magic+48
++ ldr $ATALMatQ, .Lsbox_magic+64
++ ldr $ANDMaskQ, .Lsbox_magic+80
+ ___
+ }
+
+@@ -525,7 +525,7 @@ sub compute_tweak_vec() {
+ my $std = shift;
+ &rbit(@vtmp[2],$src,$std);
+ $code.=<<___;
+- ldr @qtmp[0], =0x01010101010101010101010101010187
++ ldr @qtmp[0], .Lxts_magic
+ shl $des.16b, @vtmp[2].16b, #1
+ ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
+ ushr @vtmp[1].16b, @vtmp[1].16b, #7
+@@ -556,6 +556,15 @@ _${prefix}_consts:
+ .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197
+ .Lshuffles:
+ .dword 0x0B0A090807060504,0x030201000F0E0D0C
++.Lxts_magic:
++ .dword 0x0101010101010187,0x0101010101010101
++.Lsbox_magic:
++ .dword 0x0b0e0104070a0d00,0x0306090c0f020508
++ .dword 0x62185a2042387a00,0x22581a6002783a40
++ .dword 0x15df62a89e54e923,0xc10bb67c4a803df7
++ .dword 0xb9aa6b78c1d21300,0x1407c6d56c7fbead
++ .dword 0x6404462679195b3b,0xe383c1a1fe9edcbc
++ .dword 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
+
+ .size _${prefix}_consts,.-_${prefix}_consts
+ ___
+--
+2.37.3.windows.1
+
diff --git a/Backport-Fix-SM4-test-failures-on-big-endian-ARM-processors.patch b/Backport-Fix-SM4-test-failures-on-big-endian-ARM-processors.patch
new file mode 100644
index 0000000..485fd65
--- /dev/null
+++ b/Backport-Fix-SM4-test-failures-on-big-endian-ARM-processors.patch
@@ -0,0 +1,207 @@
+From b8f24cb95dbe70cbeef08b41f35018141b6ce994 Mon Sep 17 00:00:00 2001
+From: Xu Yizhou <xuyizhou1@huawei.com>
+Date: Thu, 15 Dec 2022 10:21:07 +0800
+Subject: [PATCH 10/13] Fix SM4 test failures on big-endian ARM processors
+
+Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com>
+
+Reviewed-by: Paul Yang <kaishen.yy@antfin.com>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/19910)
+---
+ crypto/sm4/asm/vpsm4-armv8.pl | 52 +++++++++++++++++------------------
+ 1 file changed, 26 insertions(+), 26 deletions(-)
+
+diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl
+index c842ef61d5..73797af582 100755
+--- a/crypto/sm4/asm/vpsm4-armv8.pl
++++ b/crypto/sm4/asm/vpsm4-armv8.pl
+@@ -45,7 +45,7 @@ sub rev32() {
+
+ if ($src and ("$src" ne "$dst")) {
+ $code.=<<___;
+-#ifndef __ARMEB__
++#ifndef __AARCH64EB__
+ rev32 $dst.16b,$src.16b
+ #else
+ mov $dst.16b,$src.16b
+@@ -53,7 +53,7 @@ $code.=<<___;
+ ___
+ } else {
+ $code.=<<___;
+-#ifndef __ARMEB__
++#ifndef __AARCH64EB__
+ rev32 $dst.16b,$dst.16b
+ #endif
+ ___
+@@ -428,10 +428,10 @@ sub load_sbox () {
+
+ $code.=<<___;
+ adr $ptr,.Lsbox
+- ld1 {@sbox[0].4s,@sbox[1].4s,@sbox[2].4s,@sbox[3].4s},[$ptr],#64
+- ld1 {@sbox[4].4s,@sbox[5].4s,@sbox[6].4s,@sbox[7].4s},[$ptr],#64
+- ld1 {@sbox[8].4s,@sbox[9].4s,@sbox[10].4s,@sbox[11].4s},[$ptr],#64
+- ld1 {@sbox[12].4s,@sbox[13].4s,@sbox[14].4s,@sbox[15].4s},[$ptr]
++ ld1 {@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},[$ptr],#64
++ ld1 {@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},[$ptr],#64
++ ld1 {@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},[$ptr],#64
++ ld1 {@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},[$ptr]
+ ___
+ }
+
+@@ -492,9 +492,9 @@ ___
+ &rev32($vkey,$vkey);
+ $code.=<<___;
+ adr $pointer,.Lshuffles
+- ld1 {$vmap.4s},[$pointer]
++ ld1 {$vmap.2d},[$pointer]
+ adr $pointer,.Lfk
+- ld1 {$vfk.4s},[$pointer]
++ ld1 {$vfk.2d},[$pointer]
+ eor $vkey.16b,$vkey.16b,$vfk.16b
+ mov $schedules,#32
+ adr $pointer,.Lck
+@@ -615,7 +615,7 @@ $code.=<<___;
+ .align 5
+ ${prefix}_${dir}crypt:
+ AARCH64_VALID_CALL_TARGET
+- ld1 {@data[0].16b},[$inp]
++ ld1 {@data[0].4s},[$inp]
+ ___
+ &load_sbox();
+ &rev32(@data[0],@data[0]);
+@@ -624,7 +624,7 @@ $code.=<<___;
+ ___
+ &encrypt_1blk(@data[0]);
+ $code.=<<___;
+- st1 {@data[0].16b},[$outp]
++ st1 {@data[0].4s},[$outp]
+ ret
+ .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
+ ___
+@@ -692,12 +692,12 @@ $code.=<<___;
+ cmp $blocks,#1
+ b.lt 100f
+ b.gt 1f
+- ld1 {@data[0].16b},[$inp]
++ ld1 {@data[0].4s},[$inp]
+ ___
+ &rev32(@data[0],@data[0]);
+ &encrypt_1blk(@data[0]);
+ $code.=<<___;
+- st1 {@data[0].16b},[$outp]
++ st1 {@data[0].4s},[$outp]
+ b 100f
+ 1: // process last 2 blocks
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
+@@ -798,11 +798,11 @@ ___
+ &rev32($ivec0,$ivec0);
+ &encrypt_1blk($ivec0);
+ $code.=<<___;
+- st1 {$ivec0.16b},[$outp],#16
++ st1 {$ivec0.4s},[$outp],#16
+ b 1b
+ 2:
+ // save back IV
+- st1 {$ivec0.16b},[$ivp]
++ st1 {$ivec0.4s},[$ivp]
+ ret
+
+ .Ldec:
+@@ -834,7 +834,7 @@ ___
+ &transpose(@vtmp,@datax);
+ &transpose(@data,@datax);
+ $code.=<<___;
+- ld1 {$ivec1.16b},[$ivp]
++ ld1 {$ivec1.4s},[$ivp]
+ ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
+ // note ivec1 and vtmpx[3] are resuing the same register
+ // care needs to be taken to avoid conflict
+@@ -844,7 +844,7 @@ $code.=<<___;
+ eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
+ eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
+ // save back IV
+- st1 {$vtmpx[3].16b}, [$ivp]
++ st1 {$vtmpx[3].4s}, [$ivp]
+ eor @data[0].16b,@data[0].16b,$datax[3].16b
+ eor @data[1].16b,@data[1].16b,@vtmpx[0].16b
+ eor @data[2].16b,@data[2].16b,@vtmpx[1].16b
+@@ -855,7 +855,7 @@ $code.=<<___;
+ b.gt .Lcbc_8_blocks_dec
+ b.eq 100f
+ 1:
+- ld1 {$ivec1.16b},[$ivp]
++ ld1 {$ivec1.4s},[$ivp]
+ .Lcbc_4_blocks_dec:
+ cmp $blocks,#4
+ b.lt 1f
+@@ -880,7 +880,7 @@ $code.=<<___;
+ subs $blocks,$blocks,#4
+ b.gt .Lcbc_4_blocks_dec
+ // save back IV
+- st1 {@data[3].16b}, [$ivp]
++ st1 {@data[3].4s}, [$ivp]
+ b 100f
+ 1: // last block
+ subs $blocks,$blocks,#1
+@@ -888,13 +888,13 @@ $code.=<<___;
+ b.gt 1f
+ ld1 {@data[0].4s},[$inp],#16
+ // save back IV
+- st1 {$data[0].16b}, [$ivp]
++ st1 {$data[0].4s}, [$ivp]
+ ___
+ &rev32(@datax[0],@data[0]);
+ &encrypt_1blk(@datax[0]);
+ $code.=<<___;
+ eor @datax[0].16b,@datax[0].16b,$ivec1.16b
+- st1 {@datax[0].16b},[$outp],#16
++ st1 {@datax[0].4s},[$outp],#16
+ b 100f
+ 1: // last two blocks
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
+@@ -917,7 +917,7 @@ $code.=<<___;
+ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
+ // save back IV
+- st1 {@data[1].16b}, [$ivp]
++ st1 {@data[1].4s}, [$ivp]
+ b 100f
+ 1: // last 3 blocks
+ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
+@@ -937,7 +937,7 @@ $code.=<<___;
+ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
+ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
+ // save back IV
+- st1 {@data[2].16b}, [$ivp]
++ st1 {@data[2].4s}, [$ivp]
+ 100:
+ ldp d10,d11,[sp,#16]
+ ldp d12,d13,[sp,#32]
+@@ -973,9 +973,9 @@ $code.=<<___;
+ ___
+ &encrypt_1blk($ivec);
+ $code.=<<___;
+- ld1 {@data[0].16b},[$inp]
++ ld1 {@data[0].4s},[$inp]
+ eor @data[0].16b,@data[0].16b,$ivec.16b
+- st1 {@data[0].16b},[$outp]
++ st1 {@data[0].4s},[$outp]
+ ret
+ 1:
+ AARCH64_SIGN_LINK_REGISTER
+@@ -1053,9 +1053,9 @@ $code.=<<___;
+ ___
+ &encrypt_1blk($ivec);
+ $code.=<<___;
+- ld1 {@data[0].16b},[$inp]
++ ld1 {@data[0].4s},[$inp]
+ eor @data[0].16b,@data[0].16b,$ivec.16b
+- st1 {@data[0].16b},[$outp]
++ st1 {@data[0].4s},[$outp]
+ b 100f
+ 1: // last 2 blocks processing
+ dup @data[0].4s,$word0
+--
+2.37.3.windows.1
+
diff --git a/Backport-Fix-sm3ss1-translation-issue-in-sm3-armv8.pl.patch b/Backport-Fix-sm3ss1-translation-issue-in-sm3-armv8.pl.patch
new file mode 100644
index 0000000..3ecb59c
--- /dev/null
+++ b/Backport-Fix-sm3ss1-translation-issue-in-sm3-armv8.pl.patch
@@ -0,0 +1,67 @@
+From 8746fff8f096fa35c7157199917100aa7b547d7a Mon Sep 17 00:00:00 2001
+From: "fangming.fang" <fangming.fang@arm.com>
+Date: Tue, 18 Jan 2022 02:58:08 +0000
+Subject: [PATCH 03/13] Fix sm3ss1 translation issue in sm3-armv8.pl
+
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+Reviewed-by: Matt Caswell <matt@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/17542)
+---
+ crypto/sm3/asm/sm3-armv8.pl | 15 +++++++--------
+ 1 file changed, 7 insertions(+), 8 deletions(-)
+
+diff --git a/crypto/sm3/asm/sm3-armv8.pl b/crypto/sm3/asm/sm3-armv8.pl
+index bb71b2eade..f0555fd3f2 100644
+--- a/crypto/sm3/asm/sm3-armv8.pl
++++ b/crypto/sm3/asm/sm3-armv8.pl
+@@ -109,7 +109,7 @@ ___
+
+ $code=<<___;
+ #include "arm_arch.h"
+-.arch armv8.2-a+sm4
++.arch armv8.2-a
+ .text
+ ___
+
+@@ -222,8 +222,8 @@ my %sm3partopcode = (
+ "sm3partw1" => 0xce60C000,
+ "sm3partw2" => 0xce60C400);
+
+-my %sm3sslopcode = (
+- "sm3ssl" => 0xce400000);
++my %sm3ss1opcode = (
++ "sm3ss1" => 0xce400000);
+
+ my %sm3ttopcode = (
+ "sm3tt1a" => 0xce408000,
+@@ -241,14 +241,13 @@ sub unsm3part {
+ $mnemonic,$arg;
+ }
+
+-sub unsm3ssl {
++sub unsm3ss1 {
+ my ($mnemonic,$arg)=@_;
+
+- $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,
+- \s*[qv](\d+)/o
++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o
+ &&
+ sprintf ".inst\t0x%08x\t//%s %s",
+- $sm3sslopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10),
++ $sm3ss1opcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10),
+ $mnemonic,$arg;
+ }
+
+@@ -274,7 +273,7 @@ foreach(split("\n",$code)) {
+ s/\`([^\`]*)\`/eval($1)/ge;
+
+ s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge;
+- s/\b(sm3ssl)\s+([qv].*)/unsm3ssl($1,$2)/ge;
++ s/\b(sm3ss1)\s+([qv].*)/unsm3ss1($1,$2)/ge;
+ s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge;
+ print $_,"\n";
+ }
+--
+2.37.3.windows.1
+
diff --git a/Backport-Further-acceleration-for-SM4-GCM-on-ARM.patch b/Backport-Further-acceleration-for-SM4-GCM-on-ARM.patch
new file mode 100644
index 0000000..11129d9
--- /dev/null
+++ b/Backport-Further-acceleration-for-SM4-GCM-on-ARM.patch
@@ -0,0 +1,73 @@
+From 98da8a58f964e279decc1bbbe8f07d807de05f7f Mon Sep 17 00:00:00 2001
+From: Daniel Hu <Daniel.Hu@arm.com>
+Date: Wed, 2 Mar 2022 12:55:39 +0000
+Subject: [PATCH 06/13] Further acceleration for SM4-GCM on ARM
+
+This patch will allow the SM4-GCM function to leverage the SM4
+high-performance CTR crypto interface already implemented for ARM,
+which is faster than current single block cipher routine used
+for GCM
+
+It does not address the acceleration of GHASH function of GCM,
+which can be a future task, still we can see immediate uplift of
+performance (up to 4X)
+
+Before this patch:
+type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
+SM4-GCM 186432.92k 394234.05k 587916.46k 639365.12k 648486.91k 652924.25k
+
+After the patch:
+SM4-GCM 193924.87k 860940.35k 1696083.71k 2302548.31k 2580411.73k 2607398.91k
+
+Signed-off-by: Daniel Hu <Daniel.Hu@arm.com>
+
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/17814)
+---
+ .../ciphers/cipher_sm4_gcm_hw.c | 25 ++++++++++++++++++-
+ 1 file changed, 24 insertions(+), 1 deletion(-)
+
+diff --git a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+index c0c9b22bd3..b9633f83ed 100644
+--- a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
++++ b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+@@ -42,11 +42,34 @@ static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
+ return 1;
+ }
+
++static int hw_gcm_cipher_update(PROV_GCM_CTX *ctx, const unsigned char *in,
++ size_t len, unsigned char *out)
++{
++ if (ctx->enc) {
++ if (ctx->ctr != NULL) {
++ if (CRYPTO_gcm128_encrypt_ctr32(&ctx->gcm, in, out, len, ctx->ctr))
++ return 0;
++ } else {
++ if (CRYPTO_gcm128_encrypt(&ctx->gcm, in, out, len))
++ return 0;
++ }
++ } else {
++ if (ctx->ctr != NULL) {
++ if (CRYPTO_gcm128_decrypt_ctr32(&ctx->gcm, in, out, len, ctx->ctr))
++ return 0;
++ } else {
++ if (CRYPTO_gcm128_decrypt(&ctx->gcm, in, out, len))
++ return 0;
++ }
++ }
++ return 1;
++}
++
+ static const PROV_GCM_HW sm4_gcm = {
+ sm4_gcm_initkey,
+ ossl_gcm_setiv,
+ ossl_gcm_aad_update,
+- ossl_gcm_cipher_update,
++ hw_gcm_cipher_update,
+ ossl_gcm_cipher_final,
+ ossl_gcm_one_shot
+ };
+--
+2.37.3.windows.1
+
diff --git a/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch b/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch
new file mode 100644
index 0000000..0467d78
--- /dev/null
+++ b/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch
@@ -0,0 +1,457 @@
+From 8a83d735057dde1f727eb0921446e4ca8b085267 Mon Sep 17 00:00:00 2001
+From: "fangming.fang" <fangming.fang@arm.com>
+Date: Fri, 24 Dec 2021 08:29:04 +0000
+Subject: [PATCH 02/13] SM3 acceleration with SM3 hardware instruction on
+ aarch64
+
+SM3 hardware instruction is optional feature of crypto extension for
+aarch64. This implementation accelerates SM3 via SM3 instructions. For
+the platform not supporting SM3 instruction, the original C
+implementation still works. Thanks to AliBaba for testing and reporting
+the following perf numbers for Yitian710:
+
+Benchmark on T-Head Yitian-710 2.75GHz:
+
+Before:
+type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
+sm3 49297.82k 121062.63k 223106.05k 283371.52k 307574.10k 309400.92k
+
+After (33% - 74% faster):
+type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
+sm3 65640.01k 179121.79k 359854.59k 481448.96k 534055.59k 538274.47k
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/17454)
+---
+ crypto/arm64cpuid.pl | 8 +
+ crypto/arm_arch.h | 2 +
+ crypto/armcap.c | 10 ++
+ crypto/sm3/asm/sm3-armv8.pl | 282 ++++++++++++++++++++++++++++++++++++
+ crypto/sm3/build.info | 21 ++-
+ crypto/sm3/sm3_local.h | 16 +-
+ 6 files changed, 336 insertions(+), 3 deletions(-)
+ create mode 100644 crypto/sm3/asm/sm3-armv8.pl
+
+diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
+index 11f0e50279..10d267b7ad 100755
+--- a/crypto/arm64cpuid.pl
++++ b/crypto/arm64cpuid.pl
+@@ -96,6 +96,14 @@ _armv8_cpuid_probe:
+ ret
+ .size _armv8_cpuid_probe,.-_armv8_cpuid_probe
+
++.globl _armv8_sm3_probe
++.type _armv8_sm3_probe,%function
++_armv8_sm3_probe:
++ AARCH64_VALID_CALL_TARGET
++ .long 0xce63c004 // sm3partw1 v4.4s, v0.4s, v3.4s
++ ret
++.size _armv8_sm3_probe,.-_armv8_sm3_probe
++
+ .globl OPENSSL_cleanse
+ .type OPENSSL_cleanse,%function
+ .align 5
+diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
+index a815a5c72b..c8b501f34c 100644
+--- a/crypto/arm_arch.h
++++ b/crypto/arm_arch.h
+@@ -83,6 +83,8 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
+ # define ARMV8_PMULL (1<<5)
+ # define ARMV8_SHA512 (1<<6)
+ # define ARMV8_CPUID (1<<7)
++# define ARMV8_RNG (1<<8)
++# define ARMV8_SM3 (1<<9)
+
+ /*
+ * MIDR_EL1 system register
+diff --git a/crypto/armcap.c b/crypto/armcap.c
+index c021330e32..365a48df45 100644
+--- a/crypto/armcap.c
++++ b/crypto/armcap.c
+@@ -52,6 +52,7 @@ void _armv8_sha1_probe(void);
+ void _armv8_sha256_probe(void);
+ void _armv8_pmull_probe(void);
+ # ifdef __aarch64__
++void _armv8_sm3_probe(void);
+ void _armv8_sha512_probe(void);
+ unsigned int _armv8_cpuid_probe(void);
+ # endif
+@@ -137,6 +138,7 @@ static unsigned long getauxval(unsigned long key)
+ # define HWCAP_CE_SHA1 (1 << 5)
+ # define HWCAP_CE_SHA256 (1 << 6)
+ # define HWCAP_CPUID (1 << 11)
++# define HWCAP_CE_SM3 (1 << 18)
+ # define HWCAP_CE_SHA512 (1 << 21)
+ # endif
+
+@@ -210,6 +212,9 @@ void OPENSSL_cpuid_setup(void)
+
+ if (hwcap & HWCAP_CPUID)
+ OPENSSL_armcap_P |= ARMV8_CPUID;
++
++ if (hwcap & HWCAP_CE_SM3)
++ OPENSSL_armcap_P |= ARMV8_SM3;
+ # endif
+ }
+ # endif
+@@ -253,6 +258,11 @@ void OPENSSL_cpuid_setup(void)
+ _armv8_sha512_probe();
+ OPENSSL_armcap_P |= ARMV8_SHA512;
+ }
++
++ if (sigsetjmp(ill_jmp, 1) == 0) {
++ _armv8_sm3_probe();
++ OPENSSL_armcap_P |= ARMV8_SM3;
++ }
+ # endif
+ }
+ # endif
+diff --git a/crypto/sm3/asm/sm3-armv8.pl b/crypto/sm3/asm/sm3-armv8.pl
+new file mode 100644
+index 0000000000..bb71b2eade
+--- /dev/null
++++ b/crypto/sm3/asm/sm3-armv8.pl
+@@ -0,0 +1,282 @@
++#! /usr/bin/env perl
++# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the Apache License 2.0 (the "License"). You may not use
++# this file except in compliance with the License. You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++#
++# This module implements support for Armv8 SM3 instructions
++
++# $output is the last argument if it looks like a file (it has an extension)
++# $flavour is the first argument if it doesn't look like a file
++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
++die "can't locate arm-xlate.pl";
++
++open OUT,"| \"$^X\" $xlate $flavour \"$output\""
++ or die "can't call $xlate: $!";
++*STDOUT=*OUT;
++
++# Message expanding:
++# Wj <- P1(W[j-16]^W[j-9]^(W[j-3]<<<15))^(W[j-13]<<<7)^W[j-6]
++# Input: s0, s1, s2, s3
++# s0 = w0 | w1 | w2 | w3
++# s1 = w4 | w5 | w6 | w7
++# s2 = w8 | w9 | w10 | w11
++# s3 = w12 | w13 | w14 | w15
++# Output: s4
++sub msg_exp () {
++my $s0 = shift;
++my $s1 = shift;
++my $s2 = shift;
++my $s3 = shift;
++my $s4 = shift;
++my $vtmp1 = shift;
++my $vtmp2 = shift;
++$code.=<<___;
++ // s4 = w7 | w8 | w9 | w10
++ ext $s4.16b, $s1.16b, $s2.16b, #12
++ // vtmp1 = w3 | w4 | w5 | w6
++ ext $vtmp1.16b, $s0.16b, $s1.16b, #12
++ // vtmp2 = w10 | w11 | w12 | w13
++ ext $vtmp2.16b, $s2.16b, $s3.16b, #8
++ sm3partw1 $s4.4s, $s0.4s, $s3.4s
++ sm3partw2 $s4.4s, $vtmp2.4s, $vtmp1.4s
++___
++}
++
++# A round of compresson function
++# Input:
++# ab - choose instruction among sm3tt1a, sm3tt1b, sm3tt2a, sm3tt2b
++# vstate0 - vstate1, store digest status(A - H)
++# vconst0 - vconst1, interleaved used to store Tj <<< j
++# vtmp - temporary register
++# vw - for sm3tt1ab, vw = s0 eor s1
++# s0 - for sm3tt2ab, just be s0
++# i, choose wj' or wj from vw
++sub round () {
++my $ab = shift;
++my $vstate0 = shift;
++my $vstate1 = shift;
++my $vconst0 = shift;
++my $vconst1 = shift;
++my $vtmp = shift;
++my $vw = shift;
++my $s0 = shift;
++my $i = shift;
++$code.=<<___;
++ sm3ss1 $vtmp.4s, $vstate0.4s, $vconst0.4s, $vstate1.4s
++ shl $vconst1.4s, $vconst0.4s, #1
++ sri $vconst1.4s, $vconst0.4s, #31
++ sm3tt1$ab $vstate0.4s, $vtmp.4s, $vw.4s[$i]
++ sm3tt2$ab $vstate1.4s, $vtmp.4s, $s0.4s[$i]
++___
++}
++
++sub qround () {
++my $ab = shift;
++my $vstate0 = shift;
++my $vstate1 = shift;
++my $vconst0 = shift;
++my $vconst1 = shift;
++my $vtmp1 = shift;
++my $vtmp2 = shift;
++my $s0 = shift;
++my $s1 = shift;
++my $s2 = shift;
++my $s3 = shift;
++my $s4 = shift;
++ if($s4) {
++ &msg_exp($s0, $s1, $s2, $s3, $s4, $vtmp1, $vtmp2);
++ }
++$code.=<<___;
++ eor $vtmp1.16b, $s0.16b, $s1.16b
++___
++ &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
++ $vtmp1, $s0, 0);
++ &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
++ $vtmp1, $s0, 1);
++ &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
++ $vtmp1, $s0, 2);
++ &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
++ $vtmp1, $s0, 3);
++}
++
++$code=<<___;
++#include "arm_arch.h"
++.arch armv8.2-a+sm4
++.text
++___
++
++{{{
++my ($pstate,$pdata,$num)=("x0","x1","w2");
++my ($state1,$state2)=("v5","v6");
++my ($sconst1, $sconst2)=("s16","s17");
++my ($vconst1, $vconst2)=("v16","v17");
++my ($s0,$s1,$s2,$s3,$s4)=map("v$_",(0..4));
++my ($bkstate1,$bkstate2)=("v18","v19");
++my ($vconst_tmp1,$vconst_tmp2)=("v20","v21");
++my ($vtmp1,$vtmp2)=("v22","v23");
++my $constaddr="x8";
++# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num)
++$code.=<<___;
++.globl ossl_hwsm3_block_data_order
++.type ossl_hwsm3_block_data_order,%function
++.align 5
++ossl_hwsm3_block_data_order:
++ AARCH64_VALID_CALL_TARGET
++ // load state
++ ld1 {$state1.4s-$state2.4s}, [$pstate]
++ rev64 $state1.4s, $state1.4s
++ rev64 $state2.4s, $state2.4s
++ ext $state1.16b, $state1.16b, $state1.16b, #8
++ ext $state2.16b, $state2.16b, $state2.16b, #8
++
++ adr $constaddr, .Tj
++ ldp $sconst1, $sconst2, [$constaddr]
++
++.Loop:
++ // load input
++ ld1 {$s0.16b-$s3.16b}, [$pdata], #64
++ sub $num, $num, #1
++
++ mov $bkstate1.16b, $state1.16b
++ mov $bkstate2.16b, $state2.16b
++
++#ifndef __ARMEB__
++ rev32 $s0.16b, $s0.16b
++ rev32 $s1.16b, $s1.16b
++ rev32 $s2.16b, $s2.16b
++ rev32 $s3.16b, $s3.16b
++#endif
++
++ ext $vconst_tmp1.16b, $vconst1.16b, $vconst1.16b, #4
++___
++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s0,$s1,$s2,$s3,$s4);
++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s1,$s2,$s3,$s4,$s0);
++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s2,$s3,$s4,$s0,$s1);
++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s3,$s4,$s0,$s1,$s2);
++
++$code.=<<___;
++ ext $vconst_tmp1.16b, $vconst2.16b, $vconst2.16b, #4
++___
++
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s4,$s0,$s1,$s2,$s3);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s0,$s1,$s2,$s3,$s4);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s1,$s2,$s3,$s4,$s0);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s2,$s3,$s4,$s0,$s1);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s3,$s4,$s0,$s1,$s2);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s4,$s0,$s1,$s2,$s3);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s0,$s1,$s2,$s3,$s4);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s1,$s2,$s3,$s4,$s0);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s2,$s3,$s4,$s0,$s1);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s3,$s4);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s4,$s0);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s0,$s1);
++
++$code.=<<___;
++ eor $state1.16b, $state1.16b, $bkstate1.16b
++ eor $state2.16b, $state2.16b, $bkstate2.16b
++
++ // any remained blocks?
++ cbnz $num, .Loop
++
++ // save state
++ rev64 $state1.4s, $state1.4s
++ rev64 $state2.4s, $state2.4s
++ ext $state1.16b, $state1.16b, $state1.16b, #8
++ ext $state2.16b, $state2.16b, $state2.16b, #8
++ st1 {$state1.4s-$state2.4s}, [$pstate]
++ ret
++.size ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order
++
++.align 3
++.Tj:
++.word 0x79cc4519, 0x9d8a7a87
++___
++}}}
++
++#########################################
++my %sm3partopcode = (
++ "sm3partw1" => 0xce60C000,
++ "sm3partw2" => 0xce60C400);
++
++my %sm3sslopcode = (
++ "sm3ssl" => 0xce400000);
++
++my %sm3ttopcode = (
++ "sm3tt1a" => 0xce408000,
++ "sm3tt1b" => 0xce408400,
++ "sm3tt2a" => 0xce408800,
++ "sm3tt2b" => 0xce408C00);
++
++sub unsm3part {
++ my ($mnemonic,$arg)=@_;
++
++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o
++ &&
++ sprintf ".inst\t0x%08x\t//%s %s",
++ $sm3partopcode{$mnemonic}|$1|($2<<5)|($3<<16),
++ $mnemonic,$arg;
++}
++
++sub unsm3ssl {
++ my ($mnemonic,$arg)=@_;
++
++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,
++ \s*[qv](\d+)/o
++ &&
++ sprintf ".inst\t0x%08x\t//%s %s",
++ $sm3sslopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10),
++ $mnemonic,$arg;
++}
++
++sub unsm3tt {
++ my ($mnemonic,$arg)=@_;
++
++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*\[([0-3])\]/o
++ &&
++ sprintf ".inst\t0x%08x\t//%s %s",
++ $sm3ttopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<12),
++ $mnemonic,$arg;
++}
++
++open SELF,$0;
++while(<SELF>) {
++ next if (/^#!/);
++ last if (!s/^#/\/\// and !/^$/);
++ print;
++}
++close SELF;
++
++foreach(split("\n",$code)) {
++ s/\`([^\`]*)\`/eval($1)/ge;
++
++ s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge;
++ s/\b(sm3ssl)\s+([qv].*)/unsm3ssl($1,$2)/ge;
++ s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge;
++ print $_,"\n";
++}
++
++close STDOUT or die "error closing STDOUT: $!";
+diff --git a/crypto/sm3/build.info b/crypto/sm3/build.info
+index eca68216f2..2fa54a4a8b 100644
+--- a/crypto/sm3/build.info
++++ b/crypto/sm3/build.info
+@@ -1,5 +1,22 @@
+ LIBS=../../libcrypto
+
+ IF[{- !$disabled{sm3} -}]
+- SOURCE[../../libcrypto]=sm3.c legacy_sm3.c
+-ENDIF
+\ No newline at end of file
++ IF[{- !$disabled{asm} -}]
++ $SM3ASM_aarch64=sm3-armv8.S
++ $SM3DEF_aarch64=OPENSSL_SM3_ASM
++
++ # Now that we have defined all the arch specific variables, use the
++ # appropriate ones, and define the appropriate macros
++ IF[$SM3ASM_{- $target{asm_arch} -}]
++ $SM3ASM=$SM3ASM_{- $target{asm_arch} -}
++ $SM3DEF=$SM3DEF_{- $target{asm_arch} -}
++ ENDIF
++ ENDIF
++
++ SOURCE[../../libcrypto]=sm3.c legacy_sm3.c $SM3ASM
++ DEFINE[../../libcrypto]=$SM3DEF
++
++ GENERATE[sm3-armv8.S]=asm/sm3-armv8.pl
++ INCLUDE[sm3-armv8.o]=..
++ENDIF
++
+diff --git a/crypto/sm3/sm3_local.h b/crypto/sm3/sm3_local.h
+index 6daeb878a8..ac8a2bf768 100644
+--- a/crypto/sm3/sm3_local.h
++++ b/crypto/sm3/sm3_local.h
+@@ -32,7 +32,21 @@
+ ll=(c)->G; (void)HOST_l2c(ll, (s)); \
+ ll=(c)->H; (void)HOST_l2c(ll, (s)); \
+ } while (0)
+-#define HASH_BLOCK_DATA_ORDER ossl_sm3_block_data_order
++
++#if defined(OPENSSL_SM3_ASM)
++# if defined(__aarch64__)
++# include "crypto/arm_arch.h"
++# define HWSM3_CAPABLE (OPENSSL_armcap_P & ARMV8_SM3)
++void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num);
++# endif
++#endif
++
++#if defined(HWSM3_CAPABLE)
++# define HASH_BLOCK_DATA_ORDER (HWSM3_CAPABLE ? ossl_hwsm3_block_data_order \
++ : ossl_sm3_block_data_order)
++#else
++# define HASH_BLOCK_DATA_ORDER ossl_sm3_block_data_order
++#endif
+
+ void ossl_sm3_block_data_order(SM3_CTX *c, const void *p, size_t num);
+ void ossl_sm3_transform(SM3_CTX *c, const unsigned char *data);
+--
+2.37.3.windows.1
+
diff --git a/Backport-SM4-AESE-optimization-for-ARMv8.patch b/Backport-SM4-AESE-optimization-for-ARMv8.patch
new file mode 100644
index 0000000..0866262
--- /dev/null
+++ b/Backport-SM4-AESE-optimization-for-ARMv8.patch
@@ -0,0 +1,2322 @@
+From 730387aebda57a1bb0af5a74747d4dadc5e033f7 Mon Sep 17 00:00:00 2001
+From: Xu Yizhou <xuyizhou1@huawei.com>
+Date: Wed, 18 Jan 2023 09:55:02 +0800
+Subject: [PATCH 12/13] SM4 AESE optimization for ARMv8
+
+Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com>
+
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/19914)
+---
+ crypto/sm4/asm/vpsm4-armv8.pl | 458 +++++
+ crypto/sm4/asm/vpsm4_ex-armv8.pl | 1544 +++++++++++++++++
+ crypto/sm4/build.info | 4 +-
+ include/crypto/sm4_platform.h | 41 +-
+ .../implementations/ciphers/cipher_sm4_hw.c | 26 +-
+ .../implementations/ciphers/cipher_sm4_xts.c | 4 +-
+ .../implementations/ciphers/cipher_sm4_xts.h | 2 +-
+ .../ciphers/cipher_sm4_xts_hw.c | 33 +-
+ 8 files changed, 2090 insertions(+), 22 deletions(-)
+ create mode 100644 crypto/sm4/asm/vpsm4_ex-armv8.pl
+
+diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl
+index 73797af582..e19de30901 100755
+--- a/crypto/sm4/asm/vpsm4-armv8.pl
++++ b/crypto/sm4/asm/vpsm4-armv8.pl
+@@ -28,6 +28,7 @@ open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+
+ $prefix="vpsm4";
+ my @vtmp=map("v$_",(0..3));
++my @qtmp=map("q$_",(0..3));
+ my @data=map("v$_",(4..7));
+ my @datax=map("v$_",(8..11));
+ my ($rk0,$rk1)=("v12","v13");
+@@ -36,6 +37,7 @@ my @vtmpx=map("v$_",(12..15));
+ my @sbox=map("v$_",(16..31));
+ my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
+ my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
++my ($xtmp1,$xtmp2)=("x8","x9");
+ my ($ptr,$counter)=("x10","w11");
+ my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
+
+@@ -60,6 +62,51 @@ ___
+ }
+ }
+
++sub rev32_armeb() {
++ my $dst = shift;
++ my $src = shift;
++
++ if ($src and ("$src" ne "$dst")) {
++$code.=<<___;
++#ifdef __AARCH64EB__
++ rev32 $dst.16b,$src.16b
++#else
++ mov $dst.16b,$src.16b
++#endif
++___
++ } else {
++$code.=<<___;
++#ifdef __AARCH64EB__
++ rev32 $dst.16b,$dst.16b
++#endif
++___
++ }
++}
++
++sub rbit() {
++ my $dst = shift;
++ my $src = shift;
++ my $std = shift;
++
++ if ($src and ("$src" ne "$dst")) {
++ if ($std eq "_gb") {
++$code.=<<___;
++ rbit $dst.16b,$src.16b
++___
++ } else {
++$code.=<<___;
++ mov $dst.16b,$src.16b
++___
++ }
++ } else {
++ if ($std eq "_gb") {
++$code.=<<___;
++ rbit $dst.16b,$src.16b
++___
++ }
++ }
++}
++
+ sub transpose() {
+ my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
+
+@@ -435,6 +482,58 @@ $code.=<<___;
+ ___
+ }
+
++
++sub mov_reg_to_vec() {
++ my $src0 = shift;
++ my $src1 = shift;
++ my $desv = shift;
++$code.=<<___;
++ mov $desv.d[0],$src0
++ mov $desv.d[1],$src1
++___
++ &rev32_armeb($desv,$desv);
++}
++
++sub mov_vec_to_reg() {
++ my $srcv = shift;
++ my $des0 = shift;
++ my $des1 = shift;
++$code.=<<___;
++ mov $des0,$srcv.d[0]
++ mov $des1,$srcv.d[1]
++___
++}
++
++sub compute_tweak() {
++ my $src0 = shift;
++ my $src1 = shift;
++ my $des0 = shift;
++ my $des1 = shift;
++$code.=<<___;
++ mov $wtmp0,0x87
++ extr $xtmp2,$src1,$src1,#32
++ extr $des1,$src1,$src0,#63
++ and $wtmp1,$wtmp0,$wtmp2,asr#31
++ eor $des0,$xtmp1,$src0,lsl#1
++___
++}
++
++sub compute_tweak_vec() {
++ my $src = shift;
++ my $des = shift;
++ my $std = shift;
++ &rbit(@vtmp[2],$src,$std);
++$code.=<<___;
++ ldr @qtmp[0], =0x01010101010101010101010101010187
++ shl $des.16b, @vtmp[2].16b, #1
++ ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
++ ushr @vtmp[1].16b, @vtmp[1].16b, #7
++ mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
++ eor $des.16b, $des.16b, @vtmp[1].16b
++___
++ &rbit($des,$des,$std);
++}
++
+ $code=<<___;
+ #include "arm_arch.h"
+ .arch armv8-a
+@@ -1101,6 +1200,365 @@ $code.=<<___;
+ .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+ ___
+ }}}
++
++{{{
++my ($blocks,$len)=("x2","x2");
++my $ivp=("x5");
++my @twx=map("x$_",(12..27));
++my ($rks1,$rks2)=("x26","x27");
++my $lastBlk=("x26");
++my $enc=("w28");
++my $remain=("x29");
++
++my @tweak=@datax;
++
++sub gen_xts_cipher() {
++ my $std = shift;
++$code.=<<___;
++.globl ${prefix}_xts_encrypt${std}
++.type ${prefix}_xts_encrypt${std},%function
++.align 5
++${prefix}_xts_encrypt${std}:
++ AARCH64_SIGN_LINK_REGISTER
++ stp x15, x16, [sp, #-0x10]!
++ stp x17, x18, [sp, #-0x10]!
++ stp x19, x20, [sp, #-0x10]!
++ stp x21, x22, [sp, #-0x10]!
++ stp x23, x24, [sp, #-0x10]!
++ stp x25, x26, [sp, #-0x10]!
++ stp x27, x28, [sp, #-0x10]!
++ stp x29, x30, [sp, #-0x10]!
++ stp d8, d9, [sp, #-0x10]!
++ stp d10, d11, [sp, #-0x10]!
++ stp d12, d13, [sp, #-0x10]!
++ stp d14, d15, [sp, #-0x10]!
++ mov $rks1,x3
++ mov $rks2,x4
++ mov $enc,w6
++ ld1 {@tweak[0].4s}, [$ivp]
++ mov $rks,$rks2
++___
++ &load_sbox();
++ &rev32(@tweak[0],@tweak[0]);
++ &encrypt_1blk(@tweak[0]);
++$code.=<<___;
++ mov $rks,$rks1
++ and $remain,$len,#0x0F
++ // convert length into blocks
++ lsr $blocks,$len,4
++ cmp $blocks,#1
++ b.lt .return${std}
++
++ cmp $remain,0
++ // If the encryption/decryption Length is N times of 16,
++ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
++ b.eq .xts_encrypt_blocks${std}
++
++ // If the encryption/decryption length is not N times of 16,
++ // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
++ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
++ subs $blocks,$blocks,#1
++ b.eq .only_2blks_tweak${std}
++.xts_encrypt_blocks${std}:
++___
++ &rbit(@tweak[0],@tweak[0],$std);
++ &rev32_armeb(@tweak[0],@tweak[0]);
++ &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
++$code.=<<___;
++.Lxts_8_blocks_process${std}:
++ cmp $blocks,#8
++ b.lt .Lxts_4_blocks_process${std}
++___
++ &mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]);
++ &mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]);
++ &mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]);
++ &mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]);
++ &mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]);
++ &mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]);
++ &mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]);
++ &mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]);
++$code.=<<___;
++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++___
++ &rbit(@vtmp[0],@vtmp[0],$std);
++ &rbit(@vtmp[1],@vtmp[1],$std);
++ &rbit(@vtmp[2],@vtmp[2],$std);
++ &rbit(@vtmp[3],@vtmp[3],$std);
++$code.=<<___;
++ eor @data[0].16b, @data[0].16b, @vtmp[0].16b
++ eor @data[1].16b, @data[1].16b, @vtmp[1].16b
++ eor @data[2].16b, @data[2].16b, @vtmp[2].16b
++ eor @data[3].16b, @data[3].16b, @vtmp[3].16b
++ ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
++___
++ &rbit(@vtmpx[0],@vtmpx[0],$std);
++ &rbit(@vtmpx[1],@vtmpx[1],$std);
++ &rbit(@vtmpx[2],@vtmpx[2],$std);
++ &rbit(@vtmpx[3],@vtmpx[3],$std);
++$code.=<<___;
++ eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b
++ eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b
++ eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b
++ eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++ &rev32(@datax[0],@datax[0]);
++ &rev32(@datax[1],@datax[1]);
++ &rev32(@datax[2],@datax[2]);
++ &rev32(@datax[3],@datax[3]);
++ &transpose(@data,@vtmp);
++ &transpose(@datax,@vtmp);
++$code.=<<___;
++ bl _${prefix}_enc_8blks
++___
++ &transpose(@vtmp,@datax);
++ &transpose(@data,@datax);
++
++ &mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]);
++ &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
++ &mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]);
++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
++ &mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]);
++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
++ &mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]);
++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
++ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
++ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
++ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
++ &mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]);
++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
++$code.=<<___;
++ eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b
++ eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b
++ eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b
++ eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b
++ eor @data[0].16b, @data[0].16b, @tweak[0].16b
++ eor @data[1].16b, @data[1].16b, @tweak[1].16b
++ eor @data[2].16b, @data[2].16b, @tweak[2].16b
++ eor @data[3].16b, @data[3].16b, @tweak[3].16b
++
++ // save the last tweak
++ st1 {@tweak[3].4s},[$ivp]
++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++ subs $blocks,$blocks,#8
++ b.gt .Lxts_8_blocks_process${std}
++ b 100f
++.Lxts_4_blocks_process${std}:
++___
++ &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
++ &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
++ &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
++ &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
++$code.=<<___;
++ cmp $blocks,#4
++ b.lt 1f
++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++___
++ &rbit(@tweak[0],@tweak[0],$std);
++ &rbit(@tweak[1],@tweak[1],$std);
++ &rbit(@tweak[2],@tweak[2],$std);
++ &rbit(@tweak[3],@tweak[3],$std);
++$code.=<<___;
++ eor @data[0].16b, @data[0].16b, @tweak[0].16b
++ eor @data[1].16b, @data[1].16b, @tweak[1].16b
++ eor @data[2].16b, @data[2].16b, @tweak[2].16b
++ eor @data[3].16b, @data[3].16b, @tweak[3].16b
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++ &transpose(@data,@vtmp);
++$code.=<<___;
++ bl _${prefix}_enc_4blks
++___
++ &transpose(@vtmp,@data);
++$code.=<<___;
++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
++ eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++ sub $blocks,$blocks,#4
++___
++ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
++ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
++ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
++$code.=<<___;
++ // save the last tweak
++ st1 {@tweak[3].4s},[$ivp]
++1:
++ // process last block
++ cmp $blocks,#1
++ b.lt 100f
++ b.gt 1f
++ ld1 {@data[0].4s},[$inp],#16
++___
++ &rbit(@tweak[0],@tweak[0],$std);
++$code.=<<___;
++ eor @data[0].16b, @data[0].16b, @tweak[0].16b
++___
++ &rev32(@data[0],@data[0]);
++ &encrypt_1blk(@data[0]);
++$code.=<<___;
++ eor @data[0].16b, @data[0].16b, @tweak[0].16b
++ st1 {@data[0].4s},[$outp],#16
++ // save the last tweak
++ st1 {@tweak[0].4s},[$ivp]
++ b 100f
++1: // process last 2 blocks
++ cmp $blocks,#2
++ b.gt 1f
++ ld1 {@data[0].4s,@data[1].4s},[$inp],#32
++___
++ &rbit(@tweak[0],@tweak[0],$std);
++ &rbit(@tweak[1],@tweak[1],$std);
++$code.=<<___;
++ eor @data[0].16b, @data[0].16b, @tweak[0].16b
++ eor @data[1].16b, @data[1].16b, @tweak[1].16b
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &transpose(@data,@vtmp);
++$code.=<<___;
++ bl _${prefix}_enc_4blks
++___
++ &transpose(@vtmp,@data);
++$code.=<<___;
++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
++ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
++ // save the last tweak
++ st1 {@tweak[1].4s},[$ivp]
++ b 100f
++1: // process last 3 blocks
++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
++___
++ &rbit(@tweak[0],@tweak[0],$std);
++ &rbit(@tweak[1],@tweak[1],$std);
++ &rbit(@tweak[2],@tweak[2],$std);
++$code.=<<___;
++ eor @data[0].16b, @data[0].16b, @tweak[0].16b
++ eor @data[1].16b, @data[1].16b, @tweak[1].16b
++ eor @data[2].16b, @data[2].16b, @tweak[2].16b
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &transpose(@data,@vtmp);
++$code.=<<___;
++ bl _${prefix}_enc_4blks
++___
++ &transpose(@vtmp,@data);
++$code.=<<___;
++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
++ // save the last tweak
++ st1 {@tweak[2].4s},[$ivp]
++100:
++ cmp $remain,0
++ b.eq .return${std}
++
++// This brance calculates the last two tweaks,
++// while the encryption/decryption length is larger than 32
++.last_2blks_tweak${std}:
++ ld1 {@tweak[0].4s},[$ivp]
++___
++ &rev32_armeb(@tweak[0],@tweak[0]);
++ &compute_tweak_vec(@tweak[0],@tweak[1],$std);
++ &compute_tweak_vec(@tweak[1],@tweak[2],$std);
++$code.=<<___;
++ b .check_dec${std}
++
++
++// This brance calculates the last two tweaks,
++// while the encryption/decryption length is equal to 32, who only need two tweaks
++.only_2blks_tweak${std}:
++ mov @tweak[1].16b,@tweak[0].16b
++___
++ &rev32_armeb(@tweak[1],@tweak[1]);
++ &compute_tweak_vec(@tweak[1],@tweak[2]);
++$code.=<<___;
++ b .check_dec${std}
++
++
++// Determine whether encryption or decryption is required.
++// The last two tweaks need to be swapped for decryption.
++.check_dec${std}:
++ // encryption:1 decryption:0
++ cmp $enc,1
++ b.eq .prcess_last_2blks${std}
++ mov @vtmp[0].16B,@tweak[1].16b
++ mov @tweak[1].16B,@tweak[2].16b
++ mov @tweak[2].16B,@vtmp[0].16b
++
++.prcess_last_2blks${std}:
++___
++ &rev32_armeb(@tweak[1],@tweak[1]);
++ &rev32_armeb(@tweak[2],@tweak[2]);
++$code.=<<___;
++ ld1 {@data[0].4s},[$inp],#16
++ eor @data[0].16b, @data[0].16b, @tweak[1].16b
++___
++ &rev32(@data[0],@data[0]);
++ &encrypt_1blk(@data[0]);
++$code.=<<___;
++ eor @data[0].16b, @data[0].16b, @tweak[1].16b
++ st1 {@data[0].4s},[$outp],#16
++
++ sub $lastBlk,$outp,16
++ .loop${std}:
++ subs $remain,$remain,1
++ ldrb $wtmp0,[$lastBlk,$remain]
++ ldrb $wtmp1,[$inp,$remain]
++ strb $wtmp1,[$lastBlk,$remain]
++ strb $wtmp0,[$outp,$remain]
++ b.gt .loop${std}
++ ld1 {@data[0].4s}, [$lastBlk]
++ eor @data[0].16b, @data[0].16b, @tweak[2].16b
++___
++ &rev32(@data[0],@data[0]);
++ &encrypt_1blk(@data[0]);
++$code.=<<___;
++ eor @data[0].16b, @data[0].16b, @tweak[2].16b
++ st1 {@data[0].4s}, [$lastBlk]
++.return${std}:
++ ldp d14, d15, [sp], #0x10
++ ldp d12, d13, [sp], #0x10
++ ldp d10, d11, [sp], #0x10
++ ldp d8, d9, [sp], #0x10
++ ldp x29, x30, [sp], #0x10
++ ldp x27, x28, [sp], #0x10
++ ldp x25, x26, [sp], #0x10
++ ldp x23, x24, [sp], #0x10
++ ldp x21, x22, [sp], #0x10
++ ldp x19, x20, [sp], #0x10
++ ldp x17, x18, [sp], #0x10
++ ldp x15, x16, [sp], #0x10
++ AARCH64_VALIDATE_LINK_REGISTER
++ ret
++.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
++___
++} # end of gen_xts_cipher
++&gen_xts_cipher("_gb");
++&gen_xts_cipher("");
++}}}
+ ########################################
+ open SELF,$0;
+ while(<SELF>) {
+diff --git a/crypto/sm4/asm/vpsm4_ex-armv8.pl b/crypto/sm4/asm/vpsm4_ex-armv8.pl
+new file mode 100644
+index 0000000000..3d094aa535
+--- /dev/null
++++ b/crypto/sm4/asm/vpsm4_ex-armv8.pl
+@@ -0,0 +1,1544 @@
++#! /usr/bin/env perl
++# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the Apache License 2.0 (the "License"). You may not use
++# this file except in compliance with the License. You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++
++#
++# This module implements SM4 with ASIMD and AESE on AARCH64
++#
++# Dec 2022
++#
++
++# $output is the last argument if it looks like a file (it has an extension)
++# $flavour is the first argument if it doesn't look like a file
++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
++die "can't locate arm-xlate.pl";
++
++open OUT,"| \"$^X\" $xlate $flavour \"$output\""
++ or die "can't call $xlate: $!";
++*STDOUT=*OUT;
++
++$prefix="vpsm4_ex";
++my @vtmp=map("v$_",(0..3));
++my @qtmp=map("q$_",(0..3));
++my @data=map("v$_",(4..7));
++my @datax=map("v$_",(8..11));
++my ($rk0,$rk1)=("v12","v13");
++my ($rka,$rkb)=("v14","v15");
++my @vtmpx=map("v$_",(12..15));
++my ($vtmp4,$vtmp5)=("v24","v25");
++my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31");
++my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31");
++
++my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
++my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
++my ($xtmp1,$xtmp2)=("x8","x9");
++my ($ptr,$counter)=("x10","w11");
++my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
++
++sub rev32() {
++ my $dst = shift;
++ my $src = shift;
++
++ if ($src and ("$src" ne "$dst")) {
++$code.=<<___;
++#ifndef __AARCH64EB__
++ rev32 $dst.16b,$src.16b
++#else
++ mov $dst.16b,$src.16b
++#endif
++___
++ } else {
++$code.=<<___;
++#ifndef __AARCH64EB__
++ rev32 $dst.16b,$dst.16b
++#endif
++___
++ }
++}
++
++sub rev32_armeb() {
++ my $dst = shift;
++ my $src = shift;
++
++ if ($src and ("$src" ne "$dst")) {
++$code.=<<___;
++#ifdef __AARCH64EB__
++ rev32 $dst.16b,$src.16b
++#else
++ mov $dst.16b,$src.16b
++#endif
++___
++ } else {
++$code.=<<___;
++#ifdef __AARCH64EB__
++ rev32 $dst.16b,$dst.16b
++#endif
++___
++ }
++}
++
++sub rbit() {
++ my $dst = shift;
++ my $src = shift;
++ my $std = shift;
++
++ if ($src and ("$src" ne "$dst")) {
++ if ($std eq "_gb") {
++$code.=<<___;
++ rbit $dst.16b,$src.16b
++___
++ } else {
++$code.=<<___;
++ mov $dst.16b,$src.16b
++___
++ }
++ } else {
++ if ($std eq "_gb") {
++$code.=<<___;
++ rbit $dst.16b,$src.16b
++___
++ }
++ }
++}
++
++sub transpose() {
++ my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
++
++$code.=<<___;
++ zip1 $vt0.4s,$dat0.4s,$dat1.4s
++ zip2 $vt1.4s,$dat0.4s,$dat1.4s
++ zip1 $vt2.4s,$dat2.4s,$dat3.4s
++ zip2 $vt3.4s,$dat2.4s,$dat3.4s
++ zip1 $dat0.2d,$vt0.2d,$vt2.2d
++ zip2 $dat1.2d,$vt0.2d,$vt2.2d
++ zip1 $dat2.2d,$vt1.2d,$vt3.2d
++ zip2 $dat3.2d,$vt1.2d,$vt3.2d
++___
++}
++
++# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x)
++sub mul_matrix() {
++ my $x = shift;
++ my $higherMat = shift;
++ my $lowerMat = shift;
++ my $tmp = shift;
++$code.=<<___;
++ ushr $tmp.16b, $x.16b, 4
++ and $x.16b, $x.16b, $ANDMaskV.16b
++ tbl $x.16b, {$lowerMat.16b}, $x.16b
++ tbl $tmp.16b, {$higherMat.16b}, $tmp.16b
++ eor $x.16b, $x.16b, $tmp.16b
++___
++}
++
++# sbox operations for 4-lane of words
++# sbox operation for 4-lane of words
++sub sbox() {
++ my $dat = shift;
++
++$code.=<<___;
++ // optimize sbox using AESE instruction
++ tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b
++___
++ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
++$code.=<<___;
++ eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
++ aese @vtmp[0].16b,@vtmp[1].16b
++___
++ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4);
++$code.=<<___;
++ mov $dat.16b,@vtmp[0].16b
++
++ // linear transformation
++ ushr @vtmp[0].4s,$dat.4s,32-2
++ ushr @vtmp[1].4s,$dat.4s,32-10
++ ushr @vtmp[2].4s,$dat.4s,32-18
++ ushr @vtmp[3].4s,$dat.4s,32-24
++ sli @vtmp[0].4s,$dat.4s,2
++ sli @vtmp[1].4s,$dat.4s,10
++ sli @vtmp[2].4s,$dat.4s,18
++ sli @vtmp[3].4s,$dat.4s,24
++ eor $vtmp4.16b,@vtmp[0].16b,$dat.16b
++ eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b
++ eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b
++ eor $dat.16b,$dat.16b,$vtmp4.16b
++___
++}
++
++# sbox operation for 8-lane of words
++sub sbox_double() {
++ my $dat = shift;
++ my $datx = shift;
++
++$code.=<<___;
++ // optimize sbox using AESE instruction
++ tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b
++ tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b
++___
++ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
++ &mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4);
++$code.=<<___;
++ eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b
++ aese @vtmp[0].16b,$vtmp5.16b
++ aese @vtmp[1].16b,$vtmp5.16b
++___
++ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4);
++ &mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4);
++$code.=<<___;
++ mov $dat.16b,@vtmp[0].16b
++ mov $datx.16b,@vtmp[1].16b
++
++ // linear transformation
++ ushr @vtmp[0].4s,$dat.4s,32-2
++ ushr $vtmp5.4s,$datx.4s,32-2
++ ushr @vtmp[1].4s,$dat.4s,32-10
++ ushr @vtmp[2].4s,$dat.4s,32-18
++ ushr @vtmp[3].4s,$dat.4s,32-24
++ sli @vtmp[0].4s,$dat.4s,2
++ sli $vtmp5.4s,$datx.4s,2
++ sli @vtmp[1].4s,$dat.4s,10
++ sli @vtmp[2].4s,$dat.4s,18
++ sli @vtmp[3].4s,$dat.4s,24
++ eor $vtmp4.16b,@vtmp[0].16b,$dat.16b
++ eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
++ eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b
++ eor $dat.16b,$dat.16b,$vtmp4.16b
++ ushr @vtmp[1].4s,$datx.4s,32-10
++ ushr @vtmp[2].4s,$datx.4s,32-18
++ ushr @vtmp[3].4s,$datx.4s,32-24
++ sli @vtmp[1].4s,$datx.4s,10
++ sli @vtmp[2].4s,$datx.4s,18
++ sli @vtmp[3].4s,$datx.4s,24
++ eor $vtmp4.16b,$vtmp5.16b,$datx.16b
++ eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
++ eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b
++ eor $datx.16b,$datx.16b,$vtmp4.16b
++___
++}
++
++# sbox operation for one single word
++sub sbox_1word () {
++ my $word = shift;
++
++$code.=<<___;
++ mov @vtmp[3].s[0],$word
++ // optimize sbox using AESE instruction
++ tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b
++___
++ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
++$code.=<<___;
++ eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
++ aese @vtmp[0].16b,@vtmp[1].16b
++___
++ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
++$code.=<<___;
++
++ mov $wtmp0,@vtmp[0].s[0]
++ eor $word,$wtmp0,$wtmp0,ror #32-2
++ eor $word,$word,$wtmp0,ror #32-10
++ eor $word,$word,$wtmp0,ror #32-18
++ eor $word,$word,$wtmp0,ror #32-24
++___
++}
++
++# sm4 for one block of data, in scalar registers word0/word1/word2/word3
++sub sm4_1blk () {
++ my $kptr = shift;
++
++$code.=<<___;
++ ldp $wtmp0,$wtmp1,[$kptr],8
++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
++ eor $tmpw,$word2,$word3
++ eor $wtmp2,$wtmp0,$word1
++ eor $tmpw,$tmpw,$wtmp2
++___
++ &sbox_1word($tmpw);
++$code.=<<___;
++ eor $word0,$word0,$tmpw
++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
++ eor $tmpw,$word2,$word3
++ eor $wtmp2,$word0,$wtmp1
++ eor $tmpw,$tmpw,$wtmp2
++___
++ &sbox_1word($tmpw);
++$code.=<<___;
++ ldp $wtmp0,$wtmp1,[$kptr],8
++ eor $word1,$word1,$tmpw
++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
++ eor $tmpw,$word0,$word1
++ eor $wtmp2,$wtmp0,$word3
++ eor $tmpw,$tmpw,$wtmp2
++___
++ &sbox_1word($tmpw);
++$code.=<<___;
++ eor $word2,$word2,$tmpw
++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
++ eor $tmpw,$word0,$word1
++ eor $wtmp2,$word2,$wtmp1
++ eor $tmpw,$tmpw,$wtmp2
++___
++ &sbox_1word($tmpw);
++$code.=<<___;
++ eor $word3,$word3,$tmpw
++___
++}
++
++# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
++sub sm4_4blks () {
++ my $kptr = shift;
++
++$code.=<<___;
++ ldp $wtmp0,$wtmp1,[$kptr],8
++ dup $rk0.4s,$wtmp0
++ dup $rk1.4s,$wtmp1
++
++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
++ eor $rka.16b,@data[2].16b,@data[3].16b
++ eor $rk0.16b,@data[1].16b,$rk0.16b
++ eor $rk0.16b,$rka.16b,$rk0.16b
++___
++ &sbox($rk0);
++$code.=<<___;
++ eor @data[0].16b,@data[0].16b,$rk0.16b
++
++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
++ eor $rka.16b,$rka.16b,@data[0].16b
++ eor $rk1.16b,$rka.16b,$rk1.16b
++___
++ &sbox($rk1);
++$code.=<<___;
++ ldp $wtmp0,$wtmp1,[$kptr],8
++ eor @data[1].16b,@data[1].16b,$rk1.16b
++
++ dup $rk0.4s,$wtmp0
++ dup $rk1.4s,$wtmp1
++
++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
++ eor $rka.16b,@data[0].16b,@data[1].16b
++ eor $rk0.16b,@data[3].16b,$rk0.16b
++ eor $rk0.16b,$rka.16b,$rk0.16b
++___
++ &sbox($rk0);
++$code.=<<___;
++ eor @data[2].16b,@data[2].16b,$rk0.16b
++
++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
++ eor $rka.16b,$rka.16b,@data[2].16b
++ eor $rk1.16b,$rka.16b,$rk1.16b
++___
++ &sbox($rk1);
++$code.=<<___;
++ eor @data[3].16b,@data[3].16b,$rk1.16b
++___
++}
++
++# sm4 for 8 lanes of data, in neon registers
++# data0/data1/data2/data3 datax0/datax1/datax2/datax3
++sub sm4_8blks () {
++ my $kptr = shift;
++
++$code.=<<___;
++ ldp $wtmp0,$wtmp1,[$kptr],8
++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
++ dup $rk0.4s,$wtmp0
++ eor $rka.16b,@data[2].16b,@data[3].16b
++ eor $rkb.16b,@datax[2].16b,@datax[3].16b
++ eor @vtmp[0].16b,@data[1].16b,$rk0.16b
++ eor @vtmp[1].16b,@datax[1].16b,$rk0.16b
++ eor $rk0.16b,$rka.16b,@vtmp[0].16b
++ eor $rk1.16b,$rkb.16b,@vtmp[1].16b
++___
++ &sbox_double($rk0,$rk1);
++$code.=<<___;
++ eor @data[0].16b,@data[0].16b,$rk0.16b
++ eor @datax[0].16b,@datax[0].16b,$rk1.16b
++
++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
++ dup $rk1.4s,$wtmp1
++ eor $rka.16b,$rka.16b,@data[0].16b
++ eor $rkb.16b,$rkb.16b,@datax[0].16b
++ eor $rk0.16b,$rka.16b,$rk1.16b
++ eor $rk1.16b,$rkb.16b,$rk1.16b
++___
++ &sbox_double($rk0,$rk1);
++$code.=<<___;
++ ldp $wtmp0,$wtmp1,[$kptr],8
++ eor @data[1].16b,@data[1].16b,$rk0.16b
++ eor @datax[1].16b,@datax[1].16b,$rk1.16b
++
++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
++ dup $rk0.4s,$wtmp0
++ eor $rka.16b,@data[0].16b,@data[1].16b
++ eor $rkb.16b,@datax[0].16b,@datax[1].16b
++ eor @vtmp[0].16b,@data[3].16b,$rk0.16b
++ eor @vtmp[1].16b,@datax[3].16b,$rk0.16b
++ eor $rk0.16b,$rka.16b,@vtmp[0].16b
++ eor $rk1.16b,$rkb.16b,@vtmp[1].16b
++___
++ &sbox_double($rk0,$rk1);
++$code.=<<___;
++ eor @data[2].16b,@data[2].16b,$rk0.16b
++ eor @datax[2].16b,@datax[2].16b,$rk1.16b
++
++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
++ dup $rk1.4s,$wtmp1
++ eor $rka.16b,$rka.16b,@data[2].16b
++ eor $rkb.16b,$rkb.16b,@datax[2].16b
++ eor $rk0.16b,$rka.16b,$rk1.16b
++ eor $rk1.16b,$rkb.16b,$rk1.16b
++___
++ &sbox_double($rk0,$rk1);
++$code.=<<___;
++ eor @data[3].16b,@data[3].16b,$rk0.16b
++ eor @datax[3].16b,@datax[3].16b,$rk1.16b
++___
++}
++
++sub encrypt_1blk_norev() {
++ my $dat = shift;
++
++$code.=<<___;
++ mov $ptr,$rks
++ mov $counter,#8
++ mov $word0,$dat.s[0]
++ mov $word1,$dat.s[1]
++ mov $word2,$dat.s[2]
++ mov $word3,$dat.s[3]
++10:
++___
++ &sm4_1blk($ptr);
++$code.=<<___;
++ subs $counter,$counter,#1
++ b.ne 10b
++ mov $dat.s[0],$word3
++ mov $dat.s[1],$word2
++ mov $dat.s[2],$word1
++ mov $dat.s[3],$word0
++___
++}
++
++sub encrypt_1blk() {
++ my $dat = shift;
++
++ &encrypt_1blk_norev($dat);
++ &rev32($dat,$dat);
++}
++
++sub encrypt_4blks() {
++$code.=<<___;
++ mov $ptr,$rks
++ mov $counter,#8
++10:
++___
++ &sm4_4blks($ptr);
++$code.=<<___;
++ subs $counter,$counter,#1
++ b.ne 10b
++___
++ &rev32(@vtmp[3],@data[0]);
++ &rev32(@vtmp[2],@data[1]);
++ &rev32(@vtmp[1],@data[2]);
++ &rev32(@vtmp[0],@data[3]);
++}
++
++sub encrypt_8blks() {
++$code.=<<___;
++ mov $ptr,$rks
++ mov $counter,#8
++10:
++___
++ &sm4_8blks($ptr);
++$code.=<<___;
++ subs $counter,$counter,#1
++ b.ne 10b
++___
++ &rev32(@vtmp[3],@data[0]);
++ &rev32(@vtmp[2],@data[1]);
++ &rev32(@vtmp[1],@data[2]);
++ &rev32(@vtmp[0],@data[3]);
++ &rev32(@data[3],@datax[0]);
++ &rev32(@data[2],@datax[1]);
++ &rev32(@data[1],@datax[2]);
++ &rev32(@data[0],@datax[3]);
++}
++
++sub load_sbox () {
++ my $data = shift;
++
++$code.=<<___;
++ ldr $MaskQ, =0x0306090c0f0205080b0e0104070a0d00
++ ldr $TAHMatQ, =0x22581a6002783a4062185a2042387a00
++ ldr $TALMatQ, =0xc10bb67c4a803df715df62a89e54e923
++ ldr $ATAHMatQ, =0x1407c6d56c7fbeadb9aa6b78c1d21300
++ ldr $ATALMatQ, =0xe383c1a1fe9edcbc6404462679195b3b
++ ldr $ANDMaskQ, =0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f
++___
++}
++
++sub mov_reg_to_vec() {
++ my $src0 = shift;
++ my $src1 = shift;
++ my $desv = shift;
++$code.=<<___;
++ mov $desv.d[0],$src0
++ mov $desv.d[1],$src1
++___
++ &rev32_armeb($desv,$desv);
++}
++
++sub mov_vec_to_reg() {
++ my $srcv = shift;
++ my $des0 = shift;
++ my $des1 = shift;
++$code.=<<___;
++ mov $des0,$srcv.d[0]
++ mov $des1,$srcv.d[1]
++___
++}
++
++sub compute_tweak() {
++ my $src0 = shift;
++ my $src1 = shift;
++ my $des0 = shift;
++ my $des1 = shift;
++$code.=<<___;
++ mov $wtmp0,0x87
++ extr $xtmp2,$src1,$src1,#32
++ extr $des1,$src1,$src0,#63
++ and $wtmp1,$wtmp0,$wtmp2,asr#31
++ eor $des0,$xtmp1,$src0,lsl#1
++___
++}
++
++sub compute_tweak_vec() {
++ my $src = shift;
++ my $des = shift;
++ my $std = shift;
++ &rbit(@vtmp[2],$src,$std);
++$code.=<<___;
++ ldr @qtmp[0], =0x01010101010101010101010101010187
++ shl $des.16b, @vtmp[2].16b, #1
++ ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
++ ushr @vtmp[1].16b, @vtmp[1].16b, #7
++ mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
++ eor $des.16b, $des.16b, @vtmp[1].16b
++___
++ &rbit($des,$des,$std);
++}
++
++$code=<<___;
++#include "arm_arch.h"
++.arch armv8-a+crypto
++.text
++
++.type _${prefix}_consts,%object
++.align 7
++_${prefix}_consts:
++.Lck:
++ .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
++ .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
++ .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
++ .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
++ .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
++ .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
++ .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
++ .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
++.Lfk:
++ .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197
++.Lshuffles:
++ .dword 0x0B0A090807060504,0x030201000F0E0D0C
++
++.size _${prefix}_consts,.-_${prefix}_consts
++___
++
++{{{
++my ($key,$keys,$enc)=("x0","x1","w2");
++my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
++my ($vkey,$vfk,$vmap)=("v5","v6","v7");
++$code.=<<___;
++.type _${prefix}_set_key,%function
++.align 4
++_${prefix}_set_key:
++ AARCH64_VALID_CALL_TARGET
++ ld1 {$vkey.4s},[$key]
++___
++ &load_sbox();
++ &rev32($vkey,$vkey);
++$code.=<<___;
++ adr $pointer,.Lshuffles
++ ld1 {$vmap.2d},[$pointer]
++ adr $pointer,.Lfk
++ ld1 {$vfk.2d},[$pointer]
++ eor $vkey.16b,$vkey.16b,$vfk.16b
++ mov $schedules,#32
++ adr $pointer,.Lck
++ movi @vtmp[0].16b,#64
++ cbnz $enc,1f
++ add $keys,$keys,124
++1:
++ mov $wtmp,$vkey.s[1]
++ ldr $roundkey,[$pointer],#4
++ eor $roundkey,$roundkey,$wtmp
++ mov $wtmp,$vkey.s[2]
++ eor $roundkey,$roundkey,$wtmp
++ mov $wtmp,$vkey.s[3]
++ eor $roundkey,$roundkey,$wtmp
++ // optimize sbox using AESE instruction
++ mov @data[0].s[0],$roundkey
++ tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b
++___
++ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
++$code.=<<___;
++ eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
++ aese @vtmp[0].16b,@vtmp[1].16b
++___
++ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
++$code.=<<___;
++ mov $wtmp,@vtmp[0].s[0]
++ eor $roundkey,$wtmp,$wtmp,ror #19
++ eor $roundkey,$roundkey,$wtmp,ror #9
++ mov $wtmp,$vkey.s[0]
++ eor $roundkey,$roundkey,$wtmp
++ mov $vkey.s[0],$roundkey
++ cbz $enc,2f
++ str $roundkey,[$keys],#4
++ b 3f
++2:
++ str $roundkey,[$keys],#-4
++3:
++ tbl $vkey.16b,{$vkey.16b},$vmap.16b
++ subs $schedules,$schedules,#1
++ b.ne 1b
++ ret
++.size _${prefix}_set_key,.-_${prefix}_set_key
++___
++}}}
++
++
++{{{
++$code.=<<___;
++.type _${prefix}_enc_4blks,%function
++.align 4
++_${prefix}_enc_4blks:
++ AARCH64_VALID_CALL_TARGET
++___
++ &encrypt_4blks();
++$code.=<<___;
++ ret
++.size _${prefix}_enc_4blks,.-_${prefix}_enc_4blks
++___
++}}}
++
++{{{
++$code.=<<___;
++.type _${prefix}_enc_8blks,%function
++.align 4
++_${prefix}_enc_8blks:
++ AARCH64_VALID_CALL_TARGET
++___
++ &encrypt_8blks();
++$code.=<<___;
++ ret
++.size _${prefix}_enc_8blks,.-_${prefix}_enc_8blks
++___
++}}}
++
++
++{{{
++my ($key,$keys)=("x0","x1");
++$code.=<<___;
++.globl ${prefix}_set_encrypt_key
++.type ${prefix}_set_encrypt_key,%function
++.align 5
++${prefix}_set_encrypt_key:
++ AARCH64_SIGN_LINK_REGISTER
++ stp x29,x30,[sp,#-16]!
++ mov w2,1
++ bl _${prefix}_set_key
++ ldp x29,x30,[sp],#16
++ AARCH64_VALIDATE_LINK_REGISTER
++ ret
++.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
++___
++}}}
++
++{{{
++my ($key,$keys)=("x0","x1");
++$code.=<<___;
++.globl ${prefix}_set_decrypt_key
++.type ${prefix}_set_decrypt_key,%function
++.align 5
++${prefix}_set_decrypt_key:
++ AARCH64_SIGN_LINK_REGISTER
++ stp x29,x30,[sp,#-16]!
++ mov w2,0
++ bl _${prefix}_set_key
++ ldp x29,x30,[sp],#16
++ AARCH64_VALIDATE_LINK_REGISTER
++ ret
++.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
++___
++}}}
++
++{{{
++sub gen_block () {
++ my $dir = shift;
++ my ($inp,$outp,$rk)=map("x$_",(0..2));
++
++$code.=<<___;
++.globl ${prefix}_${dir}crypt
++.type ${prefix}_${dir}crypt,%function
++.align 5
++${prefix}_${dir}crypt:
++ AARCH64_VALID_CALL_TARGET
++ ld1 {@data[0].4s},[$inp]
++___
++ &load_sbox();
++ &rev32(@data[0],@data[0]);
++$code.=<<___;
++ mov $rks,$rk
++___
++ &encrypt_1blk(@data[0]);
++$code.=<<___;
++ st1 {@data[0].4s},[$outp]
++ ret
++.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
++___
++}
++&gen_block("en");
++&gen_block("de");
++}}}
++
++{{{
++$code.=<<___;
++.globl ${prefix}_ecb_encrypt
++.type ${prefix}_ecb_encrypt,%function
++.align 5
++${prefix}_ecb_encrypt:
++ AARCH64_SIGN_LINK_REGISTER
++ // convert length into blocks
++ lsr x2,x2,4
++ stp d8,d9,[sp,#-80]!
++ stp d10,d11,[sp,#16]
++ stp d12,d13,[sp,#32]
++ stp d14,d15,[sp,#48]
++ stp x29,x30,[sp,#64]
++___
++ &load_sbox();
++$code.=<<___;
++.Lecb_8_blocks_process:
++ cmp $blocks,#8
++ b.lt .Lecb_4_blocks_process
++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++ ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++ &rev32(@datax[0],@datax[0]);
++ &rev32(@datax[1],@datax[1]);
++ &rev32(@datax[2],@datax[2]);
++ &rev32(@datax[3],@datax[3]);
++$code.=<<___;
++ bl _${prefix}_enc_8blks
++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++ subs $blocks,$blocks,#8
++ b.gt .Lecb_8_blocks_process
++ b 100f
++.Lecb_4_blocks_process:
++ cmp $blocks,#4
++ b.lt 1f
++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++$code.=<<___;
++ bl _${prefix}_enc_4blks
++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++ sub $blocks,$blocks,#4
++1:
++ // process last block
++ cmp $blocks,#1
++ b.lt 100f
++ b.gt 1f
++ ld1 {@data[0].4s},[$inp]
++___
++ &rev32(@data[0],@data[0]);
++ &encrypt_1blk(@data[0]);
++$code.=<<___;
++ st1 {@data[0].4s},[$outp]
++ b 100f
++1: // process last 2 blocks
++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
++ cmp $blocks,#2
++ b.gt 1f
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++$code.=<<___;
++ bl _${prefix}_enc_4blks
++ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
++ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
++ b 100f
++1: // process last 3 blocks
++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++$code.=<<___;
++ bl _${prefix}_enc_4blks
++ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
++ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
++ st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
++100:
++ ldp d10,d11,[sp,#16]
++ ldp d12,d13,[sp,#32]
++ ldp d14,d15,[sp,#48]
++ ldp x29,x30,[sp,#64]
++ ldp d8,d9,[sp],#80
++ AARCH64_VALIDATE_LINK_REGISTER
++ ret
++.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
++___
++}}}
++
++{{{
++my ($len,$ivp,$enc)=("x2","x4","w5");
++my $ivec0=("v3");
++my $ivec1=("v15");
++
++$code.=<<___;
++.globl ${prefix}_cbc_encrypt
++.type ${prefix}_cbc_encrypt,%function
++.align 5
++${prefix}_cbc_encrypt:
++ AARCH64_VALID_CALL_TARGET
++ lsr $len,$len,4
++___
++ &load_sbox();
++$code.=<<___;
++ cbz $enc,.Ldec
++ ld1 {$ivec0.4s},[$ivp]
++.Lcbc_4_blocks_enc:
++ cmp $blocks,#4
++ b.lt 1f
++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++ eor @data[0].16b,@data[0].16b,$ivec0.16b
++___
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++ &encrypt_1blk_norev(@data[0]);
++$code.=<<___;
++ eor @data[1].16b,@data[1].16b,@data[0].16b
++___
++ &encrypt_1blk_norev(@data[1]);
++ &rev32(@data[0],@data[0]);
++
++$code.=<<___;
++ eor @data[2].16b,@data[2].16b,@data[1].16b
++___
++ &encrypt_1blk_norev(@data[2]);
++ &rev32(@data[1],@data[1]);
++$code.=<<___;
++ eor @data[3].16b,@data[3].16b,@data[2].16b
++___
++ &encrypt_1blk_norev(@data[3]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++$code.=<<___;
++ orr $ivec0.16b,@data[3].16b,@data[3].16b
++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++ subs $blocks,$blocks,#4
++ b.ne .Lcbc_4_blocks_enc
++ b 2f
++1:
++ subs $blocks,$blocks,#1
++ b.lt 2f
++ ld1 {@data[0].4s},[$inp],#16
++ eor $ivec0.16b,$ivec0.16b,@data[0].16b
++___
++ &rev32($ivec0,$ivec0);
++ &encrypt_1blk($ivec0);
++$code.=<<___;
++ st1 {$ivec0.4s},[$outp],#16
++ b 1b
++2:
++ // save back IV
++ st1 {$ivec0.4s},[$ivp]
++ ret
++
++.Ldec:
++ // decryption mode starts
++ AARCH64_SIGN_LINK_REGISTER
++ stp d8,d9,[sp,#-80]!
++ stp d10,d11,[sp,#16]
++ stp d12,d13,[sp,#32]
++ stp d14,d15,[sp,#48]
++ stp x29,x30,[sp,#64]
++.Lcbc_8_blocks_dec:
++ cmp $blocks,#8
++ b.lt 1f
++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
++ add $ptr,$inp,#64
++ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],$data[3]);
++ &rev32(@datax[0],@datax[0]);
++ &rev32(@datax[1],@datax[1]);
++ &rev32(@datax[2],@datax[2]);
++ &rev32(@datax[3],$datax[3]);
++$code.=<<___;
++ bl _${prefix}_enc_8blks
++___
++ &transpose(@vtmp,@datax);
++ &transpose(@data,@datax);
++$code.=<<___;
++ ld1 {$ivec1.4s},[$ivp]
++ ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
++ // note ivec1 and vtmpx[3] are resuing the same register
++ // care needs to be taken to avoid conflict
++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
++ ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
++ eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
++ eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
++ eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
++ // save back IV
++ st1 {$vtmpx[3].4s}, [$ivp]
++ eor @data[0].16b,@data[0].16b,$datax[3].16b
++ eor @data[1].16b,@data[1].16b,@vtmpx[0].16b
++ eor @data[2].16b,@data[2].16b,@vtmpx[1].16b
++ eor @data[3].16b,$data[3].16b,@vtmpx[2].16b
++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++ subs $blocks,$blocks,#8
++ b.gt .Lcbc_8_blocks_dec
++ b.eq 100f
++1:
++ ld1 {$ivec1.4s},[$ivp]
++.Lcbc_4_blocks_dec:
++ cmp $blocks,#4
++ b.lt 1f
++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],$data[3]);
++$code.=<<___;
++ bl _${prefix}_enc_4blks
++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++___
++ &transpose(@vtmp,@datax);
++$code.=<<___;
++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
++ orr $ivec1.16b,@data[3].16b,@data[3].16b
++ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
++ eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++ subs $blocks,$blocks,#4
++ b.gt .Lcbc_4_blocks_dec
++ // save back IV
++ st1 {@data[3].4s}, [$ivp]
++ b 100f
++1: // last block
++ subs $blocks,$blocks,#1
++ b.lt 100f
++ b.gt 1f
++ ld1 {@data[0].4s},[$inp],#16
++ // save back IV
++ st1 {$data[0].4s}, [$ivp]
++___
++ &rev32(@datax[0],@data[0]);
++ &encrypt_1blk(@datax[0]);
++$code.=<<___;
++ eor @datax[0].16b,@datax[0].16b,$ivec1.16b
++ st1 {@datax[0].4s},[$outp],#16
++ b 100f
++1: // last two blocks
++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
++ add $ptr,$inp,#16
++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
++ subs $blocks,$blocks,1
++ b.gt 1f
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++$code.=<<___;
++ bl _${prefix}_enc_4blks
++ ld1 {@data[0].4s,@data[1].4s},[$inp],#32
++___
++ &transpose(@vtmp,@datax);
++$code.=<<___;
++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
++ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
++ // save back IV
++ st1 {@data[1].4s}, [$ivp]
++ b 100f
++1: // last 3 blocks
++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++$code.=<<___;
++ bl _${prefix}_enc_4blks
++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
++___
++ &transpose(@vtmp,@datax);
++$code.=<<___;
++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
++ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
++ // save back IV
++ st1 {@data[2].4s}, [$ivp]
++100:
++ ldp d10,d11,[sp,#16]
++ ldp d12,d13,[sp,#32]
++ ldp d14,d15,[sp,#48]
++ ldp x29,x30,[sp,#64]
++ ldp d8,d9,[sp],#80
++ AARCH64_VALIDATE_LINK_REGISTER
++ ret
++.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
++___
++}}}
++
++{{{
++my ($ivp)=("x4");
++my ($ctr)=("w5");
++my $ivec=("v3");
++
++$code.=<<___;
++.globl ${prefix}_ctr32_encrypt_blocks
++.type ${prefix}_ctr32_encrypt_blocks,%function
++.align 5
++${prefix}_ctr32_encrypt_blocks:
++ AARCH64_VALID_CALL_TARGET
++ ld1 {$ivec.4s},[$ivp]
++___
++ &rev32($ivec,$ivec);
++ &load_sbox();
++$code.=<<___;
++ cmp $blocks,#1
++ b.ne 1f
++ // fast processing for one single block without
++ // context saving overhead
++___
++ &encrypt_1blk($ivec);
++$code.=<<___;
++ ld1 {@data[0].4s},[$inp]
++ eor @data[0].16b,@data[0].16b,$ivec.16b
++ st1 {@data[0].4s},[$outp]
++ ret
++1:
++ AARCH64_SIGN_LINK_REGISTER
++ stp d8,d9,[sp,#-80]!
++ stp d10,d11,[sp,#16]
++ stp d12,d13,[sp,#32]
++ stp d14,d15,[sp,#48]
++ stp x29,x30,[sp,#64]
++ mov $word0,$ivec.s[0]
++ mov $word1,$ivec.s[1]
++ mov $word2,$ivec.s[2]
++ mov $ctr,$ivec.s[3]
++.Lctr32_4_blocks_process:
++ cmp $blocks,#4
++ b.lt 1f
++ dup @data[0].4s,$word0
++ dup @data[1].4s,$word1
++ dup @data[2].4s,$word2
++ mov @data[3].s[0],$ctr
++ add $ctr,$ctr,#1
++ mov $data[3].s[1],$ctr
++ add $ctr,$ctr,#1
++ mov @data[3].s[2],$ctr
++ add $ctr,$ctr,#1
++ mov @data[3].s[3],$ctr
++ add $ctr,$ctr,#1
++ cmp $blocks,#8
++ b.ge .Lctr32_8_blocks_process
++ bl _${prefix}_enc_4blks
++ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++ subs $blocks,$blocks,#4
++ b.ne .Lctr32_4_blocks_process
++ b 100f
++.Lctr32_8_blocks_process:
++ dup @datax[0].4s,$word0
++ dup @datax[1].4s,$word1
++ dup @datax[2].4s,$word2
++ mov @datax[3].s[0],$ctr
++ add $ctr,$ctr,#1
++ mov $datax[3].s[1],$ctr
++ add $ctr,$ctr,#1
++ mov @datax[3].s[2],$ctr
++ add $ctr,$ctr,#1
++ mov @datax[3].s[3],$ctr
++ add $ctr,$ctr,#1
++ bl _${prefix}_enc_8blks
++ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
++ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
++ eor @data[0].16b,@data[0].16b,@datax[0].16b
++ eor @data[1].16b,@data[1].16b,@datax[1].16b
++ eor @data[2].16b,@data[2].16b,@datax[2].16b
++ eor @data[3].16b,@data[3].16b,@datax[3].16b
++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++ subs $blocks,$blocks,#8
++ b.ne .Lctr32_4_blocks_process
++ b 100f
++1: // last block processing
++ subs $blocks,$blocks,#1
++ b.lt 100f
++ b.gt 1f
++ mov $ivec.s[0],$word0
++ mov $ivec.s[1],$word1
++ mov $ivec.s[2],$word2
++ mov $ivec.s[3],$ctr
++___
++ &encrypt_1blk($ivec);
++$code.=<<___;
++ ld1 {@data[0].4s},[$inp]
++ eor @data[0].16b,@data[0].16b,$ivec.16b
++ st1 {@data[0].4s},[$outp]
++ b 100f
++1: // last 2 blocks processing
++ dup @data[0].4s,$word0
++ dup @data[1].4s,$word1
++ dup @data[2].4s,$word2
++ mov @data[3].s[0],$ctr
++ add $ctr,$ctr,#1
++ mov @data[3].s[1],$ctr
++ subs $blocks,$blocks,#1
++ b.ne 1f
++ bl _${prefix}_enc_4blks
++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
++ b 100f
++1: // last 3 blocks processing
++ add $ctr,$ctr,#1
++ mov @data[3].s[2],$ctr
++ bl _${prefix}_enc_4blks
++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
++100:
++ ldp d10,d11,[sp,#16]
++ ldp d12,d13,[sp,#32]
++ ldp d14,d15,[sp,#48]
++ ldp x29,x30,[sp,#64]
++ ldp d8,d9,[sp],#80
++ AARCH64_VALIDATE_LINK_REGISTER
++ ret
++.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
++___
++}}}
++
++
++{{{
++my ($blocks,$len)=("x2","x2");
++my $ivp=("x5");
++my @twx=map("x$_",(12..27));
++my ($rks1,$rks2)=("x26","x27");
++my $lastBlk=("x26");
++my $enc=("w28");
++my $remain=("x29");
++
++my @tweak=map("v$_",(16..23));
++my $lastTweak=("v25");
++
++sub gen_xts_cipher() {
++ my $std = shift;
++$code.=<<___;
++.globl ${prefix}_xts_encrypt${std}
++.type ${prefix}_xts_encrypt${std},%function
++.align 5
++${prefix}_xts_encrypt${std}:
++ AARCH64_SIGN_LINK_REGISTER
++ stp x15, x16, [sp, #-0x10]!
++ stp x17, x18, [sp, #-0x10]!
++ stp x19, x20, [sp, #-0x10]!
++ stp x21, x22, [sp, #-0x10]!
++ stp x23, x24, [sp, #-0x10]!
++ stp x25, x26, [sp, #-0x10]!
++ stp x27, x28, [sp, #-0x10]!
++ stp x29, x30, [sp, #-0x10]!
++ stp d8, d9, [sp, #-0x10]!
++ stp d10, d11, [sp, #-0x10]!
++ stp d12, d13, [sp, #-0x10]!
++ stp d14, d15, [sp, #-0x10]!
++ mov $rks1,x3
++ mov $rks2,x4
++ mov $enc,w6
++ ld1 {@tweak[0].4s}, [$ivp]
++ mov $rks,$rks2
++___
++ &load_sbox();
++ &rev32(@tweak[0],@tweak[0]);
++ &encrypt_1blk(@tweak[0]);
++$code.=<<___;
++ mov $rks,$rks1
++ and $remain,$len,#0x0F
++ // convert length into blocks
++ lsr $blocks,$len,4
++ cmp $blocks,#1
++ b.lt .return${std}
++
++ cmp $remain,0
++ // If the encryption/decryption Length is N times of 16,
++ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
++ b.eq .xts_encrypt_blocks${std}
++
++ // If the encryption/decryption length is not N times of 16,
++ // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
++ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
++ subs $blocks,$blocks,#1
++ b.eq .only_2blks_tweak${std}
++.xts_encrypt_blocks${std}:
++___
++ &rbit(@tweak[0],@tweak[0],$std);
++ &rev32_armeb(@tweak[0],@tweak[0]);
++ &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
++$code.=<<___;
++.Lxts_8_blocks_process${std}:
++ cmp $blocks,#8
++___
++ &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
++ &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
++ &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
++ &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
++ &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
++ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]);
++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
++ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]);
++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
++ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]);
++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
++ &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]);
++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
++$code.=<<___;
++ b.lt .Lxts_4_blocks_process${std}
++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++___
++ &rbit(@tweak[0],@tweak[0],$std);
++ &rbit(@tweak[1],@tweak[1],$std);
++ &rbit(@tweak[2],@tweak[2],$std);
++ &rbit(@tweak[3],@tweak[3],$std);
++$code.=<<___;
++ eor @data[0].16b, @data[0].16b, @tweak[0].16b
++ eor @data[1].16b, @data[1].16b, @tweak[1].16b
++ eor @data[2].16b, @data[2].16b, @tweak[2].16b
++ eor @data[3].16b, @data[3].16b, @tweak[3].16b
++ ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
++___
++ &rbit(@tweak[4],@tweak[4],$std);
++ &rbit(@tweak[5],@tweak[5],$std);
++ &rbit(@tweak[6],@tweak[6],$std);
++ &rbit(@tweak[7],@tweak[7],$std);
++$code.=<<___;
++ eor @datax[0].16b, @datax[0].16b, @tweak[4].16b
++ eor @datax[1].16b, @datax[1].16b, @tweak[5].16b
++ eor @datax[2].16b, @datax[2].16b, @tweak[6].16b
++ eor @datax[3].16b, @datax[3].16b, @tweak[7].16b
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++ &rev32(@datax[0],@datax[0]);
++ &rev32(@datax[1],@datax[1]);
++ &rev32(@datax[2],@datax[2]);
++ &rev32(@datax[3],@datax[3]);
++ &transpose(@data,@vtmp);
++ &transpose(@datax,@vtmp);
++$code.=<<___;
++ bl _${prefix}_enc_8blks
++___
++ &transpose(@vtmp,@datax);
++ &transpose(@data,@datax);
++$code.=<<___;
++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
++ eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
++ eor @data[0].16b, @data[0].16b, @tweak[4].16b
++ eor @data[1].16b, @data[1].16b, @tweak[5].16b
++ eor @data[2].16b, @data[2].16b, @tweak[6].16b
++ eor @data[3].16b, @data[3].16b, @tweak[7].16b
++
++ // save the last tweak
++ mov $lastTweak.16b,@tweak[7].16b
++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++ subs $blocks,$blocks,#8
++ b.gt .Lxts_8_blocks_process${std}
++ b 100f
++.Lxts_4_blocks_process${std}:
++ cmp $blocks,#4
++ b.lt 1f
++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++___
++ &rbit(@tweak[0],@tweak[0],$std);
++ &rbit(@tweak[1],@tweak[1],$std);
++ &rbit(@tweak[2],@tweak[2],$std);
++ &rbit(@tweak[3],@tweak[3],$std);
++$code.=<<___;
++ eor @data[0].16b, @data[0].16b, @tweak[0].16b
++ eor @data[1].16b, @data[1].16b, @tweak[1].16b
++ eor @data[2].16b, @data[2].16b, @tweak[2].16b
++ eor @data[3].16b, @data[3].16b, @tweak[3].16b
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++ &transpose(@data,@vtmp);
++$code.=<<___;
++ bl _${prefix}_enc_4blks
++___
++ &transpose(@vtmp,@data);
++$code.=<<___;
++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
++ eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++ sub $blocks,$blocks,#4
++ mov @tweak[0].16b,@tweak[4].16b
++ mov @tweak[1].16b,@tweak[5].16b
++ mov @tweak[2].16b,@tweak[6].16b
++ // save the last tweak
++ mov $lastTweak.16b,@tweak[3].16b
++1:
++ // process last block
++ cmp $blocks,#1
++ b.lt 100f
++ b.gt 1f
++ ld1 {@data[0].4s},[$inp],#16
++___
++ &rbit(@tweak[0],@tweak[0],$std);
++$code.=<<___;
++ eor @data[0].16b, @data[0].16b, @tweak[0].16b
++___
++ &rev32(@data[0],@data[0]);
++ &encrypt_1blk(@data[0]);
++$code.=<<___;
++ eor @data[0].16b, @data[0].16b, @tweak[0].16b
++ st1 {@data[0].4s},[$outp],#16
++ // save the last tweak
++ mov $lastTweak.16b,@tweak[0].16b
++ b 100f
++1: // process last 2 blocks
++ cmp $blocks,#2
++ b.gt 1f
++ ld1 {@data[0].4s,@data[1].4s},[$inp],#32
++___
++ &rbit(@tweak[0],@tweak[0],$std);
++ &rbit(@tweak[1],@tweak[1],$std);
++$code.=<<___;
++ eor @data[0].16b, @data[0].16b, @tweak[0].16b
++ eor @data[1].16b, @data[1].16b, @tweak[1].16b
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &transpose(@data,@vtmp);
++$code.=<<___;
++ bl _${prefix}_enc_4blks
++___
++ &transpose(@vtmp,@data);
++$code.=<<___;
++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
++ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
++ // save the last tweak
++ mov $lastTweak.16b,@tweak[1].16b
++ b 100f
++1: // process last 3 blocks
++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
++___
++ &rbit(@tweak[0],@tweak[0],$std);
++ &rbit(@tweak[1],@tweak[1],$std);
++ &rbit(@tweak[2],@tweak[2],$std);
++$code.=<<___;
++ eor @data[0].16b, @data[0].16b, @tweak[0].16b
++ eor @data[1].16b, @data[1].16b, @tweak[1].16b
++ eor @data[2].16b, @data[2].16b, @tweak[2].16b
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &transpose(@data,@vtmp);
++$code.=<<___;
++ bl _${prefix}_enc_4blks
++___
++ &transpose(@vtmp,@data);
++$code.=<<___;
++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
++ // save the last tweak
++ mov $lastTweak.16b,@tweak[2].16b
++100:
++ cmp $remain,0
++ b.eq .return${std}
++
++// This brance calculates the last two tweaks,
++// while the encryption/decryption length is larger than 32
++.last_2blks_tweak${std}:
++___
++ &rev32_armeb($lastTweak,$lastTweak);
++ &compute_tweak_vec($lastTweak,@tweak[1],$std);
++ &compute_tweak_vec(@tweak[1],@tweak[2],$std);
++$code.=<<___;
++ b .check_dec${std}
++
++
++// This brance calculates the last two tweaks,
++// while the encryption/decryption length is equal to 32, who only need two tweaks
++.only_2blks_tweak${std}:
++ mov @tweak[1].16b,@tweak[0].16b
++___
++ &rev32_armeb(@tweak[1],@tweak[1]);
++ &compute_tweak_vec(@tweak[1],@tweak[2]);
++$code.=<<___;
++ b .check_dec${std}
++
++
++// Determine whether encryption or decryption is required.
++// The last two tweaks need to be swapped for decryption.
++.check_dec${std}:
++ // encryption:1 decryption:0
++ cmp $enc,1
++ b.eq .prcess_last_2blks${std}
++ mov @vtmp[0].16B,@tweak[1].16b
++ mov @tweak[1].16B,@tweak[2].16b
++ mov @tweak[2].16B,@vtmp[0].16b
++
++.prcess_last_2blks${std}:
++___
++ &rev32_armeb(@tweak[1],@tweak[1]);
++ &rev32_armeb(@tweak[2],@tweak[2]);
++$code.=<<___;
++ ld1 {@data[0].4s},[$inp],#16
++ eor @data[0].16b, @data[0].16b, @tweak[1].16b
++___
++ &rev32(@data[0],@data[0]);
++ &encrypt_1blk(@data[0]);
++$code.=<<___;
++ eor @data[0].16b, @data[0].16b, @tweak[1].16b
++ st1 {@data[0].4s},[$outp],#16
++
++ sub $lastBlk,$outp,16
++ .loop${std}:
++ subs $remain,$remain,1
++ ldrb $wtmp0,[$lastBlk,$remain]
++ ldrb $wtmp1,[$inp,$remain]
++ strb $wtmp1,[$lastBlk,$remain]
++ strb $wtmp0,[$outp,$remain]
++ b.gt .loop${std}
++ ld1 {@data[0].4s}, [$lastBlk]
++ eor @data[0].16b, @data[0].16b, @tweak[2].16b
++___
++ &rev32(@data[0],@data[0]);
++ &encrypt_1blk(@data[0]);
++$code.=<<___;
++ eor @data[0].16b, @data[0].16b, @tweak[2].16b
++ st1 {@data[0].4s}, [$lastBlk]
++.return${std}:
++ ldp d14, d15, [sp], #0x10
++ ldp d12, d13, [sp], #0x10
++ ldp d10, d11, [sp], #0x10
++ ldp d8, d9, [sp], #0x10
++ ldp x29, x30, [sp], #0x10
++ ldp x27, x28, [sp], #0x10
++ ldp x25, x26, [sp], #0x10
++ ldp x23, x24, [sp], #0x10
++ ldp x21, x22, [sp], #0x10
++ ldp x19, x20, [sp], #0x10
++ ldp x17, x18, [sp], #0x10
++ ldp x15, x16, [sp], #0x10
++ AARCH64_VALIDATE_LINK_REGISTER
++ ret
++.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
++___
++} # end of gen_xts_cipher
++&gen_xts_cipher("_gb");
++&gen_xts_cipher("");
++}}}
++
++########################################
++open SELF,$0;
++while(<SELF>) {
++ next if (/^#!/);
++ last if (!s/^#/\/\// and !/^$/);
++ print;
++}
++close SELF;
++
++foreach(split("\n",$code)) {
++ s/\`([^\`]*)\`/eval($1)/ge;
++ print $_,"\n";
++}
++
++close STDOUT or die "error closing STDOUT: $!";
+diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info
+index 75a215ab80..73ffe5ea09 100644
+--- a/crypto/sm4/build.info
++++ b/crypto/sm4/build.info
+@@ -2,7 +2,7 @@ LIBS=../../libcrypto
+
+ IF[{- !$disabled{asm} -}]
+ $SM4DEF_aarch64=SM4_ASM VPSM4_ASM
+- $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S
++ $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S vpsm4_ex-armv8.S
+
+ # Now that we have defined all the arch specific variables, use the
+ # appropriate one, and define the appropriate macros
+@@ -30,5 +30,7 @@ ENDIF
+
+ GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl
+ GENERATE[vpsm4-armv8.S]=asm/vpsm4-armv8.pl
++GENERATE[vpsm4_ex-armv8.S]=asm/vpsm4_ex-armv8.pl
+ INCLUDE[sm4-armv8.o]=..
+ INCLUDE[vpsm4-armv8.o]=..
++INCLUDE[vpsm4_ex-armv8.o]=..
+diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
+index 15d8abbcb1..8b9cd10f97 100644
+--- a/include/crypto/sm4_platform.h
++++ b/include/crypto/sm4_platform.h
+@@ -20,11 +20,16 @@ static inline int vpsm4_capable(void)
+ {
+ return (OPENSSL_armcap_P & ARMV8_CPUID) &&
+ (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1) ||
+- MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1) ||
+- MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, HISI_CPU_IMP, HISI_CPU_PART_KP920));
++ MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1));
++}
++static inline int vpsm4_ex_capable(void)
++{
++ return (OPENSSL_armcap_P & ARMV8_CPUID) &&
++ (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, HISI_CPU_IMP, HISI_CPU_PART_KP920));
+ }
+ # if defined(VPSM4_ASM)
+ # define VPSM4_CAPABLE vpsm4_capable()
++# define VPSM4_EX_CAPABLE vpsm4_ex_capable()
+ # endif
+ # define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4)
+ # define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key
+@@ -56,7 +61,7 @@ void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+ const unsigned char ivec[16]);
+ # endif /* HWSM4_CAPABLE */
+
+-#ifdef VPSM4_CAPABLE
++# ifdef VPSM4_CAPABLE
+ int vpsm4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key);
+ int vpsm4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key);
+ void vpsm4_encrypt(const unsigned char *in, unsigned char *out,
+@@ -72,7 +77,37 @@ void vpsm4_ecb_encrypt(const unsigned char *in, unsigned char *out,
+ void vpsm4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+ size_t len, const void *key,
+ const unsigned char ivec[16]);
++void vpsm4_xts_encrypt(const unsigned char *in, unsigned char *out,
++ size_t len, const SM4_KEY *key1, const SM4_KEY *key2,
++ const unsigned char ivec[16], const int enc);
++void vpsm4_xts_encrypt_gb(const unsigned char *in, unsigned char *out,
++ size_t len, const SM4_KEY *key1, const SM4_KEY *key2,
++ const unsigned char ivec[16], const int enc);
+ # endif /* VPSM4_CAPABLE */
+
++# ifdef VPSM4_EX_CAPABLE
++int vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key);
++int vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key);
++void vpsm4_ex_encrypt(const unsigned char *in, unsigned char *out,
++ const SM4_KEY *key);
++void vpsm4_ex_decrypt(const unsigned char *in, unsigned char *out,
++ const SM4_KEY *key);
++void vpsm4_ex_cbc_encrypt(const unsigned char *in, unsigned char *out,
++ size_t length, const SM4_KEY *key,
++ unsigned char *ivec, const int enc);
++void vpsm4_ex_ecb_encrypt(const unsigned char *in, unsigned char *out,
++ size_t length, const SM4_KEY *key,
++ const int enc);
++void vpsm4_ex_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
++ size_t len, const void *key,
++ const unsigned char ivec[16]);
++void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out,
++ size_t len, const SM4_KEY *key1, const SM4_KEY *key2,
++ const unsigned char ivec[16], const int enc);
++void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out,
++ size_t len, const SM4_KEY *key1,
++ const SM4_KEY *key2, const unsigned char ivec[16],
++ const int enc);
++# endif /* VPSM4_EX_CAPABLE */
+
+ #endif /* OSSL_SM4_PLATFORM_H */
+diff --git a/providers/implementations/ciphers/cipher_sm4_hw.c b/providers/implementations/ciphers/cipher_sm4_hw.c
+index 9a2e99f67c..8cabd78266 100644
+--- a/providers/implementations/ciphers/cipher_sm4_hw.c
++++ b/providers/implementations/ciphers/cipher_sm4_hw.c
+@@ -42,6 +42,19 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
+ (void)0; /* terminate potentially open 'else' */
+ } else
+ #endif
++#ifdef VPSM4_EX_CAPABLE
++ if (VPSM4_EX_CAPABLE) {
++ vpsm4_ex_set_encrypt_key(key, ks);
++ ctx->block = (block128_f)vpsm4_ex_encrypt;
++ ctx->stream.cbc = NULL;
++ if (ctx->mode == EVP_CIPH_CBC_MODE)
++ ctx->stream.cbc = (cbc128_f)vpsm4_ex_cbc_encrypt;
++ else if (ctx->mode == EVP_CIPH_ECB_MODE)
++ ctx->stream.ecb = (ecb128_f)vpsm4_ex_ecb_encrypt;
++ else if (ctx->mode == EVP_CIPH_CTR_MODE)
++ ctx->stream.ctr = (ctr128_f)vpsm4_ex_ctr32_encrypt_blocks;
++ } else
++#endif
+ #ifdef VPSM4_CAPABLE
+ if (VPSM4_CAPABLE) {
+ vpsm4_set_encrypt_key(key, ks);
+@@ -75,6 +88,17 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
+ #endif
+ } else
+ #endif
++#ifdef VPSM4_EX_CAPABLE
++ if (VPSM4_EX_CAPABLE) {
++ vpsm4_ex_set_decrypt_key(key, ks);
++ ctx->block = (block128_f)vpsm4_ex_decrypt;
++ ctx->stream.cbc = NULL;
++ if (ctx->mode == EVP_CIPH_CBC_MODE)
++ ctx->stream.cbc = (cbc128_f)vpsm4_ex_cbc_encrypt;
++ else if (ctx->mode == EVP_CIPH_ECB_MODE)
++ ctx->stream.ecb = (ecb128_f)vpsm4_ex_ecb_encrypt;
++ } else
++#endif
+ #ifdef VPSM4_CAPABLE
+ if (VPSM4_CAPABLE) {
+ vpsm4_set_decrypt_key(key, ks);
+@@ -82,7 +106,7 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
+ ctx->stream.cbc = NULL;
+ if (ctx->mode == EVP_CIPH_CBC_MODE)
+ ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt;
+- else if (ctx->mode == EVP_CIPH_ECB_MODE)
++ else if (ctx->mode == EVP_CIPH_ECB_MODE)
+ ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt;
+ } else
+ #endif
+diff --git a/providers/implementations/ciphers/cipher_sm4_xts.c b/providers/implementations/ciphers/cipher_sm4_xts.c
+index 3c568d4d18..037055fce8 100644
+--- a/providers/implementations/ciphers/cipher_sm4_xts.c
++++ b/providers/implementations/ciphers/cipher_sm4_xts.c
+@@ -145,14 +145,14 @@ static int sm4_xts_cipher(void *vctx, unsigned char *out, size_t *outl,
+ if (ctx->xts_standard) {
+ if (ctx->stream != NULL)
+ (*ctx->stream)(in, out, inl, ctx->xts.key1, ctx->xts.key2,
+- ctx->base.iv);
++ ctx->base.iv, ctx->base.enc);
+ else if (CRYPTO_xts128_encrypt(&ctx->xts, ctx->base.iv, in, out, inl,
+ ctx->base.enc))
+ return 0;
+ } else {
+ if (ctx->stream_gb != NULL)
+ (*ctx->stream_gb)(in, out, inl, ctx->xts.key1, ctx->xts.key2,
+- ctx->base.iv);
++ ctx->base.iv, ctx->base.enc);
+ else if (ossl_crypto_xts128gb_encrypt(&ctx->xts, ctx->base.iv, in, out,
+ inl, ctx->base.enc))
+ return 0;
+diff --git a/providers/implementations/ciphers/cipher_sm4_xts.h b/providers/implementations/ciphers/cipher_sm4_xts.h
+index 4c369183e2..cfca596979 100644
+--- a/providers/implementations/ciphers/cipher_sm4_xts.h
++++ b/providers/implementations/ciphers/cipher_sm4_xts.h
+@@ -14,7 +14,7 @@
+ PROV_CIPHER_FUNC(void, xts_stream,
+ (const unsigned char *in, unsigned char *out, size_t len,
+ const SM4_KEY *key1, const SM4_KEY *key2,
+- const unsigned char iv[16]));
++ const unsigned char iv[16], const int enc));
+
+ typedef struct prov_sm4_xts_ctx_st {
+ /* Must be first */
+diff --git a/providers/implementations/ciphers/cipher_sm4_xts_hw.c b/providers/implementations/ciphers/cipher_sm4_xts_hw.c
+index 403eb879b1..67a9923d94 100644
+--- a/providers/implementations/ciphers/cipher_sm4_xts_hw.c
++++ b/providers/implementations/ciphers/cipher_sm4_xts_hw.c
+@@ -11,8 +11,7 @@
+
+ #define XTS_SET_KEY_FN(fn_set_enc_key, fn_set_dec_key, \
+ fn_block_enc, fn_block_dec, \
+- fn_stream_enc, fn_stream_dec, \
+- fn_stream_gb_enc, fn_stream_gb_dec) { \
++ fn_stream, fn_stream_gb) { \
+ size_t bytes = keylen / 2; \
+ \
+ if (ctx->enc) { \
+@@ -26,8 +25,8 @@
+ xctx->xts.block2 = (block128_f)fn_block_enc; \
+ xctx->xts.key1 = &xctx->ks1; \
+ xctx->xts.key2 = &xctx->ks2; \
+- xctx->stream = ctx->enc ? fn_stream_enc : fn_stream_dec; \
+- xctx->stream_gb = ctx->enc ? fn_stream_gb_enc : fn_stream_gb_dec; \
++ xctx->stream = fn_stream; \
++ xctx->stream_gb = fn_stream_gb; \
+ }
+
+ static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx,
+@@ -35,23 +34,30 @@ static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx,
+ size_t keylen)
+ {
+ PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)ctx;
+- OSSL_xts_stream_fn stream_enc = NULL;
+- OSSL_xts_stream_fn stream_dec = NULL;
+- OSSL_xts_stream_fn stream_gb_enc = NULL;
+- OSSL_xts_stream_fn stream_gb_dec = NULL;
++ OSSL_xts_stream_fn stream = NULL;
++ OSSL_xts_stream_fn stream_gb = NULL;
+ #ifdef HWSM4_CAPABLE
+ if (HWSM4_CAPABLE) {
+ XTS_SET_KEY_FN(HWSM4_set_encrypt_key, HWSM4_set_decrypt_key,
+- HWSM4_encrypt, HWSM4_decrypt, stream_enc, stream_dec,
+- stream_gb_enc, stream_gb_dec);
++ HWSM4_encrypt, HWSM4_decrypt, stream, stream_gb);
+ return 1;
+ } else
+ #endif /* HWSM4_CAPABLE */
++#ifdef VPSM4_EX_CAPABLE
++ if (VPSM4_EX_CAPABLE) {
++ stream = vpsm4_ex_xts_encrypt;
++ stream_gb = vpsm4_ex_xts_encrypt_gb;
++ XTS_SET_KEY_FN(vpsm4_ex_set_encrypt_key, vpsm4_ex_set_decrypt_key,
++ vpsm4_ex_encrypt, vpsm4_ex_decrypt, stream, stream_gb);
++ return 1;
++ } else
++#endif /* VPSM4_EX_CAPABLE */
+ #ifdef VPSM4_CAPABLE
+ if (VPSM4_CAPABLE) {
++ stream = vpsm4_xts_encrypt;
++ stream_gb = vpsm4_xts_encrypt_gb;
+ XTS_SET_KEY_FN(vpsm4_set_encrypt_key, vpsm4_set_decrypt_key,
+- vpsm4_encrypt, vpsm4_decrypt, stream_enc, stream_dec,
+- stream_gb_enc, stream_gb_dec);
++ vpsm4_encrypt, vpsm4_decrypt, stream, stream_gb);
+ return 1;
+ } else
+ #endif /* VPSM4_CAPABLE */
+@@ -60,8 +66,7 @@ static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx,
+ }
+ {
+ XTS_SET_KEY_FN(ossl_sm4_set_key, ossl_sm4_set_key, ossl_sm4_encrypt,
+- ossl_sm4_decrypt, stream_enc, stream_dec, stream_gb_enc,
+- stream_gb_dec);
++ ossl_sm4_decrypt, stream, stream_gb);
+ }
+ return 1;
+ }
+--
+2.37.3.windows.1
+
diff --git a/Backport-SM4-optimization-for-ARM-by-ASIMD.patch b/Backport-SM4-optimization-for-ARM-by-ASIMD.patch
new file mode 100644
index 0000000..5d58d16
--- /dev/null
+++ b/Backport-SM4-optimization-for-ARM-by-ASIMD.patch
@@ -0,0 +1,1334 @@
+From ca0b08e39bb619b6e62ef58c80edc784e8f20966 Mon Sep 17 00:00:00 2001
+From: Daniel Hu <Daniel.Hu@arm.com>
+Date: Mon, 14 Feb 2022 14:36:34 +0000
+Subject: [PATCH 07/13] SM4 optimization for ARM by ASIMD
+
+This patch optimizes SM4 for ARM processor using ASIMD instruction
+
+It will improve performance if both of following conditions are met:
+1) Input data equal to or more than 4 blocks
+2) Cipher mode allows parallelism, including ECB,CTR,GCM or CBC decryption
+
+This patch implements SM4 SBOX lookup in vector registers, with the
+benefit of constant processing time over existing C implementation.
+
+It is only enabled for micro-architecture N1/V1. In the ideal scenario,
+performance can reach up to 2.7X
+
+When either of above two conditions is not met, e.g. single block input
+or CFB/OFB mode, CBC encryption, performance could drop about 50%.
+
+The assembly code has been reviewed internally by ARM engineer
+Fangming.Fang@arm.com
+
+Signed-off-by: Daniel Hu <Daniel.Hu@arm.com>
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/17951)
+---
+ crypto/evp/e_sm4.c | 24 +
+ crypto/sm4/asm/vpsm4-armv8.pl | 1118 +++++++++++++++++
+ crypto/sm4/build.info | 6 +-
+ include/crypto/sm4_platform.h | 29 +
+ .../ciphers/cipher_sm4_gcm_hw.c | 7 +
+ .../implementations/ciphers/cipher_sm4_hw.c | 24 +
+ 6 files changed, 1206 insertions(+), 2 deletions(-)
+ create mode 100755 crypto/sm4/asm/vpsm4-armv8.pl
+
+diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c
+index bff79ff197..c8e8cfe9c9 100644
+--- a/crypto/evp/e_sm4.c
++++ b/crypto/evp/e_sm4.c
+@@ -76,6 +76,17 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt;
+ # endif
+ } else
++#endif
++#ifdef VPSM4_CAPABLE
++ if (VPSM4_CAPABLE) {
++ vpsm4_set_decrypt_key(key, &dat->ks.ks);
++ dat->block = (block128_f) vpsm4_decrypt;
++ dat->stream.cbc = NULL;
++ if (mode == EVP_CIPH_CBC_MODE)
++ dat->stream.cbc = (cbc128_f) vpsm4_cbc_encrypt;
++ else if (mode == EVP_CIPH_ECB_MODE)
++ dat->stream.ecb = (ecb128_f) vpsm4_ecb_encrypt;
++ } else
+ #endif
+ {
+ dat->block = (block128_f) ossl_sm4_decrypt;
+@@ -104,6 +115,19 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ # endif
+ (void)0; /* terminate potentially open 'else' */
+ } else
++#endif
++#ifdef VPSM4_CAPABLE
++ if (VPSM4_CAPABLE) {
++ vpsm4_set_encrypt_key(key, &dat->ks.ks);
++ dat->block = (block128_f) vpsm4_encrypt;
++ dat->stream.cbc = NULL;
++ if (mode == EVP_CIPH_CBC_MODE)
++ dat->stream.cbc = (cbc128_f) vpsm4_cbc_encrypt;
++ else if (mode == EVP_CIPH_ECB_MODE)
++ dat->stream.ecb = (ecb128_f) vpsm4_ecb_encrypt;
++ else if (mode == EVP_CIPH_CTR_MODE)
++ dat->stream.ctr = (ctr128_f) vpsm4_ctr32_encrypt_blocks;
++ } else
+ #endif
+ {
+ dat->block = (block128_f) ossl_sm4_encrypt;
+diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl
+new file mode 100755
+index 0000000000..095d9dae64
+--- /dev/null
++++ b/crypto/sm4/asm/vpsm4-armv8.pl
+@@ -0,0 +1,1118 @@
++#! /usr/bin/env perl
++# Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the Apache License 2.0 (the "License"). You may not use
++# this file except in compliance with the License. You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++
++#
++# This module implements SM4 with ASIMD on aarch64
++#
++# Feb 2022
++#
++
++# $output is the last argument if it looks like a file (it has an extension)
++# $flavour is the first argument if it doesn't look like a file
++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
++die "can't locate arm-xlate.pl";
++
++open OUT,"| \"$^X\" $xlate $flavour \"$output\""
++ or die "can't call $xlate: $!";
++*STDOUT=*OUT;
++
++$prefix="vpsm4";
++my @vtmp=map("v$_",(0..3));
++my @data=map("v$_",(4..7));
++my @datax=map("v$_",(8..11));
++my ($rk0,$rk1)=("v12","v13");
++my ($rka,$rkb)=("v14","v15");
++my @vtmpx=map("v$_",(12..15));
++my @sbox=map("v$_",(16..31));
++my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
++my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
++my ($ptr,$counter)=("x10","w11");
++my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
++
++sub rev32() {
++ my $dst = shift;
++ my $src = shift;
++
++ if ($src and ("$src" ne "$dst")) {
++$code.=<<___;
++#ifndef __ARMEB__
++ rev32 $dst.16b,$src.16b
++#else
++ mov $dst.16b,$src.16b
++#endif
++___
++ } else {
++$code.=<<___;
++#ifndef __ARMEB__
++ rev32 $dst.16b,$dst.16b
++#endif
++___
++ }
++}
++
++sub transpose() {
++ my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
++
++$code.=<<___;
++ zip1 $vt0.4s,$dat0.4s,$dat1.4s
++ zip2 $vt1.4s,$dat0.4s,$dat1.4s
++ zip1 $vt2.4s,$dat2.4s,$dat3.4s
++ zip2 $vt3.4s,$dat2.4s,$dat3.4s
++ zip1 $dat0.2d,$vt0.2d,$vt2.2d
++ zip2 $dat1.2d,$vt0.2d,$vt2.2d
++ zip1 $dat2.2d,$vt1.2d,$vt3.2d
++ zip2 $dat3.2d,$vt1.2d,$vt3.2d
++___
++}
++
++# sbox operations for 4-lane of words
++sub sbox() {
++ my $dat = shift;
++
++$code.=<<___;
++ movi @vtmp[0].16b,#64
++ movi @vtmp[1].16b,#128
++ movi @vtmp[2].16b,#192
++ sub @vtmp[0].16b,$dat.16b,@vtmp[0].16b
++ sub @vtmp[1].16b,$dat.16b,@vtmp[1].16b
++ sub @vtmp[2].16b,$dat.16b,@vtmp[2].16b
++ tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
++ tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
++ tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
++ tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
++ add @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d
++ add @vtmp[2].2d,@vtmp[2].2d,$dat.2d
++ add $dat.2d,@vtmp[0].2d,@vtmp[2].2d
++
++ ushr @vtmp[0].4s,$dat.4s,32-2
++ sli @vtmp[0].4s,$dat.4s,2
++ ushr @vtmp[2].4s,$dat.4s,32-10
++ eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
++ sli @vtmp[2].4s,$dat.4s,10
++ eor @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b
++ ushr @vtmp[0].4s,$dat.4s,32-18
++ sli @vtmp[0].4s,$dat.4s,18
++ ushr @vtmp[2].4s,$dat.4s,32-24
++ eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
++ sli @vtmp[2].4s,$dat.4s,24
++ eor $dat.16b,@vtmp[2].16b,@vtmp[1].16b
++___
++}
++
++# sbox operation for 8-lane of words
++sub sbox_double() {
++ my $dat = shift;
++ my $datx = shift;
++
++$code.=<<___;
++ movi @vtmp[3].16b,#64
++ sub @vtmp[0].16b,$dat.16b,@vtmp[3].16b
++ sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
++ sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
++ tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
++ tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
++ tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
++ tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
++ add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
++ add $dat.2d,@vtmp[2].2d,$dat.2d
++ add $dat.2d,@vtmp[1].2d,$dat.2d
++
++ sub @vtmp[0].16b,$datx.16b,@vtmp[3].16b
++ sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
++ sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
++ tbl $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b
++ tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
++ tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
++ tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
++ add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
++ add $datx.2d,@vtmp[2].2d,$datx.2d
++ add $datx.2d,@vtmp[1].2d,$datx.2d
++
++ ushr @vtmp[0].4s,$dat.4s,32-2
++ sli @vtmp[0].4s,$dat.4s,2
++ ushr @vtmp[2].4s,$datx.4s,32-2
++ eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
++ sli @vtmp[2].4s,$datx.4s,2
++
++ ushr @vtmp[0].4s,$dat.4s,32-10
++ eor @vtmp[3].16b,@vtmp[2].16b,$datx.16b
++ sli @vtmp[0].4s,$dat.4s,10
++ ushr @vtmp[2].4s,$datx.4s,32-10
++ eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
++ sli @vtmp[2].4s,$datx.4s,10
++
++ ushr @vtmp[0].4s,$dat.4s,32-18
++ eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
++ sli @vtmp[0].4s,$dat.4s,18
++ ushr @vtmp[2].4s,$datx.4s,32-18
++ eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
++ sli @vtmp[2].4s,$datx.4s,18
++
++ ushr @vtmp[0].4s,$dat.4s,32-24
++ eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
++ sli @vtmp[0].4s,$dat.4s,24
++ ushr @vtmp[2].4s,$datx.4s,32-24
++ eor $dat.16b,@vtmp[0].16b,@vtmp[1].16b
++ sli @vtmp[2].4s,$datx.4s,24
++ eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b
++___
++}
++
++# sbox operation for one single word
++sub sbox_1word () {
++ my $word = shift;
++
++$code.=<<___;
++ movi @vtmp[1].16b,#64
++ movi @vtmp[2].16b,#128
++ movi @vtmp[3].16b,#192
++ mov @vtmp[0].s[0],$word
++
++ sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b
++ sub @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b
++ sub @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b
++
++ tbl @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b
++ tbl @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b
++ tbl @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b
++ tbl @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b
++
++ mov $word,@vtmp[0].s[0]
++ mov $wtmp0,@vtmp[1].s[0]
++ mov $wtmp2,@vtmp[2].s[0]
++ add $wtmp0,$word,$wtmp0
++ mov $word,@vtmp[3].s[0]
++ add $wtmp0,$wtmp0,$wtmp2
++ add $wtmp0,$wtmp0,$word
++
++ eor $word,$wtmp0,$wtmp0,ror #32-2
++ eor $word,$word,$wtmp0,ror #32-10
++ eor $word,$word,$wtmp0,ror #32-18
++ eor $word,$word,$wtmp0,ror #32-24
++___
++}
++
++# sm4 for one block of data, in scalar registers word0/word1/word2/word3
++sub sm4_1blk () {
++ my $kptr = shift;
++
++$code.=<<___;
++ ldp $wtmp0,$wtmp1,[$kptr],8
++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
++ eor $tmpw,$word2,$word3
++ eor $wtmp2,$wtmp0,$word1
++ eor $tmpw,$tmpw,$wtmp2
++___
++ &sbox_1word($tmpw);
++$code.=<<___;
++ eor $word0,$word0,$tmpw
++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
++ eor $tmpw,$word2,$word3
++ eor $wtmp2,$word0,$wtmp1
++ eor $tmpw,$tmpw,$wtmp2
++___
++ &sbox_1word($tmpw);
++$code.=<<___;
++ ldp $wtmp0,$wtmp1,[$kptr],8
++ eor $word1,$word1,$tmpw
++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
++ eor $tmpw,$word0,$word1
++ eor $wtmp2,$wtmp0,$word3
++ eor $tmpw,$tmpw,$wtmp2
++___
++ &sbox_1word($tmpw);
++$code.=<<___;
++ eor $word2,$word2,$tmpw
++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
++ eor $tmpw,$word0,$word1
++ eor $wtmp2,$word2,$wtmp1
++ eor $tmpw,$tmpw,$wtmp2
++___
++ &sbox_1word($tmpw);
++$code.=<<___;
++ eor $word3,$word3,$tmpw
++___
++}
++
++# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
++sub sm4_4blks () {
++ my $kptr = shift;
++
++$code.=<<___;
++ ldp $wtmp0,$wtmp1,[$kptr],8
++ dup $rk0.4s,$wtmp0
++ dup $rk1.4s,$wtmp1
++
++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
++ eor $rka.16b,@data[2].16b,@data[3].16b
++ eor $rk0.16b,@data[1].16b,$rk0.16b
++ eor $rk0.16b,$rka.16b,$rk0.16b
++___
++ &sbox($rk0);
++$code.=<<___;
++ eor @data[0].16b,@data[0].16b,$rk0.16b
++
++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
++ eor $rka.16b,$rka.16b,@data[0].16b
++ eor $rk1.16b,$rka.16b,$rk1.16b
++___
++ &sbox($rk1);
++$code.=<<___;
++ ldp $wtmp0,$wtmp1,[$kptr],8
++ eor @data[1].16b,@data[1].16b,$rk1.16b
++
++ dup $rk0.4s,$wtmp0
++ dup $rk1.4s,$wtmp1
++
++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
++ eor $rka.16b,@data[0].16b,@data[1].16b
++ eor $rk0.16b,@data[3].16b,$rk0.16b
++ eor $rk0.16b,$rka.16b,$rk0.16b
++___
++ &sbox($rk0);
++$code.=<<___;
++ eor @data[2].16b,@data[2].16b,$rk0.16b
++
++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
++ eor $rka.16b,$rka.16b,@data[2].16b
++ eor $rk1.16b,$rka.16b,$rk1.16b
++___
++ &sbox($rk1);
++$code.=<<___;
++ eor @data[3].16b,@data[3].16b,$rk1.16b
++___
++}
++
++# sm4 for 8 lanes of data, in neon registers
++# data0/data1/data2/data3 datax0/datax1/datax2/datax3
++sub sm4_8blks () {
++ my $kptr = shift;
++
++$code.=<<___;
++ ldp $wtmp0,$wtmp1,[$kptr],8
++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
++ dup $rk0.4s,$wtmp0
++ eor $rka.16b,@data[2].16b,@data[3].16b
++ eor $rkb.16b,@datax[2].16b,@datax[3].16b
++ eor @vtmp[0].16b,@data[1].16b,$rk0.16b
++ eor @vtmp[1].16b,@datax[1].16b,$rk0.16b
++ eor $rk0.16b,$rka.16b,@vtmp[0].16b
++ eor $rk1.16b,$rkb.16b,@vtmp[1].16b
++___
++ &sbox_double($rk0,$rk1);
++$code.=<<___;
++ eor @data[0].16b,@data[0].16b,$rk0.16b
++ eor @datax[0].16b,@datax[0].16b,$rk1.16b
++
++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
++ dup $rk1.4s,$wtmp1
++ eor $rka.16b,$rka.16b,@data[0].16b
++ eor $rkb.16b,$rkb.16b,@datax[0].16b
++ eor $rk0.16b,$rka.16b,$rk1.16b
++ eor $rk1.16b,$rkb.16b,$rk1.16b
++___
++ &sbox_double($rk0,$rk1);
++$code.=<<___;
++ ldp $wtmp0,$wtmp1,[$kptr],8
++ eor @data[1].16b,@data[1].16b,$rk0.16b
++ eor @datax[1].16b,@datax[1].16b,$rk1.16b
++
++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
++ dup $rk0.4s,$wtmp0
++ eor $rka.16b,@data[0].16b,@data[1].16b
++ eor $rkb.16b,@datax[0].16b,@datax[1].16b
++ eor @vtmp[0].16b,@data[3].16b,$rk0.16b
++ eor @vtmp[1].16b,@datax[3].16b,$rk0.16b
++ eor $rk0.16b,$rka.16b,@vtmp[0].16b
++ eor $rk1.16b,$rkb.16b,@vtmp[1].16b
++___
++ &sbox_double($rk0,$rk1);
++$code.=<<___;
++ eor @data[2].16b,@data[2].16b,$rk0.16b
++ eor @datax[2].16b,@datax[2].16b,$rk1.16b
++
++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
++ dup $rk1.4s,$wtmp1
++ eor $rka.16b,$rka.16b,@data[2].16b
++ eor $rkb.16b,$rkb.16b,@datax[2].16b
++ eor $rk0.16b,$rka.16b,$rk1.16b
++ eor $rk1.16b,$rkb.16b,$rk1.16b
++___
++ &sbox_double($rk0,$rk1);
++$code.=<<___;
++ eor @data[3].16b,@data[3].16b,$rk0.16b
++ eor @datax[3].16b,@datax[3].16b,$rk1.16b
++___
++}
++
++sub encrypt_1blk_norev() {
++ my $dat = shift;
++
++$code.=<<___;
++ mov $ptr,$rks
++ mov $counter,#8
++ mov $word0,$dat.s[0]
++ mov $word1,$dat.s[1]
++ mov $word2,$dat.s[2]
++ mov $word3,$dat.s[3]
++10:
++___
++ &sm4_1blk($ptr);
++$code.=<<___;
++ subs $counter,$counter,#1
++ b.ne 10b
++ mov $dat.s[0],$word3
++ mov $dat.s[1],$word2
++ mov $dat.s[2],$word1
++ mov $dat.s[3],$word0
++___
++}
++
++sub encrypt_1blk() {
++ my $dat = shift;
++
++ &encrypt_1blk_norev($dat);
++ &rev32($dat,$dat);
++}
++
++sub encrypt_4blks() {
++$code.=<<___;
++ mov $ptr,$rks
++ mov $counter,#8
++10:
++___
++ &sm4_4blks($ptr);
++$code.=<<___;
++ subs $counter,$counter,#1
++ b.ne 10b
++___
++ &rev32(@vtmp[3],@data[0]);
++ &rev32(@vtmp[2],@data[1]);
++ &rev32(@vtmp[1],@data[2]);
++ &rev32(@vtmp[0],@data[3]);
++}
++
++sub encrypt_8blks() {
++$code.=<<___;
++ mov $ptr,$rks
++ mov $counter,#8
++10:
++___
++ &sm4_8blks($ptr);
++$code.=<<___;
++ subs $counter,$counter,#1
++ b.ne 10b
++___
++ &rev32(@vtmp[3],@data[0]);
++ &rev32(@vtmp[2],@data[1]);
++ &rev32(@vtmp[1],@data[2]);
++ &rev32(@vtmp[0],@data[3]);
++ &rev32(@data[3],@datax[0]);
++ &rev32(@data[2],@datax[1]);
++ &rev32(@data[1],@datax[2]);
++ &rev32(@data[0],@datax[3]);
++}
++
++sub load_sbox () {
++ my $data = shift;
++
++$code.=<<___;
++ adr $ptr,.Lsbox
++ ld1 {@sbox[0].4s,@sbox[1].4s,@sbox[2].4s,@sbox[3].4s},[$ptr],#64
++ ld1 {@sbox[4].4s,@sbox[5].4s,@sbox[6].4s,@sbox[7].4s},[$ptr],#64
++ ld1 {@sbox[8].4s,@sbox[9].4s,@sbox[10].4s,@sbox[11].4s},[$ptr],#64
++ ld1 {@sbox[12].4s,@sbox[13].4s,@sbox[14].4s,@sbox[15].4s},[$ptr]
++___
++}
++
++$code=<<___;
++#include "arm_arch.h"
++.arch armv8-a
++.text
++
++.type _vpsm4_consts,%object
++.align 7
++_vpsm4_consts:
++.Lsbox:
++ .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
++ .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
++ .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
++ .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
++ .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
++ .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
++ .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
++ .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
++ .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
++ .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
++ .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
++ .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
++ .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
++ .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
++ .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
++ .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
++.Lck:
++ .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
++ .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
++ .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
++ .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
++ .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
++ .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
++ .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
++ .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
++.Lfk:
++ .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197
++.Lshuffles:
++ .dword 0x0B0A090807060504,0x030201000F0E0D0C
++
++.size _vpsm4_consts,.-_vpsm4_consts
++___
++
++{{{
++my ($key,$keys,$enc)=("x0","x1","w2");
++my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
++my ($vkey,$vfk,$vmap)=("v5","v6","v7");
++$code.=<<___;
++.type _vpsm4_set_key,%function
++.align 4
++_vpsm4_set_key:
++ AARCH64_VALID_CALL_TARGET
++ ld1 {$vkey.4s},[$key]
++___
++ &load_sbox();
++ &rev32($vkey,$vkey);
++$code.=<<___;
++ adr $pointer,.Lshuffles
++ ld1 {$vmap.4s},[$pointer]
++ adr $pointer,.Lfk
++ ld1 {$vfk.4s},[$pointer]
++ eor $vkey.16b,$vkey.16b,$vfk.16b
++ mov $schedules,#32
++ adr $pointer,.Lck
++ movi @vtmp[0].16b,#64
++ cbnz $enc,1f
++ add $keys,$keys,124
++1:
++ mov $wtmp,$vkey.s[1]
++ ldr $roundkey,[$pointer],#4
++ eor $roundkey,$roundkey,$wtmp
++ mov $wtmp,$vkey.s[2]
++ eor $roundkey,$roundkey,$wtmp
++ mov $wtmp,$vkey.s[3]
++ eor $roundkey,$roundkey,$wtmp
++ // sbox lookup
++ mov @data[0].s[0],$roundkey
++ tbl @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b
++ sub @data[0].16b,@data[0].16b,@vtmp[0].16b
++ tbx @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b
++ sub @data[0].16b,@data[0].16b,@vtmp[0].16b
++ tbx @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b
++ sub @data[0].16b,@data[0].16b,@vtmp[0].16b
++ tbx @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b
++ mov $wtmp,@vtmp[1].s[0]
++ eor $roundkey,$wtmp,$wtmp,ror #19
++ eor $roundkey,$roundkey,$wtmp,ror #9
++ mov $wtmp,$vkey.s[0]
++ eor $roundkey,$roundkey,$wtmp
++ mov $vkey.s[0],$roundkey
++ cbz $enc,2f
++ str $roundkey,[$keys],#4
++ b 3f
++2:
++ str $roundkey,[$keys],#-4
++3:
++ tbl $vkey.16b,{$vkey.16b},$vmap.16b
++ subs $schedules,$schedules,#1
++ b.ne 1b
++ ret
++.size _vpsm4_set_key,.-_vpsm4_set_key
++___
++}}}
++
++
++{{{
++$code.=<<___;
++.type _vpsm4_enc_4blks,%function
++.align 4
++_vpsm4_enc_4blks:
++ AARCH64_VALID_CALL_TARGET
++___
++ &encrypt_4blks();
++$code.=<<___;
++ ret
++.size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks
++___
++}}}
++
++{{{
++$code.=<<___;
++.type _vpsm4_enc_8blks,%function
++.align 4
++_vpsm4_enc_8blks:
++ AARCH64_VALID_CALL_TARGET
++___
++ &encrypt_8blks();
++$code.=<<___;
++ ret
++.size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks
++___
++}}}
++
++
++{{{
++my ($key,$keys)=("x0","x1");
++$code.=<<___;
++.globl ${prefix}_set_encrypt_key
++.type ${prefix}_set_encrypt_key,%function
++.align 5
++${prefix}_set_encrypt_key:
++ AARCH64_SIGN_LINK_REGISTER
++ stp x29,x30,[sp,#-16]!
++ mov w2,1
++ bl _vpsm4_set_key
++ ldp x29,x30,[sp],#16
++ AARCH64_VALIDATE_LINK_REGISTER
++ ret
++.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
++___
++}}}
++
++{{{
++my ($key,$keys)=("x0","x1");
++$code.=<<___;
++.globl ${prefix}_set_decrypt_key
++.type ${prefix}_set_decrypt_key,%function
++.align 5
++${prefix}_set_decrypt_key:
++ AARCH64_SIGN_LINK_REGISTER
++ stp x29,x30,[sp,#-16]!
++ mov w2,0
++ bl _vpsm4_set_key
++ ldp x29,x30,[sp],#16
++ AARCH64_VALIDATE_LINK_REGISTER
++ ret
++.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
++___
++}}}
++
++{{{
++sub gen_block () {
++ my $dir = shift;
++ my ($inp,$outp,$rk)=map("x$_",(0..2));
++
++$code.=<<___;
++.globl ${prefix}_${dir}crypt
++.type ${prefix}_${dir}crypt,%function
++.align 5
++${prefix}_${dir}crypt:
++ AARCH64_VALID_CALL_TARGET
++ ld1 {@data[0].16b},[$inp]
++___
++ &load_sbox();
++ &rev32(@data[0],@data[0]);
++$code.=<<___;
++ mov $rks,x2
++___
++ &encrypt_1blk(@data[0]);
++$code.=<<___;
++ st1 {@data[0].16b},[$outp]
++ ret
++.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
++___
++}
++&gen_block("en");
++&gen_block("de");
++}}}
++
++{{{
++my ($enc) = ("w4");
++my @dat=map("v$_",(16..23));
++
++$code.=<<___;
++.globl ${prefix}_ecb_encrypt
++.type ${prefix}_ecb_encrypt,%function
++.align 5
++${prefix}_ecb_encrypt:
++ AARCH64_SIGN_LINK_REGISTER
++ // convert length into blocks
++ lsr x2,x2,4
++ stp d8,d9,[sp,#-80]!
++ stp d10,d11,[sp,#16]
++ stp d12,d13,[sp,#32]
++ stp d14,d15,[sp,#48]
++ stp x29,x30,[sp,#64]
++___
++ &load_sbox();
++$code.=<<___;
++.Lecb_8_blocks_process:
++ cmp $blocks,#8
++ b.lt .Lecb_4_blocks_process
++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++ ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++ &rev32(@datax[0],@datax[0]);
++ &rev32(@datax[1],@datax[1]);
++ &rev32(@datax[2],@datax[2]);
++ &rev32(@datax[3],@datax[3]);
++$code.=<<___;
++ bl _vpsm4_enc_8blks
++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++ subs $blocks,$blocks,#8
++ b.gt .Lecb_8_blocks_process
++ b 100f
++.Lecb_4_blocks_process:
++ cmp $blocks,#4
++ b.lt 1f
++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++$code.=<<___;
++ bl _vpsm4_enc_4blks
++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++ sub $blocks,$blocks,#4
++1:
++ // process last block
++ cmp $blocks,#1
++ b.lt 100f
++ b.gt 1f
++ ld1 {@data[0].16b},[$inp]
++___
++ &rev32(@data[0],@data[0]);
++ &encrypt_1blk(@data[0]);
++$code.=<<___;
++ st1 {@data[0].16b},[$outp]
++ b 100f
++1: // process last 2 blocks
++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
++ cmp $blocks,#2
++ b.gt 1f
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++$code.=<<___;
++ bl _vpsm4_enc_4blks
++ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
++ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
++ b 100f
++1: // process last 3 blocks
++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++$code.=<<___;
++ bl _vpsm4_enc_4blks
++ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
++ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
++ st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
++100:
++ ldp d10,d11,[sp,#16]
++ ldp d12,d13,[sp,#32]
++ ldp d14,d15,[sp,#48]
++ ldp x29,x30,[sp,#64]
++ ldp d8,d9,[sp],#80
++ AARCH64_VALIDATE_LINK_REGISTER
++ ret
++.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
++___
++}}}
++
++{{{
++my ($len,$ivp,$enc)=("x2","x4","w5");
++my $ivec0=("v3");
++my $ivec1=("v15");
++
++$code.=<<___;
++.globl ${prefix}_cbc_encrypt
++.type ${prefix}_cbc_encrypt,%function
++.align 5
++${prefix}_cbc_encrypt:
++ AARCH64_VALID_CALL_TARGET
++ lsr $len,$len,4
++___
++ &load_sbox();
++$code.=<<___;
++ cbz $enc,.Ldec
++ ld1 {$ivec0.4s},[$ivp]
++.Lcbc_4_blocks_enc:
++ cmp $blocks,#4
++ b.lt 1f
++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++ eor @data[0].16b,@data[0].16b,$ivec0.16b
++___
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++ &encrypt_1blk_norev(@data[0]);
++$code.=<<___;
++ eor @data[1].16b,@data[1].16b,@data[0].16b
++___
++ &encrypt_1blk_norev(@data[1]);
++ &rev32(@data[0],@data[0]);
++
++$code.=<<___;
++ eor @data[2].16b,@data[2].16b,@data[1].16b
++___
++ &encrypt_1blk_norev(@data[2]);
++ &rev32(@data[1],@data[1]);
++$code.=<<___;
++ eor @data[3].16b,@data[3].16b,@data[2].16b
++___
++ &encrypt_1blk_norev(@data[3]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++$code.=<<___;
++ orr $ivec0.16b,@data[3].16b,@data[3].16b
++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++ subs $blocks,$blocks,#4
++ b.ne .Lcbc_4_blocks_enc
++ b 2f
++1:
++ subs $blocks,$blocks,#1
++ b.lt 2f
++ ld1 {@data[0].4s},[$inp],#16
++ eor $ivec0.16b,$ivec0.16b,@data[0].16b
++___
++ &rev32($ivec0,$ivec0);
++ &encrypt_1blk($ivec0);
++$code.=<<___;
++ st1 {$ivec0.16b},[$outp],#16
++ b 1b
++2:
++ // save back IV
++ st1 {$ivec0.16b},[$ivp]
++ ret
++
++.Ldec:
++ // decryption mode starts
++ AARCH64_SIGN_LINK_REGISTER
++ stp d8,d9,[sp,#-80]!
++ stp d10,d11,[sp,#16]
++ stp d12,d13,[sp,#32]
++ stp d14,d15,[sp,#48]
++ stp x29,x30,[sp,#64]
++.Lcbc_8_blocks_dec:
++ cmp $blocks,#8
++ b.lt 1f
++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
++ add $ptr,$inp,#64
++ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],$data[3]);
++ &rev32(@datax[0],@datax[0]);
++ &rev32(@datax[1],@datax[1]);
++ &rev32(@datax[2],@datax[2]);
++ &rev32(@datax[3],$datax[3]);
++$code.=<<___;
++ bl _vpsm4_enc_8blks
++___
++ &transpose(@vtmp,@datax);
++ &transpose(@data,@datax);
++$code.=<<___;
++ ld1 {$ivec1.16b},[$ivp]
++ ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
++ // note ivec1 and vtmpx[3] are resuing the same register
++ // care needs to be taken to avoid conflict
++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
++ ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
++ eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
++ eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
++ eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
++ // save back IV
++ st1 {$vtmpx[3].16b}, [$ivp]
++ eor @data[0].16b,@data[0].16b,$datax[3].16b
++ eor @data[1].16b,@data[1].16b,@vtmpx[0].16b
++ eor @data[2].16b,@data[2].16b,@vtmpx[1].16b
++ eor @data[3].16b,$data[3].16b,@vtmpx[2].16b
++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++ subs $blocks,$blocks,#8
++ b.gt .Lcbc_8_blocks_dec
++ b.eq 100f
++1:
++ ld1 {$ivec1.16b},[$ivp]
++.Lcbc_4_blocks_dec:
++ cmp $blocks,#4
++ b.lt 1f
++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],$data[3]);
++$code.=<<___;
++ bl _vpsm4_enc_4blks
++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
++___
++ &transpose(@vtmp,@datax);
++$code.=<<___;
++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
++ orr $ivec1.16b,@data[3].16b,@data[3].16b
++ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
++ eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++ subs $blocks,$blocks,#4
++ b.gt .Lcbc_4_blocks_dec
++ // save back IV
++ st1 {@vtmp[3].16b}, [$ivp]
++ b 100f
++1: // last block
++ subs $blocks,$blocks,#1
++ b.lt 100f
++ b.gt 1f
++ ld1 {@data[0].4s},[$inp],#16
++ // save back IV
++ st1 {$data[0].16b}, [$ivp]
++___
++ &rev32(@datax[0],@data[0]);
++ &encrypt_1blk(@datax[0]);
++$code.=<<___;
++ eor @datax[0].16b,@datax[0].16b,$ivec1.16b
++ st1 {@datax[0].16b},[$outp],#16
++ b 100f
++1: // last two blocks
++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
++ add $ptr,$inp,#16
++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
++ subs $blocks,$blocks,1
++ b.gt 1f
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++$code.=<<___;
++ bl _vpsm4_enc_4blks
++ ld1 {@data[0].4s,@data[1].4s},[$inp],#32
++___
++ &transpose(@vtmp,@datax);
++$code.=<<___;
++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
++ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
++ // save back IV
++ st1 {@data[1].16b}, [$ivp]
++ b 100f
++1: // last 3 blocks
++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
++___
++ &rev32(@data[0],@data[0]);
++ &rev32(@data[1],@data[1]);
++ &rev32(@data[2],@data[2]);
++ &rev32(@data[3],@data[3]);
++$code.=<<___;
++ bl _vpsm4_enc_4blks
++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
++___
++ &transpose(@vtmp,@datax);
++$code.=<<___;
++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
++ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
++ // save back IV
++ st1 {@data[2].16b}, [$ivp]
++100:
++ ldp d10,d11,[sp,#16]
++ ldp d12,d13,[sp,#32]
++ ldp d14,d15,[sp,#48]
++ ldp x29,x30,[sp,#64]
++ ldp d8,d9,[sp],#80
++ AARCH64_VALIDATE_LINK_REGISTER
++ ret
++.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
++___
++}}}
++
++{{{
++my ($ivp)=("x4");
++my ($ctr)=("w5");
++my $ivec=("v3");
++
++$code.=<<___;
++.globl ${prefix}_ctr32_encrypt_blocks
++.type ${prefix}_ctr32_encrypt_blocks,%function
++.align 5
++${prefix}_ctr32_encrypt_blocks:
++ AARCH64_VALID_CALL_TARGET
++ ld1 {$ivec.4s},[$ivp]
++___
++ &rev32($ivec,$ivec);
++ &load_sbox();
++$code.=<<___;
++ cmp $blocks,#1
++ b.ne 1f
++ // fast processing for one single block without
++ // context saving overhead
++___
++ &encrypt_1blk($ivec);
++$code.=<<___;
++ ld1 {@data[0].16b},[$inp]
++ eor @data[0].16b,@data[0].16b,$ivec.16b
++ st1 {@data[0].16b},[$outp]
++ ret
++1:
++ AARCH64_SIGN_LINK_REGISTER
++ stp d8,d9,[sp,#-80]!
++ stp d10,d11,[sp,#16]
++ stp d12,d13,[sp,#32]
++ stp d14,d15,[sp,#48]
++ stp x29,x30,[sp,#64]
++ mov $word0,$ivec.s[0]
++ mov $word1,$ivec.s[1]
++ mov $word2,$ivec.s[2]
++ mov $ctr,$ivec.s[3]
++.Lctr32_4_blocks_process:
++ cmp $blocks,#4
++ b.lt 1f
++ dup @data[0].4s,$word0
++ dup @data[1].4s,$word1
++ dup @data[2].4s,$word2
++ mov @data[3].s[0],$ctr
++ add $ctr,$ctr,#1
++ mov $data[3].s[1],$ctr
++ add $ctr,$ctr,#1
++ mov @data[3].s[2],$ctr
++ add $ctr,$ctr,#1
++ mov @data[3].s[3],$ctr
++ add $ctr,$ctr,#1
++ cmp $blocks,#8
++ b.ge .Lctr32_8_blocks_process
++ bl _vpsm4_enc_4blks
++ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++ subs $blocks,$blocks,#4
++ b.ne .Lctr32_4_blocks_process
++ b 100f
++.Lctr32_8_blocks_process:
++ dup @datax[0].4s,$word0
++ dup @datax[1].4s,$word1
++ dup @datax[2].4s,$word2
++ mov @datax[3].s[0],$ctr
++ add $ctr,$ctr,#1
++ mov $datax[3].s[1],$ctr
++ add $ctr,$ctr,#1
++ mov @datax[3].s[2],$ctr
++ add $ctr,$ctr,#1
++ mov @datax[3].s[3],$ctr
++ add $ctr,$ctr,#1
++ bl _vpsm4_enc_8blks
++ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
++ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
++ eor @data[0].16b,@data[0].16b,@datax[0].16b
++ eor @data[1].16b,@data[1].16b,@datax[1].16b
++ eor @data[2].16b,@data[2].16b,@datax[2].16b
++ eor @data[3].16b,@data[3].16b,@datax[3].16b
++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
++ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
++ subs $blocks,$blocks,#8
++ b.ne .Lctr32_4_blocks_process
++ b 100f
++1: // last block processing
++ subs $blocks,$blocks,#1
++ b.lt 100f
++ b.gt 1f
++ mov $ivec.s[0],$word0
++ mov $ivec.s[1],$word1
++ mov $ivec.s[2],$word2
++ mov $ivec.s[3],$ctr
++___
++ &encrypt_1blk($ivec);
++$code.=<<___;
++ ld1 {@data[0].16b},[$inp]
++ eor @data[0].16b,@data[0].16b,$ivec.16b
++ st1 {@data[0].16b},[$outp]
++ b 100f
++1: // last 2 blocks processing
++ dup @data[0].4s,$word0
++ dup @data[1].4s,$word1
++ dup @data[2].4s,$word2
++ mov @data[3].s[0],$ctr
++ add $ctr,$ctr,#1
++ mov @data[3].s[1],$ctr
++ subs $blocks,$blocks,#1
++ b.ne 1f
++ bl _vpsm4_enc_4blks
++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
++ b 100f
++1: // last 3 blocks processing
++ add $ctr,$ctr,#1
++ mov @data[3].s[2],$ctr
++ bl _vpsm4_enc_4blks
++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
++100:
++ ldp d10,d11,[sp,#16]
++ ldp d12,d13,[sp,#32]
++ ldp d14,d15,[sp,#48]
++ ldp x29,x30,[sp,#64]
++ ldp d8,d9,[sp],#80
++ AARCH64_VALIDATE_LINK_REGISTER
++ ret
++.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
++___
++}}}
++########################################
++open SELF,$0;
++while(<SELF>) {
++ next if (/^#!/);
++ last if (!s/^#/\/\// and !/^$/);
++ print;
++}
++close SELF;
++
++foreach(split("\n",$code)) {
++ s/\`([^\`]*)\`/eval($1)/ge;
++ print $_,"\n";
++}
++
++close STDOUT or die "error closing STDOUT: $!";
+diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info
+index e27aa49e67..75a215ab80 100644
+--- a/crypto/sm4/build.info
++++ b/crypto/sm4/build.info
+@@ -1,8 +1,8 @@
+ LIBS=../../libcrypto
+
+ IF[{- !$disabled{asm} -}]
+- $SM4DEF_aarch64=SM4_ASM
+- $SM4ASM_aarch64=sm4-armv8.S
++ $SM4DEF_aarch64=SM4_ASM VPSM4_ASM
++ $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S
+
+ # Now that we have defined all the arch specific variables, use the
+ # appropriate one, and define the appropriate macros
+@@ -29,4 +29,6 @@ IF[{- !$disabled{module} && !$disabled{shared} -}]
+ ENDIF
+
+ GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl
++GENERATE[vpsm4-armv8.S]=asm/vpsm4-armv8.pl
+ INCLUDE[sm4-armv8.o]=..
++INCLUDE[vpsm4-armv8.o]=..
+diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
+index 42c8b44a43..11f9b9d88b 100644
+--- a/include/crypto/sm4_platform.h
++++ b/include/crypto/sm4_platform.h
+@@ -15,6 +15,16 @@
+ # if (defined(__arm__) || defined(__arm) || defined(__aarch64__))
+ # include "arm_arch.h"
+ # if __ARM_MAX_ARCH__>=8
++extern unsigned int OPENSSL_arm_midr;
++static inline int vpsm4_capable(void)
++{
++ return (OPENSSL_armcap_P & ARMV8_CPUID) &&
++ (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1) ||
++ MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1));
++}
++# if defined(VPSM4_ASM)
++# define VPSM4_CAPABLE vpsm4_capable()
++# endif
+ # define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4)
+ # define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key
+ # define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key
+@@ -45,4 +55,23 @@ void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
+ const unsigned char ivec[16]);
+ # endif /* HWSM4_CAPABLE */
+
++#ifdef VPSM4_CAPABLE
++int vpsm4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key);
++int vpsm4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key);
++void vpsm4_encrypt(const unsigned char *in, unsigned char *out,
++ const SM4_KEY *key);
++void vpsm4_decrypt(const unsigned char *in, unsigned char *out,
++ const SM4_KEY *key);
++void vpsm4_cbc_encrypt(const unsigned char *in, unsigned char *out,
++ size_t length, const SM4_KEY *key,
++ unsigned char *ivec, const int enc);
++void vpsm4_ecb_encrypt(const unsigned char *in, unsigned char *out,
++ size_t length, const SM4_KEY *key,
++ const int enc);
++void vpsm4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
++ size_t len, const void *key,
++ const unsigned char ivec[16]);
++# endif /* VPSM4_CAPABLE */
++
++
+ #endif /* OSSL_SM4_PLATFORM_H */
+diff --git a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+index b9633f83ed..db7fe0fe2f 100644
+--- a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
++++ b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+@@ -32,6 +32,13 @@ static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
+ # endif
+ } else
+ # endif /* HWSM4_CAPABLE */
++# ifdef VPSM4_CAPABLE
++ if (VPSM4_CAPABLE) {
++ vpsm4_set_encrypt_key(key, ks);
++ CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f) vpsm4_encrypt);
++ ctx->ctr = (ctr128_f) vpsm4_ctr32_encrypt_blocks;
++ } else
++# endif /* VPSM4_CAPABLE */
+ {
+ ossl_sm4_set_key(key, ks);
+ CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt);
+diff --git a/providers/implementations/ciphers/cipher_sm4_hw.c b/providers/implementations/ciphers/cipher_sm4_hw.c
+index 4cd3d3d669..9a2e99f67c 100644
+--- a/providers/implementations/ciphers/cipher_sm4_hw.c
++++ b/providers/implementations/ciphers/cipher_sm4_hw.c
+@@ -41,6 +41,19 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
+ #endif
+ (void)0; /* terminate potentially open 'else' */
+ } else
++#endif
++#ifdef VPSM4_CAPABLE
++ if (VPSM4_CAPABLE) {
++ vpsm4_set_encrypt_key(key, ks);
++ ctx->block = (block128_f)vpsm4_encrypt;
++ ctx->stream.cbc = NULL;
++ if (ctx->mode == EVP_CIPH_CBC_MODE)
++ ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt;
++ else if (ctx->mode == EVP_CIPH_ECB_MODE)
++ ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt;
++ else if (ctx->mode == EVP_CIPH_CTR_MODE)
++ ctx->stream.ctr = (ctr128_f)vpsm4_ctr32_encrypt_blocks;
++ } else
+ #endif
+ {
+ ossl_sm4_set_key(key, ks);
+@@ -61,6 +74,17 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
+ ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt;
+ #endif
+ } else
++#endif
++#ifdef VPSM4_CAPABLE
++ if (VPSM4_CAPABLE) {
++ vpsm4_set_decrypt_key(key, ks);
++ ctx->block = (block128_f)vpsm4_decrypt;
++ ctx->stream.cbc = NULL;
++ if (ctx->mode == EVP_CIPH_CBC_MODE)
++ ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt;
++ else if (ctx->mode == EVP_CIPH_ECB_MODE)
++ ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt;
++ } else
+ #endif
+ {
+ ossl_sm4_set_key(key, ks);
+--
+2.37.3.windows.1
+
diff --git a/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch b/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch
new file mode 100644
index 0000000..c68f1a0
--- /dev/null
+++ b/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch
@@ -0,0 +1,1228 @@
+From 1cd480c10b8bbaa6f72d503494ff2973672ec0e4 Mon Sep 17 00:00:00 2001
+From: Daniel Hu <Daniel.Hu@arm.com>
+Date: Tue, 19 Oct 2021 22:49:05 +0100
+Subject: [PATCH 05/13] SM4 optimization for ARM by HW instruction
+
+This patch implements the SM4 optimization for ARM processor,
+using SM4 HW instruction, which is an optional feature of
+crypto extension for aarch64 V8.
+
+Tested on some modern ARM micro-architectures with SM4 support, the
+performance uplift can be observed around 8X~40X over existing
+C implementation in openssl. Algorithms that can be parallelized
+(like CTR, ECB, CBC decryption) are on higher end, with algorithm
+like CBC encryption on lower end (due to inter-block dependency)
+
+Perf data on Yitian-710 2.75GHz hardware, before and after optimization:
+
+Before:
+ type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
+ SM4-CTR 105787.80k 107837.87k 108380.84k 108462.08k 108549.46k 108554.92k
+ SM4-ECB 111924.58k 118173.76k 119776.00k 120093.70k 120264.02k 120274.94k
+ SM4-CBC 106428.09k 109190.98k 109674.33k 109774.51k 109827.41k 109827.41k
+
+After (7.4x - 36.6x faster):
+ type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
+ SM4-CTR 781979.02k 2432994.28k 3437753.86k 3834177.88k 3963715.58k 3974556.33k
+ SM4-ECB 937590.69k 2941689.02k 3945751.81k 4328655.87k 4459181.40k 4468692.31k
+ SM4-CBC 890639.88k 1027746.58k 1050621.78k 1056696.66k 1058613.93k 1058701.31k
+
+Signed-off-by: Daniel Hu <Daniel.Hu@arm.com>
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/17455)
+---
+ crypto/arm64cpuid.pl | 8 +
+ crypto/arm_arch.h | 1 +
+ crypto/armcap.c | 10 +
+ crypto/evp/e_sm4.c | 193 ++++--
+ crypto/sm4/asm/sm4-armv8.pl | 635 ++++++++++++++++++
+ crypto/sm4/build.info | 32 +-
+ include/crypto/sm4_platform.h | 48 ++
+ .../implementations/ciphers/cipher_sm4.h | 1 +
+ .../ciphers/cipher_sm4_gcm_hw.c | 20 +-
+ .../implementations/ciphers/cipher_sm4_hw.c | 57 +-
+ 10 files changed, 945 insertions(+), 60 deletions(-)
+ create mode 100755 crypto/sm4/asm/sm4-armv8.pl
+ create mode 100644 include/crypto/sm4_platform.h
+
+diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
+index 10d267b7ad..36af3e075b 100755
+--- a/crypto/arm64cpuid.pl
++++ b/crypto/arm64cpuid.pl
+@@ -80,6 +80,14 @@ _armv8_pmull_probe:
+ ret
+ .size _armv8_pmull_probe,.-_armv8_pmull_probe
+
++.globl _armv8_sm4_probe
++.type _armv8_sm4_probe,%function
++_armv8_sm4_probe:
++ AARCH64_VALID_CALL_TARGET
++ .long 0xcec08400 // sm4e v0.4s, v0.4s
++ ret
++.size _armv8_sm4_probe,.-_armv8_sm4_probe
++
+ .globl _armv8_sha512_probe
+ .type _armv8_sha512_probe,%function
+ _armv8_sha512_probe:
+diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
+index c8b501f34c..5b5af31d92 100644
+--- a/crypto/arm_arch.h
++++ b/crypto/arm_arch.h
+@@ -85,6 +85,7 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
+ # define ARMV8_CPUID (1<<7)
+ # define ARMV8_RNG (1<<8)
+ # define ARMV8_SM3 (1<<9)
++# define ARMV8_SM4 (1<<10)
+
+ /*
+ * MIDR_EL1 system register
+diff --git a/crypto/armcap.c b/crypto/armcap.c
+index 365a48df45..c5aa062767 100644
+--- a/crypto/armcap.c
++++ b/crypto/armcap.c
+@@ -53,6 +53,7 @@ void _armv8_sha256_probe(void);
+ void _armv8_pmull_probe(void);
+ # ifdef __aarch64__
+ void _armv8_sm3_probe(void);
++void _armv8_sm4_probe(void);
+ void _armv8_sha512_probe(void);
+ unsigned int _armv8_cpuid_probe(void);
+ # endif
+@@ -139,6 +140,7 @@ static unsigned long getauxval(unsigned long key)
+ # define HWCAP_CE_SHA256 (1 << 6)
+ # define HWCAP_CPUID (1 << 11)
+ # define HWCAP_CE_SM3 (1 << 18)
++# define HWCAP_CE_SM4 (1 << 19)
+ # define HWCAP_CE_SHA512 (1 << 21)
+ # endif
+
+@@ -207,6 +209,9 @@ void OPENSSL_cpuid_setup(void)
+ OPENSSL_armcap_P |= ARMV8_SHA256;
+
+ # ifdef __aarch64__
++ if (hwcap & HWCAP_CE_SM4)
++ OPENSSL_armcap_P |= ARMV8_SM4;
++
+ if (hwcap & HWCAP_CE_SHA512)
+ OPENSSL_armcap_P |= ARMV8_SHA512;
+
+@@ -254,6 +259,11 @@ void OPENSSL_cpuid_setup(void)
+ OPENSSL_armcap_P |= ARMV8_SHA256;
+ }
+ # if defined(__aarch64__) && !defined(__APPLE__)
++ if (sigsetjmp(ill_jmp, 1) == 0) {
++ _armv8_sm4_probe();
++ OPENSSL_armcap_P |= ARMV8_SM4;
++ }
++
+ if (sigsetjmp(ill_jmp, 1) == 0) {
+ _armv8_sha512_probe();
+ OPENSSL_armcap_P |= ARMV8_SHA512;
+diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c
+index abd603015c..bff79ff197 100644
+--- a/crypto/evp/e_sm4.c
++++ b/crypto/evp/e_sm4.c
+@@ -17,92 +17,187 @@
+ # include <openssl/modes.h>
+ # include "crypto/sm4.h"
+ # include "crypto/evp.h"
++# include "crypto/sm4_platform.h"
+ # include "evp_local.h"
+
+ typedef struct {
+- SM4_KEY ks;
++ union {
++ OSSL_UNION_ALIGN;
++ SM4_KEY ks;
++ } ks;
++ block128_f block;
++ union {
++ ecb128_f ecb;
++ cbc128_f cbc;
++ ctr128_f ctr;
++ } stream;
+ } EVP_SM4_KEY;
+
++# define BLOCK_CIPHER_generic(nid,blocksize,ivlen,nmode,mode,MODE,flags) \
++static const EVP_CIPHER sm4_##mode = { \
++ nid##_##nmode,blocksize,128/8,ivlen, \
++ flags|EVP_CIPH_##MODE##_MODE, \
++ EVP_ORIG_GLOBAL, \
++ sm4_init_key, \
++ sm4_##mode##_cipher, \
++ NULL, \
++ sizeof(EVP_SM4_KEY), \
++ NULL,NULL,NULL,NULL }; \
++const EVP_CIPHER *EVP_sm4_##mode(void) \
++{ return &sm4_##mode; }
++
++#define DEFINE_BLOCK_CIPHERS(nid,flags) \
++ BLOCK_CIPHER_generic(nid,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
++ BLOCK_CIPHER_generic(nid,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
++ BLOCK_CIPHER_generic(nid,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
++ BLOCK_CIPHER_generic(nid,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \
++ BLOCK_CIPHER_generic(nid,1,16,ctr,ctr,CTR,flags)
++
+ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key,
+ const unsigned char *iv, int enc)
+ {
+- ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
++ int mode;
++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
++
++ mode = EVP_CIPHER_CTX_get_mode(ctx);
++ if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE)
++ && !enc) {
++#ifdef HWSM4_CAPABLE
++ if (HWSM4_CAPABLE) {
++ HWSM4_set_decrypt_key(key, &dat->ks.ks);
++ dat->block = (block128_f) HWSM4_decrypt;
++ dat->stream.cbc = NULL;
++# ifdef HWSM4_cbc_encrypt
++ if (mode == EVP_CIPH_CBC_MODE)
++ dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt;
++# endif
++# ifdef HWSM4_ecb_encrypt
++ if (mode == EVP_CIPH_ECB_MODE)
++ dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt;
++# endif
++ } else
++#endif
++ {
++ dat->block = (block128_f) ossl_sm4_decrypt;
++ ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
++ }
++ } else
++#ifdef HWSM4_CAPABLE
++ if (HWSM4_CAPABLE) {
++ HWSM4_set_encrypt_key(key, &dat->ks.ks);
++ dat->block = (block128_f) HWSM4_encrypt;
++ dat->stream.cbc = NULL;
++# ifdef HWSM4_cbc_encrypt
++ if (mode == EVP_CIPH_CBC_MODE)
++ dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt;
++ else
++# endif
++# ifdef HWSM4_ecb_encrypt
++ if (mode == EVP_CIPH_ECB_MODE)
++ dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt;
++ else
++# endif
++# ifdef HWSM4_ctr32_encrypt_blocks
++ if (mode == EVP_CIPH_CTR_MODE)
++ dat->stream.ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks;
++ else
++# endif
++ (void)0; /* terminate potentially open 'else' */
++ } else
++#endif
++ {
++ dat->block = (block128_f) ossl_sm4_encrypt;
++ ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx));
++ }
+ return 1;
+ }
+
+-static void sm4_cbc_encrypt(const unsigned char *in, unsigned char *out,
+- size_t len, const SM4_KEY *key,
+- unsigned char *ivec, const int enc)
++static int sm4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
++ const unsigned char *in, size_t len)
+ {
+- if (enc)
+- CRYPTO_cbc128_encrypt(in, out, len, key, ivec,
+- (block128_f)ossl_sm4_encrypt);
++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
++
++ if (dat->stream.cbc)
++ (*dat->stream.cbc) (in, out, len, &dat->ks.ks, ctx->iv,
++ EVP_CIPHER_CTX_is_encrypting(ctx));
++ else if (EVP_CIPHER_CTX_is_encrypting(ctx))
++ CRYPTO_cbc128_encrypt(in, out, len, &dat->ks, ctx->iv,
++ dat->block);
+ else
+- CRYPTO_cbc128_decrypt(in, out, len, key, ivec,
+- (block128_f)ossl_sm4_decrypt);
++ CRYPTO_cbc128_decrypt(in, out, len, &dat->ks,
++ ctx->iv, dat->block);
++ return 1;
+ }
+
+-static void sm4_cfb128_encrypt(const unsigned char *in, unsigned char *out,
+- size_t length, const SM4_KEY *key,
+- unsigned char *ivec, int *num, const int enc)
++static int sm4_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
++ const unsigned char *in, size_t len)
+ {
+- CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc,
+- (block128_f)ossl_sm4_encrypt);
++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
++ int num = EVP_CIPHER_CTX_get_num(ctx);
++
++ CRYPTO_cfb128_encrypt(in, out, len, &dat->ks,
++ ctx->iv, &num,
++ EVP_CIPHER_CTX_is_encrypting(ctx), dat->block);
++ EVP_CIPHER_CTX_set_num(ctx, num);
++ return 1;
+ }
+
+-static void sm4_ecb_encrypt(const unsigned char *in, unsigned char *out,
+- const SM4_KEY *key, const int enc)
++static int sm4_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
++ const unsigned char *in, size_t len)
+ {
+- if (enc)
+- ossl_sm4_encrypt(in, out, key);
++ size_t bl = EVP_CIPHER_CTX_get_block_size(ctx);
++ size_t i;
++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
++
++ if (len < bl)
++ return 1;
++
++ if (dat->stream.ecb != NULL)
++ (*dat->stream.ecb) (in, out, len, &dat->ks.ks,
++ EVP_CIPHER_CTX_is_encrypting(ctx));
+ else
+- ossl_sm4_decrypt(in, out, key);
++ for (i = 0, len -= bl; i <= len; i += bl)
++ (*dat->block) (in + i, out + i, &dat->ks);
++
++ return 1;
+ }
+
+-static void sm4_ofb128_encrypt(const unsigned char *in, unsigned char *out,
+- size_t length, const SM4_KEY *key,
+- unsigned char *ivec, int *num)
++static int sm4_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
++ const unsigned char *in, size_t len)
+ {
+- CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num,
+- (block128_f)ossl_sm4_encrypt);
+-}
++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
++ int num = EVP_CIPHER_CTX_get_num(ctx);
+
+-IMPLEMENT_BLOCK_CIPHER(sm4, ks, sm4, EVP_SM4_KEY, NID_sm4,
+- 16, 16, 16, 128, EVP_CIPH_FLAG_DEFAULT_ASN1,
+- sm4_init_key, 0, 0, 0, 0)
++ CRYPTO_ofb128_encrypt(in, out, len, &dat->ks,
++ ctx->iv, &num, dat->block);
++ EVP_CIPHER_CTX_set_num(ctx, num);
++ return 1;
++}
+
+ static int sm4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
+ const unsigned char *in, size_t len)
+ {
+ int n = EVP_CIPHER_CTX_get_num(ctx);
+ unsigned int num;
+- EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY, ctx);
++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx);
+
+ if (n < 0)
+ return 0;
+ num = (unsigned int)n;
+
+- CRYPTO_ctr128_encrypt(in, out, len, &dat->ks, ctx->iv,
+- EVP_CIPHER_CTX_buf_noconst(ctx), &num,
+- (block128_f)ossl_sm4_encrypt);
++ if (dat->stream.ctr)
++ CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks,
++ ctx->iv,
++ EVP_CIPHER_CTX_buf_noconst(ctx),
++ &num, dat->stream.ctr);
++ else
++ CRYPTO_ctr128_encrypt(in, out, len, &dat->ks,
++ ctx->iv,
++ EVP_CIPHER_CTX_buf_noconst(ctx), &num,
++ dat->block);
+ EVP_CIPHER_CTX_set_num(ctx, num);
+ return 1;
+ }
+
+-static const EVP_CIPHER sm4_ctr_mode = {
+- NID_sm4_ctr, 1, 16, 16,
+- EVP_CIPH_CTR_MODE,
+- EVP_ORIG_GLOBAL,
+- sm4_init_key,
+- sm4_ctr_cipher,
+- NULL,
+- sizeof(EVP_SM4_KEY),
+- NULL, NULL, NULL, NULL
+-};
+-
+-const EVP_CIPHER *EVP_sm4_ctr(void)
+-{
+- return &sm4_ctr_mode;
+-}
+-
++DEFINE_BLOCK_CIPHERS(NID_sm4, 0)
+ #endif
+diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl
+new file mode 100755
+index 0000000000..7358a6e6a2
+--- /dev/null
++++ b/crypto/sm4/asm/sm4-armv8.pl
+@@ -0,0 +1,635 @@
++#! /usr/bin/env perl
++# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the Apache License 2.0 (the "License"). You may not use
++# this file except in compliance with the License. You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++
++#
++# This module implements support for SM4 hw support on aarch64
++# Oct 2021
++#
++
++# $output is the last argument if it looks like a file (it has an extension)
++# $flavour is the first argument if it doesn't look like a file
++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
++die "can't locate arm-xlate.pl";
++
++open OUT,"| \"$^X\" $xlate $flavour \"$output\""
++ or die "can't call $xlate: $!";
++*STDOUT=*OUT;
++
++$prefix="sm4_v8";
++my @rks=map("v$_",(0..7));
++
++sub rev32() {
++my $dst = shift;
++my $src = shift;
++$code.=<<___;
++#ifndef __ARMEB__
++ rev32 $dst.16b,$src.16b
++#endif
++___
++}
++
++sub enc_blk () {
++my $data = shift;
++$code.=<<___;
++ sm4e $data.4s,@rks[0].4s
++ sm4e $data.4s,@rks[1].4s
++ sm4e $data.4s,@rks[2].4s
++ sm4e $data.4s,@rks[3].4s
++ sm4e $data.4s,@rks[4].4s
++ sm4e $data.4s,@rks[5].4s
++ sm4e $data.4s,@rks[6].4s
++ sm4e $data.4s,@rks[7].4s
++ rev64 $data.4S,$data.4S
++ ext $data.16b,$data.16b,$data.16b,#8
++___
++}
++
++sub enc_4blks () {
++my $data0 = shift;
++my $data1 = shift;
++my $data2 = shift;
++my $data3 = shift;
++$code.=<<___;
++ sm4e $data0.4s,@rks[0].4s
++ sm4e $data1.4s,@rks[0].4s
++ sm4e $data2.4s,@rks[0].4s
++ sm4e $data3.4s,@rks[0].4s
++
++ sm4e $data0.4s,@rks[1].4s
++ sm4e $data1.4s,@rks[1].4s
++ sm4e $data2.4s,@rks[1].4s
++ sm4e $data3.4s,@rks[1].4s
++
++ sm4e $data0.4s,@rks[2].4s
++ sm4e $data1.4s,@rks[2].4s
++ sm4e $data2.4s,@rks[2].4s
++ sm4e $data3.4s,@rks[2].4s
++
++ sm4e $data0.4s,@rks[3].4s
++ sm4e $data1.4s,@rks[3].4s
++ sm4e $data2.4s,@rks[3].4s
++ sm4e $data3.4s,@rks[3].4s
++
++ sm4e $data0.4s,@rks[4].4s
++ sm4e $data1.4s,@rks[4].4s
++ sm4e $data2.4s,@rks[4].4s
++ sm4e $data3.4s,@rks[4].4s
++
++ sm4e $data0.4s,@rks[5].4s
++ sm4e $data1.4s,@rks[5].4s
++ sm4e $data2.4s,@rks[5].4s
++ sm4e $data3.4s,@rks[5].4s
++
++ sm4e $data0.4s,@rks[6].4s
++ sm4e $data1.4s,@rks[6].4s
++ sm4e $data2.4s,@rks[6].4s
++ sm4e $data3.4s,@rks[6].4s
++
++ sm4e $data0.4s,@rks[7].4s
++ rev64 $data0.4S,$data0.4S
++ sm4e $data1.4s,@rks[7].4s
++ ext $data0.16b,$data0.16b,$data0.16b,#8
++ rev64 $data1.4S,$data1.4S
++ sm4e $data2.4s,@rks[7].4s
++ ext $data1.16b,$data1.16b,$data1.16b,#8
++ rev64 $data2.4S,$data2.4S
++ sm4e $data3.4s,@rks[7].4s
++ ext $data2.16b,$data2.16b,$data2.16b,#8
++ rev64 $data3.4S,$data3.4S
++ ext $data3.16b,$data3.16b,$data3.16b,#8
++___
++}
++
++$code=<<___;
++#include "arm_arch.h"
++.arch armv8-a+crypto
++.text
++___
++
++{{{
++$code.=<<___;
++.align 6
++.Lck:
++ .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
++ .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
++ .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
++ .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
++ .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
++ .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
++ .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
++ .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
++.Lfk:
++ .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc
++___
++}}}
++
++{{{
++my ($key,$keys)=("x0","x1");
++my ($tmp)=("x2");
++my ($key0,$key1,$key2,$key3,$key4,$key5,$key6,$key7)=map("v$_",(0..7));
++my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
++my ($fkconst) = ("v24");
++$code.=<<___;
++.globl ${prefix}_set_encrypt_key
++.type ${prefix}_set_encrypt_key,%function
++.align 5
++${prefix}_set_encrypt_key:
++ AARCH64_VALID_CALL_TARGET
++ ld1 {$key0.4s},[$key]
++ adr $tmp,.Lfk
++ ld1 {$fkconst.4s},[$tmp]
++ adr $tmp,.Lck
++ ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
++___
++ &rev32($key0, $key0);
++$code.=<<___;
++ ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
++ eor $key0.16b,$key0.16b,$fkconst.16b;
++ sm4ekey $key0.4S,$key0.4S,$const0.4S
++ sm4ekey $key1.4S,$key0.4S,$const1.4S
++ sm4ekey $key2.4S,$key1.4S,$const2.4S
++ sm4ekey $key3.4S,$key2.4S,$const3.4S
++ sm4ekey $key4.4S,$key3.4S,$const4.4S
++ st1 {$key0.4s,$key1.4s,$key2.4s,$key3.4s},[$keys],64
++ sm4ekey $key5.4S,$key4.4S,$const5.4S
++ sm4ekey $key6.4S,$key5.4S,$const6.4S
++ sm4ekey $key7.4S,$key6.4S,$const7.4S
++ st1 {$key4.4s,$key5.4s,$key6.4s,$key7.4s},[$keys]
++ ret
++.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
++___
++}}}
++
++{{{
++my ($key,$keys)=("x0","x1");
++my ($tmp)=("x2");
++my ($key7,$key6,$key5,$key4,$key3,$key2,$key1,$key0)=map("v$_",(0..7));
++my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23));
++my ($fkconst) = ("v24");
++$code.=<<___;
++.globl ${prefix}_set_decrypt_key
++.type ${prefix}_set_decrypt_key,%function
++.align 5
++${prefix}_set_decrypt_key:
++ AARCH64_VALID_CALL_TARGET
++ ld1 {$key0.4s},[$key]
++ adr $tmp,.Lfk
++ ld1 {$fkconst.4s},[$tmp]
++ adr $tmp, .Lck
++ ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64
++___
++ &rev32($key0, $key0);
++$code.=<<___;
++ ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp]
++ eor $key0.16b, $key0.16b,$fkconst.16b;
++ sm4ekey $key0.4S,$key0.4S,$const0.4S
++ sm4ekey $key1.4S,$key0.4S,$const1.4S
++ sm4ekey $key2.4S,$key1.4S,$const2.4S
++ rev64 $key0.4s,$key0.4s
++ rev64 $key1.4s,$key1.4s
++ ext $key0.16b,$key0.16b,$key0.16b,#8
++ ext $key1.16b,$key1.16b,$key1.16b,#8
++ sm4ekey $key3.4S,$key2.4S,$const3.4S
++ sm4ekey $key4.4S,$key3.4S,$const4.4S
++ rev64 $key2.4s,$key2.4s
++ rev64 $key3.4s,$key3.4s
++ ext $key2.16b,$key2.16b,$key2.16b,#8
++ ext $key3.16b,$key3.16b,$key3.16b,#8
++ sm4ekey $key5.4S,$key4.4S,$const5.4S
++ sm4ekey $key6.4S,$key5.4S,$const6.4S
++ rev64 $key4.4s,$key4.4s
++ rev64 $key5.4s,$key5.4s
++ ext $key4.16b,$key4.16b,$key4.16b,#8
++ ext $key5.16b,$key5.16b,$key5.16b,#8
++ sm4ekey $key7.4S,$key6.4S,$const7.4S
++ rev64 $key6.4s, $key6.4s
++ rev64 $key7.4s, $key7.4s
++ ext $key6.16b,$key6.16b,$key6.16b,#8
++ ext $key7.16b,$key7.16b,$key7.16b,#8
++ st1 {$key7.4s,$key6.4s,$key5.4s,$key4.4s},[$keys],64
++ st1 {$key3.4s,$key2.4s,$key1.4s,$key0.4s},[$keys]
++ ret
++.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
++___
++}}}
++
++{{{
++sub gen_block () {
++my $dir = shift;
++my ($inp,$out,$rk)=map("x$_",(0..2));
++my ($data)=("v16");
++$code.=<<___;
++.globl ${prefix}_${dir}crypt
++.type ${prefix}_${dir}crypt,%function
++.align 5
++${prefix}_${dir}crypt:
++ AARCH64_VALID_CALL_TARGET
++ ld1 {$data.4s},[$inp]
++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64
++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
++___
++ &rev32($data,$data);
++ &enc_blk($data);
++ &rev32($data,$data);
++$code.=<<___;
++ st1 {$data.4s},[$out]
++ ret
++.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
++___
++}
++
++&gen_block("en");
++&gen_block("de");
++}}}
++
++{{{
++my ($inp,$out,$len,$rk)=map("x$_",(0..3));
++my ($enc) = ("w4");
++my @dat=map("v$_",(16..23));
++$code.=<<___;
++.globl ${prefix}_ecb_encrypt
++.type ${prefix}_ecb_encrypt,%function
++.align 5
++${prefix}_ecb_encrypt:
++ AARCH64_VALID_CALL_TARGET
++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64
++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
++1:
++ cmp $len,#64
++ b.lt 1f
++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
++ cmp $len,#128
++ b.lt 2f
++ ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64
++ // 8 blocks
++___
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++ &rev32(@dat[4],@dat[4]);
++ &rev32(@dat[5],@dat[5]);
++ &rev32(@dat[6],@dat[6]);
++ &rev32(@dat[7],@dat[7]);
++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++ &rev32(@dat[4],@dat[4]);
++ &rev32(@dat[5],@dat[5]);
++$code.=<<___;
++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++___
++ &rev32(@dat[6],@dat[6]);
++ &rev32(@dat[7],@dat[7]);
++$code.=<<___;
++ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
++ subs $len,$len,#128
++ b.gt 1b
++ ret
++ // 4 blocks
++2:
++___
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++$code.=<<___;
++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++ subs $len,$len,#64
++ b.gt 1b
++1:
++ subs $len,$len,#16
++ b.lt 1f
++ ld1 {@dat[0].4s},[$inp],#16
++___
++ &rev32(@dat[0],@dat[0]);
++ &enc_blk(@dat[0]);
++ &rev32(@dat[0],@dat[0]);
++$code.=<<___;
++ st1 {@dat[0].4s},[$out],#16
++ b.ne 1b
++1:
++ ret
++.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
++___
++}}}
++
++{{{
++my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
++my ($enc) = ("w5");
++my @dat=map("v$_",(16..23));
++my @in=map("v$_",(24..31));
++my ($ivec) = ("v8");
++$code.=<<___;
++.globl ${prefix}_cbc_encrypt
++.type ${prefix}_cbc_encrypt,%function
++.align 5
++${prefix}_cbc_encrypt:
++ AARCH64_VALID_CALL_TARGET
++ stp d8,d9,[sp, #-16]!
++
++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64
++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
++ ld1 {$ivec.4s},[$ivp]
++ cmp $enc,#0
++ b.eq .Ldec
++1:
++ cmp $len, #64
++ b.lt 1f
++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64
++ eor @dat[0].16b,@dat[0].16b,$ivec.16b
++___
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++ &enc_blk(@dat[0]);
++$code.=<<___;
++ eor @dat[1].16b,@dat[1].16b,@dat[0].16b
++___
++ &enc_blk(@dat[1]);
++ &rev32(@dat[0],@dat[0]);
++$code.=<<___;
++ eor @dat[2].16b,@dat[2].16b,@dat[1].16b
++___
++ &enc_blk(@dat[2]);
++ &rev32(@dat[1],@dat[1]);
++$code.=<<___;
++ eor @dat[3].16b,@dat[3].16b,@dat[2].16b
++___
++ &enc_blk(@dat[3]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++$code.=<<___;
++ mov $ivec.16b,@dat[3].16b
++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++ subs $len,$len,#64
++ b.ne 1b
++1:
++ subs $len,$len,#16
++ b.lt 3f
++ ld1 {@dat[0].4s},[$inp],#16
++ eor $ivec.16b,$ivec.16b,@dat[0].16b
++___
++ &rev32($ivec,$ivec);
++ &enc_blk($ivec);
++ &rev32($ivec,$ivec);
++$code.=<<___;
++ st1 {$ivec.16b},[$out],#16
++ b.ne 1b
++ b 3f
++.Ldec:
++1:
++ cmp $len, #64
++ b.lt 1f
++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp]
++ ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64
++ cmp $len,#128
++ b.lt 2f
++ // 8 blocks mode
++ ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp]
++ ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64
++___
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],$dat[3]);
++ &rev32(@dat[4],@dat[4]);
++ &rev32(@dat[5],@dat[5]);
++ &rev32(@dat[6],@dat[6]);
++ &rev32(@dat[7],$dat[7]);
++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++ &rev32(@dat[4],@dat[4]);
++ &rev32(@dat[5],@dat[5]);
++ &rev32(@dat[6],@dat[6]);
++ &rev32(@dat[7],@dat[7]);
++$code.=<<___;
++ eor @dat[0].16b,@dat[0].16b,$ivec.16b
++ eor @dat[1].16b,@dat[1].16b,@in[0].16b
++ eor @dat[2].16b,@dat[2].16b,@in[1].16b
++ mov $ivec.16b,@in[7].16b
++ eor @dat[3].16b,$dat[3].16b,@in[2].16b
++ eor @dat[4].16b,$dat[4].16b,@in[3].16b
++ eor @dat[5].16b,$dat[5].16b,@in[4].16b
++ eor @dat[6].16b,$dat[6].16b,@in[5].16b
++ eor @dat[7].16b,$dat[7].16b,@in[6].16b
++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
++ subs $len,$len,128
++ b.gt 1b
++ b 3f
++ // 4 blocks mode
++2:
++___
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],$dat[3]);
++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++$code.=<<___;
++ eor @dat[0].16b,@dat[0].16b,$ivec.16b
++ eor @dat[1].16b,@dat[1].16b,@in[0].16b
++ mov $ivec.16b,@in[3].16b
++ eor @dat[2].16b,@dat[2].16b,@in[1].16b
++ eor @dat[3].16b,$dat[3].16b,@in[2].16b
++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++ subs $len,$len,#64
++ b.gt 1b
++1:
++ subs $len,$len,#16
++ b.lt 3f
++ ld1 {@dat[0].4s},[$inp],#16
++ mov @in[0].16b,@dat[0].16b
++___
++ &rev32(@dat[0],@dat[0]);
++ &enc_blk(@dat[0]);
++ &rev32(@dat[0],@dat[0]);
++$code.=<<___;
++ eor @dat[0].16b,@dat[0].16b,$ivec.16b
++ mov $ivec.16b,@in[0].16b
++ st1 {@dat[0].16b},[$out],#16
++ b.ne 1b
++3:
++ // save back IV
++ st1 {$ivec.16b},[$ivp]
++ ldp d8,d9,[sp],#16
++ ret
++.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
++___
++}}}
++
++{{{
++my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4));
++my ($ctr)=("w5");
++my @dat=map("v$_",(16..23));
++my @in=map("v$_",(24..31));
++my ($ivec)=("v8");
++$code.=<<___;
++.globl ${prefix}_ctr32_encrypt_blocks
++.type ${prefix}_ctr32_encrypt_blocks,%function
++.align 5
++${prefix}_ctr32_encrypt_blocks:
++ AARCH64_VALID_CALL_TARGET
++ stp d8,d9,[sp, #-16]!
++
++ ld1 {$ivec.4s},[$ivp]
++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64
++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk]
++___
++ &rev32($ivec,$ivec);
++$code.=<<___;
++ mov $ctr,$ivec.s[3]
++1:
++ cmp $len,#4
++ b.lt 1f
++ ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64
++ mov @dat[0].16b,$ivec.16b
++ mov @dat[1].16b,$ivec.16b
++ mov @dat[2].16b,$ivec.16b
++ mov @dat[3].16b,$ivec.16b
++ add $ctr,$ctr,#1
++ mov $dat[1].s[3],$ctr
++ add $ctr,$ctr,#1
++ mov @dat[2].s[3],$ctr
++ add $ctr,$ctr,#1
++ mov @dat[3].s[3],$ctr
++ cmp $len,#8
++ b.lt 2f
++ ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64
++ mov @dat[4].16b,$ivec.16b
++ mov @dat[5].16b,$ivec.16b
++ mov @dat[6].16b,$ivec.16b
++ mov @dat[7].16b,$ivec.16b
++ add $ctr,$ctr,#1
++ mov $dat[4].s[3],$ctr
++ add $ctr,$ctr,#1
++ mov @dat[5].s[3],$ctr
++ add $ctr,$ctr,#1
++ mov @dat[6].s[3],$ctr
++ add $ctr,$ctr,#1
++ mov @dat[7].s[3],$ctr
++___
++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]);
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++ &rev32(@dat[4],@dat[4]);
++ &rev32(@dat[5],@dat[5]);
++ &rev32(@dat[6],@dat[6]);
++ &rev32(@dat[7],@dat[7]);
++$code.=<<___;
++ eor @dat[0].16b,@dat[0].16b,@in[0].16b
++ eor @dat[1].16b,@dat[1].16b,@in[1].16b
++ eor @dat[2].16b,@dat[2].16b,@in[2].16b
++ eor @dat[3].16b,@dat[3].16b,@in[3].16b
++ eor @dat[4].16b,@dat[4].16b,@in[4].16b
++ eor @dat[5].16b,@dat[5].16b,@in[5].16b
++ eor @dat[6].16b,@dat[6].16b,@in[6].16b
++ eor @dat[7].16b,@dat[7].16b,@in[7].16b
++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64
++ subs $len,$len,#8
++ b.eq 3f
++ add $ctr,$ctr,#1
++ mov $ivec.s[3],$ctr
++ b 1b
++2:
++___
++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]);
++ &rev32(@dat[0],@dat[0]);
++ &rev32(@dat[1],@dat[1]);
++ &rev32(@dat[2],@dat[2]);
++ &rev32(@dat[3],@dat[3]);
++$code.=<<___;
++ eor @dat[0].16b,@dat[0].16b,@in[0].16b
++ eor @dat[1].16b,@dat[1].16b,@in[1].16b
++ eor @dat[2].16b,@dat[2].16b,@in[2].16b
++ eor @dat[3].16b,@dat[3].16b,@in[3].16b
++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64
++ subs $len,$len,#4
++ b.eq 3f
++ add $ctr,$ctr,#1
++ mov $ivec.s[3],$ctr
++ b 1b
++1:
++ subs $len,$len,#1
++ b.lt 3f
++ mov $dat[0].16b,$ivec.16b
++ ld1 {@in[0].4s},[$inp],#16
++___
++ &enc_blk(@dat[0]);
++ &rev32(@dat[0],@dat[0]);
++$code.=<<___;
++ eor $dat[0].16b,$dat[0].16b,@in[0].16b
++ st1 {$dat[0].4s},[$out],#16
++ b.eq 3f
++ add $ctr,$ctr,#1
++ mov $ivec.s[3],$ctr
++ b 1b
++3:
++ ldp d8,d9,[sp],#16
++ ret
++.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
++___
++}}}
++########################################
++{ my %opcode = (
++ "sm4e" => 0xcec08400,
++ "sm4ekey" => 0xce60c800);
++
++ sub unsm4 {
++ my ($mnemonic,$arg)=@_;
++
++ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
++ &&
++ sprintf ".inst\t0x%08x\t//%s %s",
++ $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
++ $mnemonic,$arg;
++ }
++}
++
++open SELF,$0;
++while(<SELF>) {
++ next if (/^#!/);
++ last if (!s/^#/\/\// and !/^$/);
++ print;
++}
++close SELF;
++
++foreach(split("\n",$code)) {
++ s/\`([^\`]*)\`/eval($1)/ge;
++
++ s/\b(sm4\w+)\s+([qv].*)/unsm4($1,$2)/ge;
++ print $_,"\n";
++}
++
++close STDOUT or die "error closing STDOUT: $!";
+diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info
+index b65a7d149e..e27aa49e67 100644
+--- a/crypto/sm4/build.info
++++ b/crypto/sm4/build.info
+@@ -1,4 +1,32 @@
+ LIBS=../../libcrypto
+-SOURCE[../../libcrypto]=\
+- sm4.c
+
++IF[{- !$disabled{asm} -}]
++ $SM4DEF_aarch64=SM4_ASM
++ $SM4ASM_aarch64=sm4-armv8.S
++
++ # Now that we have defined all the arch specific variables, use the
++ # appropriate one, and define the appropriate macros
++ IF[$SM4ASM_{- $target{asm_arch} -}]
++ $SM4ASM=$SM4ASM_{- $target{asm_arch} -}
++ $SM4DEF=$SM4DEF_{- $target{asm_arch} -}
++ ENDIF
++ENDIF
++
++SOURCE[../../libcrypto]= $SM4ASM sm4.c
++
++
++# Implementations are now spread across several libraries, so the defines
++# need to be applied to all affected libraries and modules.
++DEFINE[../../libcrypto]=$SM4DEF
++DEFINE[../../providers/libfips.a]=$SM4DEF
++DEFINE[../../providers/libdefault.a]=$SM4DEF
++# We only need to include the SM4DEF stuff in the legacy provider when it's a
++# separate module and it's dynamically linked with libcrypto. Otherwise, it
++# already gets everything that the static libcrypto.a has, and doesn't need it
++# added again.
++IF[{- !$disabled{module} && !$disabled{shared} -}]
++ DEFINE[../providers/liblegacy.a]=$SM4DEF
++ENDIF
++
++GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl
++INCLUDE[sm4-armv8.o]=..
+diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h
+new file mode 100644
+index 0000000000..42c8b44a43
+--- /dev/null
++++ b/include/crypto/sm4_platform.h
+@@ -0,0 +1,48 @@
++/*
++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License"). You may not use
++ * this file except in compliance with the License. You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++#ifndef OSSL_SM4_PLATFORM_H
++# define OSSL_SM4_PLATFORM_H
++# pragma once
++
++# if defined(OPENSSL_CPUID_OBJ)
++# if (defined(__arm__) || defined(__arm) || defined(__aarch64__))
++# include "arm_arch.h"
++# if __ARM_MAX_ARCH__>=8
++# define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4)
++# define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key
++# define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key
++# define HWSM4_encrypt sm4_v8_encrypt
++# define HWSM4_decrypt sm4_v8_decrypt
++# define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt
++# define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt
++# define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks
++# endif
++# endif
++# endif /* OPENSSL_CPUID_OBJ */
++
++# if defined(HWSM4_CAPABLE)
++int HWSM4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key);
++int HWSM4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key);
++void HWSM4_encrypt(const unsigned char *in, unsigned char *out,
++ const SM4_KEY *key);
++void HWSM4_decrypt(const unsigned char *in, unsigned char *out,
++ const SM4_KEY *key);
++void HWSM4_cbc_encrypt(const unsigned char *in, unsigned char *out,
++ size_t length, const SM4_KEY *key,
++ unsigned char *ivec, const int enc);
++void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out,
++ size_t length, const SM4_KEY *key,
++ const int enc);
++void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out,
++ size_t len, const void *key,
++ const unsigned char ivec[16]);
++# endif /* HWSM4_CAPABLE */
++
++#endif /* OSSL_SM4_PLATFORM_H */
+diff --git a/providers/implementations/ciphers/cipher_sm4.h b/providers/implementations/ciphers/cipher_sm4.h
+index f7f833fcb4..01a031a74d 100644
+--- a/providers/implementations/ciphers/cipher_sm4.h
++++ b/providers/implementations/ciphers/cipher_sm4.h
+@@ -9,6 +9,7 @@
+
+ #include "prov/ciphercommon.h"
+ #include "crypto/sm4.h"
++#include "crypto/sm4_platform.h"
+
+ typedef struct prov_cast_ctx_st {
+ PROV_CIPHER_CTX base; /* Must be first */
+diff --git a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+index 6bcd1ec406..c0c9b22bd3 100644
+--- a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
++++ b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+@@ -12,6 +12,7 @@
+ */
+
+ #include "cipher_sm4_gcm.h"
++#include "crypto/sm4_platform.h"
+
+ static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
+ size_t keylen)
+@@ -20,9 +21,22 @@ static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
+ SM4_KEY *ks = &actx->ks.ks;
+
+ ctx->ks = ks;
+- ossl_sm4_set_key(key, ks);
+- CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt);
+- ctx->ctr = (ctr128_f)NULL;
++# ifdef HWSM4_CAPABLE
++ if (HWSM4_CAPABLE) {
++ HWSM4_set_encrypt_key(key, ks);
++ CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f) HWSM4_encrypt);
++# ifdef HWSM4_ctr32_encrypt_blocks
++ ctx->ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks;
++# else /* HWSM4_ctr32_encrypt_blocks */
++ ctx->ctr = (ctr128_f)NULL;
++# endif
++ } else
++# endif /* HWSM4_CAPABLE */
++ {
++ ossl_sm4_set_key(key, ks);
++ CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt);
++ ctx->ctr = (ctr128_f)NULL;
++ }
+ ctx->key_set = 1;
+
+ return 1;
+diff --git a/providers/implementations/ciphers/cipher_sm4_hw.c b/providers/implementations/ciphers/cipher_sm4_hw.c
+index 0db04b1a74..4cd3d3d669 100644
+--- a/providers/implementations/ciphers/cipher_sm4_hw.c
++++ b/providers/implementations/ciphers/cipher_sm4_hw.c
+@@ -15,14 +15,59 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx,
+ PROV_SM4_CTX *sctx = (PROV_SM4_CTX *)ctx;
+ SM4_KEY *ks = &sctx->ks.ks;
+
+- ossl_sm4_set_key(key, ks);
+ ctx->ks = ks;
+ if (ctx->enc
+ || (ctx->mode != EVP_CIPH_ECB_MODE
+- && ctx->mode != EVP_CIPH_CBC_MODE))
+- ctx->block = (block128_f)ossl_sm4_encrypt;
+- else
+- ctx->block = (block128_f)ossl_sm4_decrypt;
++ && ctx->mode != EVP_CIPH_CBC_MODE)) {
++#ifdef HWSM4_CAPABLE
++ if (HWSM4_CAPABLE) {
++ HWSM4_set_encrypt_key(key, ks);
++ ctx->block = (block128_f)HWSM4_encrypt;
++ ctx->stream.cbc = NULL;
++#ifdef HWSM4_cbc_encrypt
++ if (ctx->mode == EVP_CIPH_CBC_MODE)
++ ctx->stream.cbc = (cbc128_f)HWSM4_cbc_encrypt;
++ else
++#endif
++#ifdef HWSM4_ecb_encrypt
++ if (ctx->mode == EVP_CIPH_ECB_MODE)
++ ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt;
++ else
++#endif
++#ifdef HWSM4_ctr32_encrypt_blocks
++ if (ctx->mode == EVP_CIPH_CTR_MODE)
++ ctx->stream.ctr = (ctr128_f)HWSM4_ctr32_encrypt_blocks;
++ else
++#endif
++ (void)0; /* terminate potentially open 'else' */
++ } else
++#endif
++ {
++ ossl_sm4_set_key(key, ks);
++ ctx->block = (block128_f)ossl_sm4_encrypt;
++ }
++ } else {
++#ifdef HWSM4_CAPABLE
++ if (HWSM4_CAPABLE) {
++ HWSM4_set_decrypt_key(key, ks);
++ ctx->block = (block128_f)HWSM4_decrypt;
++ ctx->stream.cbc = NULL;
++#ifdef HWSM4_cbc_encrypt
++ if (ctx->mode == EVP_CIPH_CBC_MODE)
++ ctx->stream.cbc = (cbc128_f)HWSM4_cbc_encrypt;
++#endif
++#ifdef HWSM4_ecb_encrypt
++ if (ctx->mode == EVP_CIPH_ECB_MODE)
++ ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt;
++#endif
++ } else
++#endif
++ {
++ ossl_sm4_set_key(key, ks);
++ ctx->block = (block128_f)ossl_sm4_decrypt;
++ }
++ }
++
+ return 1;
+ }
+
+@@ -31,7 +76,7 @@ IMPLEMENT_CIPHER_HW_COPYCTX(cipher_hw_sm4_copyctx, PROV_SM4_CTX)
+ # define PROV_CIPHER_HW_sm4_mode(mode) \
+ static const PROV_CIPHER_HW sm4_##mode = { \
+ cipher_hw_sm4_initkey, \
+- ossl_cipher_hw_chunked_##mode, \
++ ossl_cipher_hw_generic_##mode, \
+ cipher_hw_sm4_copyctx \
+ }; \
+ const PROV_CIPHER_HW *ossl_prov_cipher_hw_sm4_##mode(size_t keybits) \
+--
+2.37.3.windows.1
+
diff --git a/Backport-aarch64-support-BTI-and-pointer-authentication-in-as.patch b/Backport-aarch64-support-BTI-and-pointer-authentication-in-as.patch
new file mode 100644
index 0000000..31852cb
--- /dev/null
+++ b/Backport-aarch64-support-BTI-and-pointer-authentication-in-as.patch
@@ -0,0 +1,1521 @@
+From 44b6e3d07ae5b09255710986e61035c862ec68aa Mon Sep 17 00:00:00 2001
+From: Russ Butler <russ.butler@arm.com>
+Date: Sat, 28 Aug 2021 13:57:09 -0500
+Subject: [PATCH 01/13] aarch64: support BTI and pointer authentication in
+ assembly
+
+This change adds optional support for
+- Armv8.3-A Pointer Authentication (PAuth) and
+- Armv8.5-A Branch Target Identification (BTI)
+features to the perl scripts.
+
+Both features can be enabled with additional compiler flags.
+Unless any of these are enabled explicitly there is no code change at
+all.
+
+The extensions are briefly described below. Please read the appropriate
+chapters of the Arm Architecture Reference Manual for the complete
+specification.
+
+Scope
+-----
+
+This change only affects generated assembly code.
+
+Armv8.3-A Pointer Authentication
+--------------------------------
+
+Pointer Authentication extension supports the authentication of the
+contents of registers before they are used for indirect branching
+or load.
+
+PAuth provides a probabilistic method to detect corruption of register
+values. PAuth signing instructions generate a Pointer Authentication
+Code (PAC) based on the value of a register, a seed and a key.
+The generated PAC is inserted into the original value in the register.
+A PAuth authentication instruction recomputes the PAC, and if it matches
+the PAC in the register, restores its original value. In case of a
+mismatch, an architecturally unmapped address is generated instead.
+
+With PAuth, mitigation against ROP (Return-oriented Programming) attacks
+can be implemented. This is achieved by signing the contents of the
+link-register (LR) before it is pushed to stack. Once LR is popped,
+it is authenticated. This way a stack corruption which overwrites the
+LR on the stack is detectable.
+
+The PAuth extension adds several new instructions, some of which are not
+recognized by older hardware. To support a single codebase for both pre
+Armv8.3-A targets and newer ones, only NOP-space instructions are added
+by this patch. These instructions are treated as NOPs on hardware
+which does not support Armv8.3-A. Furthermore, this patch only considers
+cases where LR is saved to the stack and then restored before branching
+to its content. There are cases in the code where LR is pushed to stack
+but it is not used later. We do not address these cases as they are not
+affected by PAuth.
+
+There are two keys available to sign an instruction address: A and B.
+PACIASP and PACIBSP only differ in the used keys: A and B, respectively.
+The keys are typically managed by the operating system.
+
+To enable generating code for PAuth compile with
+-mbranch-protection=<mode>:
+
+- standard or pac-ret: add PACIASP and AUTIASP, also enables BTI
+ (read below)
+- pac-ret+b-key: add PACIBSP and AUTIBSP
+
+Armv8.5-A Branch Target Identification
+--------------------------------------
+
+Branch Target Identification features some new instructions which
+protect the execution of instructions on guarded pages which are not
+intended branch targets.
+
+If Armv8.5-A is supported by the hardware, execution of an instruction
+changes the value of PSTATE.BTYPE field. If an indirect branch
+lands on a guarded page the target instruction must be one of the
+BTI <jc> flavors, or in case of a direct call or jump it can be any
+other instruction. If the target instruction is not compatible with the
+value of PSTATE.BTYPE a Branch Target Exception is generated.
+
+In short, indirect jumps are compatible with BTI <j> and <jc> while
+indirect calls are compatible with BTI <c> and <jc>. Please refer to the
+specification for the details.
+
+Armv8.3-A PACIASP and PACIBSP are implicit branch target
+identification instructions which are equivalent with BTI c or BTI jc
+depending on system register configuration.
+
+BTI is used to mitigate JOP (Jump-oriented Programming) attacks by
+limiting the set of instructions which can be jumped to.
+
+BTI requires active linker support to mark the pages with BTI-enabled
+code as guarded. For ELF64 files BTI compatibility is recorded in the
+.note.gnu.property section. For a shared object or static binary it is
+required that all linked units support BTI. This means that even a
+single assembly file without the required note section turns-off BTI
+for the whole binary or shared object.
+
+The new BTI instructions are treated as NOPs on hardware which does
+not support Armv8.5-A or on pages which are not guarded.
+
+To insert this new and optional instruction compile with
+-mbranch-protection=standard (also enables PAuth) or +bti.
+
+When targeting a guarded page from a non-guarded page, weaker
+compatibility restrictions apply to maintain compatibility between
+legacy and new code. For detailed rules please refer to the Arm ARM.
+
+Compiler support
+----------------
+
+Compiler support requires understanding '-mbranch-protection=<mode>'
+and emitting the appropriate feature macros (__ARM_FEATURE_BTI_DEFAULT
+and __ARM_FEATURE_PAC_DEFAULT). The current state is the following:
+
+-------------------------------------------------------
+| Compiler | -mbranch-protection | Feature macros |
++----------+---------------------+--------------------+
+| clang | 9.0.0 | 11.0.0 |
++----------+---------------------+--------------------+
+| gcc | 9 | expected in 10.1+ |
+-------------------------------------------------------
+
+Available Platforms
+------------------
+
+Arm Fast Model and QEMU support both extensions.
+
+https://developer.arm.com/tools-and-software/simulation-models/fast-models
+https://www.qemu.org/
+
+Implementation Notes
+--------------------
+
+This change adds BTI landing pads even to assembly functions which are
+likely to be directly called only. In these cases, landing pads might
+be superfluous depending on what code the linker generates.
+Code size and performance impact for these cases would be negligible.
+
+Interaction with C code
+-----------------------
+
+Pointer Authentication is a per-frame protection while Branch Target
+Identification can be turned on and off only for all code pages of a
+whole shared object or static binary. Because of these properties if
+C/C++ code is compiled without any of the above features but assembly
+files support any of them unconditionally there is no incompatibility
+between the two.
+
+Useful Links
+------------
+
+To fully understand the details of both PAuth and BTI it is advised to
+read the related chapters of the Arm Architecture Reference Manual
+(Arm ARM):
+https://developer.arm.com/documentation/ddi0487/latest/
+
+Additional materials:
+
+"Providing protection for complex software"
+https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software
+
+Arm Compiler Reference Guide Version 6.14: -mbranch-protection
+https://developer.arm.com/documentation/101754/0614/armclang-Reference/armclang-Command-line-Options/-mbranch-protection?lang=en
+
+Arm C Language Extensions (ACLE)
+https://developer.arm.com/docs/101028/latest
+
+Addional Notes
+--------------
+
+This patch is a copy of the work done by Tamas Petz in boringssl. It
+contains the changes from the following commits:
+
+aarch64: support BTI and pointer authentication in assembly
+ Change-Id: I4335f92e2ccc8e209c7d68a0a79f1acdf3aeb791
+ URL: https://boringssl-review.googlesource.com/c/boringssl/+/42084
+aarch64: Improve conditional compilation
+ Change-Id: I14902a64e5f403c2b6a117bc9f5fb1a4f4611ebf
+ URL: https://boringssl-review.googlesource.com/c/boringssl/+/43524
+aarch64: Fix name of gnu property note section
+ Change-Id: I6c432d1c852129e9c273f6469a8b60e3983671ec
+ URL: https://boringssl-review.googlesource.com/c/boringssl/+/44024
+
+Change-Id: I2d95ebc5e4aeb5610d3b226f9754ee80cf74a9af
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/16674)
+---
+ crypto/aes/asm/aesv8-armx.pl | 18 +++++++-
+ crypto/aes/asm/vpaes-armv8.pl | 39 ++++++++--------
+ crypto/aes/build.info | 1 +
+ crypto/arm64cpuid.pl | 10 +++++
+ crypto/arm_arch.h | 58 ++++++++++++++++++++++++
+ crypto/bn/asm/armv8-mont.pl | 19 +++++---
+ crypto/chacha/asm/chacha-armv8.pl | 18 ++++----
+ crypto/ec/asm/ecp_nistz256-armv8.pl | 64 ++++++++++++++++-----------
+ crypto/modes/asm/aes-gcm-armv8_64.pl | 6 +++
+ crypto/modes/asm/ghashv8-armx.pl | 11 +++++
+ crypto/poly1305/asm/poly1305-armv8.pl | 17 ++++++-
+ crypto/sha/asm/keccak1600-armv8.pl | 30 +++++++------
+ crypto/sha/asm/sha1-armv8.pl | 5 ++-
+ crypto/sha/asm/sha512-armv8.pl | 11 +++--
+ crypto/sha/build.info | 1 +
+ 15 files changed, 228 insertions(+), 80 deletions(-)
+
+diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl
+index 6a7bf05d1b..ed5ae4207c 100755
+--- a/crypto/aes/asm/aesv8-armx.pl
++++ b/crypto/aes/asm/aesv8-armx.pl
+@@ -120,6 +120,8 @@ ${prefix}_set_encrypt_key:
+ .Lenc_key:
+ ___
+ $code.=<<___ if ($flavour =~ /64/);
++ AARCH64_VALID_CALL_TARGET
++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ ___
+@@ -295,7 +297,7 @@ $code.=<<___;
+ ${prefix}_set_decrypt_key:
+ ___
+ $code.=<<___ if ($flavour =~ /64/);
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ ___
+@@ -339,7 +341,7 @@ $code.=<<___ if ($flavour !~ /64/);
+ ___
+ $code.=<<___ if ($flavour =~ /64/);
+ ldp x29,x30,[sp],#16
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ ___
+ $code.=<<___;
+@@ -359,6 +361,11 @@ $code.=<<___;
+ .type ${prefix}_${dir}crypt,%function
+ .align 5
+ ${prefix}_${dir}crypt:
++___
++$code.=<<___ if ($flavour =~ /64/);
++ AARCH64_VALID_CALL_TARGET
++___
++$code.=<<___;
+ ldr $rounds,[$key,#240]
+ vld1.32 {$rndkey0},[$key],#16
+ vld1.8 {$inout},[$inp]
+@@ -442,6 +449,7 @@ $code.=<<___;
+ ${prefix}_ecb_encrypt:
+ ___
+ $code.=<<___ if ($flavour =~ /64/);
++ AARCH64_VALID_CALL_TARGET
+ subs $len,$len,#16
+ // Original input data size bigger than 16, jump to big size processing.
+ b.ne .Lecb_big_size
+@@ -1236,6 +1244,8 @@ $code.=<<___;
+ ${prefix}_cbc_encrypt:
+ ___
+ $code.=<<___ if ($flavour =~ /64/);
++ AARCH64_VALID_CALL_TARGET
++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ ___
+@@ -1764,6 +1774,8 @@ $code.=<<___;
+ ${prefix}_ctr32_encrypt_blocks:
+ ___
+ $code.=<<___ if ($flavour =~ /64/);
++ AARCH64_VALID_CALL_TARGET
++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ ___
+@@ -2256,6 +2268,7 @@ $code.=<<___ if ($flavour =~ /64/);
+ ${prefix}_xts_encrypt:
+ ___
+ $code.=<<___ if ($flavour =~ /64/);
++ AARCH64_VALID_CALL_TARGET
+ cmp $len,#16
+ // Original input data size bigger than 16, jump to big size processing.
+ b.ne .Lxts_enc_big_size
+@@ -2930,6 +2943,7 @@ $code.=<<___ if ($flavour =~ /64/);
+ .type ${prefix}_xts_decrypt,%function
+ .align 5
+ ${prefix}_xts_decrypt:
++ AARCH64_VALID_CALL_TARGET
+ ___
+ $code.=<<___ if ($flavour =~ /64/);
+ cmp $len,#16
+diff --git a/crypto/aes/asm/vpaes-armv8.pl b/crypto/aes/asm/vpaes-armv8.pl
+index dcd5065e68..49988e9c2b 100755
+--- a/crypto/aes/asm/vpaes-armv8.pl
++++ b/crypto/aes/asm/vpaes-armv8.pl
+@@ -53,6 +53,8 @@ open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+ *STDOUT=*OUT;
+
+ $code.=<<___;
++#include "arm_arch.h"
++
+ .text
+
+ .type _vpaes_consts,%object
+@@ -259,7 +261,7 @@ _vpaes_encrypt_core:
+ .type vpaes_encrypt,%function
+ .align 4
+ vpaes_encrypt:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+@@ -269,7 +271,7 @@ vpaes_encrypt:
+ st1 {v0.16b}, [$out]
+
+ ldp x29,x30,[sp],#16
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size vpaes_encrypt,.-vpaes_encrypt
+
+@@ -492,7 +494,7 @@ _vpaes_decrypt_core:
+ .type vpaes_decrypt,%function
+ .align 4
+ vpaes_decrypt:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+@@ -502,7 +504,7 @@ vpaes_decrypt:
+ st1 {v0.16b}, [$out]
+
+ ldp x29,x30,[sp],#16
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size vpaes_decrypt,.-vpaes_decrypt
+
+@@ -673,7 +675,7 @@ _vpaes_key_preheat:
+ .type _vpaes_schedule_core,%function
+ .align 4
+ _vpaes_schedule_core:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29, x30, [sp,#-16]!
+ add x29,sp,#0
+
+@@ -838,7 +840,7 @@ _vpaes_schedule_core:
+ eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6
+ eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7
+ ldp x29, x30, [sp],#16
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size _vpaes_schedule_core,.-_vpaes_schedule_core
+
+@@ -1051,7 +1053,7 @@ _vpaes_schedule_mangle:
+ .type vpaes_set_encrypt_key,%function
+ .align 4
+ vpaes_set_encrypt_key:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+@@ -1067,7 +1069,7 @@ vpaes_set_encrypt_key:
+
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key
+
+@@ -1075,7 +1077,7 @@ vpaes_set_encrypt_key:
+ .type vpaes_set_decrypt_key,%function
+ .align 4
+ vpaes_set_decrypt_key:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+@@ -1095,7 +1097,7 @@ vpaes_set_decrypt_key:
+
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key
+ ___
+@@ -1108,11 +1110,11 @@ $code.=<<___;
+ .type vpaes_cbc_encrypt,%function
+ .align 4
+ vpaes_cbc_encrypt:
++ AARCH64_SIGN_LINK_REGISTER
+ cbz $len, .Lcbc_abort
+ cmp w5, #0 // check direction
+ b.eq vpaes_cbc_decrypt
+
+- .inst 0xd503233f // paciasp
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+@@ -1135,15 +1137,16 @@ vpaes_cbc_encrypt:
+ st1 {v0.16b}, [$ivec] // write ivec
+
+ ldp x29,x30,[sp],#16
+- .inst 0xd50323bf // autiasp
+ .Lcbc_abort:
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt
+
+ .type vpaes_cbc_decrypt,%function
+ .align 4
+ vpaes_cbc_decrypt:
+- .inst 0xd503233f // paciasp
++ // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to
++ // only from vpaes_cbc_encrypt which has already signed the return address.
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+@@ -1185,7 +1188,7 @@ vpaes_cbc_decrypt:
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt
+ ___
+@@ -1195,7 +1198,7 @@ $code.=<<___;
+ .type vpaes_ecb_encrypt,%function
+ .align 4
+ vpaes_ecb_encrypt:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+@@ -1229,7 +1232,7 @@ vpaes_ecb_encrypt:
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt
+
+@@ -1237,7 +1240,7 @@ vpaes_ecb_encrypt:
+ .type vpaes_ecb_decrypt,%function
+ .align 4
+ vpaes_ecb_decrypt:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#-16]! // ABI spec says so
+@@ -1271,7 +1274,7 @@ vpaes_ecb_decrypt:
+ ldp d10,d11,[sp],#16
+ ldp d8,d9,[sp],#16
+ ldp x29,x30,[sp],#16
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt
+ ___
+diff --git a/crypto/aes/build.info b/crypto/aes/build.info
+index b250903fa6..47f99fdf33 100644
+--- a/crypto/aes/build.info
++++ b/crypto/aes/build.info
+@@ -116,6 +116,7 @@ INCLUDE[aes-mips.o]=..
+ GENERATE[aesv8-armx.S]=asm/aesv8-armx.pl
+ INCLUDE[aesv8-armx.o]=..
+ GENERATE[vpaes-armv8.S]=asm/vpaes-armv8.pl
++INCLUDE[vpaes-armv8.o]=..
+
+ GENERATE[aes-armv4.S]=asm/aes-armv4.pl
+ INCLUDE[aes-armv4.o]=..
+diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
+index ac76dd449f..11f0e50279 100755
+--- a/crypto/arm64cpuid.pl
++++ b/crypto/arm64cpuid.pl
+@@ -31,6 +31,7 @@ $code.=<<___;
+ .globl _armv7_neon_probe
+ .type _armv7_neon_probe,%function
+ _armv7_neon_probe:
++ AARCH64_VALID_CALL_TARGET
+ orr v15.16b, v15.16b, v15.16b
+ ret
+ .size _armv7_neon_probe,.-_armv7_neon_probe
+@@ -38,6 +39,7 @@ _armv7_neon_probe:
+ .globl _armv7_tick
+ .type _armv7_tick,%function
+ _armv7_tick:
++ AARCH64_VALID_CALL_TARGET
+ #ifdef __APPLE__
+ mrs x0, CNTPCT_EL0
+ #else
+@@ -49,6 +51,7 @@ _armv7_tick:
+ .globl _armv8_aes_probe
+ .type _armv8_aes_probe,%function
+ _armv8_aes_probe:
++ AARCH64_VALID_CALL_TARGET
+ aese v0.16b, v0.16b
+ ret
+ .size _armv8_aes_probe,.-_armv8_aes_probe
+@@ -56,6 +59,7 @@ _armv8_aes_probe:
+ .globl _armv8_sha1_probe
+ .type _armv8_sha1_probe,%function
+ _armv8_sha1_probe:
++ AARCH64_VALID_CALL_TARGET
+ sha1h s0, s0
+ ret
+ .size _armv8_sha1_probe,.-_armv8_sha1_probe
+@@ -63,6 +67,7 @@ _armv8_sha1_probe:
+ .globl _armv8_sha256_probe
+ .type _armv8_sha256_probe,%function
+ _armv8_sha256_probe:
++ AARCH64_VALID_CALL_TARGET
+ sha256su0 v0.4s, v0.4s
+ ret
+ .size _armv8_sha256_probe,.-_armv8_sha256_probe
+@@ -70,6 +75,7 @@ _armv8_sha256_probe:
+ .globl _armv8_pmull_probe
+ .type _armv8_pmull_probe,%function
+ _armv8_pmull_probe:
++ AARCH64_VALID_CALL_TARGET
+ pmull v0.1q, v0.1d, v0.1d
+ ret
+ .size _armv8_pmull_probe,.-_armv8_pmull_probe
+@@ -77,6 +83,7 @@ _armv8_pmull_probe:
+ .globl _armv8_sha512_probe
+ .type _armv8_sha512_probe,%function
+ _armv8_sha512_probe:
++ AARCH64_VALID_CALL_TARGET
+ .long 0xcec08000 // sha512su0 v0.2d,v0.2d
+ ret
+ .size _armv8_sha512_probe,.-_armv8_sha512_probe
+@@ -84,6 +91,7 @@ _armv8_sha512_probe:
+ .globl _armv8_cpuid_probe
+ .type _armv8_cpuid_probe,%function
+ _armv8_cpuid_probe:
++ AARCH64_VALID_CALL_TARGET
+ mrs x0, midr_el1
+ ret
+ .size _armv8_cpuid_probe,.-_armv8_cpuid_probe
+@@ -92,6 +100,7 @@ _armv8_cpuid_probe:
+ .type OPENSSL_cleanse,%function
+ .align 5
+ OPENSSL_cleanse:
++ AARCH64_VALID_CALL_TARGET
+ cbz x1,.Lret // len==0?
+ cmp x1,#15
+ b.hi .Lot // len>15
+@@ -123,6 +132,7 @@ OPENSSL_cleanse:
+ .type CRYPTO_memcmp,%function
+ .align 4
+ CRYPTO_memcmp:
++ AARCH64_VALID_CALL_TARGET
+ eor w3,w3,w3
+ cbz x2,.Lno_data // len==0?
+ cmp x2,#16
+diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
+index 45d7e15564..a815a5c72b 100644
+--- a/crypto/arm_arch.h
++++ b/crypto/arm_arch.h
+@@ -126,4 +126,62 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
+
+ # define MIDR_IS_CPU_MODEL(midr, imp, partnum) \
+ (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum))
++
++#if defined(__ASSEMBLER__)
++
++ /*
++ * Support macros for
++ * - Armv8.3-A Pointer Authentication and
++ * - Armv8.5-A Branch Target Identification
++ * features which require emitting a .note.gnu.property section with the
++ * appropriate architecture-dependent feature bits set.
++ * Read more: "ELF for the Arm® 64-bit Architecture"
++ */
++
++# if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1
++# define GNU_PROPERTY_AARCH64_BTI (1 << 0) /* Has Branch Target Identification */
++# define AARCH64_VALID_CALL_TARGET hint #34 /* BTI 'c' */
++# else
++# define GNU_PROPERTY_AARCH64_BTI 0 /* No Branch Target Identification */
++# define AARCH64_VALID_CALL_TARGET
++# endif
++
++# if defined(__ARM_FEATURE_PAC_DEFAULT) && \
++ (__ARM_FEATURE_PAC_DEFAULT & 1) == 1 /* Signed with A-key */
++# define GNU_PROPERTY_AARCH64_POINTER_AUTH \
++ (1 << 1) /* Has Pointer Authentication */
++# define AARCH64_SIGN_LINK_REGISTER hint #25 /* PACIASP */
++# define AARCH64_VALIDATE_LINK_REGISTER hint #29 /* AUTIASP */
++# elif defined(__ARM_FEATURE_PAC_DEFAULT) && \
++ (__ARM_FEATURE_PAC_DEFAULT & 2) == 2 /* Signed with B-key */
++# define GNU_PROPERTY_AARCH64_POINTER_AUTH \
++ (1 << 1) /* Has Pointer Authentication */
++# define AARCH64_SIGN_LINK_REGISTER hint #27 /* PACIBSP */
++# define AARCH64_VALIDATE_LINK_REGISTER hint #31 /* AUTIBSP */
++# else
++# define GNU_PROPERTY_AARCH64_POINTER_AUTH 0 /* No Pointer Authentication */
++# if GNU_PROPERTY_AARCH64_BTI != 0
++# define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET
++# else
++# define AARCH64_SIGN_LINK_REGISTER
++# endif
++# define AARCH64_VALIDATE_LINK_REGISTER
++# endif
++
++# if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0
++ .pushsection .note.gnu.property, "a";
++ .balign 8;
++ .long 4;
++ .long 0x10;
++ .long 0x5;
++ .asciz "GNU";
++ .long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */
++ .long 4;
++ .long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI);
++ .long 0;
++ .popsection;
++# endif
++
++# endif /* defined __ASSEMBLER__ */
++
+ #endif
+diff --git a/crypto/bn/asm/armv8-mont.pl b/crypto/bn/asm/armv8-mont.pl
+index 54d2e8245f..21ab12bdf0 100755
+--- a/crypto/bn/asm/armv8-mont.pl
++++ b/crypto/bn/asm/armv8-mont.pl
+@@ -67,8 +67,8 @@ $n0="x4"; # const BN_ULONG *n0,
+ $num="x5"; # int num);
+
+ $code.=<<___;
++#include "arm_arch.h"
+ #ifndef __KERNEL__
+-# include "arm_arch.h"
+ .extern OPENSSL_armv8_rsa_neonized
+ .hidden OPENSSL_armv8_rsa_neonized
+ #endif
+@@ -78,6 +78,7 @@ $code.=<<___;
+ .type bn_mul_mont,%function
+ .align 5
+ bn_mul_mont:
++ AARCH64_SIGN_LINK_REGISTER
+ .Lbn_mul_mont:
+ tst $num,#3
+ b.ne .Lmul_mont
+@@ -288,6 +289,7 @@ bn_mul_mont:
+ mov x0,#1
+ ldp x23,x24,[x29,#48]
+ ldr x29,[sp],#64
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size bn_mul_mont,.-bn_mul_mont
+ ___
+@@ -309,6 +311,8 @@ $code.=<<___;
+ .type bn_mul8x_mont_neon,%function
+ .align 5
+ bn_mul8x_mont_neon:
++ // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to
++ // only from bn_mul_mont which has already signed the return address.
+ stp x29,x30,[sp,#-80]!
+ mov x16,sp
+ stp d8,d9,[sp,#16]
+@@ -649,6 +653,7 @@ $code.=<<___;
+ ldp d10,d11,[sp,#32]
+ ldp d8,d9,[sp,#16]
+ ldr x29,[sp],#80
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret // bx lr
+
+ .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon
+@@ -671,7 +676,8 @@ __bn_sqr8x_mont:
+ cmp $ap,$bp
+ b.ne __bn_mul4x_mont
+ .Lsqr8x_mont:
+- .inst 0xd503233f // paciasp
++ // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to
++ // only from bn_mul_mont which has already signed the return address.
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+@@ -1425,7 +1431,8 @@ $code.=<<___;
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldr x29,[sp],#128
+- .inst 0xd50323bf // autiasp
++ // x30 is loaded earlier
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size __bn_sqr8x_mont,.-__bn_sqr8x_mont
+ ___
+@@ -1449,7 +1456,8 @@ $code.=<<___;
+ .type __bn_mul4x_mont,%function
+ .align 5
+ __bn_mul4x_mont:
+- .inst 0xd503233f // paciasp
++ // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to
++ // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address.
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+@@ -1883,7 +1891,8 @@ __bn_mul4x_mont:
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldr x29,[sp],#128
+- .inst 0xd50323bf // autiasp
++ // x30 loaded earlier
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size __bn_mul4x_mont,.-__bn_mul4x_mont
+ ___
+diff --git a/crypto/chacha/asm/chacha-armv8.pl b/crypto/chacha/asm/chacha-armv8.pl
+index dcdc4a04e3..e1a8b81594 100755
+--- a/crypto/chacha/asm/chacha-armv8.pl
++++ b/crypto/chacha/asm/chacha-armv8.pl
+@@ -132,8 +132,8 @@ my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+ }
+
+ $code.=<<___;
++#include "arm_arch.h"
+ #ifndef __KERNEL__
+-# include "arm_arch.h"
+ .extern OPENSSL_armcap_P
+ .hidden OPENSSL_armcap_P
+ #endif
+@@ -153,6 +153,7 @@ $code.=<<___;
+ .type ChaCha20_ctr32,%function
+ .align 5
+ ChaCha20_ctr32:
++ AARCH64_SIGN_LINK_REGISTER
+ cbz $len,.Labort
+ cmp $len,#192
+ b.lo .Lshort
+@@ -165,7 +166,6 @@ ChaCha20_ctr32:
+ #endif
+
+ .Lshort:
+- .inst 0xd503233f // paciasp
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+@@ -285,8 +285,8 @@ $code.=<<___;
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+- .inst 0xd50323bf // autiasp
+ .Labort:
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+ .align 4
+@@ -342,7 +342,7 @@ $code.=<<___;
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size ChaCha20_ctr32,.-ChaCha20_ctr32
+ ___
+@@ -432,8 +432,8 @@ $code.=<<___;
+ .type ChaCha20_neon,%function
+ .align 5
+ ChaCha20_neon:
++ AARCH64_SIGN_LINK_REGISTER
+ .LChaCha20_neon:
+- .inst 0xd503233f // paciasp
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+@@ -667,7 +667,7 @@ $code.=<<___;
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+
+ .align 4
+@@ -799,7 +799,7 @@ $code.=<<___;
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size ChaCha20_neon,.-ChaCha20_neon
+ ___
+@@ -844,7 +844,7 @@ $code.=<<___;
+ .type ChaCha20_512_neon,%function
+ .align 5
+ ChaCha20_512_neon:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+
+@@ -1268,7 +1268,7 @@ $code.=<<___;
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size ChaCha20_512_neon,.-ChaCha20_512_neon
+ ___
+diff --git a/crypto/ec/asm/ecp_nistz256-armv8.pl b/crypto/ec/asm/ecp_nistz256-armv8.pl
+index 81ee3947d7..6c5d0e8b3c 100644
+--- a/crypto/ec/asm/ecp_nistz256-armv8.pl
++++ b/crypto/ec/asm/ecp_nistz256-armv8.pl
+@@ -122,7 +122,7 @@ $code.=<<___;
+ .type ecp_nistz256_to_mont,%function
+ .align 6
+ ecp_nistz256_to_mont:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-32]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+@@ -138,7 +138,7 @@ ecp_nistz256_to_mont:
+
+ ldp x19,x20,[sp,#16]
+ ldp x29,x30,[sp],#32
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
+
+@@ -147,7 +147,7 @@ ecp_nistz256_to_mont:
+ .type ecp_nistz256_from_mont,%function
+ .align 4
+ ecp_nistz256_from_mont:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-32]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+@@ -163,7 +163,7 @@ ecp_nistz256_from_mont:
+
+ ldp x19,x20,[sp,#16]
+ ldp x29,x30,[sp],#32
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
+
+@@ -173,7 +173,7 @@ ecp_nistz256_from_mont:
+ .type ecp_nistz256_mul_mont,%function
+ .align 4
+ ecp_nistz256_mul_mont:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-32]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+@@ -188,7 +188,7 @@ ecp_nistz256_mul_mont:
+
+ ldp x19,x20,[sp,#16]
+ ldp x29,x30,[sp],#32
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
+
+@@ -197,7 +197,7 @@ ecp_nistz256_mul_mont:
+ .type ecp_nistz256_sqr_mont,%function
+ .align 4
+ ecp_nistz256_sqr_mont:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-32]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+@@ -211,7 +211,7 @@ ecp_nistz256_sqr_mont:
+
+ ldp x19,x20,[sp,#16]
+ ldp x29,x30,[sp],#32
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
+
+@@ -221,7 +221,7 @@ ecp_nistz256_sqr_mont:
+ .type ecp_nistz256_add,%function
+ .align 4
+ ecp_nistz256_add:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+@@ -235,7 +235,7 @@ ecp_nistz256_add:
+ bl __ecp_nistz256_add
+
+ ldp x29,x30,[sp],#16
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size ecp_nistz256_add,.-ecp_nistz256_add
+
+@@ -244,7 +244,7 @@ ecp_nistz256_add:
+ .type ecp_nistz256_div_by_2,%function
+ .align 4
+ ecp_nistz256_div_by_2:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+@@ -256,7 +256,7 @@ ecp_nistz256_div_by_2:
+ bl __ecp_nistz256_div_by_2
+
+ ldp x29,x30,[sp],#16
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
+
+@@ -265,7 +265,7 @@ ecp_nistz256_div_by_2:
+ .type ecp_nistz256_mul_by_2,%function
+ .align 4
+ ecp_nistz256_mul_by_2:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+@@ -281,7 +281,7 @@ ecp_nistz256_mul_by_2:
+ bl __ecp_nistz256_add // ret = a+a // 2*a
+
+ ldp x29,x30,[sp],#16
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
+
+@@ -290,7 +290,7 @@ ecp_nistz256_mul_by_2:
+ .type ecp_nistz256_mul_by_3,%function
+ .align 4
+ ecp_nistz256_mul_by_3:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+@@ -317,7 +317,7 @@ ecp_nistz256_mul_by_3:
+ bl __ecp_nistz256_add // ret += a // 2*a+a=3*a
+
+ ldp x29,x30,[sp],#16
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
+
+@@ -327,7 +327,7 @@ ecp_nistz256_mul_by_3:
+ .type ecp_nistz256_sub,%function
+ .align 4
+ ecp_nistz256_sub:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+@@ -339,7 +339,7 @@ ecp_nistz256_sub:
+ bl __ecp_nistz256_sub_from
+
+ ldp x29,x30,[sp],#16
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size ecp_nistz256_sub,.-ecp_nistz256_sub
+
+@@ -348,7 +348,7 @@ ecp_nistz256_sub:
+ .type ecp_nistz256_neg,%function
+ .align 4
+ ecp_nistz256_neg:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+@@ -363,7 +363,7 @@ ecp_nistz256_neg:
+ bl __ecp_nistz256_sub_from
+
+ ldp x29,x30,[sp],#16
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size ecp_nistz256_neg,.-ecp_nistz256_neg
+
+@@ -724,7 +724,7 @@ $code.=<<___;
+ .type ecp_nistz256_point_double,%function
+ .align 5
+ ecp_nistz256_point_double:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+@@ -859,7 +859,7 @@ ecp_nistz256_point_double:
+ ldp x19,x20,[x29,#16]
+ ldp x21,x22,[x29,#32]
+ ldp x29,x30,[sp],#96
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size ecp_nistz256_point_double,.-ecp_nistz256_point_double
+ ___
+@@ -882,7 +882,7 @@ $code.=<<___;
+ .type ecp_nistz256_point_add,%function
+ .align 5
+ ecp_nistz256_point_add:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+@@ -1117,7 +1117,7 @@ $code.=<<___;
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#96
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size ecp_nistz256_point_add,.-ecp_nistz256_point_add
+ ___
+@@ -1139,7 +1139,7 @@ $code.=<<___;
+ .type ecp_nistz256_point_add_affine,%function
+ .align 5
+ ecp_nistz256_point_add_affine:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+@@ -1328,7 +1328,7 @@ $code.=<<___;
+ ldp x23,x24,[x29,#48]
+ ldp x25,x26,[x29,#64]
+ ldp x29,x30,[sp],#80
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
+ ___
+@@ -1346,6 +1346,8 @@ $code.=<<___;
+ .type ecp_nistz256_ord_mul_mont,%function
+ .align 4
+ ecp_nistz256_ord_mul_mont:
++ AARCH64_VALID_CALL_TARGET
++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+@@ -1487,6 +1489,8 @@ $code.=<<___;
+ .type ecp_nistz256_ord_sqr_mont,%function
+ .align 4
+ ecp_nistz256_ord_sqr_mont:
++ AARCH64_VALID_CALL_TARGET
++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-64]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+@@ -1641,6 +1645,8 @@ $code.=<<___;
+ .type ecp_nistz256_scatter_w5,%function
+ .align 4
+ ecp_nistz256_scatter_w5:
++ AARCH64_VALID_CALL_TARGET
++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+@@ -1703,6 +1709,8 @@ ecp_nistz256_scatter_w5:
+ .type ecp_nistz256_gather_w5,%function
+ .align 4
+ ecp_nistz256_gather_w5:
++ AARCH64_VALID_CALL_TARGET
++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+@@ -1780,6 +1788,8 @@ ecp_nistz256_gather_w5:
+ .type ecp_nistz256_scatter_w7,%function
+ .align 4
+ ecp_nistz256_scatter_w7:
++ AARCH64_VALID_CALL_TARGET
++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+@@ -1824,6 +1834,8 @@ ecp_nistz256_scatter_w7:
+ .type ecp_nistz256_gather_w7,%function
+ .align 4
+ ecp_nistz256_gather_w7:
++ AARCH64_VALID_CALL_TARGET
++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+diff --git a/crypto/modes/asm/aes-gcm-armv8_64.pl b/crypto/modes/asm/aes-gcm-armv8_64.pl
+index 3b9d5b6511..ff5809ec22 100755
+--- a/crypto/modes/asm/aes-gcm-armv8_64.pl
++++ b/crypto/modes/asm/aes-gcm-armv8_64.pl
+@@ -256,6 +256,7 @@ $code.=<<___;
+ .type aes_gcm_enc_128_kernel,%function
+ .align 4
+ aes_gcm_enc_128_kernel:
++ AARCH64_VALID_CALL_TARGET
+ cbz x1, .L128_enc_ret
+ stp x19, x20, [sp, #-112]!
+ mov x16, x4
+@@ -1089,6 +1090,7 @@ $code.=<<___;
+ .type aes_gcm_dec_128_kernel,%function
+ .align 4
+ aes_gcm_dec_128_kernel:
++ AARCH64_VALID_CALL_TARGET
+ cbz x1, .L128_dec_ret
+ stp x19, x20, [sp, #-112]!
+ mov x16, x4
+@@ -1973,6 +1975,7 @@ $code.=<<___;
+ .type aes_gcm_enc_192_kernel,%function
+ .align 4
+ aes_gcm_enc_192_kernel:
++ AARCH64_VALID_CALL_TARGET
+ cbz x1, .L192_enc_ret
+ stp x19, x20, [sp, #-112]!
+ mov x16, x4
+@@ -2858,6 +2861,7 @@ $code.=<<___;
+ .type aes_gcm_dec_192_kernel,%function
+ .align 4
+ aes_gcm_dec_192_kernel:
++ AARCH64_VALID_CALL_TARGET
+ cbz x1, .L192_dec_ret
+ stp x19, x20, [sp, #-112]!
+ mov x16, x4
+@@ -3797,6 +3801,7 @@ $code.=<<___;
+ .type aes_gcm_enc_256_kernel,%function
+ .align 4
+ aes_gcm_enc_256_kernel:
++ AARCH64_VALID_CALL_TARGET
+ cbz x1, .L256_enc_ret
+ stp x19, x20, [sp, #-112]!
+ mov x16, x4
+@@ -4729,6 +4734,7 @@ $code.=<<___;
+ .type aes_gcm_dec_256_kernel,%function
+ .align 4
+ aes_gcm_dec_256_kernel:
++ AARCH64_VALID_CALL_TARGET
+ cbz x1, .L256_dec_ret
+ stp x19, x20, [sp, #-112]!
+ mov x16, x4
+diff --git a/crypto/modes/asm/ghashv8-armx.pl b/crypto/modes/asm/ghashv8-armx.pl
+index b1d35d25b5..57f893e77c 100644
+--- a/crypto/modes/asm/ghashv8-armx.pl
++++ b/crypto/modes/asm/ghashv8-armx.pl
+@@ -107,6 +107,11 @@ $code.=<<___;
+ .type gcm_init_v8,%function
+ .align 4
+ gcm_init_v8:
++___
++$code.=<<___ if ($flavour =~ /64/);
++ AARCH64_VALID_CALL_TARGET
++___
++$code.=<<___;
+ vld1.64 {$t1},[x1] @ load input H
+ vmov.i8 $xC2,#0xe1
+ vshl.i64 $xC2,$xC2,#57 @ 0xc2.0
+@@ -214,6 +219,11 @@ $code.=<<___;
+ .type gcm_gmult_v8,%function
+ .align 4
+ gcm_gmult_v8:
++___
++$code.=<<___ if ($flavour =~ /64/);
++ AARCH64_VALID_CALL_TARGET
++___
++$code.=<<___;
+ vld1.64 {$t1},[$Xi] @ load Xi
+ vmov.i8 $xC2,#0xe1
+ vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ...
+@@ -268,6 +278,7 @@ $code.=<<___;
+ gcm_ghash_v8:
+ ___
+ $code.=<<___ if ($flavour =~ /64/);
++ AARCH64_VALID_CALL_TARGET
+ cmp $len,#64
+ b.hs .Lgcm_ghash_v8_4x
+ ___
+diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl
+index 113a2151b6..20816c4283 100755
+--- a/crypto/poly1305/asm/poly1305-armv8.pl
++++ b/crypto/poly1305/asm/poly1305-armv8.pl
+@@ -72,6 +72,7 @@ $code.=<<___;
+ .type poly1305_init,%function
+ .align 5
+ poly1305_init:
++ AARCH64_VALID_CALL_TARGET
+ cmp $inp,xzr
+ stp xzr,xzr,[$ctx] // zero hash value
+ stp xzr,xzr,[$ctx,#16] // [along with is_base2_26]
+@@ -119,6 +120,9 @@ poly1305_init:
+ .align 5
+ poly1305_blocks:
+ .Lpoly1305_blocks:
++ // The symbol .Lpoly1305_blocks is not a .globl symbol
++ // but a pointer to it is returned by poly1305_init
++ AARCH64_VALID_CALL_TARGET
+ ands $len,$len,#-16
+ b.eq .Lno_data
+
+@@ -184,6 +188,9 @@ poly1305_blocks:
+ .align 5
+ poly1305_emit:
+ .Lpoly1305_emit:
++ // The symbol .poly1305_emit is not a .globl symbol
++ // but a pointer to it is returned by poly1305_init
++ AARCH64_VALID_CALL_TARGET
+ ldp $h0,$h1,[$ctx] // load hash base 2^64
+ ldr $h2,[$ctx,#16]
+ ldp $t0,$t1,[$nonce] // load nonce
+@@ -291,13 +298,16 @@ poly1305_splat:
+ .align 5
+ poly1305_blocks_neon:
+ .Lpoly1305_blocks_neon:
++ // The symbol .Lpoly1305_blocks_neon is not a .globl symbol
++ // but a pointer to it is returned by poly1305_init
++ AARCH64_VALID_CALL_TARGET
+ ldr $is_base2_26,[$ctx,#24]
+ cmp $len,#128
+ b.hs .Lblocks_neon
+ cbz $is_base2_26,.Lpoly1305_blocks
+
+ .Lblocks_neon:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+
+@@ -867,7 +877,7 @@ poly1305_blocks_neon:
+
+ .Lno_data_neon:
+ ldr x29,[sp],#80
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size poly1305_blocks_neon,.-poly1305_blocks_neon
+
+@@ -875,6 +885,9 @@ poly1305_blocks_neon:
+ .align 5
+ poly1305_emit_neon:
+ .Lpoly1305_emit_neon:
++ // The symbol .Lpoly1305_emit_neon is not a .globl symbol
++ // but a pointer to it is returned by poly1305_init
++ AARCH64_VALID_CALL_TARGET
+ ldr $is_base2_26,[$ctx,#24]
+ cbz $is_base2_26,poly1305_emit
+
+diff --git a/crypto/sha/asm/keccak1600-armv8.pl b/crypto/sha/asm/keccak1600-armv8.pl
+index 65102e7c29..cf54b62c63 100755
+--- a/crypto/sha/asm/keccak1600-armv8.pl
++++ b/crypto/sha/asm/keccak1600-armv8.pl
+@@ -80,6 +80,8 @@ my @rhotates = ([ 0, 1, 62, 28, 27 ],
+ [ 18, 2, 61, 56, 14 ]);
+
+ $code.=<<___;
++#include "arm_arch.h"
++
+ .text
+
+ .align 8 // strategic alignment and padding that allows to use
+@@ -125,7 +127,7 @@ $code.=<<___;
+ .align 5
+ KeccakF1600_int:
+ adr $C[2],iotas
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp $C[2],x30,[sp,#16] // 32 bytes on top are mine
+ b .Loop
+ .align 4
+@@ -297,14 +299,14 @@ $code.=<<___;
+ bne .Loop
+
+ ldr x30,[sp,#24]
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size KeccakF1600_int,.-KeccakF1600_int
+
+ .type KeccakF1600,%function
+ .align 5
+ KeccakF1600:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+@@ -354,7 +356,7 @@ KeccakF1600:
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size KeccakF1600,.-KeccakF1600
+
+@@ -362,7 +364,7 @@ KeccakF1600:
+ .type SHA3_absorb,%function
+ .align 5
+ SHA3_absorb:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+@@ -460,7 +462,7 @@ $code.=<<___;
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size SHA3_absorb,.-SHA3_absorb
+ ___
+@@ -471,7 +473,7 @@ $code.=<<___;
+ .type SHA3_squeeze,%function
+ .align 5
+ SHA3_squeeze:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-48]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+@@ -534,7 +536,7 @@ SHA3_squeeze:
+ ldp x19,x20,[sp,#16]
+ ldp x21,x22,[sp,#32]
+ ldp x29,x30,[sp],#48
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size SHA3_squeeze,.-SHA3_squeeze
+ ___
+@@ -653,7 +655,7 @@ KeccakF1600_ce:
+ .type KeccakF1600_cext,%function
+ .align 5
+ KeccakF1600_cext:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#16] // per ABI requirement
+@@ -686,7 +688,7 @@ $code.=<<___;
+ ldp d12,d13,[sp,#48]
+ ldp d14,d15,[sp,#64]
+ ldr x29,[sp],#80
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size KeccakF1600_cext,.-KeccakF1600_cext
+ ___
+@@ -699,7 +701,7 @@ $code.=<<___;
+ .type SHA3_absorb_cext,%function
+ .align 5
+ SHA3_absorb_cext:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-80]!
+ add x29,sp,#0
+ stp d8,d9,[sp,#16] // per ABI requirement
+@@ -771,7 +773,7 @@ $code.=<<___;
+ ldp d12,d13,[sp,#48]
+ ldp d14,d15,[sp,#64]
+ ldp x29,x30,[sp],#80
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size SHA3_absorb_cext,.-SHA3_absorb_cext
+ ___
+@@ -783,7 +785,7 @@ $code.=<<___;
+ .type SHA3_squeeze_cext,%function
+ .align 5
+ SHA3_squeeze_cext:
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+ mov x9,$ctx
+@@ -839,7 +841,7 @@ SHA3_squeeze_cext:
+
+ .Lsqueeze_done_ce:
+ ldr x29,[sp],#16
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size SHA3_squeeze_cext,.-SHA3_squeeze_cext
+ ___
+diff --git a/crypto/sha/asm/sha1-armv8.pl b/crypto/sha/asm/sha1-armv8.pl
+index cdea8845af..5f23a20c1a 100644
+--- a/crypto/sha/asm/sha1-armv8.pl
++++ b/crypto/sha/asm/sha1-armv8.pl
+@@ -175,8 +175,8 @@ ___
+ }
+
+ $code.=<<___;
++#include "arm_arch.h"
+ #ifndef __KERNEL__
+-# include "arm_arch.h"
+ .extern OPENSSL_armcap_P
+ .hidden OPENSSL_armcap_P
+ #endif
+@@ -187,11 +187,13 @@ $code.=<<___;
+ .type sha1_block_data_order,%function
+ .align 6
+ sha1_block_data_order:
++ AARCH64_VALID_CALL_TARGET
+ adrp x16,OPENSSL_armcap_P
+ ldr w16,[x16,#:lo12:OPENSSL_armcap_P]
+ tst w16,#ARMV8_SHA1
+ b.ne .Lv8_entry
+
++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-96]!
+ add x29,sp,#0
+ stp x19,x20,[sp,#16]
+@@ -253,6 +255,7 @@ $code.=<<___;
+ .align 6
+ sha1_block_armv8:
+ .Lv8_entry:
++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl
+index 6bcff0b7d3..f900882fee 100644
+--- a/crypto/sha/asm/sha512-armv8.pl
++++ b/crypto/sha/asm/sha512-armv8.pl
+@@ -190,8 +190,8 @@ ___
+ }
+
+ $code.=<<___;
++#include "arm_arch.h"
+ #ifndef __KERNEL__
+-# include "arm_arch.h"
+ .extern OPENSSL_armcap_P
+ .hidden OPENSSL_armcap_P
+ #endif
+@@ -202,6 +202,7 @@ $code.=<<___;
+ .type $func,%function
+ .align 6
+ $func:
++ AARCH64_VALID_CALL_TARGET
+ #ifndef __KERNEL__
+ adrp x16,OPENSSL_armcap_P
+ ldr w16,[x16,#:lo12:OPENSSL_armcap_P]
+@@ -218,7 +219,7 @@ $code.=<<___ if ($SZ==8);
+ ___
+ $code.=<<___;
+ #endif
+- .inst 0xd503233f // paciasp
++ AARCH64_SIGN_LINK_REGISTER
+ stp x29,x30,[sp,#-128]!
+ add x29,sp,#0
+
+@@ -280,7 +281,7 @@ $code.=<<___;
+ ldp x25,x26,[x29,#64]
+ ldp x27,x28,[x29,#80]
+ ldp x29,x30,[sp],#128
+- .inst 0xd50323bf // autiasp
++ AARCH64_VALIDATE_LINK_REGISTER
+ ret
+ .size $func,.-$func
+
+@@ -370,6 +371,7 @@ $code.=<<___;
+ .align 6
+ sha256_block_armv8:
+ .Lv8_entry:
++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+@@ -632,7 +634,9 @@ $code.=<<___;
+ .type sha256_block_neon,%function
+ .align 4
+ sha256_block_neon:
++ AARCH64_VALID_CALL_TARGET
+ .Lneon_entry:
++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later
+ stp x29, x30, [sp, #-16]!
+ mov x29, sp
+ sub sp,sp,#16*4
+@@ -743,6 +747,7 @@ $code.=<<___;
+ .align 6
+ sha512_block_armv8:
+ .Lv8_entry:
++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+
+diff --git a/crypto/sha/build.info b/crypto/sha/build.info
+index d61f7de9b6..556a658d8b 100644
+--- a/crypto/sha/build.info
++++ b/crypto/sha/build.info
+@@ -153,6 +153,7 @@ INCLUDE[sha256-armv8.o]=..
+ GENERATE[sha512-armv8.S]=asm/sha512-armv8.pl
+ INCLUDE[sha512-armv8.o]=..
+ GENERATE[keccak1600-armv8.S]=asm/keccak1600-armv8.pl
++INCLUDE[keccak1600-armv8.o]=..
+
+ GENERATE[sha1-s390x.S]=asm/sha1-s390x.pl
+ INCLUDE[sha1-s390x.o]=..
+--
+2.37.3.windows.1
+
diff --git a/Backport-providers-Add-SM4-GCM-implementation.patch b/Backport-providers-Add-SM4-GCM-implementation.patch
new file mode 100644
index 0000000..3e2ee23
--- /dev/null
+++ b/Backport-providers-Add-SM4-GCM-implementation.patch
@@ -0,0 +1,360 @@
+From 2f1c0b5f1b585a307f21a70ef3ae652643c25f6d Mon Sep 17 00:00:00 2001
+From: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+Date: Wed, 1 Sep 2021 16:54:15 +0800
+Subject: [PATCH 04/13] providers: Add SM4 GCM implementation
+
+The GCM mode of the SM4 algorithm is specifieded by RFC8998.
+
+Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com>
+
+Reviewed-by: Paul Yang <kaishen.yy@antfin.com>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/16491)
+---
+ providers/defltprov.c | 2 +
+ providers/implementations/ciphers/build.info | 4 +-
+ .../implementations/ciphers/cipher_sm4_ccm.c | 39 +++++++++++++++++
+ .../implementations/ciphers/cipher_sm4_ccm.h | 22 ++++++++++
+ .../ciphers/cipher_sm4_ccm_hw.c | 41 ++++++++++++++++++
+ .../implementations/ciphers/cipher_sm4_gcm.c | 40 +++++++++++++++++
+ .../implementations/ciphers/cipher_sm4_gcm.h | 22 ++++++++++
+ .../ciphers/cipher_sm4_gcm_hw.c | 43 +++++++++++++++++++
+ .../include/prov/implementations.h | 2 +
+ .../implementations/include/prov/names.h | 2 +
+ test/recipes/30-test_evp_data/evpciph_sm4.txt | 20 +++++++++
+ 11 files changed, 236 insertions(+), 1 deletion(-)
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_ccm.c
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_ccm.h
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_ccm_hw.c
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_gcm.c
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_gcm.h
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+
+diff --git a/providers/defltprov.c b/providers/defltprov.c
+index ed3f4799e7..cc0b0c3b62 100644
+--- a/providers/defltprov.c
++++ b/providers/defltprov.c
+@@ -289,6 +289,8 @@ static const OSSL_ALGORITHM_CAPABLE deflt_ciphers[] = {
+ ALG(PROV_NAMES_DES_EDE_CFB, ossl_tdes_ede2_cfb_functions),
+ #endif /* OPENSSL_NO_DES */
+ #ifndef OPENSSL_NO_SM4
++ ALG(PROV_NAMES_SM4_GCM, ossl_sm4128gcm_functions),
++ ALG(PROV_NAMES_SM4_CCM, ossl_sm4128ccm_functions),
+ ALG(PROV_NAMES_SM4_ECB, ossl_sm4128ecb_functions),
+ ALG(PROV_NAMES_SM4_CBC, ossl_sm4128cbc_functions),
+ ALG(PROV_NAMES_SM4_CTR, ossl_sm4128ctr_functions),
+diff --git a/providers/implementations/ciphers/build.info b/providers/implementations/ciphers/build.info
+index e4c5f4f051..b5d9d4f6c1 100644
+--- a/providers/implementations/ciphers/build.info
++++ b/providers/implementations/ciphers/build.info
+@@ -105,7 +105,9 @@ ENDIF
+
+ IF[{- !$disabled{sm4} -}]
+ SOURCE[$SM4_GOAL]=\
+- cipher_sm4.c cipher_sm4_hw.c
++ cipher_sm4.c cipher_sm4_hw.c \
++ cipher_sm4_gcm.c cipher_sm4_gcm_hw.c \
++ cipher_sm4_ccm.c cipher_sm4_ccm_hw.c
+ ENDIF
+
+ IF[{- !$disabled{ocb} -}]
+diff --git a/providers/implementations/ciphers/cipher_sm4_ccm.c b/providers/implementations/ciphers/cipher_sm4_ccm.c
+new file mode 100644
+index 0000000000..f0295a5ca2
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_ccm.c
+@@ -0,0 +1,39 @@
++/*
++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License"). You may not use
++ * this file except in compliance with the License. You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++/* Dispatch functions for SM4 CCM mode */
++
++#include "cipher_sm4_ccm.h"
++#include "prov/implementations.h"
++#include "prov/providercommon.h"
++
++static OSSL_FUNC_cipher_freectx_fn sm4_ccm_freectx;
++
++static void *sm4_ccm_newctx(void *provctx, size_t keybits)
++{
++ PROV_SM4_CCM_CTX *ctx;
++
++ if (!ossl_prov_is_running())
++ return NULL;
++
++ ctx = OPENSSL_zalloc(sizeof(*ctx));
++ if (ctx != NULL)
++ ossl_ccm_initctx(&ctx->base, keybits, ossl_prov_sm4_hw_ccm(keybits));
++ return ctx;
++}
++
++static void sm4_ccm_freectx(void *vctx)
++{
++ PROV_SM4_CCM_CTX *ctx = (PROV_SM4_CCM_CTX *)vctx;
++
++ OPENSSL_clear_free(ctx, sizeof(*ctx));
++}
++
++/* sm4128ccm functions */
++IMPLEMENT_aead_cipher(sm4, ccm, CCM, AEAD_FLAGS, 128, 8, 96);
+diff --git a/providers/implementations/ciphers/cipher_sm4_ccm.h b/providers/implementations/ciphers/cipher_sm4_ccm.h
+new file mode 100644
+index 0000000000..189e71e9e4
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_ccm.h
+@@ -0,0 +1,22 @@
++/*
++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License"). You may not use
++ * this file except in compliance with the License. You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++#include "crypto/sm4.h"
++#include "prov/ciphercommon.h"
++#include "prov/ciphercommon_ccm.h"
++
++typedef struct prov_sm4_ccm_ctx_st {
++ PROV_CCM_CTX base; /* Must be first */
++ union {
++ OSSL_UNION_ALIGN;
++ SM4_KEY ks;
++ } ks; /* SM4 key schedule to use */
++} PROV_SM4_CCM_CTX;
++
++const PROV_CCM_HW *ossl_prov_sm4_hw_ccm(size_t keylen);
+diff --git a/providers/implementations/ciphers/cipher_sm4_ccm_hw.c b/providers/implementations/ciphers/cipher_sm4_ccm_hw.c
+new file mode 100644
+index 0000000000..791daf3e46
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_ccm_hw.c
+@@ -0,0 +1,41 @@
++/*
++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License"). You may not use
++ * this file except in compliance with the License. You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++/*-
++ * Generic support for SM4 CCM.
++ */
++
++#include "cipher_sm4_ccm.h"
++
++static int ccm_sm4_initkey(PROV_CCM_CTX *ctx,
++ const unsigned char *key, size_t keylen)
++{
++ PROV_SM4_CCM_CTX *actx = (PROV_SM4_CCM_CTX *)ctx;
++
++ ossl_sm4_set_key(key, &actx->ks.ks);
++ CRYPTO_ccm128_init(&ctx->ccm_ctx, ctx->m, ctx->l, &actx->ks.ks,
++ (block128_f)ossl_sm4_encrypt);
++ ctx->str = NULL;
++ ctx->key_set = 1;
++ return 1;
++}
++
++static const PROV_CCM_HW ccm_sm4 = {
++ ccm_sm4_initkey,
++ ossl_ccm_generic_setiv,
++ ossl_ccm_generic_setaad,
++ ossl_ccm_generic_auth_encrypt,
++ ossl_ccm_generic_auth_decrypt,
++ ossl_ccm_generic_gettag
++};
++
++const PROV_CCM_HW *ossl_prov_sm4_hw_ccm(size_t keybits)
++{
++ return &ccm_sm4;
++}
+diff --git a/providers/implementations/ciphers/cipher_sm4_gcm.c b/providers/implementations/ciphers/cipher_sm4_gcm.c
+new file mode 100644
+index 0000000000..7a936f00ee
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_gcm.c
+@@ -0,0 +1,40 @@
++/*
++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License"). You may not use
++ * this file except in compliance with the License. You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++/* Dispatch functions for SM4 GCM mode */
++
++#include "cipher_sm4_gcm.h"
++#include "prov/implementations.h"
++#include "prov/providercommon.h"
++
++static OSSL_FUNC_cipher_freectx_fn sm4_gcm_freectx;
++
++static void *sm4_gcm_newctx(void *provctx, size_t keybits)
++{
++ PROV_SM4_GCM_CTX *ctx;
++
++ if (!ossl_prov_is_running())
++ return NULL;
++
++ ctx = OPENSSL_zalloc(sizeof(*ctx));
++ if (ctx != NULL)
++ ossl_gcm_initctx(provctx, &ctx->base, keybits,
++ ossl_prov_sm4_hw_gcm(keybits));
++ return ctx;
++}
++
++static void sm4_gcm_freectx(void *vctx)
++{
++ PROV_SM4_GCM_CTX *ctx = (PROV_SM4_GCM_CTX *)vctx;
++
++ OPENSSL_clear_free(ctx, sizeof(*ctx));
++}
++
++/* ossl_sm4128gcm_functions */
++IMPLEMENT_aead_cipher(sm4, gcm, GCM, AEAD_FLAGS, 128, 8, 96);
+diff --git a/providers/implementations/ciphers/cipher_sm4_gcm.h b/providers/implementations/ciphers/cipher_sm4_gcm.h
+new file mode 100644
+index 0000000000..2b6b5f3ece
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_gcm.h
+@@ -0,0 +1,22 @@
++/*
++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License"). You may not use
++ * this file except in compliance with the License. You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++#include "crypto/sm4.h"
++#include "prov/ciphercommon.h"
++#include "prov/ciphercommon_gcm.h"
++
++typedef struct prov_sm4_gcm_ctx_st {
++ PROV_GCM_CTX base; /* must be first entry in struct */
++ union {
++ OSSL_UNION_ALIGN;
++ SM4_KEY ks;
++ } ks;
++} PROV_SM4_GCM_CTX;
++
++const PROV_GCM_HW *ossl_prov_sm4_hw_gcm(size_t keybits);
+diff --git a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+new file mode 100644
+index 0000000000..6bcd1ec406
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c
+@@ -0,0 +1,43 @@
++/*
++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License"). You may not use
++ * this file except in compliance with the License. You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++/*-
++ * Generic support for SM4 GCM.
++ */
++
++#include "cipher_sm4_gcm.h"
++
++static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
++ size_t keylen)
++{
++ PROV_SM4_GCM_CTX *actx = (PROV_SM4_GCM_CTX *)ctx;
++ SM4_KEY *ks = &actx->ks.ks;
++
++ ctx->ks = ks;
++ ossl_sm4_set_key(key, ks);
++ CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt);
++ ctx->ctr = (ctr128_f)NULL;
++ ctx->key_set = 1;
++
++ return 1;
++}
++
++static const PROV_GCM_HW sm4_gcm = {
++ sm4_gcm_initkey,
++ ossl_gcm_setiv,
++ ossl_gcm_aad_update,
++ ossl_gcm_cipher_update,
++ ossl_gcm_cipher_final,
++ ossl_gcm_one_shot
++};
++
++const PROV_GCM_HW *ossl_prov_sm4_hw_gcm(size_t keybits)
++{
++ return &sm4_gcm;
++}
+diff --git a/providers/implementations/include/prov/implementations.h b/providers/implementations/include/prov/implementations.h
+index 3f6dd7ee16..498eab4ad4 100644
+--- a/providers/implementations/include/prov/implementations.h
++++ b/providers/implementations/include/prov/implementations.h
+@@ -174,6 +174,8 @@ extern const OSSL_DISPATCH ossl_seed128ofb128_functions[];
+ extern const OSSL_DISPATCH ossl_seed128cfb128_functions[];
+ #endif /* OPENSSL_NO_SEED */
+ #ifndef OPENSSL_NO_SM4
++extern const OSSL_DISPATCH ossl_sm4128gcm_functions[];
++extern const OSSL_DISPATCH ossl_sm4128ccm_functions[];
+ extern const OSSL_DISPATCH ossl_sm4128ecb_functions[];
+ extern const OSSL_DISPATCH ossl_sm4128cbc_functions[];
+ extern const OSSL_DISPATCH ossl_sm4128ctr_functions[];
+diff --git a/providers/implementations/include/prov/names.h b/providers/implementations/include/prov/names.h
+index e0dbb69a9d..0fac23a850 100644
+--- a/providers/implementations/include/prov/names.h
++++ b/providers/implementations/include/prov/names.h
+@@ -162,6 +162,8 @@
+ #define PROV_NAMES_SM4_CTR "SM4-CTR:1.2.156.10197.1.104.7"
+ #define PROV_NAMES_SM4_OFB "SM4-OFB:SM4-OFB128:1.2.156.10197.1.104.3"
+ #define PROV_NAMES_SM4_CFB "SM4-CFB:SM4-CFB128:1.2.156.10197.1.104.4"
++#define PROV_NAMES_SM4_GCM "SM4-GCM:1.2.156.10197.1.104.8"
++#define PROV_NAMES_SM4_CCM "SM4-CCM:1.2.156.10197.1.104.9"
+ #define PROV_NAMES_ChaCha20 "ChaCha20"
+ #define PROV_NAMES_ChaCha20_Poly1305 "ChaCha20-Poly1305"
+ #define PROV_NAMES_CAST5_ECB "CAST5-ECB"
+diff --git a/test/recipes/30-test_evp_data/evpciph_sm4.txt b/test/recipes/30-test_evp_data/evpciph_sm4.txt
+index ec8a45bd3f..9fb16ca15c 100644
+--- a/test/recipes/30-test_evp_data/evpciph_sm4.txt
++++ b/test/recipes/30-test_evp_data/evpciph_sm4.txt
+@@ -36,3 +36,23 @@ Key = 0123456789ABCDEFFEDCBA9876543210
+ IV = 0123456789ABCDEFFEDCBA9876543210
+ Plaintext = AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCDDDDDDDDDDDDDDDDEEEEEEEEEEEEEEEEFFFFFFFFFFFFFFFFEEEEEEEEEEEEEEEEAAAAAAAAAAAAAAAA
+ Ciphertext = C2B4759E78AC3CF43D0852F4E8D5F9FD7256E8A5FCB65A350EE00630912E44492A0B17E1B85B060D0FBA612D8A95831638B361FD5FFACD942F081485A83CA35D
++
++Title = SM4 GCM test vectors from RFC8998
++
++Cipher = SM4-GCM
++Key = 0123456789abcdeffedcba9876543210
++IV = 00001234567800000000abcd
++AAD = feedfacedeadbeeffeedfacedeadbeefabaddad2
++Tag = 83de3541e4c2b58177e065a9bf7b62ec
++Plaintext = aaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbccccccccccccccccddddddddddddddddeeeeeeeeeeeeeeeeffffffffffffffffeeeeeeeeeeeeeeeeaaaaaaaaaaaaaaaa
++Ciphertext = 17f399f08c67d5ee19d0dc9969c4bb7d5fd46fd3756489069157b282bb200735d82710ca5c22f0ccfa7cbf93d496ac15a56834cbcf98c397b4024a2691233b8d
++
++Title = SM4 CCM test vectors from RFC8998
++
++Cipher = SM4-CCM
++Key = 0123456789abcdeffedcba9876543210
++IV = 00001234567800000000abcd
++AAD = feedfacedeadbeeffeedfacedeadbeefabaddad2
++Tag = 16842d4fa186f56ab33256971fa110f4
++Plaintext = aaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbccccccccccccccccddddddddddddddddeeeeeeeeeeeeeeeeffffffffffffffffeeeeeeeeeeeeeeeeaaaaaaaaaaaaaaaa
++Ciphertext = 48af93501fa62adbcd414cce6034d895dda1bf8f132f042098661572e7483094fd12e518ce062c98acee28d95df4416bed31a2f04476c18bb40c84a74b97dc5b
+--
+2.37.3.windows.1
+
diff --git a/Backport-providers-Add-SM4-XTS-implementation.patch b/Backport-providers-Add-SM4-XTS-implementation.patch
new file mode 100644
index 0000000..5136236
--- /dev/null
+++ b/Backport-providers-Add-SM4-XTS-implementation.patch
@@ -0,0 +1,763 @@
+From 57c854480481bd6b0900984d17db17426c44aa40 Mon Sep 17 00:00:00 2001
+From: Xu Yizhou <xuyizhou1@huawei.com>
+Date: Fri, 25 Nov 2022 13:52:49 +0800
+Subject: [PATCH 08/13] providers: Add SM4 XTS implementation
+
+Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com>
+
+Reviewed-by: Hugo Landau <hlandau@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/19619)
+---
+ crypto/modes/build.info | 2 +-
+ crypto/modes/xts128gb.c | 199 +++++++++++++
+ include/crypto/modes.h | 6 +
+ include/openssl/core_names.h | 1 +
+ providers/defltprov.c | 1 +
+ providers/implementations/ciphers/build.info | 4 +-
+ .../implementations/ciphers/cipher_sm4_xts.c | 281 ++++++++++++++++++
+ .../implementations/ciphers/cipher_sm4_xts.h | 46 +++
+ .../ciphers/cipher_sm4_xts_hw.c | 89 ++++++
+ .../include/prov/implementations.h | 1 +
+ .../implementations/include/prov/names.h | 1 +
+ 11 files changed, 629 insertions(+), 2 deletions(-)
+ create mode 100644 crypto/modes/xts128gb.c
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_xts.c
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_xts.h
+ create mode 100644 providers/implementations/ciphers/cipher_sm4_xts_hw.c
+
+diff --git a/crypto/modes/build.info b/crypto/modes/build.info
+index f3558fa1a4..0ee297ced8 100644
+--- a/crypto/modes/build.info
++++ b/crypto/modes/build.info
+@@ -49,7 +49,7 @@ IF[{- !$disabled{asm} -}]
+ ENDIF
+
+ $COMMON=cbc128.c ctr128.c cfb128.c ofb128.c gcm128.c ccm128.c xts128.c \
+- wrap128.c $MODESASM
++ wrap128.c xts128gb.c $MODESASM
+ SOURCE[../../libcrypto]=$COMMON \
+ cts128.c ocb128.c siv128.c
+ SOURCE[../../providers/libfips.a]=$COMMON
+diff --git a/crypto/modes/xts128gb.c b/crypto/modes/xts128gb.c
+new file mode 100644
+index 0000000000..021c0597e4
+--- /dev/null
++++ b/crypto/modes/xts128gb.c
+@@ -0,0 +1,199 @@
++/*
++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License"). You may not use
++ * this file except in compliance with the License. You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++#include <string.h>
++#include <openssl/crypto.h>
++#include "internal/endian.h"
++#include "crypto/modes.h"
++
++#ifndef STRICT_ALIGNMENT
++# ifdef __GNUC__
++typedef u64 u64_a1 __attribute((__aligned__(1)));
++# else
++typedef u64 u64_a1;
++# endif
++#endif
++
++int ossl_crypto_xts128gb_encrypt(const XTS128_CONTEXT *ctx,
++ const unsigned char iv[16],
++ const unsigned char *inp, unsigned char *out,
++ size_t len, int enc)
++{
++ DECLARE_IS_ENDIAN;
++ union {
++ u64 u[2];
++ u32 d[4];
++ u8 c[16];
++ } tweak, scratch;
++ unsigned int i;
++
++ if (len < 16)
++ return -1;
++
++ memcpy(tweak.c, iv, 16);
++
++ (*ctx->block2) (tweak.c, tweak.c, ctx->key2);
++
++ if (!enc && (len % 16))
++ len -= 16;
++
++ while (len >= 16) {
++#if defined(STRICT_ALIGNMENT)
++ memcpy(scratch.c, inp, 16);
++ scratch.u[0] ^= tweak.u[0];
++ scratch.u[1] ^= tweak.u[1];
++#else
++ scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak.u[0];
++ scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak.u[1];
++#endif
++ (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
++#if defined(STRICT_ALIGNMENT)
++ scratch.u[0] ^= tweak.u[0];
++ scratch.u[1] ^= tweak.u[1];
++ memcpy(out, scratch.c, 16);
++#else
++ ((u64_a1 *)out)[0] = scratch.u[0] ^= tweak.u[0];
++ ((u64_a1 *)out)[1] = scratch.u[1] ^= tweak.u[1];
++#endif
++ inp += 16;
++ out += 16;
++ len -= 16;
++
++ if (len == 0)
++ return 0;
++
++ if (IS_LITTLE_ENDIAN) {
++ u8 res;
++ u64 hi, lo;
++#ifdef BSWAP8
++ hi = BSWAP8(tweak.u[0]);
++ lo = BSWAP8(tweak.u[1]);
++#else
++ u8 *p = tweak.c;
++
++ hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
++ lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
++#endif
++ res = (u8)lo & 1;
++ tweak.u[0] = (lo >> 1) | (hi << 63);
++ tweak.u[1] = hi >> 1;
++ if (res)
++ tweak.c[15] ^= 0xe1;
++#ifdef BSWAP8
++ hi = BSWAP8(tweak.u[0]);
++ lo = BSWAP8(tweak.u[1]);
++#else
++ p = tweak.c;
++
++ hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
++ lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
++#endif
++ tweak.u[0] = lo;
++ tweak.u[1] = hi;
++ } else {
++ u8 carry, res;
++ carry = 0;
++ for (i = 0; i < 16; ++i) {
++ res = (tweak.c[i] << 7) & 0x80;
++ tweak.c[i] = ((tweak.c[i] >> 1) + carry) & 0xff;
++ carry = res;
++ }
++ if (res)
++ tweak.c[0] ^= 0xe1;
++ }
++ }
++ if (enc) {
++ for (i = 0; i < len; ++i) {
++ u8 c = inp[i];
++ out[i] = scratch.c[i];
++ scratch.c[i] = c;
++ }
++ scratch.u[0] ^= tweak.u[0];
++ scratch.u[1] ^= tweak.u[1];
++ (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
++ scratch.u[0] ^= tweak.u[0];
++ scratch.u[1] ^= tweak.u[1];
++ memcpy(out - 16, scratch.c, 16);
++ } else {
++ union {
++ u64 u[2];
++ u8 c[16];
++ } tweak1;
++
++ if (IS_LITTLE_ENDIAN) {
++ u8 res;
++ u64 hi, lo;
++#ifdef BSWAP8
++ hi = BSWAP8(tweak.u[0]);
++ lo = BSWAP8(tweak.u[1]);
++#else
++ u8 *p = tweak.c;
++
++ hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
++ lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
++#endif
++ res = (u8)lo & 1;
++ tweak1.u[0] = (lo >> 1) | (hi << 63);
++ tweak1.u[1] = hi >> 1;
++ if (res)
++ tweak1.c[15] ^= 0xe1;
++#ifdef BSWAP8
++ hi = BSWAP8(tweak1.u[0]);
++ lo = BSWAP8(tweak1.u[1]);
++#else
++ p = tweak1.c;
++
++ hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
++ lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
++#endif
++ tweak1.u[0] = lo;
++ tweak1.u[1] = hi;
++ } else {
++ u8 carry, res;
++ carry = 0;
++ for (i = 0; i < 16; ++i) {
++ res = (tweak.c[i] << 7) & 0x80;
++ tweak1.c[i] = ((tweak.c[i] >> 1) + carry) & 0xff;
++ carry = res;
++ }
++ if (res)
++ tweak1.c[0] ^= 0xe1;
++ }
++#if defined(STRICT_ALIGNMENT)
++ memcpy(scratch.c, inp, 16);
++ scratch.u[0] ^= tweak1.u[0];
++ scratch.u[1] ^= tweak1.u[1];
++#else
++ scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak1.u[0];
++ scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak1.u[1];
++#endif
++ (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
++ scratch.u[0] ^= tweak1.u[0];
++ scratch.u[1] ^= tweak1.u[1];
++
++ for (i = 0; i < len; ++i) {
++ u8 c = inp[16 + i];
++ out[16 + i] = scratch.c[i];
++ scratch.c[i] = c;
++ }
++ scratch.u[0] ^= tweak.u[0];
++ scratch.u[1] ^= tweak.u[1];
++ (*ctx->block1) (scratch.c, scratch.c, ctx->key1);
++#if defined(STRICT_ALIGNMENT)
++ scratch.u[0] ^= tweak.u[0];
++ scratch.u[1] ^= tweak.u[1];
++ memcpy(out, scratch.c, 16);
++#else
++ ((u64_a1 *)out)[0] = scratch.u[0] ^ tweak.u[0];
++ ((u64_a1 *)out)[1] = scratch.u[1] ^ tweak.u[1];
++#endif
++ }
++
++ return 0;
++}
+diff --git a/include/crypto/modes.h b/include/crypto/modes.h
+index 19f9d85959..475b77f925 100644
+--- a/include/crypto/modes.h
++++ b/include/crypto/modes.h
+@@ -148,6 +148,12 @@ struct xts128_context {
+ block128_f block1, block2;
+ };
+
++/* XTS mode for SM4 algorithm specified by GB/T 17964-2021 */
++int ossl_crypto_xts128gb_encrypt(const XTS128_CONTEXT *ctx,
++ const unsigned char iv[16],
++ const unsigned char *inp, unsigned char *out,
++ size_t len, int enc);
++
+ struct ccm128_context {
+ union {
+ u64 u[2];
+diff --git a/include/openssl/core_names.h b/include/openssl/core_names.h
+index 6bed5a8a67..a90971099d 100644
+--- a/include/openssl/core_names.h
++++ b/include/openssl/core_names.h
+@@ -97,6 +97,7 @@ extern "C" {
+ #define OSSL_CIPHER_PARAM_CTS_MODE "cts_mode" /* utf8_string */
+ /* For passing the AlgorithmIdentifier parameter in DER form */
+ #define OSSL_CIPHER_PARAM_ALGORITHM_ID_PARAMS "alg_id_param" /* octet_string */
++#define OSSL_CIPHER_PARAM_XTS_STANDARD "xts_standard" /* utf8_string */
+
+ #define OSSL_CIPHER_PARAM_TLS1_MULTIBLOCK_MAX_SEND_FRAGMENT \
+ "tls1multi_maxsndfrag" /* uint */
+diff --git a/providers/defltprov.c b/providers/defltprov.c
+index cc0b0c3b62..ab898d3f44 100644
+--- a/providers/defltprov.c
++++ b/providers/defltprov.c
+@@ -296,6 +296,7 @@ static const OSSL_ALGORITHM_CAPABLE deflt_ciphers[] = {
+ ALG(PROV_NAMES_SM4_CTR, ossl_sm4128ctr_functions),
+ ALG(PROV_NAMES_SM4_OFB, ossl_sm4128ofb128_functions),
+ ALG(PROV_NAMES_SM4_CFB, ossl_sm4128cfb128_functions),
++ ALG(PROV_NAMES_SM4_XTS, ossl_sm4128xts_functions),
+ #endif /* OPENSSL_NO_SM4 */
+ #ifndef OPENSSL_NO_CHACHA
+ ALG(PROV_NAMES_ChaCha20, ossl_chacha20_functions),
+diff --git a/providers/implementations/ciphers/build.info b/providers/implementations/ciphers/build.info
+index b5d9d4f6c1..9f6eacf5e3 100644
+--- a/providers/implementations/ciphers/build.info
++++ b/providers/implementations/ciphers/build.info
+@@ -107,7 +107,9 @@ IF[{- !$disabled{sm4} -}]
+ SOURCE[$SM4_GOAL]=\
+ cipher_sm4.c cipher_sm4_hw.c \
+ cipher_sm4_gcm.c cipher_sm4_gcm_hw.c \
+- cipher_sm4_ccm.c cipher_sm4_ccm_hw.c
++ cipher_sm4_ccm.c cipher_sm4_ccm_hw.c \
++ cipher_sm4_xts.c cipher_sm4_xts_hw.c
++
+ ENDIF
+
+ IF[{- !$disabled{ocb} -}]
+diff --git a/providers/implementations/ciphers/cipher_sm4_xts.c b/providers/implementations/ciphers/cipher_sm4_xts.c
+new file mode 100644
+index 0000000000..3c568d4d18
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_xts.c
+@@ -0,0 +1,281 @@
++
++/*
++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License"). You may not use
++ * this file except in compliance with the License. You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++/* Dispatch functions for SM4 XTS mode */
++
++#include <openssl/proverr.h>
++#include "cipher_sm4_xts.h"
++#include "prov/implementations.h"
++#include "prov/providercommon.h"
++
++#define SM4_XTS_FLAGS PROV_CIPHER_FLAG_CUSTOM_IV
++#define SM4_XTS_IV_BITS 128
++#define SM4_XTS_BLOCK_BITS 8
++
++/* forward declarations */
++static OSSL_FUNC_cipher_encrypt_init_fn sm4_xts_einit;
++static OSSL_FUNC_cipher_decrypt_init_fn sm4_xts_dinit;
++static OSSL_FUNC_cipher_update_fn sm4_xts_stream_update;
++static OSSL_FUNC_cipher_final_fn sm4_xts_stream_final;
++static OSSL_FUNC_cipher_cipher_fn sm4_xts_cipher;
++static OSSL_FUNC_cipher_freectx_fn sm4_xts_freectx;
++static OSSL_FUNC_cipher_dupctx_fn sm4_xts_dupctx;
++static OSSL_FUNC_cipher_set_ctx_params_fn sm4_xts_set_ctx_params;
++static OSSL_FUNC_cipher_settable_ctx_params_fn sm4_xts_settable_ctx_params;
++
++/*-
++ * Provider dispatch functions
++ */
++static int sm4_xts_init(void *vctx, const unsigned char *key, size_t keylen,
++ const unsigned char *iv, size_t ivlen,
++ const OSSL_PARAM params[], int enc)
++{
++ PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)vctx;
++ PROV_CIPHER_CTX *ctx = &xctx->base;
++
++ if (!ossl_prov_is_running())
++ return 0;
++
++ ctx->enc = enc;
++
++ if (iv != NULL) {
++ if (!ossl_cipher_generic_initiv(vctx, iv, ivlen))
++ return 0;
++ }
++ if (key != NULL) {
++ if (keylen != ctx->keylen) {
++ ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_KEY_LENGTH);
++ return 0;
++ }
++ if (!ctx->hw->init(ctx, key, keylen))
++ return 0;
++ }
++ return sm4_xts_set_ctx_params(xctx, params);
++}
++
++static int sm4_xts_einit(void *vctx, const unsigned char *key, size_t keylen,
++ const unsigned char *iv, size_t ivlen,
++ const OSSL_PARAM params[])
++{
++ return sm4_xts_init(vctx, key, keylen, iv, ivlen, params, 1);
++}
++
++static int sm4_xts_dinit(void *vctx, const unsigned char *key, size_t keylen,
++ const unsigned char *iv, size_t ivlen,
++ const OSSL_PARAM params[])
++{
++ return sm4_xts_init(vctx, key, keylen, iv, ivlen, params, 0);
++}
++
++static void *sm4_xts_newctx(void *provctx, unsigned int mode, uint64_t flags,
++ size_t kbits, size_t blkbits, size_t ivbits)
++{
++ PROV_SM4_XTS_CTX *ctx = OPENSSL_zalloc(sizeof(*ctx));
++
++ if (ctx != NULL) {
++ ossl_cipher_generic_initkey(&ctx->base, kbits, blkbits, ivbits, mode,
++ flags, ossl_prov_cipher_hw_sm4_xts(kbits),
++ NULL);
++ }
++ return ctx;
++}
++
++static void sm4_xts_freectx(void *vctx)
++{
++ PROV_SM4_XTS_CTX *ctx = (PROV_SM4_XTS_CTX *)vctx;
++
++ ossl_cipher_generic_reset_ctx((PROV_CIPHER_CTX *)vctx);
++ OPENSSL_clear_free(ctx, sizeof(*ctx));
++}
++
++static void *sm4_xts_dupctx(void *vctx)
++{
++ PROV_SM4_XTS_CTX *in = (PROV_SM4_XTS_CTX *)vctx;
++ PROV_SM4_XTS_CTX *ret = NULL;
++
++ if (!ossl_prov_is_running())
++ return NULL;
++
++ if (in->xts.key1 != NULL) {
++ if (in->xts.key1 != &in->ks1)
++ return NULL;
++ }
++ if (in->xts.key2 != NULL) {
++ if (in->xts.key2 != &in->ks2)
++ return NULL;
++ }
++ ret = OPENSSL_malloc(sizeof(*ret));
++ if (ret == NULL)
++ return NULL;
++ in->base.hw->copyctx(&ret->base, &in->base);
++ return ret;
++}
++
++static int sm4_xts_cipher(void *vctx, unsigned char *out, size_t *outl,
++ size_t outsize, const unsigned char *in, size_t inl)
++{
++ PROV_SM4_XTS_CTX *ctx = (PROV_SM4_XTS_CTX *)vctx;
++
++ if (!ossl_prov_is_running()
++ || ctx->xts.key1 == NULL
++ || ctx->xts.key2 == NULL
++ || !ctx->base.iv_set
++ || out == NULL
++ || in == NULL
++ || inl < SM4_BLOCK_SIZE)
++ return 0;
++
++ /*
++ * Impose a limit of 2^20 blocks per data unit as specified by
++ * IEEE Std 1619-2018. The earlier and obsolete IEEE Std 1619-2007
++ * indicated that this was a SHOULD NOT rather than a MUST NOT.
++ * NIST SP 800-38E mandates the same limit.
++ */
++ if (inl > XTS_MAX_BLOCKS_PER_DATA_UNIT * SM4_BLOCK_SIZE) {
++ ERR_raise(ERR_LIB_PROV, PROV_R_XTS_DATA_UNIT_IS_TOO_LARGE);
++ return 0;
++ }
++ if (ctx->xts_standard) {
++ if (ctx->stream != NULL)
++ (*ctx->stream)(in, out, inl, ctx->xts.key1, ctx->xts.key2,
++ ctx->base.iv);
++ else if (CRYPTO_xts128_encrypt(&ctx->xts, ctx->base.iv, in, out, inl,
++ ctx->base.enc))
++ return 0;
++ } else {
++ if (ctx->stream_gb != NULL)
++ (*ctx->stream_gb)(in, out, inl, ctx->xts.key1, ctx->xts.key2,
++ ctx->base.iv);
++ else if (ossl_crypto_xts128gb_encrypt(&ctx->xts, ctx->base.iv, in, out,
++ inl, ctx->base.enc))
++ return 0;
++ }
++ *outl = inl;
++ return 1;
++}
++
++static int sm4_xts_stream_update(void *vctx, unsigned char *out, size_t *outl,
++ size_t outsize, const unsigned char *in,
++ size_t inl)
++{
++ PROV_SM4_XTS_CTX *ctx = (PROV_SM4_XTS_CTX *)vctx;
++
++ if (outsize < inl) {
++ ERR_raise(ERR_LIB_PROV, PROV_R_OUTPUT_BUFFER_TOO_SMALL);
++ return 0;
++ }
++
++ if (!sm4_xts_cipher(ctx, out, outl, outsize, in, inl)) {
++ ERR_raise(ERR_LIB_PROV, PROV_R_CIPHER_OPERATION_FAILED);
++ return 0;
++ }
++
++ return 1;
++}
++
++static int sm4_xts_stream_final(void *vctx, unsigned char *out, size_t *outl,
++ size_t outsize)
++{
++ if (!ossl_prov_is_running())
++ return 0;
++ *outl = 0;
++ return 1;
++}
++
++static const OSSL_PARAM sm4_xts_known_settable_ctx_params[] = {
++ OSSL_PARAM_utf8_string(OSSL_CIPHER_PARAM_XTS_STANDARD, NULL, 0),
++ OSSL_PARAM_END
++};
++
++static const OSSL_PARAM *sm4_xts_settable_ctx_params(ossl_unused void *cctx,
++ ossl_unused void *provctx)
++{
++ return sm4_xts_known_settable_ctx_params;
++}
++
++static int sm4_xts_set_ctx_params(void *vxctx, const OSSL_PARAM params[])
++{
++ PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)vxctx;
++ const OSSL_PARAM *p;
++
++ if (params == NULL)
++ return 1;
++
++ /*-
++ * Sets the XTS standard to use with SM4-XTS algorithm.
++ *
++ * Must be utf8 string "GB" or "IEEE",
++ * "GB" means the GB/T 17964-2021 standard
++ * "IEEE" means the IEEE Std 1619-2007 standard
++ */
++ p = OSSL_PARAM_locate_const(params, OSSL_CIPHER_PARAM_XTS_STANDARD);
++
++ if (p != NULL) {
++ const char *xts_standard = NULL;
++
++ if (p->data_type != OSSL_PARAM_UTF8_STRING)
++ return 0;
++
++ if (!OSSL_PARAM_get_utf8_string_ptr(p, &xts_standard)) {
++ ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_GET_PARAMETER);
++ return 0;
++ }
++ if (OPENSSL_strcasecmp(xts_standard, "GB") == 0) {
++ xctx->xts_standard = 0;
++ } else if (OPENSSL_strcasecmp(xts_standard, "IEEE") == 0) {
++ xctx->xts_standard = 1;
++ } else {
++ ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER);
++ return 0;
++ }
++ }
++
++ return 1;
++}
++
++#define IMPLEMENT_cipher(lcmode, UCMODE, kbits, flags) \
++static OSSL_FUNC_cipher_get_params_fn sm4_##kbits##_##lcmode##_get_params; \
++static int sm4_##kbits##_##lcmode##_get_params(OSSL_PARAM params[]) \
++{ \
++ return ossl_cipher_generic_get_params(params, EVP_CIPH_##UCMODE##_MODE, \
++ flags, 2 * kbits, SM4_XTS_BLOCK_BITS,\
++ SM4_XTS_IV_BITS); \
++} \
++static OSSL_FUNC_cipher_newctx_fn sm4_##kbits##_xts_newctx; \
++static void *sm4_##kbits##_xts_newctx(void *provctx) \
++{ \
++ return sm4_xts_newctx(provctx, EVP_CIPH_##UCMODE##_MODE, flags, 2 * kbits, \
++ SM4_XTS_BLOCK_BITS, SM4_XTS_IV_BITS); \
++} \
++const OSSL_DISPATCH ossl_sm4##kbits##xts_functions[] = { \
++ { OSSL_FUNC_CIPHER_NEWCTX, (void (*)(void))sm4_##kbits##_xts_newctx }, \
++ { OSSL_FUNC_CIPHER_ENCRYPT_INIT, (void (*)(void))sm4_xts_einit }, \
++ { OSSL_FUNC_CIPHER_DECRYPT_INIT, (void (*)(void))sm4_xts_dinit }, \
++ { OSSL_FUNC_CIPHER_UPDATE, (void (*)(void))sm4_xts_stream_update }, \
++ { OSSL_FUNC_CIPHER_FINAL, (void (*)(void))sm4_xts_stream_final }, \
++ { OSSL_FUNC_CIPHER_CIPHER, (void (*)(void))sm4_xts_cipher }, \
++ { OSSL_FUNC_CIPHER_FREECTX, (void (*)(void))sm4_xts_freectx }, \
++ { OSSL_FUNC_CIPHER_DUPCTX, (void (*)(void))sm4_xts_dupctx }, \
++ { OSSL_FUNC_CIPHER_GET_PARAMS, \
++ (void (*)(void))sm4_##kbits##_##lcmode##_get_params }, \
++ { OSSL_FUNC_CIPHER_GETTABLE_PARAMS, \
++ (void (*)(void))ossl_cipher_generic_gettable_params }, \
++ { OSSL_FUNC_CIPHER_GET_CTX_PARAMS, \
++ (void (*)(void))ossl_cipher_generic_get_ctx_params }, \
++ { OSSL_FUNC_CIPHER_GETTABLE_CTX_PARAMS, \
++ (void (*)(void))ossl_cipher_generic_gettable_ctx_params }, \
++ { OSSL_FUNC_CIPHER_SET_CTX_PARAMS, \
++ (void (*)(void))sm4_xts_set_ctx_params }, \
++ { OSSL_FUNC_CIPHER_SETTABLE_CTX_PARAMS, \
++ (void (*)(void))sm4_xts_settable_ctx_params }, \
++ { 0, NULL } \
++}
++/* ossl_sm4128xts_functions */
++IMPLEMENT_cipher(xts, XTS, 128, SM4_XTS_FLAGS);
+diff --git a/providers/implementations/ciphers/cipher_sm4_xts.h b/providers/implementations/ciphers/cipher_sm4_xts.h
+new file mode 100644
+index 0000000000..4c369183e2
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_xts.h
+@@ -0,0 +1,46 @@
++/*
++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License"). You may not use
++ * this file except in compliance with the License. You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++#include <crypto/sm4.h>
++#include "prov/ciphercommon.h"
++#include "crypto/sm4_platform.h"
++
++PROV_CIPHER_FUNC(void, xts_stream,
++ (const unsigned char *in, unsigned char *out, size_t len,
++ const SM4_KEY *key1, const SM4_KEY *key2,
++ const unsigned char iv[16]));
++
++typedef struct prov_sm4_xts_ctx_st {
++ /* Must be first */
++ PROV_CIPHER_CTX base;
++
++ /* SM4 key schedules to use */
++ union {
++ OSSL_UNION_ALIGN;
++ SM4_KEY ks;
++ } ks1, ks2;
++
++ /*-
++ * XTS standard to use with SM4-XTS algorithm
++ *
++ * Must be 0 or 1,
++ * 0 for XTS mode specified by GB/T 17964-2021
++ * 1 for XTS mode specified by IEEE Std 1619-2007
++ */
++ int xts_standard;
++
++ XTS128_CONTEXT xts;
++
++ /* Stream function for XTS mode specified by GB/T 17964-2021 */
++ OSSL_xts_stream_fn stream_gb;
++ /* Stream function for XTS mode specified by IEEE Std 1619-2007 */
++ OSSL_xts_stream_fn stream;
++} PROV_SM4_XTS_CTX;
++
++const PROV_CIPHER_HW *ossl_prov_cipher_hw_sm4_xts(size_t keybits);
+diff --git a/providers/implementations/ciphers/cipher_sm4_xts_hw.c b/providers/implementations/ciphers/cipher_sm4_xts_hw.c
+new file mode 100644
+index 0000000000..403eb879b1
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_sm4_xts_hw.c
+@@ -0,0 +1,89 @@
++/*
++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License"). You may not use
++ * this file except in compliance with the License. You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++#include "cipher_sm4_xts.h"
++
++#define XTS_SET_KEY_FN(fn_set_enc_key, fn_set_dec_key, \
++ fn_block_enc, fn_block_dec, \
++ fn_stream_enc, fn_stream_dec, \
++ fn_stream_gb_enc, fn_stream_gb_dec) { \
++ size_t bytes = keylen / 2; \
++ \
++ if (ctx->enc) { \
++ fn_set_enc_key(key, &xctx->ks1.ks); \
++ xctx->xts.block1 = (block128_f)fn_block_enc; \
++ } else { \
++ fn_set_dec_key(key, &xctx->ks1.ks); \
++ xctx->xts.block1 = (block128_f)fn_block_dec; \
++ } \
++ fn_set_enc_key(key + bytes, &xctx->ks2.ks); \
++ xctx->xts.block2 = (block128_f)fn_block_enc; \
++ xctx->xts.key1 = &xctx->ks1; \
++ xctx->xts.key2 = &xctx->ks2; \
++ xctx->stream = ctx->enc ? fn_stream_enc : fn_stream_dec; \
++ xctx->stream_gb = ctx->enc ? fn_stream_gb_enc : fn_stream_gb_dec; \
++}
++
++static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx,
++ const unsigned char *key,
++ size_t keylen)
++{
++ PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)ctx;
++ OSSL_xts_stream_fn stream_enc = NULL;
++ OSSL_xts_stream_fn stream_dec = NULL;
++ OSSL_xts_stream_fn stream_gb_enc = NULL;
++ OSSL_xts_stream_fn stream_gb_dec = NULL;
++#ifdef HWSM4_CAPABLE
++ if (HWSM4_CAPABLE) {
++ XTS_SET_KEY_FN(HWSM4_set_encrypt_key, HWSM4_set_decrypt_key,
++ HWSM4_encrypt, HWSM4_decrypt, stream_enc, stream_dec,
++ stream_gb_enc, stream_gb_dec);
++ return 1;
++ } else
++#endif /* HWSM4_CAPABLE */
++#ifdef VPSM4_CAPABLE
++ if (VPSM4_CAPABLE) {
++ XTS_SET_KEY_FN(vpsm4_set_encrypt_key, vpsm4_set_decrypt_key,
++ vpsm4_encrypt, vpsm4_decrypt, stream_enc, stream_dec,
++ stream_gb_enc, stream_gb_dec);
++ return 1;
++ } else
++#endif /* VPSM4_CAPABLE */
++ {
++ (void)0;
++ }
++ {
++ XTS_SET_KEY_FN(ossl_sm4_set_key, ossl_sm4_set_key, ossl_sm4_encrypt,
++ ossl_sm4_decrypt, stream_enc, stream_dec, stream_gb_enc,
++ stream_gb_dec);
++ }
++ return 1;
++}
++
++static void cipher_hw_sm4_xts_copyctx(PROV_CIPHER_CTX *dst,
++ const PROV_CIPHER_CTX *src)
++{
++ PROV_SM4_XTS_CTX *sctx = (PROV_SM4_XTS_CTX *)src;
++ PROV_SM4_XTS_CTX *dctx = (PROV_SM4_XTS_CTX *)dst;
++
++ *dctx = *sctx;
++ dctx->xts.key1 = &dctx->ks1.ks;
++ dctx->xts.key2 = &dctx->ks2.ks;
++}
++
++
++static const PROV_CIPHER_HW sm4_generic_xts = {
++ cipher_hw_sm4_xts_generic_initkey,
++ NULL,
++ cipher_hw_sm4_xts_copyctx
++};
++const PROV_CIPHER_HW *ossl_prov_cipher_hw_sm4_xts(size_t keybits)
++{
++ return &sm4_generic_xts;
++}
+diff --git a/providers/implementations/include/prov/implementations.h b/providers/implementations/include/prov/implementations.h
+index 498eab4ad4..cfa32ea3ca 100644
+--- a/providers/implementations/include/prov/implementations.h
++++ b/providers/implementations/include/prov/implementations.h
+@@ -181,6 +181,7 @@ extern const OSSL_DISPATCH ossl_sm4128cbc_functions[];
+ extern const OSSL_DISPATCH ossl_sm4128ctr_functions[];
+ extern const OSSL_DISPATCH ossl_sm4128ofb128_functions[];
+ extern const OSSL_DISPATCH ossl_sm4128cfb128_functions[];
++extern const OSSL_DISPATCH ossl_sm4128xts_functions[];
+ #endif /* OPENSSL_NO_SM4 */
+ #ifndef OPENSSL_NO_RC5
+ extern const OSSL_DISPATCH ossl_rc5128ecb_functions[];
+diff --git a/providers/implementations/include/prov/names.h b/providers/implementations/include/prov/names.h
+index 0fac23a850..5192f4f471 100644
+--- a/providers/implementations/include/prov/names.h
++++ b/providers/implementations/include/prov/names.h
+@@ -164,6 +164,7 @@
+ #define PROV_NAMES_SM4_CFB "SM4-CFB:SM4-CFB128:1.2.156.10197.1.104.4"
+ #define PROV_NAMES_SM4_GCM "SM4-GCM:1.2.156.10197.1.104.8"
+ #define PROV_NAMES_SM4_CCM "SM4-CCM:1.2.156.10197.1.104.9"
++#define PROV_NAMES_SM4_XTS "SM4-XTS:1.2.156.10197.1.104.10"
+ #define PROV_NAMES_ChaCha20 "ChaCha20"
+ #define PROV_NAMES_ChaCha20_Poly1305 "ChaCha20-Poly1305"
+ #define PROV_NAMES_CAST5_ECB "CAST5-ECB"
+--
+2.37.3.windows.1
+
diff --git a/Backport-support-decode-SM2-parameters.patch b/Backport-support-decode-SM2-parameters.patch
new file mode 100644
index 0000000..7f4ea20
--- /dev/null
+++ b/Backport-support-decode-SM2-parameters.patch
@@ -0,0 +1,175 @@
+From 08ae9fa627e858b9f8e96e0c6d3cf84422a11d75 Mon Sep 17 00:00:00 2001
+From: K1 <dongbeiouba@gmail.com>
+Date: Tue, 19 Jul 2022 01:18:12 +0800
+Subject: [PATCH] Support decode SM2 parameters
+
+Reviewed-by: Hugo Landau <hlandau@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/18819)
+
+Signed-off-by: Huaxin Lu <luhuaxin1@huawei.com>
+---
+ apps/ecparam.c | 12 ++++++++++--
+ include/openssl/pem.h | 1 +
+ providers/decoders.inc | 1 +
+ .../implementations/encode_decode/decode_der2key.c | 1 +
+ .../implementations/encode_decode/decode_pem2der.c | 1 +
+ .../implementations/encode_decode/encode_key2text.c | 8 +++++---
+ .../implementations/include/prov/implementations.h | 1 +
+ test/recipes/15-test_ecparam.t | 4 ++++
+ .../15-test_ecparam_data/valid/sm2-explicit.pem | 7 +++++++
+ .../recipes/15-test_ecparam_data/valid/sm2-named.pem | 3 +++
+ 10 files changed, 34 insertions(+), 5 deletions(-)
+ create mode 100644 test/recipes/15-test_ecparam_data/valid/sm2-explicit.pem
+ create mode 100644 test/recipes/15-test_ecparam_data/valid/sm2-named.pem
+
+diff --git a/apps/ecparam.c b/apps/ecparam.c
+index 5d66b65569..71f93c4ca5 100644
+--- a/apps/ecparam.c
++++ b/apps/ecparam.c
+@@ -242,9 +242,17 @@ int ecparam_main(int argc, char **argv)
+ goto end;
+ }
+ } else {
+- params_key = load_keyparams(infile, informat, 1, "EC", "EC parameters");
+- if (params_key == NULL || !EVP_PKEY_is_a(params_key, "EC"))
++ params_key = load_keyparams_suppress(infile, informat, 1, "EC",
++ "EC parameters", 1);
++ if (params_key == NULL)
++ params_key = load_keyparams_suppress(infile, informat, 1, "SM2",
++ "SM2 parameters", 1);
++
++ if (params_key == NULL) {
++ BIO_printf(bio_err, "Unable to load parameters from %s\n", infile);
+ goto end;
++ }
++
+ if (point_format
+ && !EVP_PKEY_set_utf8_string_param(
+ params_key, OSSL_PKEY_PARAM_EC_POINT_CONVERSION_FORMAT,
+diff --git a/include/openssl/pem.h b/include/openssl/pem.h
+index ed50f081fa..0446c77019 100644
+--- a/include/openssl/pem.h
++++ b/include/openssl/pem.h
+@@ -57,6 +57,7 @@ extern "C" {
+ # define PEM_STRING_ECPRIVATEKEY "EC PRIVATE KEY"
+ # define PEM_STRING_PARAMETERS "PARAMETERS"
+ # define PEM_STRING_CMS "CMS"
++# define PEM_STRING_SM2PARAMETERS "SM2 PARAMETERS"
+
+ # define PEM_TYPE_ENCRYPTED 10
+ # define PEM_TYPE_MIC_ONLY 20
+diff --git a/providers/decoders.inc b/providers/decoders.inc
+index 2772aad05d..edca39ea36 100644
+--- a/providers/decoders.inc
++++ b/providers/decoders.inc
+@@ -69,6 +69,7 @@ DECODER_w_structure("X448", der, SubjectPublicKeyInfo, x448, yes),
+ # ifndef OPENSSL_NO_SM2
+ DECODER_w_structure("SM2", der, PrivateKeyInfo, sm2, no),
+ DECODER_w_structure("SM2", der, SubjectPublicKeyInfo, sm2, no),
++DECODER_w_structure("SM2", der, type_specific_no_pub, sm2, no),
+ # endif
+ #endif
+ DECODER_w_structure("RSA", der, PrivateKeyInfo, rsa, yes),
+diff --git a/providers/implementations/encode_decode/decode_der2key.c b/providers/implementations/encode_decode/decode_der2key.c
+index ebc2d24833..d4d3731460 100644
+--- a/providers/implementations/encode_decode/decode_der2key.c
++++ b/providers/implementations/encode_decode/decode_der2key.c
+@@ -783,6 +783,7 @@ MAKE_DECODER("ED448", ed448, ecx, SubjectPublicKeyInfo);
+ # ifndef OPENSSL_NO_SM2
+ MAKE_DECODER("SM2", sm2, ec, PrivateKeyInfo);
+ MAKE_DECODER("SM2", sm2, ec, SubjectPublicKeyInfo);
++MAKE_DECODER("SM2", sm2, sm2, type_specific_no_pub);
+ # endif
+ #endif
+ MAKE_DECODER("RSA", rsa, rsa, PrivateKeyInfo);
+diff --git a/providers/implementations/encode_decode/decode_pem2der.c b/providers/implementations/encode_decode/decode_pem2der.c
+index bc937ffb9d..648ecd4584 100644
+--- a/providers/implementations/encode_decode/decode_pem2der.c
++++ b/providers/implementations/encode_decode/decode_pem2der.c
+@@ -119,6 +119,7 @@ static int pem2der_decode(void *vctx, OSSL_CORE_BIO *cin, int selection,
+ { PEM_STRING_DSAPARAMS, OSSL_OBJECT_PKEY, "DSA", "type-specific" },
+ { PEM_STRING_ECPRIVATEKEY, OSSL_OBJECT_PKEY, "EC", "type-specific" },
+ { PEM_STRING_ECPARAMETERS, OSSL_OBJECT_PKEY, "EC", "type-specific" },
++ { PEM_STRING_SM2PARAMETERS, OSSL_OBJECT_PKEY, "SM2", "type-specific" },
+ { PEM_STRING_RSA, OSSL_OBJECT_PKEY, "RSA", "type-specific" },
+ { PEM_STRING_RSA_PUBLIC, OSSL_OBJECT_PKEY, "RSA", "type-specific" },
+
+diff --git a/providers/implementations/encode_decode/encode_key2text.c b/providers/implementations/encode_decode/encode_key2text.c
+index 7d983f5e51..a92e04a89d 100644
+--- a/providers/implementations/encode_decode/encode_key2text.c
++++ b/providers/implementations/encode_decode/encode_key2text.c
+@@ -512,7 +512,8 @@ static int ec_to_text(BIO *out, const void *key, int selection)
+ else if ((selection & OSSL_KEYMGMT_SELECT_PUBLIC_KEY) != 0)
+ type_label = "Public-Key";
+ else if ((selection & OSSL_KEYMGMT_SELECT_DOMAIN_PARAMETERS) != 0)
+- type_label = "EC-Parameters";
++ if (EC_GROUP_get_curve_name(group) != NID_sm2)
++ type_label = "EC-Parameters";
+
+ if ((selection & OSSL_KEYMGMT_SELECT_PRIVATE_KEY) != 0) {
+ const BIGNUM *priv_key = EC_KEY_get0_private_key(ec);
+@@ -538,8 +539,9 @@ static int ec_to_text(BIO *out, const void *key, int selection)
+ goto err;
+ }
+
+- if (BIO_printf(out, "%s: (%d bit)\n", type_label,
+- EC_GROUP_order_bits(group)) <= 0)
++ if (type_label != NULL
++ && BIO_printf(out, "%s: (%d bit)\n", type_label,
++ EC_GROUP_order_bits(group)) <= 0)
+ goto err;
+ if (priv != NULL
+ && !print_labeled_buf(out, "priv:", priv, priv_len))
+diff --git a/providers/implementations/include/prov/implementations.h b/providers/implementations/include/prov/implementations.h
+index 03ce43719e..288808bb6f 100644
+--- a/providers/implementations/include/prov/implementations.h
++++ b/providers/implementations/include/prov/implementations.h
+@@ -508,6 +508,7 @@ extern const OSSL_DISPATCH ossl_SubjectPublicKeyInfo_der_to_ed448_decoder_functi
+ #ifndef OPENSSL_NO_SM2
+ extern const OSSL_DISPATCH ossl_PrivateKeyInfo_der_to_sm2_decoder_functions[];
+ extern const OSSL_DISPATCH ossl_SubjectPublicKeyInfo_der_to_sm2_decoder_functions[];
++extern const OSSL_DISPATCH ossl_type_specific_no_pub_der_to_sm2_decoder_functions[];
+ #endif
+
+ extern const OSSL_DISPATCH ossl_PrivateKeyInfo_der_to_rsa_decoder_functions[];
+diff --git a/test/recipes/15-test_ecparam.t b/test/recipes/15-test_ecparam.t
+index 37bf620f35..5dba866378 100644
+--- a/test/recipes/15-test_ecparam.t
++++ b/test/recipes/15-test_ecparam.t
+@@ -25,6 +25,10 @@ my @valid = glob(data_file("valid", "*.pem"));
+ my @noncanon = glob(data_file("noncanon", "*.pem"));
+ my @invalid = glob(data_file("invalid", "*.pem"));
+
++if (disabled("sm2")) {
++ @valid = grep { !/sm2-.*\.pem/} @valid;
++}
++
+ plan tests => 12;
+
+ sub checkload {
+diff --git a/test/recipes/15-test_ecparam_data/valid/sm2-explicit.pem b/test/recipes/15-test_ecparam_data/valid/sm2-explicit.pem
+new file mode 100644
+index 0000000000..bd07654ea4
+--- /dev/null
++++ b/test/recipes/15-test_ecparam_data/valid/sm2-explicit.pem
+@@ -0,0 +1,7 @@
++-----BEGIN SM2 PARAMETERS-----
++MIHgAgEBMCwGByqGSM49AQECIQD////+/////////////////////wAAAAD/////
++/////zBEBCD////+/////////////////////wAAAAD//////////AQgKOn6np2f
++XjRNWp5Lz2UJp/OXifUVq4+S3by9QU2UDpMEQQQyxK4sHxmBGV+ZBEZqOcmUj+ML
++v/JmC+FxWkWJM0x0x7w3NqL09necWb3O42tpIVPQqYd8xipHQALfMuUhOfCgAiEA
++/////v///////////////3ID32shxgUrU7v0CTnVQSMCAQE=
++-----END SM2 PARAMETERS-----
+diff --git a/test/recipes/15-test_ecparam_data/valid/sm2-named.pem b/test/recipes/15-test_ecparam_data/valid/sm2-named.pem
+new file mode 100644
+index 0000000000..d6e280f6c2
+--- /dev/null
++++ b/test/recipes/15-test_ecparam_data/valid/sm2-named.pem
+@@ -0,0 +1,3 @@
++-----BEGIN SM2 PARAMETERS-----
++BggqgRzPVQGCLQ==
++-----END SM2 PARAMETERS-----
+--
+2.33.0
+
diff --git a/Feature-support-SM2-CMS-signature.patch b/Feature-support-SM2-CMS-signature.patch
new file mode 100644
index 0000000..b579537
--- /dev/null
+++ b/Feature-support-SM2-CMS-signature.patch
@@ -0,0 +1,41 @@
+From e7f35b6f10599a574acb3bcca40845eeccfdc63b Mon Sep 17 00:00:00 2001
+From: Huaxin Lu <luhuaxin1@huawei.com>
+Date: Fri, 1 Sep 2023 20:08:46 +0800
+Subject: [PATCH] Support SM2 CMS signature
+
+Signed-off-by: Huaxin Lu <luhuaxin1@huawei.com>
+---
+ crypto/cms/cms_sd.c | 2 +-
+ crypto/evp/p_lib.c | 3 +++
+ 2 files changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/crypto/cms/cms_sd.c b/crypto/cms/cms_sd.c
+index 34c021b..093b41c 100644
+--- a/crypto/cms/cms_sd.c
++++ b/crypto/cms/cms_sd.c
+@@ -232,7 +232,7 @@ static int cms_sd_asn1_ctrl(CMS_SignerInfo *si, int cmd)
+ EVP_PKEY *pkey = si->pkey;
+ int i;
+
+- if (EVP_PKEY_is_a(pkey, "DSA") || EVP_PKEY_is_a(pkey, "EC"))
++ if (EVP_PKEY_is_a(pkey, "DSA") || EVP_PKEY_is_a(pkey, "EC") || EVP_PKEY_is_a(pkey, "SM2"))
+ return ossl_cms_ecdsa_dsa_sign(si, cmd);
+ else if (EVP_PKEY_is_a(pkey, "RSA") || EVP_PKEY_is_a(pkey, "RSA-PSS"))
+ return ossl_cms_rsa_sign(si, cmd);
+diff --git a/crypto/evp/p_lib.c b/crypto/evp/p_lib.c
+index f6acb5b..9567bb0 100644
+--- a/crypto/evp/p_lib.c
++++ b/crypto/evp/p_lib.c
+@@ -982,6 +982,9 @@ int EVP_PKEY_type(int type)
+
+ int EVP_PKEY_get_id(const EVP_PKEY *pkey)
+ {
++ if (EVP_PKEY_is_a(pkey, "SM2")) {
++ return EVP_PKEY_SM2;
++ }
+ return pkey->type;
+ }
+
+--
+2.33.0
+
diff --git a/Feature-use-default-id-if-SM2-id-is-not-set.patch b/Feature-use-default-id-if-SM2-id-is-not-set.patch
new file mode 100644
index 0000000..ee6f2d8
--- /dev/null
+++ b/Feature-use-default-id-if-SM2-id-is-not-set.patch
@@ -0,0 +1,59 @@
+From 12f6ee3806c1f04a682b4c31aeb510a2dca602ef Mon Sep 17 00:00:00 2001
+From: Huaxin Lu <luhuaxin1@huawei.com>
+Date: Fri, 1 Sep 2023 20:27:45 +0800
+Subject: [PATCH] use default id if SM2 id is not set
+
+Signed-off-by: Huaxin Lu <luhuaxin1@huawei.com>
+---
+ crypto/sm2/sm2_sign.c | 14 +++++++++++---
+ 1 file changed, 11 insertions(+), 3 deletions(-)
+
+diff --git a/crypto/sm2/sm2_sign.c b/crypto/sm2/sm2_sign.c
+index ff5be9b..33d3a73 100644
+--- a/crypto/sm2/sm2_sign.c
++++ b/crypto/sm2/sm2_sign.c
+@@ -42,6 +42,8 @@ int ossl_sm2_compute_z_digest(uint8_t *out,
+ uint8_t *buf = NULL;
+ uint16_t entl = 0;
+ uint8_t e_byte = 0;
++ const uint8_t *f_id = id;
++ size_t f_id_len = id_len;
+
+ hash = EVP_MD_CTX_new();
+ ctx = BN_CTX_new_ex(ossl_ec_key_get_libctx(key));
+@@ -68,15 +70,21 @@ int ossl_sm2_compute_z_digest(uint8_t *out,
+ goto done;
+ }
+
++ /* if id is not set, use default id */
++ if (f_id == NULL || f_id_len == 0) {
++ f_id = (const uint8_t *)SM2_DEFAULT_USERID;
++ f_id_len = strlen(SM2_DEFAULT_USERID);
++ }
++
+ /* Z = h(ENTL || ID || a || b || xG || yG || xA || yA) */
+
+- if (id_len >= (UINT16_MAX / 8)) {
++ if (f_id_len >= (UINT16_MAX / 8)) {
+ /* too large */
+ ERR_raise(ERR_LIB_SM2, SM2_R_ID_TOO_LARGE);
+ goto done;
+ }
+
+- entl = (uint16_t)(8 * id_len);
++ entl = (uint16_t)(8 * f_id_len);
+
+ e_byte = entl >> 8;
+ if (!EVP_DigestUpdate(hash, &e_byte, 1)) {
+@@ -89,7 +97,7 @@ int ossl_sm2_compute_z_digest(uint8_t *out,
+ goto done;
+ }
+
+- if (id_len > 0 && !EVP_DigestUpdate(hash, id, id_len)) {
++ if (f_id_len > 0 && !EVP_DigestUpdate(hash, f_id, f_id_len)) {
+ ERR_raise(ERR_LIB_SM2, ERR_R_EVP_LIB);
+ goto done;
+ }
+--
+2.33.0
+
diff --git a/Makefile.certificate b/Makefile.certificate
new file mode 100644
index 0000000..cc88c52
--- /dev/null
+++ b/Makefile.certificate
@@ -0,0 +1,82 @@
+UTF8 := $(shell locale -c LC_CTYPE -k | grep -q charmap.*UTF-8 && echo -utf8)
+DAYS=365
+KEYLEN=2048
+TYPE=rsa:$(KEYLEN)
+EXTRA_FLAGS=
+ifdef SERIAL
+ EXTRA_FLAGS+=-set_serial $(SERIAL)
+endif
+
+.PHONY: usage
+.SUFFIXES: .key .csr .crt .pem
+.PRECIOUS: %.key %.csr %.crt %.pem
+
+usage:
+ @echo "This makefile allows you to create:"
+ @echo " o public/private key pairs"
+ @echo " o SSL certificate signing requests (CSRs)"
+ @echo " o self-signed SSL test certificates"
+ @echo
+ @echo "To create a key pair, run \"make SOMETHING.key\"."
+ @echo "To create a CSR, run \"make SOMETHING.csr\"."
+ @echo "To create a test certificate, run \"make SOMETHING.crt\"."
+ @echo "To create a key and a test certificate in one file, run \"make SOMETHING.pem\"."
+ @echo
+ @echo "To create a key for use with Apache, run \"make genkey\"."
+ @echo "To create a CSR for use with Apache, run \"make certreq\"."
+ @echo "To create a test certificate for use with Apache, run \"make testcert\"."
+ @echo
+ @echo "To create a test certificate with serial number other than random, add SERIAL=num"
+ @echo "You can also specify key length with KEYLEN=n and expiration in days with DAYS=n"
+ @echo "Any additional options can be passed to openssl req via EXTRA_FLAGS"
+ @echo
+ @echo Examples:
+ @echo " make server.key"
+ @echo " make server.csr"
+ @echo " make server.crt"
+ @echo " make stunnel.pem"
+ @echo " make genkey"
+ @echo " make certreq"
+ @echo " make testcert"
+ @echo " make server.crt SERIAL=1"
+ @echo " make stunnel.pem EXTRA_FLAGS=-sha384"
+ @echo " make testcert DAYS=600"
+
+%.pem:
+ umask 77 ; \
+ PEM1=`/bin/mktemp /tmp/openssl.XXXXXX` ; \
+ PEM2=`/bin/mktemp /tmp/openssl.XXXXXX` ; \
+ /usr/bin/openssl req $(UTF8) -newkey $(TYPE) -keyout $$PEM1 -nodes -x509 -days $(DAYS) -out $$PEM2 $(EXTRA_FLAGS) ; \
+ cat $$PEM1 > $@ ; \
+ echo "" >> $@ ; \
+ cat $$PEM2 >> $@ ; \
+ $(RM) $$PEM1 $$PEM2
+
+%.key:
+ umask 77 ; \
+ /usr/bin/openssl genrsa -aes128 $(KEYLEN) > $@
+
+%.csr: %.key
+ umask 77 ; \
+ /usr/bin/openssl req $(UTF8) -new -key $^ -out $@
+
+%.crt: %.key
+ umask 77 ; \
+ /usr/bin/openssl req $(UTF8) -new -key $^ -x509 -days $(DAYS) -out $@ $(EXTRA_FLAGS)
+
+TLSROOT=/etc/pki/tls
+KEY=$(TLSROOT)/private/localhost.key
+CSR=$(TLSROOT)/certs/localhost.csr
+CRT=$(TLSROOT)/certs/localhost.crt
+
+genkey: $(KEY)
+certreq: $(CSR)
+testcert: $(CRT)
+
+$(CSR): $(KEY)
+ umask 77 ; \
+ /usr/bin/openssl req $(UTF8) -new -key $(KEY) -out $(CSR)
+
+$(CRT): $(KEY)
+ umask 77 ; \
+ /usr/bin/openssl req $(UTF8) -new -key $(KEY) -x509 -days $(DAYS) -out $(CRT) $(EXTRA_FLAGS)
diff --git a/backport-A-null-pointer-dereference-occurs-when-memory-alloca.patch b/backport-A-null-pointer-dereference-occurs-when-memory-alloca.patch
new file mode 100644
index 0000000..afd87ba
--- /dev/null
+++ b/backport-A-null-pointer-dereference-occurs-when-memory-alloca.patch
@@ -0,0 +1,36 @@
+From a8da305fa3dd6e34ba5aab3978281f652fd12883 Mon Sep 17 00:00:00 2001
+From: yangyangtiantianlonglong <yangtianlong1224@163.com>
+Date: Mon, 31 Jul 2023 07:04:41 -0700
+Subject: [PATCH] A null pointer dereference occurs when memory allocation
+ fails
+
+Fixes #21605
+
+Reviewed-by: Hugo Landau <hlandau@openssl.org>
+Reviewed-by: Matthias St. Pierre <Matthias.St.Pierre@ncp-e.com>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/21606)
+---
+ ssl/ssl_sess.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/ssl/ssl_sess.c b/ssl/ssl_sess.c
+index cda6b7cc5b..2a5d21be79 100644
+--- a/ssl/ssl_sess.c
++++ b/ssl/ssl_sess.c
+@@ -139,8 +139,11 @@ SSL_SESSION *ssl_session_dup(SSL_SESSION *src, int ticket)
+ dest->references = 1;
+
+ dest->lock = CRYPTO_THREAD_lock_new();
+- if (dest->lock == NULL)
++ if (dest->lock == NULL) {
++ OPENSSL_free(dest);
++ dest = NULL;
+ goto err;
++ }
+
+ if (!CRYPTO_new_ex_data(CRYPTO_EX_INDEX_SSL_SESSION, dest, &dest->ex_data))
+ goto err;
+--
+2.27.0
+
diff --git a/backport-Add-a-test-for-CVE-2023-3446.patch b/backport-Add-a-test-for-CVE-2023-3446.patch
new file mode 100644
index 0000000..6c5f734
--- /dev/null
+++ b/backport-Add-a-test-for-CVE-2023-3446.patch
@@ -0,0 +1,63 @@
+From 8a62fd996cb1c22383ec75b4155d54dec4a1b0ee Mon Sep 17 00:00:00 2001
+From: Matt Caswell <matt@openssl.org>
+Date: Fri, 7 Jul 2023 14:39:48 +0100
+Subject: [PATCH] Add a test for CVE-2023-3446
+
+Confirm that the only errors DH_check() finds with DH parameters with an
+excessively long modulus is that the modulus is too large. We should not
+be performing time consuming checks using that modulus.
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com>
+Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/21451)
+
+(cherry picked from commit ede782b4c8868d1f09c9cd237f82b6f35b7dba8b)
+---
+ test/dhtest.c | 15 +++++++++++++--
+ 1 file changed, 13 insertions(+), 2 deletions(-)
+
+diff --git a/test/dhtest.c b/test/dhtest.c
+index 7b587f3cfa..f8dd8f3aa7 100644
+--- a/test/dhtest.c
++++ b/test/dhtest.c
+@@ -73,7 +73,7 @@ static int dh_test(void)
+ goto err1;
+
+ /* check fails, because p is way too small */
+- if (!DH_check(dh, &i))
++ if (!TEST_true(DH_check(dh, &i)))
+ goto err2;
+ i ^= DH_MODULUS_TOO_SMALL;
+ if (!TEST_false(i & DH_CHECK_P_NOT_PRIME)
+@@ -124,6 +124,17 @@ static int dh_test(void)
+ /* We'll have a stale error on the queue from the above test so clear it */
+ ERR_clear_error();
+
++ /* Modulus of size: dh check max modulus bits + 1 */
++ if (!TEST_true(BN_set_word(p, 1))
++ || !TEST_true(BN_lshift(p, p, OPENSSL_DH_CHECK_MAX_MODULUS_BITS)))
++ goto err3;
++
++ /*
++ * We expect no checks at all for an excessively large modulus
++ */
++ if (!TEST_false(DH_check(dh, &i)))
++ goto err3;
++
+ /*
+ * II) key generation
+ */
+@@ -138,7 +149,7 @@ static int dh_test(void)
+ goto err3;
+
+ /* ... and check whether it is valid */
+- if (!DH_check(a, &i))
++ if (!TEST_true(DH_check(a, &i)))
+ goto err3;
+ if (!TEST_false(i & DH_CHECK_P_NOT_PRIME)
+ || !TEST_false(i & DH_CHECK_P_NOT_SAFE_PRIME)
+--
+2.27.0
+
diff --git a/backport-Add-testcases-for-empty-associated-data-entries-with.patch b/backport-Add-testcases-for-empty-associated-data-entries-with.patch
new file mode 100644
index 0000000..74126e7
--- /dev/null
+++ b/backport-Add-testcases-for-empty-associated-data-entries-with.patch
@@ -0,0 +1,66 @@
+From 96318a8d21bed334d78797eca5b32790775d5f05 Mon Sep 17 00:00:00 2001
+From: Tomas Mraz <tomas@openssl.org>
+Date: Tue, 4 Jul 2023 17:50:37 +0200
+Subject: [PATCH] Add testcases for empty associated data entries with AES-SIV
+
+Reviewed-by: Matt Caswell <matt@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/21384)
+
+(cherry picked from commit 3993bb0c0c87e3ed0ab4274e4688aa814e164cfc)
+---
+ .../30-test_evp_data/evpciph_aes_siv.txt | 31 +++++++++++++++++++
+ 1 file changed, 31 insertions(+)
+
+diff --git a/test/recipes/30-test_evp_data/evpciph_aes_siv.txt b/test/recipes/30-test_evp_data/evpciph_aes_siv.txt
+index a78a49158d..e434f13f41 100644
+--- a/test/recipes/30-test_evp_data/evpciph_aes_siv.txt
++++ b/test/recipes/30-test_evp_data/evpciph_aes_siv.txt
+@@ -20,6 +20,19 @@ Tag = 85632d07c6e8f37f950acd320a2ecc93
+ Plaintext = 112233445566778899aabbccddee
+ Ciphertext = 40c02b9690c4dc04daef7f6afe5c
+
++Cipher = aes-128-siv
++Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
++Tag = f1c5fdeac1f15a26779c1501f9fb7588
++Plaintext = 112233445566778899aabbccddee
++Ciphertext = 27e946c669088ab06da58c5c831c
++
++Cipher = aes-128-siv
++Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff
++AAD =
++Tag = d1022f5b3664e5a4dfaf90f85be6f28a
++Plaintext = 112233445566778899aabbccddee
++Ciphertext = b66cff6b8eca0b79f083b39a0901
++
+ Cipher = aes-128-siv
+ Key = 7f7e7d7c7b7a79787776757473727170404142434445464748494a4b4c4d4e4f
+ AAD = 00112233445566778899aabbccddeeffdeaddadadeaddadaffeeddccbbaa99887766554433221100
+@@ -29,6 +42,24 @@ Tag = 7bdb6e3b432667eb06f4d14bff2fbd0f
+ Plaintext = 7468697320697320736f6d6520706c61696e7465787420746f20656e6372797074207573696e67205349562d414553
+ Ciphertext = cb900f2fddbe404326601965c889bf17dba77ceb094fa663b7a3f748ba8af829ea64ad544a272e9c485b62a3fd5c0d
+
++Cipher = aes-128-siv
++Key = 7f7e7d7c7b7a79787776757473727170404142434445464748494a4b4c4d4e4f
++AAD = 00112233445566778899aabbccddeeffdeaddadadeaddadaffeeddccbbaa99887766554433221100
++AAD =
++AAD = 09f911029d74e35bd84156c5635688c0
++Tag = 83ce6593a8fa67eb6fcd2819cedfc011
++Plaintext = 7468697320697320736f6d6520706c61696e7465787420746f20656e6372797074207573696e67205349562d414553
++Ciphertext = 30d937b42f71f71f93fc2d8d702d3eac8dc7651eefcd81120081ff29d626f97f3de17f2969b691c91b69b652bf3a6d
++
++Cipher = aes-128-siv
++Key = 7f7e7d7c7b7a79787776757473727170404142434445464748494a4b4c4d4e4f
++AAD =
++AAD = 00112233445566778899aabbccddeeffdeaddadadeaddadaffeeddccbbaa99887766554433221100
++AAD = 09f911029d74e35bd84156c5635688c0
++Tag = 77dd4a44f5a6b41302121ee7f378de25
++Plaintext = 7468697320697320736f6d6520706c61696e7465787420746f20656e6372797074207573696e67205349562d414553
++Ciphertext = 0fcd664c922464c88939d71fad7aefb864e501b0848a07d39201c1067a7288f3dadf0131a823a0bc3d588e8564a5fe
++
+ Cipher = aes-192-siv
+ Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0f0f1f2f3f4f5f6f7f8f9fafbfcfdfefffffefdfcfbfaf9f8f7f6f5f4f3f2f1f0
+ AAD = 101112131415161718191a1b1c1d1e1f2021222324252627
+--
+2.27.0
+
diff --git a/backport-DH_check-Do-not-try-checking-q-properties-if-it-is-o.patch b/backport-DH_check-Do-not-try-checking-q-properties-if-it-is-o.patch
new file mode 100644
index 0000000..13ad1a2
--- /dev/null
+++ b/backport-DH_check-Do-not-try-checking-q-properties-if-it-is-o.patch
@@ -0,0 +1,61 @@
+From 9002fd07327a91f35ba6c1307e71fa6fd4409b7f Mon Sep 17 00:00:00 2001
+From: Tomas Mraz <tomas@openssl.org>
+Date: Tue, 25 Jul 2023 15:22:48 +0200
+Subject: [PATCH] DH_check(): Do not try checking q properties if it is
+ obviously invalid
+
+If |q| >= |p| then the q value is obviously wrong as q
+is supposed to be a prime divisor of p-1.
+
+We check if p is overly large so this added test implies that
+q is not large either when performing subsequent tests using that
+q value.
+
+Otherwise if it is too large these additional checks of the q value
+such as the primality test can then trigger DoS by doing overly long
+computations.
+
+Fixes CVE-2023-3817
+
+Reviewed-by: Matt Caswell <matt@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com>
+Reviewed-by: Todd Short <todd.short@me.com>
+(Merged from https://github.com/openssl/openssl/pull/21550)
+
+(cherry picked from commit 1c16253f3c3a8d1e25918c3f404aae6a5b0893de)
+(cherry picked from commit 6a1eb62c29db6cb5eec707f9338aee00f44e26f5)
+---
+ crypto/dh/dh_check.c | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+diff --git a/crypto/dh/dh_check.c b/crypto/dh/dh_check.c
+index aef6f9b1b7..fbe2797569 100644
+--- a/crypto/dh/dh_check.c
++++ b/crypto/dh/dh_check.c
+@@ -143,7 +143,7 @@ int DH_check(const DH *dh, int *ret)
+ #ifdef FIPS_MODULE
+ return DH_check_params(dh, ret);
+ #else
+- int ok = 0, r;
++ int ok = 0, r, q_good = 0;
+ BN_CTX *ctx = NULL;
+ BIGNUM *t1 = NULL, *t2 = NULL;
+ int nid = DH_get_nid((DH *)dh);
+@@ -172,6 +172,13 @@ int DH_check(const DH *dh, int *ret)
+ goto err;
+
+ if (dh->params.q != NULL) {
++ if (BN_ucmp(dh->params.p, dh->params.q) > 0)
++ q_good = 1;
++ else
++ *ret |= DH_CHECK_INVALID_Q_VALUE;
++ }
++
++ if (q_good) {
+ if (BN_cmp(dh->params.g, BN_value_one()) <= 0)
+ *ret |= DH_NOT_SUITABLE_GENERATOR;
+ else if (BN_cmp(dh->params.g, dh->params.p) >= 0)
+--
+2.27.0
+
diff --git a/backport-Do-not-ignore-empty-associated-data-with-AES-SIV-mod.patch b/backport-Do-not-ignore-empty-associated-data-with-AES-SIV-mod.patch
new file mode 100644
index 0000000..98b1a0b
--- /dev/null
+++ b/backport-Do-not-ignore-empty-associated-data-with-AES-SIV-mod.patch
@@ -0,0 +1,57 @@
+From 00e2f5eea29994d19293ec4e8c8775ba73678598 Mon Sep 17 00:00:00 2001
+From: Tomas Mraz <tomas@openssl.org>
+Date: Tue, 4 Jul 2023 17:30:35 +0200
+Subject: [PATCH] Do not ignore empty associated data with AES-SIV mode
+
+The AES-SIV mode allows for multiple associated data items
+authenticated separately with any of these being 0 length.
+
+The provided implementation ignores such empty associated data
+which is incorrect in regards to the RFC 5297 and is also
+a security issue because such empty associated data then become
+unauthenticated if an application expects to authenticate them.
+
+Fixes CVE-2023-2975
+
+Reviewed-by: Matt Caswell <matt@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/21384)
+
+(cherry picked from commit c426c281cfc23ab182f7d7d7a35229e7db1494d9)
+---
+ .../implementations/ciphers/cipher_aes_siv.c | 18 +++++++++++-------
+ 1 file changed, 11 insertions(+), 7 deletions(-)
+
+diff --git a/providers/implementations/ciphers/cipher_aes_siv.c b/providers/implementations/ciphers/cipher_aes_siv.c
+index 45010b90db..b396c8651a 100644
+--- a/providers/implementations/ciphers/cipher_aes_siv.c
++++ b/providers/implementations/ciphers/cipher_aes_siv.c
+@@ -120,14 +120,18 @@ static int siv_cipher(void *vctx, unsigned char *out, size_t *outl,
+ if (!ossl_prov_is_running())
+ return 0;
+
+- if (inl == 0) {
+- *outl = 0;
+- return 1;
+- }
++ /* Ignore just empty encryption/decryption call and not AAD. */
++ if (out != NULL) {
++ if (inl == 0) {
++ if (outl != NULL)
++ *outl = 0;
++ return 1;
++ }
+
+- if (outsize < inl) {
+- ERR_raise(ERR_LIB_PROV, PROV_R_OUTPUT_BUFFER_TOO_SMALL);
+- return 0;
++ if (outsize < inl) {
++ ERR_raise(ERR_LIB_PROV, PROV_R_OUTPUT_BUFFER_TOO_SMALL);
++ return 0;
++ }
+ }
+
+ if (ctx->hw->cipher(ctx, out, in, inl) <= 0)
+--
+2.27.0
+
diff --git a/backport-Fix-DH_check-excessive-time-with-over-sized-modulus.patch b/backport-Fix-DH_check-excessive-time-with-over-sized-modulus.patch
new file mode 100644
index 0000000..53ddf3b
--- /dev/null
+++ b/backport-Fix-DH_check-excessive-time-with-over-sized-modulus.patch
@@ -0,0 +1,74 @@
+From 1fa20cf2f506113c761777127a38bce5068740eb Mon Sep 17 00:00:00 2001
+From: Matt Caswell <matt@openssl.org>
+Date: Thu, 6 Jul 2023 16:36:35 +0100
+Subject: [PATCH] Fix DH_check() excessive time with over sized modulus
+
+The DH_check() function checks numerous aspects of the key or parameters
+that have been supplied. Some of those checks use the supplied modulus
+value even if it is excessively large.
+
+There is already a maximum DH modulus size (10,000 bits) over which
+OpenSSL will not generate or derive keys. DH_check() will however still
+perform various tests for validity on such a large modulus. We introduce a
+new maximum (32,768) over which DH_check() will just fail.
+
+An application that calls DH_check() and supplies a key or parameters
+obtained from an untrusted source could be vulnerable to a Denial of
+Service attack.
+
+The function DH_check() is itself called by a number of other OpenSSL
+functions. An application calling any of those other functions may
+similarly be affected. The other functions affected by this are
+DH_check_ex() and EVP_PKEY_param_check().
+
+CVE-2023-3446
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com>
+Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/21451)
+
+(cherry picked from commit 9e0094e2aa1b3428a12d5095132f133c078d3c3d)
+---
+ crypto/dh/dh_check.c | 6 ++++++
+ include/openssl/dh.h | 6 +++++-
+ 2 files changed, 11 insertions(+), 1 deletion(-)
+
+diff --git a/crypto/dh/dh_check.c b/crypto/dh/dh_check.c
+index 0b391910d6..84a926998e 100644
+--- a/crypto/dh/dh_check.c
++++ b/crypto/dh/dh_check.c
+@@ -152,6 +152,12 @@ int DH_check(const DH *dh, int *ret)
+ if (nid != NID_undef)
+ return 1;
+
++ /* Don't do any checks at all with an excessively large modulus */
++ if (BN_num_bits(dh->params.p) > OPENSSL_DH_CHECK_MAX_MODULUS_BITS) {
++ ERR_raise(ERR_LIB_DH, DH_R_MODULUS_TOO_LARGE);
++ return 0;
++ }
++
+ if (!DH_check_params(dh, ret))
+ return 0;
+
+diff --git a/include/openssl/dh.h b/include/openssl/dh.h
+index b97871eca7..36420f51d8 100644
+--- a/include/openssl/dh.h
++++ b/include/openssl/dh.h
+@@ -89,7 +89,11 @@ int EVP_PKEY_CTX_get0_dh_kdf_ukm(EVP_PKEY_CTX *ctx, unsigned char **ukm);
+ # include <openssl/dherr.h>
+
+ # ifndef OPENSSL_DH_MAX_MODULUS_BITS
+-# define OPENSSL_DH_MAX_MODULUS_BITS 10000
++# define OPENSSL_DH_MAX_MODULUS_BITS 10000
++# endif
++
++# ifndef OPENSSL_DH_CHECK_MAX_MODULUS_BITS
++# define OPENSSL_DH_CHECK_MAX_MODULUS_BITS 32768
+ # endif
+
+ # define OPENSSL_DH_FIPS_MIN_MODULUS_BITS 1024
+--
+2.27.0
+
diff --git a/backport-Make-DH_check-set-some-error-bits-in-recently-added-.patch b/backport-Make-DH_check-set-some-error-bits-in-recently-added-.patch
new file mode 100644
index 0000000..91e9417
--- /dev/null
+++ b/backport-Make-DH_check-set-some-error-bits-in-recently-added-.patch
@@ -0,0 +1,39 @@
+From e648db50d9a63f71cab5cb78424c2932d019a744 Mon Sep 17 00:00:00 2001
+From: Bernd Edlinger <bernd.edlinger@hotmail.de>
+Date: Sun, 23 Jul 2023 14:27:54 +0200
+Subject: [PATCH] Make DH_check set some error bits in recently added error
+
+The pre-existing error cases where DH_check returned zero
+are not related to the dh params in any way, but are only
+triggered by out-of-memory errors, therefore having *ret
+set to zero feels right, but since the new error case is
+triggered by too large p values that is something different.
+On the other hand some callers of this function might not
+be prepared to handle the return value correctly but only
+rely on *ret. Therefore we set some error bits in *ret as
+additional safety measure.
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/21524)
+
+(cherry picked from commit 81d10e61a4b7d5394d08a718bf7d6bae20e818fc)
+---
+ crypto/dh/dh_check.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/crypto/dh/dh_check.c b/crypto/dh/dh_check.c
+index 84a926998e..aef6f9b1b7 100644
+--- a/crypto/dh/dh_check.c
++++ b/crypto/dh/dh_check.c
+@@ -155,6 +155,7 @@ int DH_check(const DH *dh, int *ret)
+ /* Don't do any checks at all with an excessively large modulus */
+ if (BN_num_bits(dh->params.p) > OPENSSL_DH_CHECK_MAX_MODULUS_BITS) {
+ ERR_raise(ERR_LIB_DH, DH_R_MODULUS_TOO_LARGE);
++ *ret = DH_MODULUS_TOO_LARGE | DH_CHECK_P_NOT_PRIME;
+ return 0;
+ }
+
+--
+2.27.0
+
diff --git a/backport-dhtest.c-Add-test-of-DH_check-with-q-p-1.patch b/backport-dhtest.c-Add-test-of-DH_check-with-q-p-1.patch
new file mode 100644
index 0000000..d5d7890
--- /dev/null
+++ b/backport-dhtest.c-Add-test-of-DH_check-with-q-p-1.patch
@@ -0,0 +1,53 @@
+From 2255f6c74e6c8b702adcf352b04c5d3e6c759745 Mon Sep 17 00:00:00 2001
+From: Tomas Mraz <tomas@openssl.org>
+Date: Tue, 25 Jul 2023 15:23:43 +0200
+Subject: [PATCH] dhtest.c: Add test of DH_check() with q = p + 1
+
+This must fail with DH_CHECK_INVALID_Q_VALUE and
+with DH_CHECK_Q_NOT_PRIME unset.
+
+Reviewed-by: Matt Caswell <matt@openssl.org>
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com>
+Reviewed-by: Todd Short <todd.short@me.com>
+(Merged from https://github.com/openssl/openssl/pull/21550)
+
+(cherry picked from commit ad5d35572695d7b5748b2bd4fb1afaa189b29e28)
+(cherry picked from commit 1478ffad3f123550ec1014642d5c880dfbe270ef)
+---
+ test/dhtest.c | 12 ++++++++++++
+ 1 file changed, 12 insertions(+)
+
+diff --git a/test/dhtest.c b/test/dhtest.c
+index f8dd8f3aa7..d02b3b7c58 100644
+--- a/test/dhtest.c
++++ b/test/dhtest.c
+@@ -124,6 +124,15 @@ static int dh_test(void)
+ /* We'll have a stale error on the queue from the above test so clear it */
+ ERR_clear_error();
+
++ if (!TEST_ptr(BN_copy(q, p)) || !TEST_true(BN_add(q, q, BN_value_one())))
++ goto err3;
++
++ if (!TEST_true(DH_check(dh, &i)))
++ goto err3;
++ if (!TEST_true(i & DH_CHECK_INVALID_Q_VALUE)
++ || !TEST_false(i & DH_CHECK_Q_NOT_PRIME))
++ goto err3;
++
+ /* Modulus of size: dh check max modulus bits + 1 */
+ if (!TEST_true(BN_set_word(p, 1))
+ || !TEST_true(BN_lshift(p, p, OPENSSL_DH_CHECK_MAX_MODULUS_BITS)))
+@@ -135,6 +144,9 @@ static int dh_test(void)
+ if (!TEST_false(DH_check(dh, &i)))
+ goto err3;
+
++ /* We'll have a stale error on the queue from the above test so clear it */
++ ERR_clear_error();
++
+ /*
+ * II) key generation
+ */
+--
+2.27.0
+
diff --git a/openssl-3.0-build.patch b/openssl-3.0-build.patch
new file mode 100644
index 0000000..83243e1
--- /dev/null
+++ b/openssl-3.0-build.patch
@@ -0,0 +1,38 @@
+From 262bff1615d4461120327c5a9fe904ad1c6ce813 Mon Sep 17 00:00:00 2001
+From: hzero1996 <wangcheng156@huawei.com>
+Date: Sun, 29 Jan 2023 14:53:03 +0800
+Subject: [PATCH] openssl-3.0-build
+
+---
+ Configurations/10-main.conf | 1 +
+ Configurations/unix-Makefile.tmpl | 2 +-
+ 2 files changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/Configurations/10-main.conf b/Configurations/10-main.conf
+index b578a3c..1ad81c3 100644
+--- a/Configurations/10-main.conf
++++ b/Configurations/10-main.conf
+@@ -772,6 +772,7 @@ my %targets = (
+ inherit_from => [ "linux-generic64" ],
+ asm_arch => 'aarch64',
+ perlasm_scheme => "linux64",
++ multilib => "64",
+ },
+ "linux-arm64ilp32" => { # https://wiki.linaro.org/Platform/arm64-ilp32
+ inherit_from => [ "linux-generic32" ],
+diff --git a/Configurations/unix-Makefile.tmpl b/Configurations/unix-Makefile.tmpl
+index 110ba06..712a779 100644
+--- a/Configurations/unix-Makefile.tmpl
++++ b/Configurations/unix-Makefile.tmpl
+@@ -611,7 +611,7 @@ install_sw: install_dev install_engines install_modules install_runtime
+
+ uninstall_sw: uninstall_runtime uninstall_modules uninstall_engines uninstall_dev
+
+-install_docs: install_man_docs install_html_docs
++install_docs: install_man_docs
+
+ uninstall_docs: uninstall_man_docs uninstall_html_docs
+ $(RM) -r $(DESTDIR)$(DOCDIR)
+--
+2.27.0
+
diff --git a/openssl.spec b/openssl.spec
new file mode 100644
index 0000000..c51ad86
--- /dev/null
+++ b/openssl.spec
@@ -0,0 +1,94 @@
+%define install_prefix /opt/openssl3
+%define soversion 3
+Name: openssl3
+Epoch: 1
+Version: 3.0.9
+Release: 1
+Summary: Cryptography and SSL/TLS Toolkit
+License: OpenSSL and SSLeay
+URL: https://www.openssl.org/
+Source0: https://www.openssl.org/source/openssl-%{version}.tar.gz
+Source1: Makefile.certificate
+
+Patch1: openssl-3.0-build.patch
+Patch2: Backport-aarch64-support-BTI-and-pointer-authentication-in-as.patch
+Patch3: Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch
+Patch4: Backport-Fix-sm3ss1-translation-issue-in-sm3-armv8.pl.patch
+Patch5: Backport-providers-Add-SM4-GCM-implementation.patch
+Patch6: Backport-SM4-optimization-for-ARM-by-HW-instruction.patch
+Patch7: Backport-Further-acceleration-for-SM4-GCM-on-ARM.patch
+Patch8: Backport-SM4-optimization-for-ARM-by-ASIMD.patch
+Patch9: Backport-providers-Add-SM4-XTS-implementation.patch
+Patch10: Backport-Fix-SM4-CBC-regression-on-Armv8.patch
+Patch11: Backport-Fix-SM4-test-failures-on-big-endian-ARM-processors.patch
+Patch12: Backport-Apply-SM4-optimization-patch-to-Kunpeng-920.patch
+Patch13: Backport-SM4-AESE-optimization-for-ARMv8.patch
+Patch14: Backport-Fix-SM4-XTS-build-failure-on-Mac-mini-M1.patch
+Patch15: backport-Add-testcases-for-empty-associated-data-entries-with.patch
+Patch16: backport-Do-not-ignore-empty-associated-data-with-AES-SIV-mod.patch
+Patch17: backport-Add-a-test-for-CVE-2023-3446.patch
+Patch18: backport-Fix-DH_check-excessive-time-with-over-sized-modulus.patch
+Patch19: backport-Make-DH_check-set-some-error-bits-in-recently-added-.patch
+Patch20: backport-DH_check-Do-not-try-checking-q-properties-if-it-is-o.patch
+Patch21: backport-dhtest.c-Add-test-of-DH_check-with-q-p-1.patch
+Patch22: Backport-support-decode-SM2-parameters.patch
+Patch23: Feature-support-SM2-CMS-signature.patch
+Patch24: Feature-use-default-id-if-SM2-id-is-not-set.patch
+Patch25: backport-A-null-pointer-dereference-occurs-when-memory-alloca.patch
+
+BuildRequires: gcc gcc-c++ perl make lksctp-tools-devel coreutils util-linux zlib-devel
+Requires: coreutils
+
+%description
+OpenSSL is a robust, commercial-grade, and full-featured toolkit for the
+Transport Layer Security (TLS) and Secure Sockets Layer (SSL) protocols.
+
+%prep
+%autosetup -n openssl-%{version} -p1
+
+%build
+
+sslarch=%{_os}-%{_target_cpu}
+%ifarch i686
+sslarch=linux-elf
+%endif
+%ifarch riscv64
+sslarch=%{_os}64-%{_target_cpu}
+%endif
+
+%ifarch x86_64 aarch64
+sslflags=enable-ec_nistp_64_gcc_128
+%endif
+
+RPM_OPT_FLAGS="$RPM_OPT_FLAGS -Wa,--noexecstack -Wa,--generate-missing-build-notes=yes -DPURIFY $RPM_LD_FLAGS"
+./Configure \
+ --prefix=%{install_prefix} -Wl,-rpath,%{install_prefix}/lib ${sslflags} \
+ zlib enable-camellia enable-seed enable-rfc3779 \
+ enable-cms enable-md2 enable-rc5 ${ktlsopt} enable-fips\
+ no-mdc2 no-ec2m enable-sm2 enable-sm4 enable-buildtest-c++\
+ shared ${sslarch} $RPM_OPT_FLAGS '-DDEVRANDOM="\"/dev/urandom\""' \
+ -Wl,--allow-multiple-definition
+
+
+%make_build all
+
+%install
+# Install OpenSSL.
+#install -d $RPM_BUILD_ROOT{%{_bindir},%{_includedir},%{_libdir},%{_mandir},%{_libdir}/openssl,%{_pkgdocdir}}
+
+%make_install
+
+rm -f %{buildroot}%{install_prefix}{/bin/c_rehash,/ssl/misc/tsget*,/ssl/misc/*.pl}
+
+export QA_RPATHS=$(( 0x0002 ))
+
+%check
+%make_build test
+
+%files
+%license LICENSE.txt
+%{install_prefix}
+
+%changelog
+* Mon Oct 02 2023 Funda Wang <fundawang@yeah.net> - 3.0.9-1
+- Try install into /opt
diff --git a/sources b/sources
new file mode 100644
index 0000000..077b1dd
--- /dev/null
+++ b/sources
@@ -0,0 +1 @@
+8b2aff668b8ce0da24b9505ebfd26b4d openssl-3.0.9.tar.gz