diff options
author | CoprDistGit <infra@openeuler.org> | 2023-10-02 03:32:16 +0000 |
---|---|---|
committer | CoprDistGit <infra@openeuler.org> | 2023-10-02 03:32:16 +0000 |
commit | e879981f405f8810d1b0d9c1c77aea3e8be6a469 (patch) | |
tree | 8698c9791c9e77d3be587c5c7ad9d43dce7c6d30 | |
parent | 80d0cbc46bb935a925d434060b67c794844558d9 (diff) |
automatic import of openssl3openeuler22.03_LTS_SP2openeuler22.03_LTSopeneuler20.03_LTS_SP1openeuler20.03
29 files changed, 9493 insertions, 0 deletions
@@ -0,0 +1 @@ +/openssl-3.0.9.tar.gz diff --git a/Backport-Apply-SM4-optimization-patch-to-Kunpeng-920.patch b/Backport-Apply-SM4-optimization-patch-to-Kunpeng-920.patch new file mode 100644 index 0000000..6536ed5 --- /dev/null +++ b/Backport-Apply-SM4-optimization-patch-to-Kunpeng-920.patch @@ -0,0 +1,74 @@ +From 06f13f85ee86cd7fbc546060fbe2d077176b0be4 Mon Sep 17 00:00:00 2001 +From: Xu Yizhou <xuyizhou1@huawei.com> +Date: Mon, 31 Oct 2022 11:28:15 +0800 +Subject: [PATCH 11/13] Apply SM4 optimization patch to Kunpeng-920 + +In the ideal scenario, performance can reach up to 2.2X. +But in single block input or CFB/OFB mode, CBC encryption, +performance could drop about 50%. + +Perf data on Kunpeng-920 2.6GHz hardware, before and after optimization: + +Before: +type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes +SM4-CTR 75318.96k 79089.62k 79736.15k 79934.12k 80325.44k 80068.61k +SM4-ECB 80211.39k 84998.36k 86472.28k 87024.93k 87144.80k 86862.51k +SM4-GCM 72156.19k 82012.08k 83848.02k 84322.65k 85103.65k 84896.43k +SM4-CBC 77956.13k 80638.81k 81976.17k 81606.31k 82078.91k 81750.70k +SM4-CFB 78078.20k 81054.87k 81841.07k 82396.38k 82203.99k 82236.76k +SM4-OFB 78282.76k 82074.03k 82765.74k 82989.06k 83200.68k 83487.17k + +After: +type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes +SM4-CTR 35678.07k 120687.25k 176632.27k 177192.62k 177586.18k 178295.18k +SM4-ECB 35540.32k 122628.07k 175067.90k 178007.84k 178298.88k 178328.92k +SM4-GCM 34215.75k 116720.50k 170275.16k 171770.88k 172714.21k 172272.30k +SM4-CBC 35645.60k 36544.86k 36515.50k 36732.15k 36618.24k 36629.16k +SM4-CFB 35528.14k 35690.99k 35954.86k 35843.42k 35809.18k 35809.96k +SM4-OFB 35563.55k 35853.56k 35963.05k 36203.52k 36233.85k 36307.82k + +Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com> + +Reviewed-by: Hugo Landau <hlandau@openssl.org> +Reviewed-by: Paul Dale <pauli@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/19547) +--- + crypto/arm_arch.h | 4 ++++ + include/crypto/sm4_platform.h | 3 ++- + 2 files changed, 6 insertions(+), 1 deletion(-) + +diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h +index 5b5af31d92..c10748e5f8 100644 +--- a/crypto/arm_arch.h ++++ b/crypto/arm_arch.h +@@ -98,9 +98,13 @@ extern unsigned int OPENSSL_armv8_rsa_neonized; + */ + + # define ARM_CPU_IMP_ARM 0x41 ++# define HISI_CPU_IMP 0x48 + + # define ARM_CPU_PART_CORTEX_A72 0xD08 + # define ARM_CPU_PART_N1 0xD0C ++# define ARM_CPU_PART_V1 0xD40 ++# define ARM_CPU_PART_N2 0xD49 ++# define HISI_CPU_PART_KP920 0xD01 + + # define MIDR_PARTNUM_SHIFT 4 + # define MIDR_PARTNUM_MASK (0xfffU << MIDR_PARTNUM_SHIFT) +diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h +index 11f9b9d88b..15d8abbcb1 100644 +--- a/include/crypto/sm4_platform.h ++++ b/include/crypto/sm4_platform.h +@@ -20,7 +20,8 @@ static inline int vpsm4_capable(void) + { + return (OPENSSL_armcap_P & ARMV8_CPUID) && + (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1) || +- MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1)); ++ MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1) || ++ MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, HISI_CPU_IMP, HISI_CPU_PART_KP920)); + } + # if defined(VPSM4_ASM) + # define VPSM4_CAPABLE vpsm4_capable() +-- +2.37.3.windows.1 + diff --git a/Backport-Fix-SM4-CBC-regression-on-Armv8.patch b/Backport-Fix-SM4-CBC-regression-on-Armv8.patch new file mode 100644 index 0000000..2176932 --- /dev/null +++ b/Backport-Fix-SM4-CBC-regression-on-Armv8.patch @@ -0,0 +1,60 @@ +From d7d5490d7201dcfb1f3811ad1bfc57ed9b2c0b77 Mon Sep 17 00:00:00 2001 +From: "fangming.fang" <fangming.fang@arm.com> +Date: Thu, 8 Dec 2022 10:46:27 +0000 +Subject: [PATCH 09/13] Fix SM4-CBC regression on Armv8 + +Fixes #19858 + +During decryption, the last ciphertext is not fed to next block +correctly when the number of input blocks is exactly 4. Fix this +and add the corresponding test cases. + +Thanks xu-yi-zhou for reporting this issue and proposing the fix. + +Reviewed-by: Tomas Mraz <tomas@openssl.org> +Reviewed-by: Paul Dale <pauli@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/19872) +--- + crypto/sm4/asm/vpsm4-armv8.pl | 2 +- + test/recipes/30-test_evp_data/evpciph_sm4.txt | 12 ++++++++++++ + 2 files changed, 13 insertions(+), 1 deletion(-) + +diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl +index 095d9dae64..c842ef61d5 100755 +--- a/crypto/sm4/asm/vpsm4-armv8.pl ++++ b/crypto/sm4/asm/vpsm4-armv8.pl +@@ -880,7 +880,7 @@ $code.=<<___; + subs $blocks,$blocks,#4 + b.gt .Lcbc_4_blocks_dec + // save back IV +- st1 {@vtmp[3].16b}, [$ivp] ++ st1 {@data[3].16b}, [$ivp] + b 100f + 1: // last block + subs $blocks,$blocks,#1 +diff --git a/test/recipes/30-test_evp_data/evpciph_sm4.txt b/test/recipes/30-test_evp_data/evpciph_sm4.txt +index 9fb16ca15c..e9a98c9898 100644 +--- a/test/recipes/30-test_evp_data/evpciph_sm4.txt ++++ b/test/recipes/30-test_evp_data/evpciph_sm4.txt +@@ -19,6 +19,18 @@ IV = 0123456789ABCDEFFEDCBA9876543210 + Plaintext = 0123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA9876543210 + Ciphertext = 2677F46B09C122CC975533105BD4A22AF6125F7275CE552C3A2BBCF533DE8A3B + ++Cipher = SM4-CBC ++Key = 0123456789ABCDEFFEDCBA9876543210 ++IV = 0123456789ABCDEFFEDCBA9876543210 ++Plaintext = 0123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA9876543210 ++Ciphertext = 2677F46B09C122CC975533105BD4A22AF6125F7275CE552C3A2BBCF533DE8A3BFFF5A4F208092C0901BA02D5772977369915E3FA2356C9F4EB6460ECC457E7f8E3CFA3DEEBFE9883E3A48BCF7C4A11AA3EC9E0D317C5D319BE72A5CDDDEC640C ++ ++Cipher = SM4-CBC ++Key = 0123456789ABCDEFFEDCBA9876543210 ++IV = 0123456789ABCDEFFEDCBA9876543210 ++Plaintext = 0123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA98765432100123456789ABCDEFFEDCBA9876543210 ++Ciphertext = 2677f46b09c122cc975533105bd4a22af6125f7275ce552c3a2bbcf533de8a3bfff5a4f208092c0901ba02d5772977369915e3fa2356c9f4eb6460ecc457e7f8e3cfa3deebfe9883e3a48bcf7c4a11aa3ec9e0d317c5d319be72a5cdddec640c6fc70bfa3ddaafffdd7c09b2774dcb2cec29f0c6f0b6773e985b3e395e924238505a8f120d9ca84de5c3cf7e45f097b14b3a46c5b1068669982a5c1f5f61be291b984f331d44ffb2758f771672448fc957fa1416c446427a41e25d5524a2418b9d96b2f17582f0f1aa9c204c6807f54f7b6833c5f00856659ddabc245936868c ++ + Cipher = SM4-OFB + Key = 0123456789ABCDEFFEDCBA9876543210 + IV = 0123456789ABCDEFFEDCBA9876543210 +-- +2.37.3.windows.1 + diff --git a/Backport-Fix-SM4-XTS-build-failure-on-Mac-mini-M1.patch b/Backport-Fix-SM4-XTS-build-failure-on-Mac-mini-M1.patch new file mode 100644 index 0000000..5bfd186 --- /dev/null +++ b/Backport-Fix-SM4-XTS-build-failure-on-Mac-mini-M1.patch @@ -0,0 +1,87 @@ +From 6df7707fb22e8bd1c7d778a2041c1403f9852060 Mon Sep 17 00:00:00 2001 +From: Xu Yizhou <xuyizhou1@huawei.com> +Date: Fri, 3 Feb 2023 15:59:59 +0800 +Subject: [PATCH 13/13] Fix SM4-XTS build failure on Mac mini M1 + +Reviewed-by: Paul Dale <pauli@openssl.org> +Reviewed-by: Tomas Mraz <tomas@openssl.org> +Reviewed-by: Richard Levitte <levitte@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/20202) +--- + crypto/sm4/asm/vpsm4-armv8.pl | 4 +++- + crypto/sm4/asm/vpsm4_ex-armv8.pl | 23 ++++++++++++++++------- + 2 files changed, 19 insertions(+), 8 deletions(-) + +diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl +index e19de30901..d30e78f3ce 100755 +--- a/crypto/sm4/asm/vpsm4-armv8.pl ++++ b/crypto/sm4/asm/vpsm4-armv8.pl +@@ -524,7 +524,7 @@ sub compute_tweak_vec() { + my $std = shift; + &rbit(@vtmp[2],$src,$std); + $code.=<<___; +- ldr @qtmp[0], =0x01010101010101010101010101010187 ++ ldr @qtmp[0], .Lxts_magic + shl $des.16b, @vtmp[2].16b, #1 + ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 + ushr @vtmp[1].16b, @vtmp[1].16b, #7 +@@ -572,6 +572,8 @@ _vpsm4_consts: + .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197 + .Lshuffles: + .dword 0x0B0A090807060504,0x030201000F0E0D0C ++.Lxts_magic: ++ .dword 0x0101010101010187,0x0101010101010101 + + .size _vpsm4_consts,.-_vpsm4_consts + ___ +diff --git a/crypto/sm4/asm/vpsm4_ex-armv8.pl b/crypto/sm4/asm/vpsm4_ex-armv8.pl +index 3d094aa535..f2d5b6debf 100644 +--- a/crypto/sm4/asm/vpsm4_ex-armv8.pl ++++ b/crypto/sm4/asm/vpsm4_ex-armv8.pl +@@ -475,12 +475,12 @@ sub load_sbox () { + my $data = shift; + + $code.=<<___; +- ldr $MaskQ, =0x0306090c0f0205080b0e0104070a0d00 +- ldr $TAHMatQ, =0x22581a6002783a4062185a2042387a00 +- ldr $TALMatQ, =0xc10bb67c4a803df715df62a89e54e923 +- ldr $ATAHMatQ, =0x1407c6d56c7fbeadb9aa6b78c1d21300 +- ldr $ATALMatQ, =0xe383c1a1fe9edcbc6404462679195b3b +- ldr $ANDMaskQ, =0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f ++ ldr $MaskQ, .Lsbox_magic ++ ldr $TAHMatQ, .Lsbox_magic+16 ++ ldr $TALMatQ, .Lsbox_magic+32 ++ ldr $ATAHMatQ, .Lsbox_magic+48 ++ ldr $ATALMatQ, .Lsbox_magic+64 ++ ldr $ANDMaskQ, .Lsbox_magic+80 + ___ + } + +@@ -525,7 +525,7 @@ sub compute_tweak_vec() { + my $std = shift; + &rbit(@vtmp[2],$src,$std); + $code.=<<___; +- ldr @qtmp[0], =0x01010101010101010101010101010187 ++ ldr @qtmp[0], .Lxts_magic + shl $des.16b, @vtmp[2].16b, #1 + ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 + ushr @vtmp[1].16b, @vtmp[1].16b, #7 +@@ -556,6 +556,15 @@ _${prefix}_consts: + .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197 + .Lshuffles: + .dword 0x0B0A090807060504,0x030201000F0E0D0C ++.Lxts_magic: ++ .dword 0x0101010101010187,0x0101010101010101 ++.Lsbox_magic: ++ .dword 0x0b0e0104070a0d00,0x0306090c0f020508 ++ .dword 0x62185a2042387a00,0x22581a6002783a40 ++ .dword 0x15df62a89e54e923,0xc10bb67c4a803df7 ++ .dword 0xb9aa6b78c1d21300,0x1407c6d56c7fbead ++ .dword 0x6404462679195b3b,0xe383c1a1fe9edcbc ++ .dword 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f + + .size _${prefix}_consts,.-_${prefix}_consts + ___ +-- +2.37.3.windows.1 + diff --git a/Backport-Fix-SM4-test-failures-on-big-endian-ARM-processors.patch b/Backport-Fix-SM4-test-failures-on-big-endian-ARM-processors.patch new file mode 100644 index 0000000..485fd65 --- /dev/null +++ b/Backport-Fix-SM4-test-failures-on-big-endian-ARM-processors.patch @@ -0,0 +1,207 @@ +From b8f24cb95dbe70cbeef08b41f35018141b6ce994 Mon Sep 17 00:00:00 2001 +From: Xu Yizhou <xuyizhou1@huawei.com> +Date: Thu, 15 Dec 2022 10:21:07 +0800 +Subject: [PATCH 10/13] Fix SM4 test failures on big-endian ARM processors + +Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com> + +Reviewed-by: Paul Yang <kaishen.yy@antfin.com> +Reviewed-by: Tomas Mraz <tomas@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/19910) +--- + crypto/sm4/asm/vpsm4-armv8.pl | 52 +++++++++++++++++------------------ + 1 file changed, 26 insertions(+), 26 deletions(-) + +diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl +index c842ef61d5..73797af582 100755 +--- a/crypto/sm4/asm/vpsm4-armv8.pl ++++ b/crypto/sm4/asm/vpsm4-armv8.pl +@@ -45,7 +45,7 @@ sub rev32() { + + if ($src and ("$src" ne "$dst")) { + $code.=<<___; +-#ifndef __ARMEB__ ++#ifndef __AARCH64EB__ + rev32 $dst.16b,$src.16b + #else + mov $dst.16b,$src.16b +@@ -53,7 +53,7 @@ $code.=<<___; + ___ + } else { + $code.=<<___; +-#ifndef __ARMEB__ ++#ifndef __AARCH64EB__ + rev32 $dst.16b,$dst.16b + #endif + ___ +@@ -428,10 +428,10 @@ sub load_sbox () { + + $code.=<<___; + adr $ptr,.Lsbox +- ld1 {@sbox[0].4s,@sbox[1].4s,@sbox[2].4s,@sbox[3].4s},[$ptr],#64 +- ld1 {@sbox[4].4s,@sbox[5].4s,@sbox[6].4s,@sbox[7].4s},[$ptr],#64 +- ld1 {@sbox[8].4s,@sbox[9].4s,@sbox[10].4s,@sbox[11].4s},[$ptr],#64 +- ld1 {@sbox[12].4s,@sbox[13].4s,@sbox[14].4s,@sbox[15].4s},[$ptr] ++ ld1 {@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},[$ptr],#64 ++ ld1 {@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},[$ptr],#64 ++ ld1 {@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},[$ptr],#64 ++ ld1 {@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},[$ptr] + ___ + } + +@@ -492,9 +492,9 @@ ___ + &rev32($vkey,$vkey); + $code.=<<___; + adr $pointer,.Lshuffles +- ld1 {$vmap.4s},[$pointer] ++ ld1 {$vmap.2d},[$pointer] + adr $pointer,.Lfk +- ld1 {$vfk.4s},[$pointer] ++ ld1 {$vfk.2d},[$pointer] + eor $vkey.16b,$vkey.16b,$vfk.16b + mov $schedules,#32 + adr $pointer,.Lck +@@ -615,7 +615,7 @@ $code.=<<___; + .align 5 + ${prefix}_${dir}crypt: + AARCH64_VALID_CALL_TARGET +- ld1 {@data[0].16b},[$inp] ++ ld1 {@data[0].4s},[$inp] + ___ + &load_sbox(); + &rev32(@data[0],@data[0]); +@@ -624,7 +624,7 @@ $code.=<<___; + ___ + &encrypt_1blk(@data[0]); + $code.=<<___; +- st1 {@data[0].16b},[$outp] ++ st1 {@data[0].4s},[$outp] + ret + .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt + ___ +@@ -692,12 +692,12 @@ $code.=<<___; + cmp $blocks,#1 + b.lt 100f + b.gt 1f +- ld1 {@data[0].16b},[$inp] ++ ld1 {@data[0].4s},[$inp] + ___ + &rev32(@data[0],@data[0]); + &encrypt_1blk(@data[0]); + $code.=<<___; +- st1 {@data[0].16b},[$outp] ++ st1 {@data[0].4s},[$outp] + b 100f + 1: // process last 2 blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 +@@ -798,11 +798,11 @@ ___ + &rev32($ivec0,$ivec0); + &encrypt_1blk($ivec0); + $code.=<<___; +- st1 {$ivec0.16b},[$outp],#16 ++ st1 {$ivec0.4s},[$outp],#16 + b 1b + 2: + // save back IV +- st1 {$ivec0.16b},[$ivp] ++ st1 {$ivec0.4s},[$ivp] + ret + + .Ldec: +@@ -834,7 +834,7 @@ ___ + &transpose(@vtmp,@datax); + &transpose(@data,@datax); + $code.=<<___; +- ld1 {$ivec1.16b},[$ivp] ++ ld1 {$ivec1.4s},[$ivp] + ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 + // note ivec1 and vtmpx[3] are resuing the same register + // care needs to be taken to avoid conflict +@@ -844,7 +844,7 @@ $code.=<<___; + eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b + eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b + // save back IV +- st1 {$vtmpx[3].16b}, [$ivp] ++ st1 {$vtmpx[3].4s}, [$ivp] + eor @data[0].16b,@data[0].16b,$datax[3].16b + eor @data[1].16b,@data[1].16b,@vtmpx[0].16b + eor @data[2].16b,@data[2].16b,@vtmpx[1].16b +@@ -855,7 +855,7 @@ $code.=<<___; + b.gt .Lcbc_8_blocks_dec + b.eq 100f + 1: +- ld1 {$ivec1.16b},[$ivp] ++ ld1 {$ivec1.4s},[$ivp] + .Lcbc_4_blocks_dec: + cmp $blocks,#4 + b.lt 1f +@@ -880,7 +880,7 @@ $code.=<<___; + subs $blocks,$blocks,#4 + b.gt .Lcbc_4_blocks_dec + // save back IV +- st1 {@data[3].16b}, [$ivp] ++ st1 {@data[3].4s}, [$ivp] + b 100f + 1: // last block + subs $blocks,$blocks,#1 +@@ -888,13 +888,13 @@ $code.=<<___; + b.gt 1f + ld1 {@data[0].4s},[$inp],#16 + // save back IV +- st1 {$data[0].16b}, [$ivp] ++ st1 {$data[0].4s}, [$ivp] + ___ + &rev32(@datax[0],@data[0]); + &encrypt_1blk(@datax[0]); + $code.=<<___; + eor @datax[0].16b,@datax[0].16b,$ivec1.16b +- st1 {@datax[0].16b},[$outp],#16 ++ st1 {@datax[0].4s},[$outp],#16 + b 100f + 1: // last two blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp] +@@ -917,7 +917,7 @@ $code.=<<___; + eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b + st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 + // save back IV +- st1 {@data[1].16b}, [$ivp] ++ st1 {@data[1].4s}, [$ivp] + b 100f + 1: // last 3 blocks + ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr] +@@ -937,7 +937,7 @@ $code.=<<___; + eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b + st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 + // save back IV +- st1 {@data[2].16b}, [$ivp] ++ st1 {@data[2].4s}, [$ivp] + 100: + ldp d10,d11,[sp,#16] + ldp d12,d13,[sp,#32] +@@ -973,9 +973,9 @@ $code.=<<___; + ___ + &encrypt_1blk($ivec); + $code.=<<___; +- ld1 {@data[0].16b},[$inp] ++ ld1 {@data[0].4s},[$inp] + eor @data[0].16b,@data[0].16b,$ivec.16b +- st1 {@data[0].16b},[$outp] ++ st1 {@data[0].4s},[$outp] + ret + 1: + AARCH64_SIGN_LINK_REGISTER +@@ -1053,9 +1053,9 @@ $code.=<<___; + ___ + &encrypt_1blk($ivec); + $code.=<<___; +- ld1 {@data[0].16b},[$inp] ++ ld1 {@data[0].4s},[$inp] + eor @data[0].16b,@data[0].16b,$ivec.16b +- st1 {@data[0].16b},[$outp] ++ st1 {@data[0].4s},[$outp] + b 100f + 1: // last 2 blocks processing + dup @data[0].4s,$word0 +-- +2.37.3.windows.1 + diff --git a/Backport-Fix-sm3ss1-translation-issue-in-sm3-armv8.pl.patch b/Backport-Fix-sm3ss1-translation-issue-in-sm3-armv8.pl.patch new file mode 100644 index 0000000..3ecb59c --- /dev/null +++ b/Backport-Fix-sm3ss1-translation-issue-in-sm3-armv8.pl.patch @@ -0,0 +1,67 @@ +From 8746fff8f096fa35c7157199917100aa7b547d7a Mon Sep 17 00:00:00 2001 +From: "fangming.fang" <fangming.fang@arm.com> +Date: Tue, 18 Jan 2022 02:58:08 +0000 +Subject: [PATCH 03/13] Fix sm3ss1 translation issue in sm3-armv8.pl + +Reviewed-by: Tomas Mraz <tomas@openssl.org> +Reviewed-by: Matt Caswell <matt@openssl.org> +Reviewed-by: Paul Dale <pauli@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/17542) +--- + crypto/sm3/asm/sm3-armv8.pl | 15 +++++++-------- + 1 file changed, 7 insertions(+), 8 deletions(-) + +diff --git a/crypto/sm3/asm/sm3-armv8.pl b/crypto/sm3/asm/sm3-armv8.pl +index bb71b2eade..f0555fd3f2 100644 +--- a/crypto/sm3/asm/sm3-armv8.pl ++++ b/crypto/sm3/asm/sm3-armv8.pl +@@ -109,7 +109,7 @@ ___ + + $code=<<___; + #include "arm_arch.h" +-.arch armv8.2-a+sm4 ++.arch armv8.2-a + .text + ___ + +@@ -222,8 +222,8 @@ my %sm3partopcode = ( + "sm3partw1" => 0xce60C000, + "sm3partw2" => 0xce60C400); + +-my %sm3sslopcode = ( +- "sm3ssl" => 0xce400000); ++my %sm3ss1opcode = ( ++ "sm3ss1" => 0xce400000); + + my %sm3ttopcode = ( + "sm3tt1a" => 0xce408000, +@@ -241,14 +241,13 @@ sub unsm3part { + $mnemonic,$arg; + } + +-sub unsm3ssl { ++sub unsm3ss1 { + my ($mnemonic,$arg)=@_; + +- $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*, +- \s*[qv](\d+)/o ++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o + && + sprintf ".inst\t0x%08x\t//%s %s", +- $sm3sslopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10), ++ $sm3ss1opcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10), + $mnemonic,$arg; + } + +@@ -274,7 +273,7 @@ foreach(split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + + s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge; +- s/\b(sm3ssl)\s+([qv].*)/unsm3ssl($1,$2)/ge; ++ s/\b(sm3ss1)\s+([qv].*)/unsm3ss1($1,$2)/ge; + s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge; + print $_,"\n"; + } +-- +2.37.3.windows.1 + diff --git a/Backport-Further-acceleration-for-SM4-GCM-on-ARM.patch b/Backport-Further-acceleration-for-SM4-GCM-on-ARM.patch new file mode 100644 index 0000000..11129d9 --- /dev/null +++ b/Backport-Further-acceleration-for-SM4-GCM-on-ARM.patch @@ -0,0 +1,73 @@ +From 98da8a58f964e279decc1bbbe8f07d807de05f7f Mon Sep 17 00:00:00 2001 +From: Daniel Hu <Daniel.Hu@arm.com> +Date: Wed, 2 Mar 2022 12:55:39 +0000 +Subject: [PATCH 06/13] Further acceleration for SM4-GCM on ARM + +This patch will allow the SM4-GCM function to leverage the SM4 +high-performance CTR crypto interface already implemented for ARM, +which is faster than current single block cipher routine used +for GCM + +It does not address the acceleration of GHASH function of GCM, +which can be a future task, still we can see immediate uplift of +performance (up to 4X) + +Before this patch: +type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes +SM4-GCM 186432.92k 394234.05k 587916.46k 639365.12k 648486.91k 652924.25k + +After the patch: +SM4-GCM 193924.87k 860940.35k 1696083.71k 2302548.31k 2580411.73k 2607398.91k + +Signed-off-by: Daniel Hu <Daniel.Hu@arm.com> + +Reviewed-by: Tomas Mraz <tomas@openssl.org> +Reviewed-by: Paul Dale <pauli@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/17814) +--- + .../ciphers/cipher_sm4_gcm_hw.c | 25 ++++++++++++++++++- + 1 file changed, 24 insertions(+), 1 deletion(-) + +diff --git a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c +index c0c9b22bd3..b9633f83ed 100644 +--- a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c ++++ b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c +@@ -42,11 +42,34 @@ static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key, + return 1; + } + ++static int hw_gcm_cipher_update(PROV_GCM_CTX *ctx, const unsigned char *in, ++ size_t len, unsigned char *out) ++{ ++ if (ctx->enc) { ++ if (ctx->ctr != NULL) { ++ if (CRYPTO_gcm128_encrypt_ctr32(&ctx->gcm, in, out, len, ctx->ctr)) ++ return 0; ++ } else { ++ if (CRYPTO_gcm128_encrypt(&ctx->gcm, in, out, len)) ++ return 0; ++ } ++ } else { ++ if (ctx->ctr != NULL) { ++ if (CRYPTO_gcm128_decrypt_ctr32(&ctx->gcm, in, out, len, ctx->ctr)) ++ return 0; ++ } else { ++ if (CRYPTO_gcm128_decrypt(&ctx->gcm, in, out, len)) ++ return 0; ++ } ++ } ++ return 1; ++} ++ + static const PROV_GCM_HW sm4_gcm = { + sm4_gcm_initkey, + ossl_gcm_setiv, + ossl_gcm_aad_update, +- ossl_gcm_cipher_update, ++ hw_gcm_cipher_update, + ossl_gcm_cipher_final, + ossl_gcm_one_shot + }; +-- +2.37.3.windows.1 + diff --git a/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch b/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch new file mode 100644 index 0000000..0467d78 --- /dev/null +++ b/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch @@ -0,0 +1,457 @@ +From 8a83d735057dde1f727eb0921446e4ca8b085267 Mon Sep 17 00:00:00 2001 +From: "fangming.fang" <fangming.fang@arm.com> +Date: Fri, 24 Dec 2021 08:29:04 +0000 +Subject: [PATCH 02/13] SM3 acceleration with SM3 hardware instruction on + aarch64 + +SM3 hardware instruction is optional feature of crypto extension for +aarch64. This implementation accelerates SM3 via SM3 instructions. For +the platform not supporting SM3 instruction, the original C +implementation still works. Thanks to AliBaba for testing and reporting +the following perf numbers for Yitian710: + +Benchmark on T-Head Yitian-710 2.75GHz: + +Before: +type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes +sm3 49297.82k 121062.63k 223106.05k 283371.52k 307574.10k 309400.92k + +After (33% - 74% faster): +type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes +sm3 65640.01k 179121.79k 359854.59k 481448.96k 534055.59k 538274.47k + +Reviewed-by: Paul Dale <pauli@openssl.org> +Reviewed-by: Tomas Mraz <tomas@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/17454) +--- + crypto/arm64cpuid.pl | 8 + + crypto/arm_arch.h | 2 + + crypto/armcap.c | 10 ++ + crypto/sm3/asm/sm3-armv8.pl | 282 ++++++++++++++++++++++++++++++++++++ + crypto/sm3/build.info | 21 ++- + crypto/sm3/sm3_local.h | 16 +- + 6 files changed, 336 insertions(+), 3 deletions(-) + create mode 100644 crypto/sm3/asm/sm3-armv8.pl + +diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl +index 11f0e50279..10d267b7ad 100755 +--- a/crypto/arm64cpuid.pl ++++ b/crypto/arm64cpuid.pl +@@ -96,6 +96,14 @@ _armv8_cpuid_probe: + ret + .size _armv8_cpuid_probe,.-_armv8_cpuid_probe + ++.globl _armv8_sm3_probe ++.type _armv8_sm3_probe,%function ++_armv8_sm3_probe: ++ AARCH64_VALID_CALL_TARGET ++ .long 0xce63c004 // sm3partw1 v4.4s, v0.4s, v3.4s ++ ret ++.size _armv8_sm3_probe,.-_armv8_sm3_probe ++ + .globl OPENSSL_cleanse + .type OPENSSL_cleanse,%function + .align 5 +diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h +index a815a5c72b..c8b501f34c 100644 +--- a/crypto/arm_arch.h ++++ b/crypto/arm_arch.h +@@ -83,6 +83,8 @@ extern unsigned int OPENSSL_armv8_rsa_neonized; + # define ARMV8_PMULL (1<<5) + # define ARMV8_SHA512 (1<<6) + # define ARMV8_CPUID (1<<7) ++# define ARMV8_RNG (1<<8) ++# define ARMV8_SM3 (1<<9) + + /* + * MIDR_EL1 system register +diff --git a/crypto/armcap.c b/crypto/armcap.c +index c021330e32..365a48df45 100644 +--- a/crypto/armcap.c ++++ b/crypto/armcap.c +@@ -52,6 +52,7 @@ void _armv8_sha1_probe(void); + void _armv8_sha256_probe(void); + void _armv8_pmull_probe(void); + # ifdef __aarch64__ ++void _armv8_sm3_probe(void); + void _armv8_sha512_probe(void); + unsigned int _armv8_cpuid_probe(void); + # endif +@@ -137,6 +138,7 @@ static unsigned long getauxval(unsigned long key) + # define HWCAP_CE_SHA1 (1 << 5) + # define HWCAP_CE_SHA256 (1 << 6) + # define HWCAP_CPUID (1 << 11) ++# define HWCAP_CE_SM3 (1 << 18) + # define HWCAP_CE_SHA512 (1 << 21) + # endif + +@@ -210,6 +212,9 @@ void OPENSSL_cpuid_setup(void) + + if (hwcap & HWCAP_CPUID) + OPENSSL_armcap_P |= ARMV8_CPUID; ++ ++ if (hwcap & HWCAP_CE_SM3) ++ OPENSSL_armcap_P |= ARMV8_SM3; + # endif + } + # endif +@@ -253,6 +258,11 @@ void OPENSSL_cpuid_setup(void) + _armv8_sha512_probe(); + OPENSSL_armcap_P |= ARMV8_SHA512; + } ++ ++ if (sigsetjmp(ill_jmp, 1) == 0) { ++ _armv8_sm3_probe(); ++ OPENSSL_armcap_P |= ARMV8_SM3; ++ } + # endif + } + # endif +diff --git a/crypto/sm3/asm/sm3-armv8.pl b/crypto/sm3/asm/sm3-armv8.pl +new file mode 100644 +index 0000000000..bb71b2eade +--- /dev/null ++++ b/crypto/sm3/asm/sm3-armv8.pl +@@ -0,0 +1,282 @@ ++#! /usr/bin/env perl ++# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++# ++# This module implements support for Armv8 SM3 instructions ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or ++die "can't locate arm-xlate.pl"; ++ ++open OUT,"| \"$^X\" $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++*STDOUT=*OUT; ++ ++# Message expanding: ++# Wj <- P1(W[j-16]^W[j-9]^(W[j-3]<<<15))^(W[j-13]<<<7)^W[j-6] ++# Input: s0, s1, s2, s3 ++# s0 = w0 | w1 | w2 | w3 ++# s1 = w4 | w5 | w6 | w7 ++# s2 = w8 | w9 | w10 | w11 ++# s3 = w12 | w13 | w14 | w15 ++# Output: s4 ++sub msg_exp () { ++my $s0 = shift; ++my $s1 = shift; ++my $s2 = shift; ++my $s3 = shift; ++my $s4 = shift; ++my $vtmp1 = shift; ++my $vtmp2 = shift; ++$code.=<<___; ++ // s4 = w7 | w8 | w9 | w10 ++ ext $s4.16b, $s1.16b, $s2.16b, #12 ++ // vtmp1 = w3 | w4 | w5 | w6 ++ ext $vtmp1.16b, $s0.16b, $s1.16b, #12 ++ // vtmp2 = w10 | w11 | w12 | w13 ++ ext $vtmp2.16b, $s2.16b, $s3.16b, #8 ++ sm3partw1 $s4.4s, $s0.4s, $s3.4s ++ sm3partw2 $s4.4s, $vtmp2.4s, $vtmp1.4s ++___ ++} ++ ++# A round of compresson function ++# Input: ++# ab - choose instruction among sm3tt1a, sm3tt1b, sm3tt2a, sm3tt2b ++# vstate0 - vstate1, store digest status(A - H) ++# vconst0 - vconst1, interleaved used to store Tj <<< j ++# vtmp - temporary register ++# vw - for sm3tt1ab, vw = s0 eor s1 ++# s0 - for sm3tt2ab, just be s0 ++# i, choose wj' or wj from vw ++sub round () { ++my $ab = shift; ++my $vstate0 = shift; ++my $vstate1 = shift; ++my $vconst0 = shift; ++my $vconst1 = shift; ++my $vtmp = shift; ++my $vw = shift; ++my $s0 = shift; ++my $i = shift; ++$code.=<<___; ++ sm3ss1 $vtmp.4s, $vstate0.4s, $vconst0.4s, $vstate1.4s ++ shl $vconst1.4s, $vconst0.4s, #1 ++ sri $vconst1.4s, $vconst0.4s, #31 ++ sm3tt1$ab $vstate0.4s, $vtmp.4s, $vw.4s[$i] ++ sm3tt2$ab $vstate1.4s, $vtmp.4s, $s0.4s[$i] ++___ ++} ++ ++sub qround () { ++my $ab = shift; ++my $vstate0 = shift; ++my $vstate1 = shift; ++my $vconst0 = shift; ++my $vconst1 = shift; ++my $vtmp1 = shift; ++my $vtmp2 = shift; ++my $s0 = shift; ++my $s1 = shift; ++my $s2 = shift; ++my $s3 = shift; ++my $s4 = shift; ++ if($s4) { ++ &msg_exp($s0, $s1, $s2, $s3, $s4, $vtmp1, $vtmp2); ++ } ++$code.=<<___; ++ eor $vtmp1.16b, $s0.16b, $s1.16b ++___ ++ &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2, ++ $vtmp1, $s0, 0); ++ &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2, ++ $vtmp1, $s0, 1); ++ &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2, ++ $vtmp1, $s0, 2); ++ &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2, ++ $vtmp1, $s0, 3); ++} ++ ++$code=<<___; ++#include "arm_arch.h" ++.arch armv8.2-a+sm4 ++.text ++___ ++ ++{{{ ++my ($pstate,$pdata,$num)=("x0","x1","w2"); ++my ($state1,$state2)=("v5","v6"); ++my ($sconst1, $sconst2)=("s16","s17"); ++my ($vconst1, $vconst2)=("v16","v17"); ++my ($s0,$s1,$s2,$s3,$s4)=map("v$_",(0..4)); ++my ($bkstate1,$bkstate2)=("v18","v19"); ++my ($vconst_tmp1,$vconst_tmp2)=("v20","v21"); ++my ($vtmp1,$vtmp2)=("v22","v23"); ++my $constaddr="x8"; ++# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num) ++$code.=<<___; ++.globl ossl_hwsm3_block_data_order ++.type ossl_hwsm3_block_data_order,%function ++.align 5 ++ossl_hwsm3_block_data_order: ++ AARCH64_VALID_CALL_TARGET ++ // load state ++ ld1 {$state1.4s-$state2.4s}, [$pstate] ++ rev64 $state1.4s, $state1.4s ++ rev64 $state2.4s, $state2.4s ++ ext $state1.16b, $state1.16b, $state1.16b, #8 ++ ext $state2.16b, $state2.16b, $state2.16b, #8 ++ ++ adr $constaddr, .Tj ++ ldp $sconst1, $sconst2, [$constaddr] ++ ++.Loop: ++ // load input ++ ld1 {$s0.16b-$s3.16b}, [$pdata], #64 ++ sub $num, $num, #1 ++ ++ mov $bkstate1.16b, $state1.16b ++ mov $bkstate2.16b, $state2.16b ++ ++#ifndef __ARMEB__ ++ rev32 $s0.16b, $s0.16b ++ rev32 $s1.16b, $s1.16b ++ rev32 $s2.16b, $s2.16b ++ rev32 $s3.16b, $s3.16b ++#endif ++ ++ ext $vconst_tmp1.16b, $vconst1.16b, $vconst1.16b, #4 ++___ ++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s0,$s1,$s2,$s3,$s4); ++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s1,$s2,$s3,$s4,$s0); ++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s2,$s3,$s4,$s0,$s1); ++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s3,$s4,$s0,$s1,$s2); ++ ++$code.=<<___; ++ ext $vconst_tmp1.16b, $vconst2.16b, $vconst2.16b, #4 ++___ ++ ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s4,$s0,$s1,$s2,$s3); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s0,$s1,$s2,$s3,$s4); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s1,$s2,$s3,$s4,$s0); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s2,$s3,$s4,$s0,$s1); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s3,$s4,$s0,$s1,$s2); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s4,$s0,$s1,$s2,$s3); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s0,$s1,$s2,$s3,$s4); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s1,$s2,$s3,$s4,$s0); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s2,$s3,$s4,$s0,$s1); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s3,$s4); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s4,$s0); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s0,$s1); ++ ++$code.=<<___; ++ eor $state1.16b, $state1.16b, $bkstate1.16b ++ eor $state2.16b, $state2.16b, $bkstate2.16b ++ ++ // any remained blocks? ++ cbnz $num, .Loop ++ ++ // save state ++ rev64 $state1.4s, $state1.4s ++ rev64 $state2.4s, $state2.4s ++ ext $state1.16b, $state1.16b, $state1.16b, #8 ++ ext $state2.16b, $state2.16b, $state2.16b, #8 ++ st1 {$state1.4s-$state2.4s}, [$pstate] ++ ret ++.size ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order ++ ++.align 3 ++.Tj: ++.word 0x79cc4519, 0x9d8a7a87 ++___ ++}}} ++ ++######################################### ++my %sm3partopcode = ( ++ "sm3partw1" => 0xce60C000, ++ "sm3partw2" => 0xce60C400); ++ ++my %sm3sslopcode = ( ++ "sm3ssl" => 0xce400000); ++ ++my %sm3ttopcode = ( ++ "sm3tt1a" => 0xce408000, ++ "sm3tt1b" => 0xce408400, ++ "sm3tt2a" => 0xce408800, ++ "sm3tt2b" => 0xce408C00); ++ ++sub unsm3part { ++ my ($mnemonic,$arg)=@_; ++ ++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o ++ && ++ sprintf ".inst\t0x%08x\t//%s %s", ++ $sm3partopcode{$mnemonic}|$1|($2<<5)|($3<<16), ++ $mnemonic,$arg; ++} ++ ++sub unsm3ssl { ++ my ($mnemonic,$arg)=@_; ++ ++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*, ++ \s*[qv](\d+)/o ++ && ++ sprintf ".inst\t0x%08x\t//%s %s", ++ $sm3sslopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10), ++ $mnemonic,$arg; ++} ++ ++sub unsm3tt { ++ my ($mnemonic,$arg)=@_; ++ ++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*\[([0-3])\]/o ++ && ++ sprintf ".inst\t0x%08x\t//%s %s", ++ $sm3ttopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<12), ++ $mnemonic,$arg; ++} ++ ++open SELF,$0; ++while(<SELF>) { ++ next if (/^#!/); ++ last if (!s/^#/\/\// and !/^$/); ++ print; ++} ++close SELF; ++ ++foreach(split("\n",$code)) { ++ s/\`([^\`]*)\`/eval($1)/ge; ++ ++ s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge; ++ s/\b(sm3ssl)\s+([qv].*)/unsm3ssl($1,$2)/ge; ++ s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge; ++ print $_,"\n"; ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; +diff --git a/crypto/sm3/build.info b/crypto/sm3/build.info +index eca68216f2..2fa54a4a8b 100644 +--- a/crypto/sm3/build.info ++++ b/crypto/sm3/build.info +@@ -1,5 +1,22 @@ + LIBS=../../libcrypto + + IF[{- !$disabled{sm3} -}] +- SOURCE[../../libcrypto]=sm3.c legacy_sm3.c +-ENDIF +\ No newline at end of file ++ IF[{- !$disabled{asm} -}] ++ $SM3ASM_aarch64=sm3-armv8.S ++ $SM3DEF_aarch64=OPENSSL_SM3_ASM ++ ++ # Now that we have defined all the arch specific variables, use the ++ # appropriate ones, and define the appropriate macros ++ IF[$SM3ASM_{- $target{asm_arch} -}] ++ $SM3ASM=$SM3ASM_{- $target{asm_arch} -} ++ $SM3DEF=$SM3DEF_{- $target{asm_arch} -} ++ ENDIF ++ ENDIF ++ ++ SOURCE[../../libcrypto]=sm3.c legacy_sm3.c $SM3ASM ++ DEFINE[../../libcrypto]=$SM3DEF ++ ++ GENERATE[sm3-armv8.S]=asm/sm3-armv8.pl ++ INCLUDE[sm3-armv8.o]=.. ++ENDIF ++ +diff --git a/crypto/sm3/sm3_local.h b/crypto/sm3/sm3_local.h +index 6daeb878a8..ac8a2bf768 100644 +--- a/crypto/sm3/sm3_local.h ++++ b/crypto/sm3/sm3_local.h +@@ -32,7 +32,21 @@ + ll=(c)->G; (void)HOST_l2c(ll, (s)); \ + ll=(c)->H; (void)HOST_l2c(ll, (s)); \ + } while (0) +-#define HASH_BLOCK_DATA_ORDER ossl_sm3_block_data_order ++ ++#if defined(OPENSSL_SM3_ASM) ++# if defined(__aarch64__) ++# include "crypto/arm_arch.h" ++# define HWSM3_CAPABLE (OPENSSL_armcap_P & ARMV8_SM3) ++void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num); ++# endif ++#endif ++ ++#if defined(HWSM3_CAPABLE) ++# define HASH_BLOCK_DATA_ORDER (HWSM3_CAPABLE ? ossl_hwsm3_block_data_order \ ++ : ossl_sm3_block_data_order) ++#else ++# define HASH_BLOCK_DATA_ORDER ossl_sm3_block_data_order ++#endif + + void ossl_sm3_block_data_order(SM3_CTX *c, const void *p, size_t num); + void ossl_sm3_transform(SM3_CTX *c, const unsigned char *data); +-- +2.37.3.windows.1 + diff --git a/Backport-SM4-AESE-optimization-for-ARMv8.patch b/Backport-SM4-AESE-optimization-for-ARMv8.patch new file mode 100644 index 0000000..0866262 --- /dev/null +++ b/Backport-SM4-AESE-optimization-for-ARMv8.patch @@ -0,0 +1,2322 @@ +From 730387aebda57a1bb0af5a74747d4dadc5e033f7 Mon Sep 17 00:00:00 2001 +From: Xu Yizhou <xuyizhou1@huawei.com> +Date: Wed, 18 Jan 2023 09:55:02 +0800 +Subject: [PATCH 12/13] SM4 AESE optimization for ARMv8 + +Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com> + +Reviewed-by: Tomas Mraz <tomas@openssl.org> +Reviewed-by: Paul Dale <pauli@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/19914) +--- + crypto/sm4/asm/vpsm4-armv8.pl | 458 +++++ + crypto/sm4/asm/vpsm4_ex-armv8.pl | 1544 +++++++++++++++++ + crypto/sm4/build.info | 4 +- + include/crypto/sm4_platform.h | 41 +- + .../implementations/ciphers/cipher_sm4_hw.c | 26 +- + .../implementations/ciphers/cipher_sm4_xts.c | 4 +- + .../implementations/ciphers/cipher_sm4_xts.h | 2 +- + .../ciphers/cipher_sm4_xts_hw.c | 33 +- + 8 files changed, 2090 insertions(+), 22 deletions(-) + create mode 100644 crypto/sm4/asm/vpsm4_ex-armv8.pl + +diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl +index 73797af582..e19de30901 100755 +--- a/crypto/sm4/asm/vpsm4-armv8.pl ++++ b/crypto/sm4/asm/vpsm4-armv8.pl +@@ -28,6 +28,7 @@ open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + + $prefix="vpsm4"; + my @vtmp=map("v$_",(0..3)); ++my @qtmp=map("q$_",(0..3)); + my @data=map("v$_",(4..7)); + my @datax=map("v$_",(8..11)); + my ($rk0,$rk1)=("v12","v13"); +@@ -36,6 +37,7 @@ my @vtmpx=map("v$_",(12..15)); + my @sbox=map("v$_",(16..31)); + my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3"); + my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9"); ++my ($xtmp1,$xtmp2)=("x8","x9"); + my ($ptr,$counter)=("x10","w11"); + my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15"); + +@@ -60,6 +62,51 @@ ___ + } + } + ++sub rev32_armeb() { ++ my $dst = shift; ++ my $src = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++$code.=<<___; ++#ifdef __AARCH64EB__ ++ rev32 $dst.16b,$src.16b ++#else ++ mov $dst.16b,$src.16b ++#endif ++___ ++ } else { ++$code.=<<___; ++#ifdef __AARCH64EB__ ++ rev32 $dst.16b,$dst.16b ++#endif ++___ ++ } ++} ++ ++sub rbit() { ++ my $dst = shift; ++ my $src = shift; ++ my $std = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++ if ($std eq "_gb") { ++$code.=<<___; ++ rbit $dst.16b,$src.16b ++___ ++ } else { ++$code.=<<___; ++ mov $dst.16b,$src.16b ++___ ++ } ++ } else { ++ if ($std eq "_gb") { ++$code.=<<___; ++ rbit $dst.16b,$src.16b ++___ ++ } ++ } ++} ++ + sub transpose() { + my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; + +@@ -435,6 +482,58 @@ $code.=<<___; + ___ + } + ++ ++sub mov_reg_to_vec() { ++ my $src0 = shift; ++ my $src1 = shift; ++ my $desv = shift; ++$code.=<<___; ++ mov $desv.d[0],$src0 ++ mov $desv.d[1],$src1 ++___ ++ &rev32_armeb($desv,$desv); ++} ++ ++sub mov_vec_to_reg() { ++ my $srcv = shift; ++ my $des0 = shift; ++ my $des1 = shift; ++$code.=<<___; ++ mov $des0,$srcv.d[0] ++ mov $des1,$srcv.d[1] ++___ ++} ++ ++sub compute_tweak() { ++ my $src0 = shift; ++ my $src1 = shift; ++ my $des0 = shift; ++ my $des1 = shift; ++$code.=<<___; ++ mov $wtmp0,0x87 ++ extr $xtmp2,$src1,$src1,#32 ++ extr $des1,$src1,$src0,#63 ++ and $wtmp1,$wtmp0,$wtmp2,asr#31 ++ eor $des0,$xtmp1,$src0,lsl#1 ++___ ++} ++ ++sub compute_tweak_vec() { ++ my $src = shift; ++ my $des = shift; ++ my $std = shift; ++ &rbit(@vtmp[2],$src,$std); ++$code.=<<___; ++ ldr @qtmp[0], =0x01010101010101010101010101010187 ++ shl $des.16b, @vtmp[2].16b, #1 ++ ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 ++ ushr @vtmp[1].16b, @vtmp[1].16b, #7 ++ mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b ++ eor $des.16b, $des.16b, @vtmp[1].16b ++___ ++ &rbit($des,$des,$std); ++} ++ + $code=<<___; + #include "arm_arch.h" + .arch armv8-a +@@ -1101,6 +1200,365 @@ $code.=<<___; + .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks + ___ + }}} ++ ++{{{ ++my ($blocks,$len)=("x2","x2"); ++my $ivp=("x5"); ++my @twx=map("x$_",(12..27)); ++my ($rks1,$rks2)=("x26","x27"); ++my $lastBlk=("x26"); ++my $enc=("w28"); ++my $remain=("x29"); ++ ++my @tweak=@datax; ++ ++sub gen_xts_cipher() { ++ my $std = shift; ++$code.=<<___; ++.globl ${prefix}_xts_encrypt${std} ++.type ${prefix}_xts_encrypt${std},%function ++.align 5 ++${prefix}_xts_encrypt${std}: ++ AARCH64_SIGN_LINK_REGISTER ++ stp x15, x16, [sp, #-0x10]! ++ stp x17, x18, [sp, #-0x10]! ++ stp x19, x20, [sp, #-0x10]! ++ stp x21, x22, [sp, #-0x10]! ++ stp x23, x24, [sp, #-0x10]! ++ stp x25, x26, [sp, #-0x10]! ++ stp x27, x28, [sp, #-0x10]! ++ stp x29, x30, [sp, #-0x10]! ++ stp d8, d9, [sp, #-0x10]! ++ stp d10, d11, [sp, #-0x10]! ++ stp d12, d13, [sp, #-0x10]! ++ stp d14, d15, [sp, #-0x10]! ++ mov $rks1,x3 ++ mov $rks2,x4 ++ mov $enc,w6 ++ ld1 {@tweak[0].4s}, [$ivp] ++ mov $rks,$rks2 ++___ ++ &load_sbox(); ++ &rev32(@tweak[0],@tweak[0]); ++ &encrypt_1blk(@tweak[0]); ++$code.=<<___; ++ mov $rks,$rks1 ++ and $remain,$len,#0x0F ++ // convert length into blocks ++ lsr $blocks,$len,4 ++ cmp $blocks,#1 ++ b.lt .return${std} ++ ++ cmp $remain,0 ++ // If the encryption/decryption Length is N times of 16, ++ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std} ++ b.eq .xts_encrypt_blocks${std} ++ ++ // If the encryption/decryption length is not N times of 16, ++ // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std} ++ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std} ++ subs $blocks,$blocks,#1 ++ b.eq .only_2blks_tweak${std} ++.xts_encrypt_blocks${std}: ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rev32_armeb(@tweak[0],@tweak[0]); ++ &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); ++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); ++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); ++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); ++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); ++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); ++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); ++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); ++$code.=<<___; ++.Lxts_8_blocks_process${std}: ++ cmp $blocks,#8 ++ b.lt .Lxts_4_blocks_process${std} ++___ ++ &mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]); ++ &mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]); ++ &mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]); ++ &mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]); ++ &mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]); ++ &mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]); ++ &mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]); ++ &mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]); ++$code.=<<___; ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &rbit(@vtmp[0],@vtmp[0],$std); ++ &rbit(@vtmp[1],@vtmp[1],$std); ++ &rbit(@vtmp[2],@vtmp[2],$std); ++ &rbit(@vtmp[3],@vtmp[3],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @vtmp[0].16b ++ eor @data[1].16b, @data[1].16b, @vtmp[1].16b ++ eor @data[2].16b, @data[2].16b, @vtmp[2].16b ++ eor @data[3].16b, @data[3].16b, @vtmp[3].16b ++ ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++___ ++ &rbit(@vtmpx[0],@vtmpx[0],$std); ++ &rbit(@vtmpx[1],@vtmpx[1],$std); ++ &rbit(@vtmpx[2],@vtmpx[2],$std); ++ &rbit(@vtmpx[3],@vtmpx[3],$std); ++$code.=<<___; ++ eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b ++ eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b ++ eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b ++ eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &rev32(@datax[0],@datax[0]); ++ &rev32(@datax[1],@datax[1]); ++ &rev32(@datax[2],@datax[2]); ++ &rev32(@datax[3],@datax[3]); ++ &transpose(@data,@vtmp); ++ &transpose(@datax,@vtmp); ++$code.=<<___; ++ bl _${prefix}_enc_8blks ++___ ++ &transpose(@vtmp,@datax); ++ &transpose(@data,@datax); ++ ++ &mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]); ++ &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); ++ &mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]); ++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); ++ &mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]); ++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); ++ &mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]); ++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); ++ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]); ++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); ++ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]); ++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); ++ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]); ++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); ++ &mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]); ++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b ++ eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b ++ eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++ eor @data[2].16b, @data[2].16b, @tweak[2].16b ++ eor @data[3].16b, @data[3].16b, @tweak[3].16b ++ ++ // save the last tweak ++ st1 {@tweak[3].4s},[$ivp] ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.gt .Lxts_8_blocks_process${std} ++ b 100f ++.Lxts_4_blocks_process${std}: ++___ ++ &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); ++ &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); ++ &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); ++ &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); ++$code.=<<___; ++ cmp $blocks,#4 ++ b.lt 1f ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rbit(@tweak[1],@tweak[1],$std); ++ &rbit(@tweak[2],@tweak[2],$std); ++ &rbit(@tweak[3],@tweak[3],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++ eor @data[2].16b, @data[2].16b, @tweak[2].16b ++ eor @data[3].16b, @data[3].16b, @tweak[3].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &transpose(@data,@vtmp); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++___ ++ &transpose(@vtmp,@data); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b ++ eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ sub $blocks,$blocks,#4 ++___ ++ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]); ++ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]); ++ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]); ++$code.=<<___; ++ // save the last tweak ++ st1 {@tweak[3].4s},[$ivp] ++1: ++ // process last block ++ cmp $blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ ld1 {@data[0].4s},[$inp],#16 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ st1 {@data[0].4s},[$outp],#16 ++ // save the last tweak ++ st1 {@tweak[0].4s},[$ivp] ++ b 100f ++1: // process last 2 blocks ++ cmp $blocks,#2 ++ b.gt 1f ++ ld1 {@data[0].4s,@data[1].4s},[$inp],#32 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rbit(@tweak[1],@tweak[1],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &transpose(@data,@vtmp); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++___ ++ &transpose(@vtmp,@data); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 ++ // save the last tweak ++ st1 {@tweak[1].4s},[$ivp] ++ b 100f ++1: // process last 3 blocks ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rbit(@tweak[1],@tweak[1],$std); ++ &rbit(@tweak[2],@tweak[2],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++ eor @data[2].16b, @data[2].16b, @tweak[2].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &transpose(@data,@vtmp); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++___ ++ &transpose(@vtmp,@data); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 ++ // save the last tweak ++ st1 {@tweak[2].4s},[$ivp] ++100: ++ cmp $remain,0 ++ b.eq .return${std} ++ ++// This brance calculates the last two tweaks, ++// while the encryption/decryption length is larger than 32 ++.last_2blks_tweak${std}: ++ ld1 {@tweak[0].4s},[$ivp] ++___ ++ &rev32_armeb(@tweak[0],@tweak[0]); ++ &compute_tweak_vec(@tweak[0],@tweak[1],$std); ++ &compute_tweak_vec(@tweak[1],@tweak[2],$std); ++$code.=<<___; ++ b .check_dec${std} ++ ++ ++// This brance calculates the last two tweaks, ++// while the encryption/decryption length is equal to 32, who only need two tweaks ++.only_2blks_tweak${std}: ++ mov @tweak[1].16b,@tweak[0].16b ++___ ++ &rev32_armeb(@tweak[1],@tweak[1]); ++ &compute_tweak_vec(@tweak[1],@tweak[2]); ++$code.=<<___; ++ b .check_dec${std} ++ ++ ++// Determine whether encryption or decryption is required. ++// The last two tweaks need to be swapped for decryption. ++.check_dec${std}: ++ // encryption:1 decryption:0 ++ cmp $enc,1 ++ b.eq .prcess_last_2blks${std} ++ mov @vtmp[0].16B,@tweak[1].16b ++ mov @tweak[1].16B,@tweak[2].16b ++ mov @tweak[2].16B,@vtmp[0].16b ++ ++.prcess_last_2blks${std}: ++___ ++ &rev32_armeb(@tweak[1],@tweak[1]); ++ &rev32_armeb(@tweak[2],@tweak[2]); ++$code.=<<___; ++ ld1 {@data[0].4s},[$inp],#16 ++ eor @data[0].16b, @data[0].16b, @tweak[1].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[1].16b ++ st1 {@data[0].4s},[$outp],#16 ++ ++ sub $lastBlk,$outp,16 ++ .loop${std}: ++ subs $remain,$remain,1 ++ ldrb $wtmp0,[$lastBlk,$remain] ++ ldrb $wtmp1,[$inp,$remain] ++ strb $wtmp1,[$lastBlk,$remain] ++ strb $wtmp0,[$outp,$remain] ++ b.gt .loop${std} ++ ld1 {@data[0].4s}, [$lastBlk] ++ eor @data[0].16b, @data[0].16b, @tweak[2].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[2].16b ++ st1 {@data[0].4s}, [$lastBlk] ++.return${std}: ++ ldp d14, d15, [sp], #0x10 ++ ldp d12, d13, [sp], #0x10 ++ ldp d10, d11, [sp], #0x10 ++ ldp d8, d9, [sp], #0x10 ++ ldp x29, x30, [sp], #0x10 ++ ldp x27, x28, [sp], #0x10 ++ ldp x25, x26, [sp], #0x10 ++ ldp x23, x24, [sp], #0x10 ++ ldp x21, x22, [sp], #0x10 ++ ldp x19, x20, [sp], #0x10 ++ ldp x17, x18, [sp], #0x10 ++ ldp x15, x16, [sp], #0x10 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std} ++___ ++} # end of gen_xts_cipher ++&gen_xts_cipher("_gb"); ++&gen_xts_cipher(""); ++}}} + ######################################## + open SELF,$0; + while(<SELF>) { +diff --git a/crypto/sm4/asm/vpsm4_ex-armv8.pl b/crypto/sm4/asm/vpsm4_ex-armv8.pl +new file mode 100644 +index 0000000000..3d094aa535 +--- /dev/null ++++ b/crypto/sm4/asm/vpsm4_ex-armv8.pl +@@ -0,0 +1,1544 @@ ++#! /usr/bin/env perl ++# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++ ++# ++# This module implements SM4 with ASIMD and AESE on AARCH64 ++# ++# Dec 2022 ++# ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or ++die "can't locate arm-xlate.pl"; ++ ++open OUT,"| \"$^X\" $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++*STDOUT=*OUT; ++ ++$prefix="vpsm4_ex"; ++my @vtmp=map("v$_",(0..3)); ++my @qtmp=map("q$_",(0..3)); ++my @data=map("v$_",(4..7)); ++my @datax=map("v$_",(8..11)); ++my ($rk0,$rk1)=("v12","v13"); ++my ($rka,$rkb)=("v14","v15"); ++my @vtmpx=map("v$_",(12..15)); ++my ($vtmp4,$vtmp5)=("v24","v25"); ++my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31"); ++my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31"); ++ ++my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3"); ++my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9"); ++my ($xtmp1,$xtmp2)=("x8","x9"); ++my ($ptr,$counter)=("x10","w11"); ++my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15"); ++ ++sub rev32() { ++ my $dst = shift; ++ my $src = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++$code.=<<___; ++#ifndef __AARCH64EB__ ++ rev32 $dst.16b,$src.16b ++#else ++ mov $dst.16b,$src.16b ++#endif ++___ ++ } else { ++$code.=<<___; ++#ifndef __AARCH64EB__ ++ rev32 $dst.16b,$dst.16b ++#endif ++___ ++ } ++} ++ ++sub rev32_armeb() { ++ my $dst = shift; ++ my $src = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++$code.=<<___; ++#ifdef __AARCH64EB__ ++ rev32 $dst.16b,$src.16b ++#else ++ mov $dst.16b,$src.16b ++#endif ++___ ++ } else { ++$code.=<<___; ++#ifdef __AARCH64EB__ ++ rev32 $dst.16b,$dst.16b ++#endif ++___ ++ } ++} ++ ++sub rbit() { ++ my $dst = shift; ++ my $src = shift; ++ my $std = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++ if ($std eq "_gb") { ++$code.=<<___; ++ rbit $dst.16b,$src.16b ++___ ++ } else { ++$code.=<<___; ++ mov $dst.16b,$src.16b ++___ ++ } ++ } else { ++ if ($std eq "_gb") { ++$code.=<<___; ++ rbit $dst.16b,$src.16b ++___ ++ } ++ } ++} ++ ++sub transpose() { ++ my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; ++ ++$code.=<<___; ++ zip1 $vt0.4s,$dat0.4s,$dat1.4s ++ zip2 $vt1.4s,$dat0.4s,$dat1.4s ++ zip1 $vt2.4s,$dat2.4s,$dat3.4s ++ zip2 $vt3.4s,$dat2.4s,$dat3.4s ++ zip1 $dat0.2d,$vt0.2d,$vt2.2d ++ zip2 $dat1.2d,$vt0.2d,$vt2.2d ++ zip1 $dat2.2d,$vt1.2d,$vt3.2d ++ zip2 $dat3.2d,$vt1.2d,$vt3.2d ++___ ++} ++ ++# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x) ++sub mul_matrix() { ++ my $x = shift; ++ my $higherMat = shift; ++ my $lowerMat = shift; ++ my $tmp = shift; ++$code.=<<___; ++ ushr $tmp.16b, $x.16b, 4 ++ and $x.16b, $x.16b, $ANDMaskV.16b ++ tbl $x.16b, {$lowerMat.16b}, $x.16b ++ tbl $tmp.16b, {$higherMat.16b}, $tmp.16b ++ eor $x.16b, $x.16b, $tmp.16b ++___ ++} ++ ++# sbox operations for 4-lane of words ++# sbox operation for 4-lane of words ++sub sbox() { ++ my $dat = shift; ++ ++$code.=<<___; ++ // optimize sbox using AESE instruction ++ tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b ++___ ++ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); ++$code.=<<___; ++ eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b ++ aese @vtmp[0].16b,@vtmp[1].16b ++___ ++ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4); ++$code.=<<___; ++ mov $dat.16b,@vtmp[0].16b ++ ++ // linear transformation ++ ushr @vtmp[0].4s,$dat.4s,32-2 ++ ushr @vtmp[1].4s,$dat.4s,32-10 ++ ushr @vtmp[2].4s,$dat.4s,32-18 ++ ushr @vtmp[3].4s,$dat.4s,32-24 ++ sli @vtmp[0].4s,$dat.4s,2 ++ sli @vtmp[1].4s,$dat.4s,10 ++ sli @vtmp[2].4s,$dat.4s,18 ++ sli @vtmp[3].4s,$dat.4s,24 ++ eor $vtmp4.16b,@vtmp[0].16b,$dat.16b ++ eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b ++ eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b ++ eor $dat.16b,$dat.16b,$vtmp4.16b ++___ ++} ++ ++# sbox operation for 8-lane of words ++sub sbox_double() { ++ my $dat = shift; ++ my $datx = shift; ++ ++$code.=<<___; ++ // optimize sbox using AESE instruction ++ tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b ++ tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b ++___ ++ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); ++ &mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4); ++$code.=<<___; ++ eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b ++ aese @vtmp[0].16b,$vtmp5.16b ++ aese @vtmp[1].16b,$vtmp5.16b ++___ ++ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4); ++ &mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4); ++$code.=<<___; ++ mov $dat.16b,@vtmp[0].16b ++ mov $datx.16b,@vtmp[1].16b ++ ++ // linear transformation ++ ushr @vtmp[0].4s,$dat.4s,32-2 ++ ushr $vtmp5.4s,$datx.4s,32-2 ++ ushr @vtmp[1].4s,$dat.4s,32-10 ++ ushr @vtmp[2].4s,$dat.4s,32-18 ++ ushr @vtmp[3].4s,$dat.4s,32-24 ++ sli @vtmp[0].4s,$dat.4s,2 ++ sli $vtmp5.4s,$datx.4s,2 ++ sli @vtmp[1].4s,$dat.4s,10 ++ sli @vtmp[2].4s,$dat.4s,18 ++ sli @vtmp[3].4s,$dat.4s,24 ++ eor $vtmp4.16b,@vtmp[0].16b,$dat.16b ++ eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b ++ eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b ++ eor $dat.16b,$dat.16b,$vtmp4.16b ++ ushr @vtmp[1].4s,$datx.4s,32-10 ++ ushr @vtmp[2].4s,$datx.4s,32-18 ++ ushr @vtmp[3].4s,$datx.4s,32-24 ++ sli @vtmp[1].4s,$datx.4s,10 ++ sli @vtmp[2].4s,$datx.4s,18 ++ sli @vtmp[3].4s,$datx.4s,24 ++ eor $vtmp4.16b,$vtmp5.16b,$datx.16b ++ eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b ++ eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b ++ eor $datx.16b,$datx.16b,$vtmp4.16b ++___ ++} ++ ++# sbox operation for one single word ++sub sbox_1word () { ++ my $word = shift; ++ ++$code.=<<___; ++ mov @vtmp[3].s[0],$word ++ // optimize sbox using AESE instruction ++ tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b ++___ ++ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); ++$code.=<<___; ++ eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b ++ aese @vtmp[0].16b,@vtmp[1].16b ++___ ++ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); ++$code.=<<___; ++ ++ mov $wtmp0,@vtmp[0].s[0] ++ eor $word,$wtmp0,$wtmp0,ror #32-2 ++ eor $word,$word,$wtmp0,ror #32-10 ++ eor $word,$word,$wtmp0,ror #32-18 ++ eor $word,$word,$wtmp0,ror #32-24 ++___ ++} ++ ++# sm4 for one block of data, in scalar registers word0/word1/word2/word3 ++sub sm4_1blk () { ++ my $kptr = shift; ++ ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) ++ eor $tmpw,$word2,$word3 ++ eor $wtmp2,$wtmp0,$word1 ++ eor $tmpw,$tmpw,$wtmp2 ++___ ++ &sbox_1word($tmpw); ++$code.=<<___; ++ eor $word0,$word0,$tmpw ++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) ++ eor $tmpw,$word2,$word3 ++ eor $wtmp2,$word0,$wtmp1 ++ eor $tmpw,$tmpw,$wtmp2 ++___ ++ &sbox_1word($tmpw); ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ eor $word1,$word1,$tmpw ++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) ++ eor $tmpw,$word0,$word1 ++ eor $wtmp2,$wtmp0,$word3 ++ eor $tmpw,$tmpw,$wtmp2 ++___ ++ &sbox_1word($tmpw); ++$code.=<<___; ++ eor $word2,$word2,$tmpw ++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) ++ eor $tmpw,$word0,$word1 ++ eor $wtmp2,$word2,$wtmp1 ++ eor $tmpw,$tmpw,$wtmp2 ++___ ++ &sbox_1word($tmpw); ++$code.=<<___; ++ eor $word3,$word3,$tmpw ++___ ++} ++ ++# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 ++sub sm4_4blks () { ++ my $kptr = shift; ++ ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ dup $rk0.4s,$wtmp0 ++ dup $rk1.4s,$wtmp1 ++ ++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) ++ eor $rka.16b,@data[2].16b,@data[3].16b ++ eor $rk0.16b,@data[1].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,$rk0.16b ++___ ++ &sbox($rk0); ++$code.=<<___; ++ eor @data[0].16b,@data[0].16b,$rk0.16b ++ ++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) ++ eor $rka.16b,$rka.16b,@data[0].16b ++ eor $rk1.16b,$rka.16b,$rk1.16b ++___ ++ &sbox($rk1); ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ eor @data[1].16b,@data[1].16b,$rk1.16b ++ ++ dup $rk0.4s,$wtmp0 ++ dup $rk1.4s,$wtmp1 ++ ++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) ++ eor $rka.16b,@data[0].16b,@data[1].16b ++ eor $rk0.16b,@data[3].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,$rk0.16b ++___ ++ &sbox($rk0); ++$code.=<<___; ++ eor @data[2].16b,@data[2].16b,$rk0.16b ++ ++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) ++ eor $rka.16b,$rka.16b,@data[2].16b ++ eor $rk1.16b,$rka.16b,$rk1.16b ++___ ++ &sbox($rk1); ++$code.=<<___; ++ eor @data[3].16b,@data[3].16b,$rk1.16b ++___ ++} ++ ++# sm4 for 8 lanes of data, in neon registers ++# data0/data1/data2/data3 datax0/datax1/datax2/datax3 ++sub sm4_8blks () { ++ my $kptr = shift; ++ ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) ++ dup $rk0.4s,$wtmp0 ++ eor $rka.16b,@data[2].16b,@data[3].16b ++ eor $rkb.16b,@datax[2].16b,@datax[3].16b ++ eor @vtmp[0].16b,@data[1].16b,$rk0.16b ++ eor @vtmp[1].16b,@datax[1].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,@vtmp[0].16b ++ eor $rk1.16b,$rkb.16b,@vtmp[1].16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ eor @data[0].16b,@data[0].16b,$rk0.16b ++ eor @datax[0].16b,@datax[0].16b,$rk1.16b ++ ++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) ++ dup $rk1.4s,$wtmp1 ++ eor $rka.16b,$rka.16b,@data[0].16b ++ eor $rkb.16b,$rkb.16b,@datax[0].16b ++ eor $rk0.16b,$rka.16b,$rk1.16b ++ eor $rk1.16b,$rkb.16b,$rk1.16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ eor @data[1].16b,@data[1].16b,$rk0.16b ++ eor @datax[1].16b,@datax[1].16b,$rk1.16b ++ ++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) ++ dup $rk0.4s,$wtmp0 ++ eor $rka.16b,@data[0].16b,@data[1].16b ++ eor $rkb.16b,@datax[0].16b,@datax[1].16b ++ eor @vtmp[0].16b,@data[3].16b,$rk0.16b ++ eor @vtmp[1].16b,@datax[3].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,@vtmp[0].16b ++ eor $rk1.16b,$rkb.16b,@vtmp[1].16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ eor @data[2].16b,@data[2].16b,$rk0.16b ++ eor @datax[2].16b,@datax[2].16b,$rk1.16b ++ ++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) ++ dup $rk1.4s,$wtmp1 ++ eor $rka.16b,$rka.16b,@data[2].16b ++ eor $rkb.16b,$rkb.16b,@datax[2].16b ++ eor $rk0.16b,$rka.16b,$rk1.16b ++ eor $rk1.16b,$rkb.16b,$rk1.16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ eor @data[3].16b,@data[3].16b,$rk0.16b ++ eor @datax[3].16b,@datax[3].16b,$rk1.16b ++___ ++} ++ ++sub encrypt_1blk_norev() { ++ my $dat = shift; ++ ++$code.=<<___; ++ mov $ptr,$rks ++ mov $counter,#8 ++ mov $word0,$dat.s[0] ++ mov $word1,$dat.s[1] ++ mov $word2,$dat.s[2] ++ mov $word3,$dat.s[3] ++10: ++___ ++ &sm4_1blk($ptr); ++$code.=<<___; ++ subs $counter,$counter,#1 ++ b.ne 10b ++ mov $dat.s[0],$word3 ++ mov $dat.s[1],$word2 ++ mov $dat.s[2],$word1 ++ mov $dat.s[3],$word0 ++___ ++} ++ ++sub encrypt_1blk() { ++ my $dat = shift; ++ ++ &encrypt_1blk_norev($dat); ++ &rev32($dat,$dat); ++} ++ ++sub encrypt_4blks() { ++$code.=<<___; ++ mov $ptr,$rks ++ mov $counter,#8 ++10: ++___ ++ &sm4_4blks($ptr); ++$code.=<<___; ++ subs $counter,$counter,#1 ++ b.ne 10b ++___ ++ &rev32(@vtmp[3],@data[0]); ++ &rev32(@vtmp[2],@data[1]); ++ &rev32(@vtmp[1],@data[2]); ++ &rev32(@vtmp[0],@data[3]); ++} ++ ++sub encrypt_8blks() { ++$code.=<<___; ++ mov $ptr,$rks ++ mov $counter,#8 ++10: ++___ ++ &sm4_8blks($ptr); ++$code.=<<___; ++ subs $counter,$counter,#1 ++ b.ne 10b ++___ ++ &rev32(@vtmp[3],@data[0]); ++ &rev32(@vtmp[2],@data[1]); ++ &rev32(@vtmp[1],@data[2]); ++ &rev32(@vtmp[0],@data[3]); ++ &rev32(@data[3],@datax[0]); ++ &rev32(@data[2],@datax[1]); ++ &rev32(@data[1],@datax[2]); ++ &rev32(@data[0],@datax[3]); ++} ++ ++sub load_sbox () { ++ my $data = shift; ++ ++$code.=<<___; ++ ldr $MaskQ, =0x0306090c0f0205080b0e0104070a0d00 ++ ldr $TAHMatQ, =0x22581a6002783a4062185a2042387a00 ++ ldr $TALMatQ, =0xc10bb67c4a803df715df62a89e54e923 ++ ldr $ATAHMatQ, =0x1407c6d56c7fbeadb9aa6b78c1d21300 ++ ldr $ATALMatQ, =0xe383c1a1fe9edcbc6404462679195b3b ++ ldr $ANDMaskQ, =0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f ++___ ++} ++ ++sub mov_reg_to_vec() { ++ my $src0 = shift; ++ my $src1 = shift; ++ my $desv = shift; ++$code.=<<___; ++ mov $desv.d[0],$src0 ++ mov $desv.d[1],$src1 ++___ ++ &rev32_armeb($desv,$desv); ++} ++ ++sub mov_vec_to_reg() { ++ my $srcv = shift; ++ my $des0 = shift; ++ my $des1 = shift; ++$code.=<<___; ++ mov $des0,$srcv.d[0] ++ mov $des1,$srcv.d[1] ++___ ++} ++ ++sub compute_tweak() { ++ my $src0 = shift; ++ my $src1 = shift; ++ my $des0 = shift; ++ my $des1 = shift; ++$code.=<<___; ++ mov $wtmp0,0x87 ++ extr $xtmp2,$src1,$src1,#32 ++ extr $des1,$src1,$src0,#63 ++ and $wtmp1,$wtmp0,$wtmp2,asr#31 ++ eor $des0,$xtmp1,$src0,lsl#1 ++___ ++} ++ ++sub compute_tweak_vec() { ++ my $src = shift; ++ my $des = shift; ++ my $std = shift; ++ &rbit(@vtmp[2],$src,$std); ++$code.=<<___; ++ ldr @qtmp[0], =0x01010101010101010101010101010187 ++ shl $des.16b, @vtmp[2].16b, #1 ++ ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 ++ ushr @vtmp[1].16b, @vtmp[1].16b, #7 ++ mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b ++ eor $des.16b, $des.16b, @vtmp[1].16b ++___ ++ &rbit($des,$des,$std); ++} ++ ++$code=<<___; ++#include "arm_arch.h" ++.arch armv8-a+crypto ++.text ++ ++.type _${prefix}_consts,%object ++.align 7 ++_${prefix}_consts: ++.Lck: ++ .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 ++ .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 ++ .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 ++ .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 ++ .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 ++ .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 ++ .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 ++ .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 ++.Lfk: ++ .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197 ++.Lshuffles: ++ .dword 0x0B0A090807060504,0x030201000F0E0D0C ++ ++.size _${prefix}_consts,.-_${prefix}_consts ++___ ++ ++{{{ ++my ($key,$keys,$enc)=("x0","x1","w2"); ++my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8"); ++my ($vkey,$vfk,$vmap)=("v5","v6","v7"); ++$code.=<<___; ++.type _${prefix}_set_key,%function ++.align 4 ++_${prefix}_set_key: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {$vkey.4s},[$key] ++___ ++ &load_sbox(); ++ &rev32($vkey,$vkey); ++$code.=<<___; ++ adr $pointer,.Lshuffles ++ ld1 {$vmap.2d},[$pointer] ++ adr $pointer,.Lfk ++ ld1 {$vfk.2d},[$pointer] ++ eor $vkey.16b,$vkey.16b,$vfk.16b ++ mov $schedules,#32 ++ adr $pointer,.Lck ++ movi @vtmp[0].16b,#64 ++ cbnz $enc,1f ++ add $keys,$keys,124 ++1: ++ mov $wtmp,$vkey.s[1] ++ ldr $roundkey,[$pointer],#4 ++ eor $roundkey,$roundkey,$wtmp ++ mov $wtmp,$vkey.s[2] ++ eor $roundkey,$roundkey,$wtmp ++ mov $wtmp,$vkey.s[3] ++ eor $roundkey,$roundkey,$wtmp ++ // optimize sbox using AESE instruction ++ mov @data[0].s[0],$roundkey ++ tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b ++___ ++ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); ++$code.=<<___; ++ eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b ++ aese @vtmp[0].16b,@vtmp[1].16b ++___ ++ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); ++$code.=<<___; ++ mov $wtmp,@vtmp[0].s[0] ++ eor $roundkey,$wtmp,$wtmp,ror #19 ++ eor $roundkey,$roundkey,$wtmp,ror #9 ++ mov $wtmp,$vkey.s[0] ++ eor $roundkey,$roundkey,$wtmp ++ mov $vkey.s[0],$roundkey ++ cbz $enc,2f ++ str $roundkey,[$keys],#4 ++ b 3f ++2: ++ str $roundkey,[$keys],#-4 ++3: ++ tbl $vkey.16b,{$vkey.16b},$vmap.16b ++ subs $schedules,$schedules,#1 ++ b.ne 1b ++ ret ++.size _${prefix}_set_key,.-_${prefix}_set_key ++___ ++}}} ++ ++ ++{{{ ++$code.=<<___; ++.type _${prefix}_enc_4blks,%function ++.align 4 ++_${prefix}_enc_4blks: ++ AARCH64_VALID_CALL_TARGET ++___ ++ &encrypt_4blks(); ++$code.=<<___; ++ ret ++.size _${prefix}_enc_4blks,.-_${prefix}_enc_4blks ++___ ++}}} ++ ++{{{ ++$code.=<<___; ++.type _${prefix}_enc_8blks,%function ++.align 4 ++_${prefix}_enc_8blks: ++ AARCH64_VALID_CALL_TARGET ++___ ++ &encrypt_8blks(); ++$code.=<<___; ++ ret ++.size _${prefix}_enc_8blks,.-_${prefix}_enc_8blks ++___ ++}}} ++ ++ ++{{{ ++my ($key,$keys)=("x0","x1"); ++$code.=<<___; ++.globl ${prefix}_set_encrypt_key ++.type ${prefix}_set_encrypt_key,%function ++.align 5 ++${prefix}_set_encrypt_key: ++ AARCH64_SIGN_LINK_REGISTER ++ stp x29,x30,[sp,#-16]! ++ mov w2,1 ++ bl _${prefix}_set_key ++ ldp x29,x30,[sp],#16 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key ++___ ++}}} ++ ++{{{ ++my ($key,$keys)=("x0","x1"); ++$code.=<<___; ++.globl ${prefix}_set_decrypt_key ++.type ${prefix}_set_decrypt_key,%function ++.align 5 ++${prefix}_set_decrypt_key: ++ AARCH64_SIGN_LINK_REGISTER ++ stp x29,x30,[sp,#-16]! ++ mov w2,0 ++ bl _${prefix}_set_key ++ ldp x29,x30,[sp],#16 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key ++___ ++}}} ++ ++{{{ ++sub gen_block () { ++ my $dir = shift; ++ my ($inp,$outp,$rk)=map("x$_",(0..2)); ++ ++$code.=<<___; ++.globl ${prefix}_${dir}crypt ++.type ${prefix}_${dir}crypt,%function ++.align 5 ++${prefix}_${dir}crypt: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {@data[0].4s},[$inp] ++___ ++ &load_sbox(); ++ &rev32(@data[0],@data[0]); ++$code.=<<___; ++ mov $rks,$rk ++___ ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ st1 {@data[0].4s},[$outp] ++ ret ++.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt ++___ ++} ++&gen_block("en"); ++&gen_block("de"); ++}}} ++ ++{{{ ++$code.=<<___; ++.globl ${prefix}_ecb_encrypt ++.type ${prefix}_ecb_encrypt,%function ++.align 5 ++${prefix}_ecb_encrypt: ++ AARCH64_SIGN_LINK_REGISTER ++ // convert length into blocks ++ lsr x2,x2,4 ++ stp d8,d9,[sp,#-80]! ++ stp d10,d11,[sp,#16] ++ stp d12,d13,[sp,#32] ++ stp d14,d15,[sp,#48] ++ stp x29,x30,[sp,#64] ++___ ++ &load_sbox(); ++$code.=<<___; ++.Lecb_8_blocks_process: ++ cmp $blocks,#8 ++ b.lt .Lecb_4_blocks_process ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++ ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &rev32(@datax[0],@datax[0]); ++ &rev32(@datax[1],@datax[1]); ++ &rev32(@datax[2],@datax[2]); ++ &rev32(@datax[3],@datax[3]); ++$code.=<<___; ++ bl _${prefix}_enc_8blks ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.gt .Lecb_8_blocks_process ++ b 100f ++.Lecb_4_blocks_process: ++ cmp $blocks,#4 ++ b.lt 1f ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ sub $blocks,$blocks,#4 ++1: ++ // process last block ++ cmp $blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ ld1 {@data[0].4s},[$inp] ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ st1 {@data[0].4s},[$outp] ++ b 100f ++1: // process last 2 blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16 ++ cmp $blocks,#2 ++ b.gt 1f ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp] ++ b 100f ++1: // process last 3 blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16 ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16 ++ st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp] ++100: ++ ldp d10,d11,[sp,#16] ++ ldp d12,d13,[sp,#32] ++ ldp d14,d15,[sp,#48] ++ ldp x29,x30,[sp,#64] ++ ldp d8,d9,[sp],#80 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt ++___ ++}}} ++ ++{{{ ++my ($len,$ivp,$enc)=("x2","x4","w5"); ++my $ivec0=("v3"); ++my $ivec1=("v15"); ++ ++$code.=<<___; ++.globl ${prefix}_cbc_encrypt ++.type ${prefix}_cbc_encrypt,%function ++.align 5 ++${prefix}_cbc_encrypt: ++ AARCH64_VALID_CALL_TARGET ++ lsr $len,$len,4 ++___ ++ &load_sbox(); ++$code.=<<___; ++ cbz $enc,.Ldec ++ ld1 {$ivec0.4s},[$ivp] ++.Lcbc_4_blocks_enc: ++ cmp $blocks,#4 ++ b.lt 1f ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++ eor @data[0].16b,@data[0].16b,$ivec0.16b ++___ ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &encrypt_1blk_norev(@data[0]); ++$code.=<<___; ++ eor @data[1].16b,@data[1].16b,@data[0].16b ++___ ++ &encrypt_1blk_norev(@data[1]); ++ &rev32(@data[0],@data[0]); ++ ++$code.=<<___; ++ eor @data[2].16b,@data[2].16b,@data[1].16b ++___ ++ &encrypt_1blk_norev(@data[2]); ++ &rev32(@data[1],@data[1]); ++$code.=<<___; ++ eor @data[3].16b,@data[3].16b,@data[2].16b ++___ ++ &encrypt_1blk_norev(@data[3]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ orr $ivec0.16b,@data[3].16b,@data[3].16b ++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#4 ++ b.ne .Lcbc_4_blocks_enc ++ b 2f ++1: ++ subs $blocks,$blocks,#1 ++ b.lt 2f ++ ld1 {@data[0].4s},[$inp],#16 ++ eor $ivec0.16b,$ivec0.16b,@data[0].16b ++___ ++ &rev32($ivec0,$ivec0); ++ &encrypt_1blk($ivec0); ++$code.=<<___; ++ st1 {$ivec0.4s},[$outp],#16 ++ b 1b ++2: ++ // save back IV ++ st1 {$ivec0.4s},[$ivp] ++ ret ++ ++.Ldec: ++ // decryption mode starts ++ AARCH64_SIGN_LINK_REGISTER ++ stp d8,d9,[sp,#-80]! ++ stp d10,d11,[sp,#16] ++ stp d12,d13,[sp,#32] ++ stp d14,d15,[sp,#48] ++ stp x29,x30,[sp,#64] ++.Lcbc_8_blocks_dec: ++ cmp $blocks,#8 ++ b.lt 1f ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] ++ add $ptr,$inp,#64 ++ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr] ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],$data[3]); ++ &rev32(@datax[0],@datax[0]); ++ &rev32(@datax[1],@datax[1]); ++ &rev32(@datax[2],@datax[2]); ++ &rev32(@datax[3],$datax[3]); ++$code.=<<___; ++ bl _${prefix}_enc_8blks ++___ ++ &transpose(@vtmp,@datax); ++ &transpose(@data,@datax); ++$code.=<<___; ++ ld1 {$ivec1.4s},[$ivp] ++ ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++ // note ivec1 and vtmpx[3] are resuing the same register ++ // care needs to be taken to avoid conflict ++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b ++ ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 ++ eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b ++ eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b ++ // save back IV ++ st1 {$vtmpx[3].4s}, [$ivp] ++ eor @data[0].16b,@data[0].16b,$datax[3].16b ++ eor @data[1].16b,@data[1].16b,@vtmpx[0].16b ++ eor @data[2].16b,@data[2].16b,@vtmpx[1].16b ++ eor @data[3].16b,$data[3].16b,@vtmpx[2].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.gt .Lcbc_8_blocks_dec ++ b.eq 100f ++1: ++ ld1 {$ivec1.4s},[$ivp] ++.Lcbc_4_blocks_dec: ++ cmp $blocks,#4 ++ b.lt 1f ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],$data[3]); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &transpose(@vtmp,@datax); ++$code.=<<___; ++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b ++ orr $ivec1.16b,@data[3].16b,@data[3].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b ++ eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#4 ++ b.gt .Lcbc_4_blocks_dec ++ // save back IV ++ st1 {@data[3].4s}, [$ivp] ++ b 100f ++1: // last block ++ subs $blocks,$blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ ld1 {@data[0].4s},[$inp],#16 ++ // save back IV ++ st1 {$data[0].4s}, [$ivp] ++___ ++ &rev32(@datax[0],@data[0]); ++ &encrypt_1blk(@datax[0]); ++$code.=<<___; ++ eor @datax[0].16b,@datax[0].16b,$ivec1.16b ++ st1 {@datax[0].4s},[$outp],#16 ++ b 100f ++1: // last two blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp] ++ add $ptr,$inp,#16 ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16 ++ subs $blocks,$blocks,1 ++ b.gt 1f ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++ ld1 {@data[0].4s,@data[1].4s},[$inp],#32 ++___ ++ &transpose(@vtmp,@datax); ++$code.=<<___; ++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 ++ // save back IV ++ st1 {@data[1].4s}, [$ivp] ++ b 100f ++1: // last 3 blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr] ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 ++___ ++ &transpose(@vtmp,@datax); ++$code.=<<___; ++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 ++ // save back IV ++ st1 {@data[2].4s}, [$ivp] ++100: ++ ldp d10,d11,[sp,#16] ++ ldp d12,d13,[sp,#32] ++ ldp d14,d15,[sp,#48] ++ ldp x29,x30,[sp,#64] ++ ldp d8,d9,[sp],#80 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt ++___ ++}}} ++ ++{{{ ++my ($ivp)=("x4"); ++my ($ctr)=("w5"); ++my $ivec=("v3"); ++ ++$code.=<<___; ++.globl ${prefix}_ctr32_encrypt_blocks ++.type ${prefix}_ctr32_encrypt_blocks,%function ++.align 5 ++${prefix}_ctr32_encrypt_blocks: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {$ivec.4s},[$ivp] ++___ ++ &rev32($ivec,$ivec); ++ &load_sbox(); ++$code.=<<___; ++ cmp $blocks,#1 ++ b.ne 1f ++ // fast processing for one single block without ++ // context saving overhead ++___ ++ &encrypt_1blk($ivec); ++$code.=<<___; ++ ld1 {@data[0].4s},[$inp] ++ eor @data[0].16b,@data[0].16b,$ivec.16b ++ st1 {@data[0].4s},[$outp] ++ ret ++1: ++ AARCH64_SIGN_LINK_REGISTER ++ stp d8,d9,[sp,#-80]! ++ stp d10,d11,[sp,#16] ++ stp d12,d13,[sp,#32] ++ stp d14,d15,[sp,#48] ++ stp x29,x30,[sp,#64] ++ mov $word0,$ivec.s[0] ++ mov $word1,$ivec.s[1] ++ mov $word2,$ivec.s[2] ++ mov $ctr,$ivec.s[3] ++.Lctr32_4_blocks_process: ++ cmp $blocks,#4 ++ b.lt 1f ++ dup @data[0].4s,$word0 ++ dup @data[1].4s,$word1 ++ dup @data[2].4s,$word2 ++ mov @data[3].s[0],$ctr ++ add $ctr,$ctr,#1 ++ mov $data[3].s[1],$ctr ++ add $ctr,$ctr,#1 ++ mov @data[3].s[2],$ctr ++ add $ctr,$ctr,#1 ++ mov @data[3].s[3],$ctr ++ add $ctr,$ctr,#1 ++ cmp $blocks,#8 ++ b.ge .Lctr32_8_blocks_process ++ bl _${prefix}_enc_4blks ++ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 ++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b ++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#4 ++ b.ne .Lctr32_4_blocks_process ++ b 100f ++.Lctr32_8_blocks_process: ++ dup @datax[0].4s,$word0 ++ dup @datax[1].4s,$word1 ++ dup @datax[2].4s,$word2 ++ mov @datax[3].s[0],$ctr ++ add $ctr,$ctr,#1 ++ mov $datax[3].s[1],$ctr ++ add $ctr,$ctr,#1 ++ mov @datax[3].s[2],$ctr ++ add $ctr,$ctr,#1 ++ mov @datax[3].s[3],$ctr ++ add $ctr,$ctr,#1 ++ bl _${prefix}_enc_8blks ++ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 ++ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b ++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b ++ eor @data[0].16b,@data[0].16b,@datax[0].16b ++ eor @data[1].16b,@data[1].16b,@datax[1].16b ++ eor @data[2].16b,@data[2].16b,@datax[2].16b ++ eor @data[3].16b,@data[3].16b,@datax[3].16b ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.ne .Lctr32_4_blocks_process ++ b 100f ++1: // last block processing ++ subs $blocks,$blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ mov $ivec.s[0],$word0 ++ mov $ivec.s[1],$word1 ++ mov $ivec.s[2],$word2 ++ mov $ivec.s[3],$ctr ++___ ++ &encrypt_1blk($ivec); ++$code.=<<___; ++ ld1 {@data[0].4s},[$inp] ++ eor @data[0].16b,@data[0].16b,$ivec.16b ++ st1 {@data[0].4s},[$outp] ++ b 100f ++1: // last 2 blocks processing ++ dup @data[0].4s,$word0 ++ dup @data[1].4s,$word1 ++ dup @data[2].4s,$word2 ++ mov @data[3].s[0],$ctr ++ add $ctr,$ctr,#1 ++ mov @data[3].s[1],$ctr ++ subs $blocks,$blocks,#1 ++ b.ne 1f ++ bl _${prefix}_enc_4blks ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 ++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b ++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 ++ b 100f ++1: // last 3 blocks processing ++ add $ctr,$ctr,#1 ++ mov @data[3].s[2],$ctr ++ bl _${prefix}_enc_4blks ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16 ++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b ++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16 ++100: ++ ldp d10,d11,[sp,#16] ++ ldp d12,d13,[sp,#32] ++ ldp d14,d15,[sp,#48] ++ ldp x29,x30,[sp,#64] ++ ldp d8,d9,[sp],#80 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks ++___ ++}}} ++ ++ ++{{{ ++my ($blocks,$len)=("x2","x2"); ++my $ivp=("x5"); ++my @twx=map("x$_",(12..27)); ++my ($rks1,$rks2)=("x26","x27"); ++my $lastBlk=("x26"); ++my $enc=("w28"); ++my $remain=("x29"); ++ ++my @tweak=map("v$_",(16..23)); ++my $lastTweak=("v25"); ++ ++sub gen_xts_cipher() { ++ my $std = shift; ++$code.=<<___; ++.globl ${prefix}_xts_encrypt${std} ++.type ${prefix}_xts_encrypt${std},%function ++.align 5 ++${prefix}_xts_encrypt${std}: ++ AARCH64_SIGN_LINK_REGISTER ++ stp x15, x16, [sp, #-0x10]! ++ stp x17, x18, [sp, #-0x10]! ++ stp x19, x20, [sp, #-0x10]! ++ stp x21, x22, [sp, #-0x10]! ++ stp x23, x24, [sp, #-0x10]! ++ stp x25, x26, [sp, #-0x10]! ++ stp x27, x28, [sp, #-0x10]! ++ stp x29, x30, [sp, #-0x10]! ++ stp d8, d9, [sp, #-0x10]! ++ stp d10, d11, [sp, #-0x10]! ++ stp d12, d13, [sp, #-0x10]! ++ stp d14, d15, [sp, #-0x10]! ++ mov $rks1,x3 ++ mov $rks2,x4 ++ mov $enc,w6 ++ ld1 {@tweak[0].4s}, [$ivp] ++ mov $rks,$rks2 ++___ ++ &load_sbox(); ++ &rev32(@tweak[0],@tweak[0]); ++ &encrypt_1blk(@tweak[0]); ++$code.=<<___; ++ mov $rks,$rks1 ++ and $remain,$len,#0x0F ++ // convert length into blocks ++ lsr $blocks,$len,4 ++ cmp $blocks,#1 ++ b.lt .return${std} ++ ++ cmp $remain,0 ++ // If the encryption/decryption Length is N times of 16, ++ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std} ++ b.eq .xts_encrypt_blocks${std} ++ ++ // If the encryption/decryption length is not N times of 16, ++ // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std} ++ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std} ++ subs $blocks,$blocks,#1 ++ b.eq .only_2blks_tweak${std} ++.xts_encrypt_blocks${std}: ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rev32_armeb(@tweak[0],@tweak[0]); ++ &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); ++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); ++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); ++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); ++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); ++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); ++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); ++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); ++$code.=<<___; ++.Lxts_8_blocks_process${std}: ++ cmp $blocks,#8 ++___ ++ &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); ++ &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); ++ &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); ++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); ++ &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); ++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); ++ &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); ++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); ++ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]); ++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); ++ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]); ++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); ++ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]); ++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); ++ &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]); ++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); ++$code.=<<___; ++ b.lt .Lxts_4_blocks_process${std} ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rbit(@tweak[1],@tweak[1],$std); ++ &rbit(@tweak[2],@tweak[2],$std); ++ &rbit(@tweak[3],@tweak[3],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++ eor @data[2].16b, @data[2].16b, @tweak[2].16b ++ eor @data[3].16b, @data[3].16b, @tweak[3].16b ++ ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++___ ++ &rbit(@tweak[4],@tweak[4],$std); ++ &rbit(@tweak[5],@tweak[5],$std); ++ &rbit(@tweak[6],@tweak[6],$std); ++ &rbit(@tweak[7],@tweak[7],$std); ++$code.=<<___; ++ eor @datax[0].16b, @datax[0].16b, @tweak[4].16b ++ eor @datax[1].16b, @datax[1].16b, @tweak[5].16b ++ eor @datax[2].16b, @datax[2].16b, @tweak[6].16b ++ eor @datax[3].16b, @datax[3].16b, @tweak[7].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &rev32(@datax[0],@datax[0]); ++ &rev32(@datax[1],@datax[1]); ++ &rev32(@datax[2],@datax[2]); ++ &rev32(@datax[3],@datax[3]); ++ &transpose(@data,@vtmp); ++ &transpose(@datax,@vtmp); ++$code.=<<___; ++ bl _${prefix}_enc_8blks ++___ ++ &transpose(@vtmp,@datax); ++ &transpose(@data,@datax); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b ++ eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b ++ eor @data[0].16b, @data[0].16b, @tweak[4].16b ++ eor @data[1].16b, @data[1].16b, @tweak[5].16b ++ eor @data[2].16b, @data[2].16b, @tweak[6].16b ++ eor @data[3].16b, @data[3].16b, @tweak[7].16b ++ ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[7].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.gt .Lxts_8_blocks_process${std} ++ b 100f ++.Lxts_4_blocks_process${std}: ++ cmp $blocks,#4 ++ b.lt 1f ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rbit(@tweak[1],@tweak[1],$std); ++ &rbit(@tweak[2],@tweak[2],$std); ++ &rbit(@tweak[3],@tweak[3],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++ eor @data[2].16b, @data[2].16b, @tweak[2].16b ++ eor @data[3].16b, @data[3].16b, @tweak[3].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &transpose(@data,@vtmp); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++___ ++ &transpose(@vtmp,@data); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b ++ eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ sub $blocks,$blocks,#4 ++ mov @tweak[0].16b,@tweak[4].16b ++ mov @tweak[1].16b,@tweak[5].16b ++ mov @tweak[2].16b,@tweak[6].16b ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[3].16b ++1: ++ // process last block ++ cmp $blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ ld1 {@data[0].4s},[$inp],#16 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ st1 {@data[0].4s},[$outp],#16 ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[0].16b ++ b 100f ++1: // process last 2 blocks ++ cmp $blocks,#2 ++ b.gt 1f ++ ld1 {@data[0].4s,@data[1].4s},[$inp],#32 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rbit(@tweak[1],@tweak[1],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &transpose(@data,@vtmp); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++___ ++ &transpose(@vtmp,@data); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[1].16b ++ b 100f ++1: // process last 3 blocks ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rbit(@tweak[1],@tweak[1],$std); ++ &rbit(@tweak[2],@tweak[2],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++ eor @data[2].16b, @data[2].16b, @tweak[2].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &transpose(@data,@vtmp); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++___ ++ &transpose(@vtmp,@data); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[2].16b ++100: ++ cmp $remain,0 ++ b.eq .return${std} ++ ++// This brance calculates the last two tweaks, ++// while the encryption/decryption length is larger than 32 ++.last_2blks_tweak${std}: ++___ ++ &rev32_armeb($lastTweak,$lastTweak); ++ &compute_tweak_vec($lastTweak,@tweak[1],$std); ++ &compute_tweak_vec(@tweak[1],@tweak[2],$std); ++$code.=<<___; ++ b .check_dec${std} ++ ++ ++// This brance calculates the last two tweaks, ++// while the encryption/decryption length is equal to 32, who only need two tweaks ++.only_2blks_tweak${std}: ++ mov @tweak[1].16b,@tweak[0].16b ++___ ++ &rev32_armeb(@tweak[1],@tweak[1]); ++ &compute_tweak_vec(@tweak[1],@tweak[2]); ++$code.=<<___; ++ b .check_dec${std} ++ ++ ++// Determine whether encryption or decryption is required. ++// The last two tweaks need to be swapped for decryption. ++.check_dec${std}: ++ // encryption:1 decryption:0 ++ cmp $enc,1 ++ b.eq .prcess_last_2blks${std} ++ mov @vtmp[0].16B,@tweak[1].16b ++ mov @tweak[1].16B,@tweak[2].16b ++ mov @tweak[2].16B,@vtmp[0].16b ++ ++.prcess_last_2blks${std}: ++___ ++ &rev32_armeb(@tweak[1],@tweak[1]); ++ &rev32_armeb(@tweak[2],@tweak[2]); ++$code.=<<___; ++ ld1 {@data[0].4s},[$inp],#16 ++ eor @data[0].16b, @data[0].16b, @tweak[1].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[1].16b ++ st1 {@data[0].4s},[$outp],#16 ++ ++ sub $lastBlk,$outp,16 ++ .loop${std}: ++ subs $remain,$remain,1 ++ ldrb $wtmp0,[$lastBlk,$remain] ++ ldrb $wtmp1,[$inp,$remain] ++ strb $wtmp1,[$lastBlk,$remain] ++ strb $wtmp0,[$outp,$remain] ++ b.gt .loop${std} ++ ld1 {@data[0].4s}, [$lastBlk] ++ eor @data[0].16b, @data[0].16b, @tweak[2].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[2].16b ++ st1 {@data[0].4s}, [$lastBlk] ++.return${std}: ++ ldp d14, d15, [sp], #0x10 ++ ldp d12, d13, [sp], #0x10 ++ ldp d10, d11, [sp], #0x10 ++ ldp d8, d9, [sp], #0x10 ++ ldp x29, x30, [sp], #0x10 ++ ldp x27, x28, [sp], #0x10 ++ ldp x25, x26, [sp], #0x10 ++ ldp x23, x24, [sp], #0x10 ++ ldp x21, x22, [sp], #0x10 ++ ldp x19, x20, [sp], #0x10 ++ ldp x17, x18, [sp], #0x10 ++ ldp x15, x16, [sp], #0x10 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std} ++___ ++} # end of gen_xts_cipher ++&gen_xts_cipher("_gb"); ++&gen_xts_cipher(""); ++}}} ++ ++######################################## ++open SELF,$0; ++while(<SELF>) { ++ next if (/^#!/); ++ last if (!s/^#/\/\// and !/^$/); ++ print; ++} ++close SELF; ++ ++foreach(split("\n",$code)) { ++ s/\`([^\`]*)\`/eval($1)/ge; ++ print $_,"\n"; ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; +diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info +index 75a215ab80..73ffe5ea09 100644 +--- a/crypto/sm4/build.info ++++ b/crypto/sm4/build.info +@@ -2,7 +2,7 @@ LIBS=../../libcrypto + + IF[{- !$disabled{asm} -}] + $SM4DEF_aarch64=SM4_ASM VPSM4_ASM +- $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S ++ $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S vpsm4_ex-armv8.S + + # Now that we have defined all the arch specific variables, use the + # appropriate one, and define the appropriate macros +@@ -30,5 +30,7 @@ ENDIF + + GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl + GENERATE[vpsm4-armv8.S]=asm/vpsm4-armv8.pl ++GENERATE[vpsm4_ex-armv8.S]=asm/vpsm4_ex-armv8.pl + INCLUDE[sm4-armv8.o]=.. + INCLUDE[vpsm4-armv8.o]=.. ++INCLUDE[vpsm4_ex-armv8.o]=.. +diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h +index 15d8abbcb1..8b9cd10f97 100644 +--- a/include/crypto/sm4_platform.h ++++ b/include/crypto/sm4_platform.h +@@ -20,11 +20,16 @@ static inline int vpsm4_capable(void) + { + return (OPENSSL_armcap_P & ARMV8_CPUID) && + (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1) || +- MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1) || +- MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, HISI_CPU_IMP, HISI_CPU_PART_KP920)); ++ MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1)); ++} ++static inline int vpsm4_ex_capable(void) ++{ ++ return (OPENSSL_armcap_P & ARMV8_CPUID) && ++ (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, HISI_CPU_IMP, HISI_CPU_PART_KP920)); + } + # if defined(VPSM4_ASM) + # define VPSM4_CAPABLE vpsm4_capable() ++# define VPSM4_EX_CAPABLE vpsm4_ex_capable() + # endif + # define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4) + # define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key +@@ -56,7 +61,7 @@ void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + const unsigned char ivec[16]); + # endif /* HWSM4_CAPABLE */ + +-#ifdef VPSM4_CAPABLE ++# ifdef VPSM4_CAPABLE + int vpsm4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); + int vpsm4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); + void vpsm4_encrypt(const unsigned char *in, unsigned char *out, +@@ -72,7 +77,37 @@ void vpsm4_ecb_encrypt(const unsigned char *in, unsigned char *out, + void vpsm4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + const unsigned char ivec[16]); ++void vpsm4_xts_encrypt(const unsigned char *in, unsigned char *out, ++ size_t len, const SM4_KEY *key1, const SM4_KEY *key2, ++ const unsigned char ivec[16], const int enc); ++void vpsm4_xts_encrypt_gb(const unsigned char *in, unsigned char *out, ++ size_t len, const SM4_KEY *key1, const SM4_KEY *key2, ++ const unsigned char ivec[16], const int enc); + # endif /* VPSM4_CAPABLE */ + ++# ifdef VPSM4_EX_CAPABLE ++int vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); ++int vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); ++void vpsm4_ex_encrypt(const unsigned char *in, unsigned char *out, ++ const SM4_KEY *key); ++void vpsm4_ex_decrypt(const unsigned char *in, unsigned char *out, ++ const SM4_KEY *key); ++void vpsm4_ex_cbc_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ unsigned char *ivec, const int enc); ++void vpsm4_ex_ecb_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ const int enc); ++void vpsm4_ex_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, ++ const unsigned char ivec[16]); ++void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out, ++ size_t len, const SM4_KEY *key1, const SM4_KEY *key2, ++ const unsigned char ivec[16], const int enc); ++void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out, ++ size_t len, const SM4_KEY *key1, ++ const SM4_KEY *key2, const unsigned char ivec[16], ++ const int enc); ++# endif /* VPSM4_EX_CAPABLE */ + + #endif /* OSSL_SM4_PLATFORM_H */ +diff --git a/providers/implementations/ciphers/cipher_sm4_hw.c b/providers/implementations/ciphers/cipher_sm4_hw.c +index 9a2e99f67c..8cabd78266 100644 +--- a/providers/implementations/ciphers/cipher_sm4_hw.c ++++ b/providers/implementations/ciphers/cipher_sm4_hw.c +@@ -42,6 +42,19 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx, + (void)0; /* terminate potentially open 'else' */ + } else + #endif ++#ifdef VPSM4_EX_CAPABLE ++ if (VPSM4_EX_CAPABLE) { ++ vpsm4_ex_set_encrypt_key(key, ks); ++ ctx->block = (block128_f)vpsm4_ex_encrypt; ++ ctx->stream.cbc = NULL; ++ if (ctx->mode == EVP_CIPH_CBC_MODE) ++ ctx->stream.cbc = (cbc128_f)vpsm4_ex_cbc_encrypt; ++ else if (ctx->mode == EVP_CIPH_ECB_MODE) ++ ctx->stream.ecb = (ecb128_f)vpsm4_ex_ecb_encrypt; ++ else if (ctx->mode == EVP_CIPH_CTR_MODE) ++ ctx->stream.ctr = (ctr128_f)vpsm4_ex_ctr32_encrypt_blocks; ++ } else ++#endif + #ifdef VPSM4_CAPABLE + if (VPSM4_CAPABLE) { + vpsm4_set_encrypt_key(key, ks); +@@ -75,6 +88,17 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx, + #endif + } else + #endif ++#ifdef VPSM4_EX_CAPABLE ++ if (VPSM4_EX_CAPABLE) { ++ vpsm4_ex_set_decrypt_key(key, ks); ++ ctx->block = (block128_f)vpsm4_ex_decrypt; ++ ctx->stream.cbc = NULL; ++ if (ctx->mode == EVP_CIPH_CBC_MODE) ++ ctx->stream.cbc = (cbc128_f)vpsm4_ex_cbc_encrypt; ++ else if (ctx->mode == EVP_CIPH_ECB_MODE) ++ ctx->stream.ecb = (ecb128_f)vpsm4_ex_ecb_encrypt; ++ } else ++#endif + #ifdef VPSM4_CAPABLE + if (VPSM4_CAPABLE) { + vpsm4_set_decrypt_key(key, ks); +@@ -82,7 +106,7 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx, + ctx->stream.cbc = NULL; + if (ctx->mode == EVP_CIPH_CBC_MODE) + ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt; +- else if (ctx->mode == EVP_CIPH_ECB_MODE) ++ else if (ctx->mode == EVP_CIPH_ECB_MODE) + ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt; + } else + #endif +diff --git a/providers/implementations/ciphers/cipher_sm4_xts.c b/providers/implementations/ciphers/cipher_sm4_xts.c +index 3c568d4d18..037055fce8 100644 +--- a/providers/implementations/ciphers/cipher_sm4_xts.c ++++ b/providers/implementations/ciphers/cipher_sm4_xts.c +@@ -145,14 +145,14 @@ static int sm4_xts_cipher(void *vctx, unsigned char *out, size_t *outl, + if (ctx->xts_standard) { + if (ctx->stream != NULL) + (*ctx->stream)(in, out, inl, ctx->xts.key1, ctx->xts.key2, +- ctx->base.iv); ++ ctx->base.iv, ctx->base.enc); + else if (CRYPTO_xts128_encrypt(&ctx->xts, ctx->base.iv, in, out, inl, + ctx->base.enc)) + return 0; + } else { + if (ctx->stream_gb != NULL) + (*ctx->stream_gb)(in, out, inl, ctx->xts.key1, ctx->xts.key2, +- ctx->base.iv); ++ ctx->base.iv, ctx->base.enc); + else if (ossl_crypto_xts128gb_encrypt(&ctx->xts, ctx->base.iv, in, out, + inl, ctx->base.enc)) + return 0; +diff --git a/providers/implementations/ciphers/cipher_sm4_xts.h b/providers/implementations/ciphers/cipher_sm4_xts.h +index 4c369183e2..cfca596979 100644 +--- a/providers/implementations/ciphers/cipher_sm4_xts.h ++++ b/providers/implementations/ciphers/cipher_sm4_xts.h +@@ -14,7 +14,7 @@ + PROV_CIPHER_FUNC(void, xts_stream, + (const unsigned char *in, unsigned char *out, size_t len, + const SM4_KEY *key1, const SM4_KEY *key2, +- const unsigned char iv[16])); ++ const unsigned char iv[16], const int enc)); + + typedef struct prov_sm4_xts_ctx_st { + /* Must be first */ +diff --git a/providers/implementations/ciphers/cipher_sm4_xts_hw.c b/providers/implementations/ciphers/cipher_sm4_xts_hw.c +index 403eb879b1..67a9923d94 100644 +--- a/providers/implementations/ciphers/cipher_sm4_xts_hw.c ++++ b/providers/implementations/ciphers/cipher_sm4_xts_hw.c +@@ -11,8 +11,7 @@ + + #define XTS_SET_KEY_FN(fn_set_enc_key, fn_set_dec_key, \ + fn_block_enc, fn_block_dec, \ +- fn_stream_enc, fn_stream_dec, \ +- fn_stream_gb_enc, fn_stream_gb_dec) { \ ++ fn_stream, fn_stream_gb) { \ + size_t bytes = keylen / 2; \ + \ + if (ctx->enc) { \ +@@ -26,8 +25,8 @@ + xctx->xts.block2 = (block128_f)fn_block_enc; \ + xctx->xts.key1 = &xctx->ks1; \ + xctx->xts.key2 = &xctx->ks2; \ +- xctx->stream = ctx->enc ? fn_stream_enc : fn_stream_dec; \ +- xctx->stream_gb = ctx->enc ? fn_stream_gb_enc : fn_stream_gb_dec; \ ++ xctx->stream = fn_stream; \ ++ xctx->stream_gb = fn_stream_gb; \ + } + + static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx, +@@ -35,23 +34,30 @@ static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx, + size_t keylen) + { + PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)ctx; +- OSSL_xts_stream_fn stream_enc = NULL; +- OSSL_xts_stream_fn stream_dec = NULL; +- OSSL_xts_stream_fn stream_gb_enc = NULL; +- OSSL_xts_stream_fn stream_gb_dec = NULL; ++ OSSL_xts_stream_fn stream = NULL; ++ OSSL_xts_stream_fn stream_gb = NULL; + #ifdef HWSM4_CAPABLE + if (HWSM4_CAPABLE) { + XTS_SET_KEY_FN(HWSM4_set_encrypt_key, HWSM4_set_decrypt_key, +- HWSM4_encrypt, HWSM4_decrypt, stream_enc, stream_dec, +- stream_gb_enc, stream_gb_dec); ++ HWSM4_encrypt, HWSM4_decrypt, stream, stream_gb); + return 1; + } else + #endif /* HWSM4_CAPABLE */ ++#ifdef VPSM4_EX_CAPABLE ++ if (VPSM4_EX_CAPABLE) { ++ stream = vpsm4_ex_xts_encrypt; ++ stream_gb = vpsm4_ex_xts_encrypt_gb; ++ XTS_SET_KEY_FN(vpsm4_ex_set_encrypt_key, vpsm4_ex_set_decrypt_key, ++ vpsm4_ex_encrypt, vpsm4_ex_decrypt, stream, stream_gb); ++ return 1; ++ } else ++#endif /* VPSM4_EX_CAPABLE */ + #ifdef VPSM4_CAPABLE + if (VPSM4_CAPABLE) { ++ stream = vpsm4_xts_encrypt; ++ stream_gb = vpsm4_xts_encrypt_gb; + XTS_SET_KEY_FN(vpsm4_set_encrypt_key, vpsm4_set_decrypt_key, +- vpsm4_encrypt, vpsm4_decrypt, stream_enc, stream_dec, +- stream_gb_enc, stream_gb_dec); ++ vpsm4_encrypt, vpsm4_decrypt, stream, stream_gb); + return 1; + } else + #endif /* VPSM4_CAPABLE */ +@@ -60,8 +66,7 @@ static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx, + } + { + XTS_SET_KEY_FN(ossl_sm4_set_key, ossl_sm4_set_key, ossl_sm4_encrypt, +- ossl_sm4_decrypt, stream_enc, stream_dec, stream_gb_enc, +- stream_gb_dec); ++ ossl_sm4_decrypt, stream, stream_gb); + } + return 1; + } +-- +2.37.3.windows.1 + diff --git a/Backport-SM4-optimization-for-ARM-by-ASIMD.patch b/Backport-SM4-optimization-for-ARM-by-ASIMD.patch new file mode 100644 index 0000000..5d58d16 --- /dev/null +++ b/Backport-SM4-optimization-for-ARM-by-ASIMD.patch @@ -0,0 +1,1334 @@ +From ca0b08e39bb619b6e62ef58c80edc784e8f20966 Mon Sep 17 00:00:00 2001 +From: Daniel Hu <Daniel.Hu@arm.com> +Date: Mon, 14 Feb 2022 14:36:34 +0000 +Subject: [PATCH 07/13] SM4 optimization for ARM by ASIMD + +This patch optimizes SM4 for ARM processor using ASIMD instruction + +It will improve performance if both of following conditions are met: +1) Input data equal to or more than 4 blocks +2) Cipher mode allows parallelism, including ECB,CTR,GCM or CBC decryption + +This patch implements SM4 SBOX lookup in vector registers, with the +benefit of constant processing time over existing C implementation. + +It is only enabled for micro-architecture N1/V1. In the ideal scenario, +performance can reach up to 2.7X + +When either of above two conditions is not met, e.g. single block input +or CFB/OFB mode, CBC encryption, performance could drop about 50%. + +The assembly code has been reviewed internally by ARM engineer +Fangming.Fang@arm.com + +Signed-off-by: Daniel Hu <Daniel.Hu@arm.com> + +Reviewed-by: Paul Dale <pauli@openssl.org> +Reviewed-by: Tomas Mraz <tomas@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/17951) +--- + crypto/evp/e_sm4.c | 24 + + crypto/sm4/asm/vpsm4-armv8.pl | 1118 +++++++++++++++++ + crypto/sm4/build.info | 6 +- + include/crypto/sm4_platform.h | 29 + + .../ciphers/cipher_sm4_gcm_hw.c | 7 + + .../implementations/ciphers/cipher_sm4_hw.c | 24 + + 6 files changed, 1206 insertions(+), 2 deletions(-) + create mode 100755 crypto/sm4/asm/vpsm4-armv8.pl + +diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c +index bff79ff197..c8e8cfe9c9 100644 +--- a/crypto/evp/e_sm4.c ++++ b/crypto/evp/e_sm4.c +@@ -76,6 +76,17 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt; + # endif + } else ++#endif ++#ifdef VPSM4_CAPABLE ++ if (VPSM4_CAPABLE) { ++ vpsm4_set_decrypt_key(key, &dat->ks.ks); ++ dat->block = (block128_f) vpsm4_decrypt; ++ dat->stream.cbc = NULL; ++ if (mode == EVP_CIPH_CBC_MODE) ++ dat->stream.cbc = (cbc128_f) vpsm4_cbc_encrypt; ++ else if (mode == EVP_CIPH_ECB_MODE) ++ dat->stream.ecb = (ecb128_f) vpsm4_ecb_encrypt; ++ } else + #endif + { + dat->block = (block128_f) ossl_sm4_decrypt; +@@ -104,6 +115,19 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + # endif + (void)0; /* terminate potentially open 'else' */ + } else ++#endif ++#ifdef VPSM4_CAPABLE ++ if (VPSM4_CAPABLE) { ++ vpsm4_set_encrypt_key(key, &dat->ks.ks); ++ dat->block = (block128_f) vpsm4_encrypt; ++ dat->stream.cbc = NULL; ++ if (mode == EVP_CIPH_CBC_MODE) ++ dat->stream.cbc = (cbc128_f) vpsm4_cbc_encrypt; ++ else if (mode == EVP_CIPH_ECB_MODE) ++ dat->stream.ecb = (ecb128_f) vpsm4_ecb_encrypt; ++ else if (mode == EVP_CIPH_CTR_MODE) ++ dat->stream.ctr = (ctr128_f) vpsm4_ctr32_encrypt_blocks; ++ } else + #endif + { + dat->block = (block128_f) ossl_sm4_encrypt; +diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl +new file mode 100755 +index 0000000000..095d9dae64 +--- /dev/null ++++ b/crypto/sm4/asm/vpsm4-armv8.pl +@@ -0,0 +1,1118 @@ ++#! /usr/bin/env perl ++# Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++ ++# ++# This module implements SM4 with ASIMD on aarch64 ++# ++# Feb 2022 ++# ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or ++die "can't locate arm-xlate.pl"; ++ ++open OUT,"| \"$^X\" $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++*STDOUT=*OUT; ++ ++$prefix="vpsm4"; ++my @vtmp=map("v$_",(0..3)); ++my @data=map("v$_",(4..7)); ++my @datax=map("v$_",(8..11)); ++my ($rk0,$rk1)=("v12","v13"); ++my ($rka,$rkb)=("v14","v15"); ++my @vtmpx=map("v$_",(12..15)); ++my @sbox=map("v$_",(16..31)); ++my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3"); ++my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9"); ++my ($ptr,$counter)=("x10","w11"); ++my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15"); ++ ++sub rev32() { ++ my $dst = shift; ++ my $src = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++$code.=<<___; ++#ifndef __ARMEB__ ++ rev32 $dst.16b,$src.16b ++#else ++ mov $dst.16b,$src.16b ++#endif ++___ ++ } else { ++$code.=<<___; ++#ifndef __ARMEB__ ++ rev32 $dst.16b,$dst.16b ++#endif ++___ ++ } ++} ++ ++sub transpose() { ++ my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; ++ ++$code.=<<___; ++ zip1 $vt0.4s,$dat0.4s,$dat1.4s ++ zip2 $vt1.4s,$dat0.4s,$dat1.4s ++ zip1 $vt2.4s,$dat2.4s,$dat3.4s ++ zip2 $vt3.4s,$dat2.4s,$dat3.4s ++ zip1 $dat0.2d,$vt0.2d,$vt2.2d ++ zip2 $dat1.2d,$vt0.2d,$vt2.2d ++ zip1 $dat2.2d,$vt1.2d,$vt3.2d ++ zip2 $dat3.2d,$vt1.2d,$vt3.2d ++___ ++} ++ ++# sbox operations for 4-lane of words ++sub sbox() { ++ my $dat = shift; ++ ++$code.=<<___; ++ movi @vtmp[0].16b,#64 ++ movi @vtmp[1].16b,#128 ++ movi @vtmp[2].16b,#192 ++ sub @vtmp[0].16b,$dat.16b,@vtmp[0].16b ++ sub @vtmp[1].16b,$dat.16b,@vtmp[1].16b ++ sub @vtmp[2].16b,$dat.16b,@vtmp[2].16b ++ tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b ++ tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b ++ tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b ++ tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b ++ add @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d ++ add @vtmp[2].2d,@vtmp[2].2d,$dat.2d ++ add $dat.2d,@vtmp[0].2d,@vtmp[2].2d ++ ++ ushr @vtmp[0].4s,$dat.4s,32-2 ++ sli @vtmp[0].4s,$dat.4s,2 ++ ushr @vtmp[2].4s,$dat.4s,32-10 ++ eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b ++ sli @vtmp[2].4s,$dat.4s,10 ++ eor @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b ++ ushr @vtmp[0].4s,$dat.4s,32-18 ++ sli @vtmp[0].4s,$dat.4s,18 ++ ushr @vtmp[2].4s,$dat.4s,32-24 ++ eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b ++ sli @vtmp[2].4s,$dat.4s,24 ++ eor $dat.16b,@vtmp[2].16b,@vtmp[1].16b ++___ ++} ++ ++# sbox operation for 8-lane of words ++sub sbox_double() { ++ my $dat = shift; ++ my $datx = shift; ++ ++$code.=<<___; ++ movi @vtmp[3].16b,#64 ++ sub @vtmp[0].16b,$dat.16b,@vtmp[3].16b ++ sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b ++ sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b ++ tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b ++ tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b ++ tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b ++ tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b ++ add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d ++ add $dat.2d,@vtmp[2].2d,$dat.2d ++ add $dat.2d,@vtmp[1].2d,$dat.2d ++ ++ sub @vtmp[0].16b,$datx.16b,@vtmp[3].16b ++ sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b ++ sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b ++ tbl $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b ++ tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b ++ tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b ++ tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b ++ add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d ++ add $datx.2d,@vtmp[2].2d,$datx.2d ++ add $datx.2d,@vtmp[1].2d,$datx.2d ++ ++ ushr @vtmp[0].4s,$dat.4s,32-2 ++ sli @vtmp[0].4s,$dat.4s,2 ++ ushr @vtmp[2].4s,$datx.4s,32-2 ++ eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b ++ sli @vtmp[2].4s,$datx.4s,2 ++ ++ ushr @vtmp[0].4s,$dat.4s,32-10 ++ eor @vtmp[3].16b,@vtmp[2].16b,$datx.16b ++ sli @vtmp[0].4s,$dat.4s,10 ++ ushr @vtmp[2].4s,$datx.4s,32-10 ++ eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b ++ sli @vtmp[2].4s,$datx.4s,10 ++ ++ ushr @vtmp[0].4s,$dat.4s,32-18 ++ eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b ++ sli @vtmp[0].4s,$dat.4s,18 ++ ushr @vtmp[2].4s,$datx.4s,32-18 ++ eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b ++ sli @vtmp[2].4s,$datx.4s,18 ++ ++ ushr @vtmp[0].4s,$dat.4s,32-24 ++ eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b ++ sli @vtmp[0].4s,$dat.4s,24 ++ ushr @vtmp[2].4s,$datx.4s,32-24 ++ eor $dat.16b,@vtmp[0].16b,@vtmp[1].16b ++ sli @vtmp[2].4s,$datx.4s,24 ++ eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b ++___ ++} ++ ++# sbox operation for one single word ++sub sbox_1word () { ++ my $word = shift; ++ ++$code.=<<___; ++ movi @vtmp[1].16b,#64 ++ movi @vtmp[2].16b,#128 ++ movi @vtmp[3].16b,#192 ++ mov @vtmp[0].s[0],$word ++ ++ sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b ++ sub @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b ++ sub @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b ++ ++ tbl @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b ++ tbl @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b ++ tbl @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b ++ tbl @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b ++ ++ mov $word,@vtmp[0].s[0] ++ mov $wtmp0,@vtmp[1].s[0] ++ mov $wtmp2,@vtmp[2].s[0] ++ add $wtmp0,$word,$wtmp0 ++ mov $word,@vtmp[3].s[0] ++ add $wtmp0,$wtmp0,$wtmp2 ++ add $wtmp0,$wtmp0,$word ++ ++ eor $word,$wtmp0,$wtmp0,ror #32-2 ++ eor $word,$word,$wtmp0,ror #32-10 ++ eor $word,$word,$wtmp0,ror #32-18 ++ eor $word,$word,$wtmp0,ror #32-24 ++___ ++} ++ ++# sm4 for one block of data, in scalar registers word0/word1/word2/word3 ++sub sm4_1blk () { ++ my $kptr = shift; ++ ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) ++ eor $tmpw,$word2,$word3 ++ eor $wtmp2,$wtmp0,$word1 ++ eor $tmpw,$tmpw,$wtmp2 ++___ ++ &sbox_1word($tmpw); ++$code.=<<___; ++ eor $word0,$word0,$tmpw ++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) ++ eor $tmpw,$word2,$word3 ++ eor $wtmp2,$word0,$wtmp1 ++ eor $tmpw,$tmpw,$wtmp2 ++___ ++ &sbox_1word($tmpw); ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ eor $word1,$word1,$tmpw ++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) ++ eor $tmpw,$word0,$word1 ++ eor $wtmp2,$wtmp0,$word3 ++ eor $tmpw,$tmpw,$wtmp2 ++___ ++ &sbox_1word($tmpw); ++$code.=<<___; ++ eor $word2,$word2,$tmpw ++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) ++ eor $tmpw,$word0,$word1 ++ eor $wtmp2,$word2,$wtmp1 ++ eor $tmpw,$tmpw,$wtmp2 ++___ ++ &sbox_1word($tmpw); ++$code.=<<___; ++ eor $word3,$word3,$tmpw ++___ ++} ++ ++# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 ++sub sm4_4blks () { ++ my $kptr = shift; ++ ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ dup $rk0.4s,$wtmp0 ++ dup $rk1.4s,$wtmp1 ++ ++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) ++ eor $rka.16b,@data[2].16b,@data[3].16b ++ eor $rk0.16b,@data[1].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,$rk0.16b ++___ ++ &sbox($rk0); ++$code.=<<___; ++ eor @data[0].16b,@data[0].16b,$rk0.16b ++ ++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) ++ eor $rka.16b,$rka.16b,@data[0].16b ++ eor $rk1.16b,$rka.16b,$rk1.16b ++___ ++ &sbox($rk1); ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ eor @data[1].16b,@data[1].16b,$rk1.16b ++ ++ dup $rk0.4s,$wtmp0 ++ dup $rk1.4s,$wtmp1 ++ ++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) ++ eor $rka.16b,@data[0].16b,@data[1].16b ++ eor $rk0.16b,@data[3].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,$rk0.16b ++___ ++ &sbox($rk0); ++$code.=<<___; ++ eor @data[2].16b,@data[2].16b,$rk0.16b ++ ++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) ++ eor $rka.16b,$rka.16b,@data[2].16b ++ eor $rk1.16b,$rka.16b,$rk1.16b ++___ ++ &sbox($rk1); ++$code.=<<___; ++ eor @data[3].16b,@data[3].16b,$rk1.16b ++___ ++} ++ ++# sm4 for 8 lanes of data, in neon registers ++# data0/data1/data2/data3 datax0/datax1/datax2/datax3 ++sub sm4_8blks () { ++ my $kptr = shift; ++ ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) ++ dup $rk0.4s,$wtmp0 ++ eor $rka.16b,@data[2].16b,@data[3].16b ++ eor $rkb.16b,@datax[2].16b,@datax[3].16b ++ eor @vtmp[0].16b,@data[1].16b,$rk0.16b ++ eor @vtmp[1].16b,@datax[1].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,@vtmp[0].16b ++ eor $rk1.16b,$rkb.16b,@vtmp[1].16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ eor @data[0].16b,@data[0].16b,$rk0.16b ++ eor @datax[0].16b,@datax[0].16b,$rk1.16b ++ ++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) ++ dup $rk1.4s,$wtmp1 ++ eor $rka.16b,$rka.16b,@data[0].16b ++ eor $rkb.16b,$rkb.16b,@datax[0].16b ++ eor $rk0.16b,$rka.16b,$rk1.16b ++ eor $rk1.16b,$rkb.16b,$rk1.16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ eor @data[1].16b,@data[1].16b,$rk0.16b ++ eor @datax[1].16b,@datax[1].16b,$rk1.16b ++ ++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) ++ dup $rk0.4s,$wtmp0 ++ eor $rka.16b,@data[0].16b,@data[1].16b ++ eor $rkb.16b,@datax[0].16b,@datax[1].16b ++ eor @vtmp[0].16b,@data[3].16b,$rk0.16b ++ eor @vtmp[1].16b,@datax[3].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,@vtmp[0].16b ++ eor $rk1.16b,$rkb.16b,@vtmp[1].16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ eor @data[2].16b,@data[2].16b,$rk0.16b ++ eor @datax[2].16b,@datax[2].16b,$rk1.16b ++ ++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) ++ dup $rk1.4s,$wtmp1 ++ eor $rka.16b,$rka.16b,@data[2].16b ++ eor $rkb.16b,$rkb.16b,@datax[2].16b ++ eor $rk0.16b,$rka.16b,$rk1.16b ++ eor $rk1.16b,$rkb.16b,$rk1.16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ eor @data[3].16b,@data[3].16b,$rk0.16b ++ eor @datax[3].16b,@datax[3].16b,$rk1.16b ++___ ++} ++ ++sub encrypt_1blk_norev() { ++ my $dat = shift; ++ ++$code.=<<___; ++ mov $ptr,$rks ++ mov $counter,#8 ++ mov $word0,$dat.s[0] ++ mov $word1,$dat.s[1] ++ mov $word2,$dat.s[2] ++ mov $word3,$dat.s[3] ++10: ++___ ++ &sm4_1blk($ptr); ++$code.=<<___; ++ subs $counter,$counter,#1 ++ b.ne 10b ++ mov $dat.s[0],$word3 ++ mov $dat.s[1],$word2 ++ mov $dat.s[2],$word1 ++ mov $dat.s[3],$word0 ++___ ++} ++ ++sub encrypt_1blk() { ++ my $dat = shift; ++ ++ &encrypt_1blk_norev($dat); ++ &rev32($dat,$dat); ++} ++ ++sub encrypt_4blks() { ++$code.=<<___; ++ mov $ptr,$rks ++ mov $counter,#8 ++10: ++___ ++ &sm4_4blks($ptr); ++$code.=<<___; ++ subs $counter,$counter,#1 ++ b.ne 10b ++___ ++ &rev32(@vtmp[3],@data[0]); ++ &rev32(@vtmp[2],@data[1]); ++ &rev32(@vtmp[1],@data[2]); ++ &rev32(@vtmp[0],@data[3]); ++} ++ ++sub encrypt_8blks() { ++$code.=<<___; ++ mov $ptr,$rks ++ mov $counter,#8 ++10: ++___ ++ &sm4_8blks($ptr); ++$code.=<<___; ++ subs $counter,$counter,#1 ++ b.ne 10b ++___ ++ &rev32(@vtmp[3],@data[0]); ++ &rev32(@vtmp[2],@data[1]); ++ &rev32(@vtmp[1],@data[2]); ++ &rev32(@vtmp[0],@data[3]); ++ &rev32(@data[3],@datax[0]); ++ &rev32(@data[2],@datax[1]); ++ &rev32(@data[1],@datax[2]); ++ &rev32(@data[0],@datax[3]); ++} ++ ++sub load_sbox () { ++ my $data = shift; ++ ++$code.=<<___; ++ adr $ptr,.Lsbox ++ ld1 {@sbox[0].4s,@sbox[1].4s,@sbox[2].4s,@sbox[3].4s},[$ptr],#64 ++ ld1 {@sbox[4].4s,@sbox[5].4s,@sbox[6].4s,@sbox[7].4s},[$ptr],#64 ++ ld1 {@sbox[8].4s,@sbox[9].4s,@sbox[10].4s,@sbox[11].4s},[$ptr],#64 ++ ld1 {@sbox[12].4s,@sbox[13].4s,@sbox[14].4s,@sbox[15].4s},[$ptr] ++___ ++} ++ ++$code=<<___; ++#include "arm_arch.h" ++.arch armv8-a ++.text ++ ++.type _vpsm4_consts,%object ++.align 7 ++_vpsm4_consts: ++.Lsbox: ++ .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05 ++ .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99 ++ .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62 ++ .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6 ++ .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8 ++ .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35 ++ .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87 ++ .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E ++ .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1 ++ .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3 ++ .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F ++ .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51 ++ .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8 ++ .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0 ++ .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84 ++ .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48 ++.Lck: ++ .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 ++ .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 ++ .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 ++ .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 ++ .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 ++ .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 ++ .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 ++ .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 ++.Lfk: ++ .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197 ++.Lshuffles: ++ .dword 0x0B0A090807060504,0x030201000F0E0D0C ++ ++.size _vpsm4_consts,.-_vpsm4_consts ++___ ++ ++{{{ ++my ($key,$keys,$enc)=("x0","x1","w2"); ++my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8"); ++my ($vkey,$vfk,$vmap)=("v5","v6","v7"); ++$code.=<<___; ++.type _vpsm4_set_key,%function ++.align 4 ++_vpsm4_set_key: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {$vkey.4s},[$key] ++___ ++ &load_sbox(); ++ &rev32($vkey,$vkey); ++$code.=<<___; ++ adr $pointer,.Lshuffles ++ ld1 {$vmap.4s},[$pointer] ++ adr $pointer,.Lfk ++ ld1 {$vfk.4s},[$pointer] ++ eor $vkey.16b,$vkey.16b,$vfk.16b ++ mov $schedules,#32 ++ adr $pointer,.Lck ++ movi @vtmp[0].16b,#64 ++ cbnz $enc,1f ++ add $keys,$keys,124 ++1: ++ mov $wtmp,$vkey.s[1] ++ ldr $roundkey,[$pointer],#4 ++ eor $roundkey,$roundkey,$wtmp ++ mov $wtmp,$vkey.s[2] ++ eor $roundkey,$roundkey,$wtmp ++ mov $wtmp,$vkey.s[3] ++ eor $roundkey,$roundkey,$wtmp ++ // sbox lookup ++ mov @data[0].s[0],$roundkey ++ tbl @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b ++ sub @data[0].16b,@data[0].16b,@vtmp[0].16b ++ tbx @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b ++ sub @data[0].16b,@data[0].16b,@vtmp[0].16b ++ tbx @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b ++ sub @data[0].16b,@data[0].16b,@vtmp[0].16b ++ tbx @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b ++ mov $wtmp,@vtmp[1].s[0] ++ eor $roundkey,$wtmp,$wtmp,ror #19 ++ eor $roundkey,$roundkey,$wtmp,ror #9 ++ mov $wtmp,$vkey.s[0] ++ eor $roundkey,$roundkey,$wtmp ++ mov $vkey.s[0],$roundkey ++ cbz $enc,2f ++ str $roundkey,[$keys],#4 ++ b 3f ++2: ++ str $roundkey,[$keys],#-4 ++3: ++ tbl $vkey.16b,{$vkey.16b},$vmap.16b ++ subs $schedules,$schedules,#1 ++ b.ne 1b ++ ret ++.size _vpsm4_set_key,.-_vpsm4_set_key ++___ ++}}} ++ ++ ++{{{ ++$code.=<<___; ++.type _vpsm4_enc_4blks,%function ++.align 4 ++_vpsm4_enc_4blks: ++ AARCH64_VALID_CALL_TARGET ++___ ++ &encrypt_4blks(); ++$code.=<<___; ++ ret ++.size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks ++___ ++}}} ++ ++{{{ ++$code.=<<___; ++.type _vpsm4_enc_8blks,%function ++.align 4 ++_vpsm4_enc_8blks: ++ AARCH64_VALID_CALL_TARGET ++___ ++ &encrypt_8blks(); ++$code.=<<___; ++ ret ++.size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks ++___ ++}}} ++ ++ ++{{{ ++my ($key,$keys)=("x0","x1"); ++$code.=<<___; ++.globl ${prefix}_set_encrypt_key ++.type ${prefix}_set_encrypt_key,%function ++.align 5 ++${prefix}_set_encrypt_key: ++ AARCH64_SIGN_LINK_REGISTER ++ stp x29,x30,[sp,#-16]! ++ mov w2,1 ++ bl _vpsm4_set_key ++ ldp x29,x30,[sp],#16 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key ++___ ++}}} ++ ++{{{ ++my ($key,$keys)=("x0","x1"); ++$code.=<<___; ++.globl ${prefix}_set_decrypt_key ++.type ${prefix}_set_decrypt_key,%function ++.align 5 ++${prefix}_set_decrypt_key: ++ AARCH64_SIGN_LINK_REGISTER ++ stp x29,x30,[sp,#-16]! ++ mov w2,0 ++ bl _vpsm4_set_key ++ ldp x29,x30,[sp],#16 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key ++___ ++}}} ++ ++{{{ ++sub gen_block () { ++ my $dir = shift; ++ my ($inp,$outp,$rk)=map("x$_",(0..2)); ++ ++$code.=<<___; ++.globl ${prefix}_${dir}crypt ++.type ${prefix}_${dir}crypt,%function ++.align 5 ++${prefix}_${dir}crypt: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {@data[0].16b},[$inp] ++___ ++ &load_sbox(); ++ &rev32(@data[0],@data[0]); ++$code.=<<___; ++ mov $rks,x2 ++___ ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ st1 {@data[0].16b},[$outp] ++ ret ++.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt ++___ ++} ++&gen_block("en"); ++&gen_block("de"); ++}}} ++ ++{{{ ++my ($enc) = ("w4"); ++my @dat=map("v$_",(16..23)); ++ ++$code.=<<___; ++.globl ${prefix}_ecb_encrypt ++.type ${prefix}_ecb_encrypt,%function ++.align 5 ++${prefix}_ecb_encrypt: ++ AARCH64_SIGN_LINK_REGISTER ++ // convert length into blocks ++ lsr x2,x2,4 ++ stp d8,d9,[sp,#-80]! ++ stp d10,d11,[sp,#16] ++ stp d12,d13,[sp,#32] ++ stp d14,d15,[sp,#48] ++ stp x29,x30,[sp,#64] ++___ ++ &load_sbox(); ++$code.=<<___; ++.Lecb_8_blocks_process: ++ cmp $blocks,#8 ++ b.lt .Lecb_4_blocks_process ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++ ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &rev32(@datax[0],@datax[0]); ++ &rev32(@datax[1],@datax[1]); ++ &rev32(@datax[2],@datax[2]); ++ &rev32(@datax[3],@datax[3]); ++$code.=<<___; ++ bl _vpsm4_enc_8blks ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.gt .Lecb_8_blocks_process ++ b 100f ++.Lecb_4_blocks_process: ++ cmp $blocks,#4 ++ b.lt 1f ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _vpsm4_enc_4blks ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ sub $blocks,$blocks,#4 ++1: ++ // process last block ++ cmp $blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ ld1 {@data[0].16b},[$inp] ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ st1 {@data[0].16b},[$outp] ++ b 100f ++1: // process last 2 blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16 ++ cmp $blocks,#2 ++ b.gt 1f ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _vpsm4_enc_4blks ++ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp] ++ b 100f ++1: // process last 3 blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16 ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _vpsm4_enc_4blks ++ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16 ++ st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp] ++100: ++ ldp d10,d11,[sp,#16] ++ ldp d12,d13,[sp,#32] ++ ldp d14,d15,[sp,#48] ++ ldp x29,x30,[sp,#64] ++ ldp d8,d9,[sp],#80 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt ++___ ++}}} ++ ++{{{ ++my ($len,$ivp,$enc)=("x2","x4","w5"); ++my $ivec0=("v3"); ++my $ivec1=("v15"); ++ ++$code.=<<___; ++.globl ${prefix}_cbc_encrypt ++.type ${prefix}_cbc_encrypt,%function ++.align 5 ++${prefix}_cbc_encrypt: ++ AARCH64_VALID_CALL_TARGET ++ lsr $len,$len,4 ++___ ++ &load_sbox(); ++$code.=<<___; ++ cbz $enc,.Ldec ++ ld1 {$ivec0.4s},[$ivp] ++.Lcbc_4_blocks_enc: ++ cmp $blocks,#4 ++ b.lt 1f ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++ eor @data[0].16b,@data[0].16b,$ivec0.16b ++___ ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &encrypt_1blk_norev(@data[0]); ++$code.=<<___; ++ eor @data[1].16b,@data[1].16b,@data[0].16b ++___ ++ &encrypt_1blk_norev(@data[1]); ++ &rev32(@data[0],@data[0]); ++ ++$code.=<<___; ++ eor @data[2].16b,@data[2].16b,@data[1].16b ++___ ++ &encrypt_1blk_norev(@data[2]); ++ &rev32(@data[1],@data[1]); ++$code.=<<___; ++ eor @data[3].16b,@data[3].16b,@data[2].16b ++___ ++ &encrypt_1blk_norev(@data[3]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ orr $ivec0.16b,@data[3].16b,@data[3].16b ++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#4 ++ b.ne .Lcbc_4_blocks_enc ++ b 2f ++1: ++ subs $blocks,$blocks,#1 ++ b.lt 2f ++ ld1 {@data[0].4s},[$inp],#16 ++ eor $ivec0.16b,$ivec0.16b,@data[0].16b ++___ ++ &rev32($ivec0,$ivec0); ++ &encrypt_1blk($ivec0); ++$code.=<<___; ++ st1 {$ivec0.16b},[$outp],#16 ++ b 1b ++2: ++ // save back IV ++ st1 {$ivec0.16b},[$ivp] ++ ret ++ ++.Ldec: ++ // decryption mode starts ++ AARCH64_SIGN_LINK_REGISTER ++ stp d8,d9,[sp,#-80]! ++ stp d10,d11,[sp,#16] ++ stp d12,d13,[sp,#32] ++ stp d14,d15,[sp,#48] ++ stp x29,x30,[sp,#64] ++.Lcbc_8_blocks_dec: ++ cmp $blocks,#8 ++ b.lt 1f ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] ++ add $ptr,$inp,#64 ++ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr] ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],$data[3]); ++ &rev32(@datax[0],@datax[0]); ++ &rev32(@datax[1],@datax[1]); ++ &rev32(@datax[2],@datax[2]); ++ &rev32(@datax[3],$datax[3]); ++$code.=<<___; ++ bl _vpsm4_enc_8blks ++___ ++ &transpose(@vtmp,@datax); ++ &transpose(@data,@datax); ++$code.=<<___; ++ ld1 {$ivec1.16b},[$ivp] ++ ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++ // note ivec1 and vtmpx[3] are resuing the same register ++ // care needs to be taken to avoid conflict ++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b ++ ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 ++ eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b ++ eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b ++ // save back IV ++ st1 {$vtmpx[3].16b}, [$ivp] ++ eor @data[0].16b,@data[0].16b,$datax[3].16b ++ eor @data[1].16b,@data[1].16b,@vtmpx[0].16b ++ eor @data[2].16b,@data[2].16b,@vtmpx[1].16b ++ eor @data[3].16b,$data[3].16b,@vtmpx[2].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.gt .Lcbc_8_blocks_dec ++ b.eq 100f ++1: ++ ld1 {$ivec1.16b},[$ivp] ++.Lcbc_4_blocks_dec: ++ cmp $blocks,#4 ++ b.lt 1f ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],$data[3]); ++$code.=<<___; ++ bl _vpsm4_enc_4blks ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &transpose(@vtmp,@datax); ++$code.=<<___; ++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b ++ orr $ivec1.16b,@data[3].16b,@data[3].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b ++ eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#4 ++ b.gt .Lcbc_4_blocks_dec ++ // save back IV ++ st1 {@vtmp[3].16b}, [$ivp] ++ b 100f ++1: // last block ++ subs $blocks,$blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ ld1 {@data[0].4s},[$inp],#16 ++ // save back IV ++ st1 {$data[0].16b}, [$ivp] ++___ ++ &rev32(@datax[0],@data[0]); ++ &encrypt_1blk(@datax[0]); ++$code.=<<___; ++ eor @datax[0].16b,@datax[0].16b,$ivec1.16b ++ st1 {@datax[0].16b},[$outp],#16 ++ b 100f ++1: // last two blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp] ++ add $ptr,$inp,#16 ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16 ++ subs $blocks,$blocks,1 ++ b.gt 1f ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _vpsm4_enc_4blks ++ ld1 {@data[0].4s,@data[1].4s},[$inp],#32 ++___ ++ &transpose(@vtmp,@datax); ++$code.=<<___; ++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 ++ // save back IV ++ st1 {@data[1].16b}, [$ivp] ++ b 100f ++1: // last 3 blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr] ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _vpsm4_enc_4blks ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 ++___ ++ &transpose(@vtmp,@datax); ++$code.=<<___; ++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 ++ // save back IV ++ st1 {@data[2].16b}, [$ivp] ++100: ++ ldp d10,d11,[sp,#16] ++ ldp d12,d13,[sp,#32] ++ ldp d14,d15,[sp,#48] ++ ldp x29,x30,[sp,#64] ++ ldp d8,d9,[sp],#80 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt ++___ ++}}} ++ ++{{{ ++my ($ivp)=("x4"); ++my ($ctr)=("w5"); ++my $ivec=("v3"); ++ ++$code.=<<___; ++.globl ${prefix}_ctr32_encrypt_blocks ++.type ${prefix}_ctr32_encrypt_blocks,%function ++.align 5 ++${prefix}_ctr32_encrypt_blocks: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {$ivec.4s},[$ivp] ++___ ++ &rev32($ivec,$ivec); ++ &load_sbox(); ++$code.=<<___; ++ cmp $blocks,#1 ++ b.ne 1f ++ // fast processing for one single block without ++ // context saving overhead ++___ ++ &encrypt_1blk($ivec); ++$code.=<<___; ++ ld1 {@data[0].16b},[$inp] ++ eor @data[0].16b,@data[0].16b,$ivec.16b ++ st1 {@data[0].16b},[$outp] ++ ret ++1: ++ AARCH64_SIGN_LINK_REGISTER ++ stp d8,d9,[sp,#-80]! ++ stp d10,d11,[sp,#16] ++ stp d12,d13,[sp,#32] ++ stp d14,d15,[sp,#48] ++ stp x29,x30,[sp,#64] ++ mov $word0,$ivec.s[0] ++ mov $word1,$ivec.s[1] ++ mov $word2,$ivec.s[2] ++ mov $ctr,$ivec.s[3] ++.Lctr32_4_blocks_process: ++ cmp $blocks,#4 ++ b.lt 1f ++ dup @data[0].4s,$word0 ++ dup @data[1].4s,$word1 ++ dup @data[2].4s,$word2 ++ mov @data[3].s[0],$ctr ++ add $ctr,$ctr,#1 ++ mov $data[3].s[1],$ctr ++ add $ctr,$ctr,#1 ++ mov @data[3].s[2],$ctr ++ add $ctr,$ctr,#1 ++ mov @data[3].s[3],$ctr ++ add $ctr,$ctr,#1 ++ cmp $blocks,#8 ++ b.ge .Lctr32_8_blocks_process ++ bl _vpsm4_enc_4blks ++ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 ++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b ++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#4 ++ b.ne .Lctr32_4_blocks_process ++ b 100f ++.Lctr32_8_blocks_process: ++ dup @datax[0].4s,$word0 ++ dup @datax[1].4s,$word1 ++ dup @datax[2].4s,$word2 ++ mov @datax[3].s[0],$ctr ++ add $ctr,$ctr,#1 ++ mov $datax[3].s[1],$ctr ++ add $ctr,$ctr,#1 ++ mov @datax[3].s[2],$ctr ++ add $ctr,$ctr,#1 ++ mov @datax[3].s[3],$ctr ++ add $ctr,$ctr,#1 ++ bl _vpsm4_enc_8blks ++ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 ++ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b ++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b ++ eor @data[0].16b,@data[0].16b,@datax[0].16b ++ eor @data[1].16b,@data[1].16b,@datax[1].16b ++ eor @data[2].16b,@data[2].16b,@datax[2].16b ++ eor @data[3].16b,@data[3].16b,@datax[3].16b ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.ne .Lctr32_4_blocks_process ++ b 100f ++1: // last block processing ++ subs $blocks,$blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ mov $ivec.s[0],$word0 ++ mov $ivec.s[1],$word1 ++ mov $ivec.s[2],$word2 ++ mov $ivec.s[3],$ctr ++___ ++ &encrypt_1blk($ivec); ++$code.=<<___; ++ ld1 {@data[0].16b},[$inp] ++ eor @data[0].16b,@data[0].16b,$ivec.16b ++ st1 {@data[0].16b},[$outp] ++ b 100f ++1: // last 2 blocks processing ++ dup @data[0].4s,$word0 ++ dup @data[1].4s,$word1 ++ dup @data[2].4s,$word2 ++ mov @data[3].s[0],$ctr ++ add $ctr,$ctr,#1 ++ mov @data[3].s[1],$ctr ++ subs $blocks,$blocks,#1 ++ b.ne 1f ++ bl _vpsm4_enc_4blks ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 ++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b ++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 ++ b 100f ++1: // last 3 blocks processing ++ add $ctr,$ctr,#1 ++ mov @data[3].s[2],$ctr ++ bl _vpsm4_enc_4blks ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16 ++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b ++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16 ++100: ++ ldp d10,d11,[sp,#16] ++ ldp d12,d13,[sp,#32] ++ ldp d14,d15,[sp,#48] ++ ldp x29,x30,[sp,#64] ++ ldp d8,d9,[sp],#80 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks ++___ ++}}} ++######################################## ++open SELF,$0; ++while(<SELF>) { ++ next if (/^#!/); ++ last if (!s/^#/\/\// and !/^$/); ++ print; ++} ++close SELF; ++ ++foreach(split("\n",$code)) { ++ s/\`([^\`]*)\`/eval($1)/ge; ++ print $_,"\n"; ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; +diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info +index e27aa49e67..75a215ab80 100644 +--- a/crypto/sm4/build.info ++++ b/crypto/sm4/build.info +@@ -1,8 +1,8 @@ + LIBS=../../libcrypto + + IF[{- !$disabled{asm} -}] +- $SM4DEF_aarch64=SM4_ASM +- $SM4ASM_aarch64=sm4-armv8.S ++ $SM4DEF_aarch64=SM4_ASM VPSM4_ASM ++ $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S + + # Now that we have defined all the arch specific variables, use the + # appropriate one, and define the appropriate macros +@@ -29,4 +29,6 @@ IF[{- !$disabled{module} && !$disabled{shared} -}] + ENDIF + + GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl ++GENERATE[vpsm4-armv8.S]=asm/vpsm4-armv8.pl + INCLUDE[sm4-armv8.o]=.. ++INCLUDE[vpsm4-armv8.o]=.. +diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h +index 42c8b44a43..11f9b9d88b 100644 +--- a/include/crypto/sm4_platform.h ++++ b/include/crypto/sm4_platform.h +@@ -15,6 +15,16 @@ + # if (defined(__arm__) || defined(__arm) || defined(__aarch64__)) + # include "arm_arch.h" + # if __ARM_MAX_ARCH__>=8 ++extern unsigned int OPENSSL_arm_midr; ++static inline int vpsm4_capable(void) ++{ ++ return (OPENSSL_armcap_P & ARMV8_CPUID) && ++ (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1) || ++ MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1)); ++} ++# if defined(VPSM4_ASM) ++# define VPSM4_CAPABLE vpsm4_capable() ++# endif + # define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4) + # define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key + # define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key +@@ -45,4 +55,23 @@ void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + const unsigned char ivec[16]); + # endif /* HWSM4_CAPABLE */ + ++#ifdef VPSM4_CAPABLE ++int vpsm4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); ++int vpsm4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); ++void vpsm4_encrypt(const unsigned char *in, unsigned char *out, ++ const SM4_KEY *key); ++void vpsm4_decrypt(const unsigned char *in, unsigned char *out, ++ const SM4_KEY *key); ++void vpsm4_cbc_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ unsigned char *ivec, const int enc); ++void vpsm4_ecb_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ const int enc); ++void vpsm4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, ++ const unsigned char ivec[16]); ++# endif /* VPSM4_CAPABLE */ ++ ++ + #endif /* OSSL_SM4_PLATFORM_H */ +diff --git a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c +index b9633f83ed..db7fe0fe2f 100644 +--- a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c ++++ b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c +@@ -32,6 +32,13 @@ static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key, + # endif + } else + # endif /* HWSM4_CAPABLE */ ++# ifdef VPSM4_CAPABLE ++ if (VPSM4_CAPABLE) { ++ vpsm4_set_encrypt_key(key, ks); ++ CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f) vpsm4_encrypt); ++ ctx->ctr = (ctr128_f) vpsm4_ctr32_encrypt_blocks; ++ } else ++# endif /* VPSM4_CAPABLE */ + { + ossl_sm4_set_key(key, ks); + CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt); +diff --git a/providers/implementations/ciphers/cipher_sm4_hw.c b/providers/implementations/ciphers/cipher_sm4_hw.c +index 4cd3d3d669..9a2e99f67c 100644 +--- a/providers/implementations/ciphers/cipher_sm4_hw.c ++++ b/providers/implementations/ciphers/cipher_sm4_hw.c +@@ -41,6 +41,19 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx, + #endif + (void)0; /* terminate potentially open 'else' */ + } else ++#endif ++#ifdef VPSM4_CAPABLE ++ if (VPSM4_CAPABLE) { ++ vpsm4_set_encrypt_key(key, ks); ++ ctx->block = (block128_f)vpsm4_encrypt; ++ ctx->stream.cbc = NULL; ++ if (ctx->mode == EVP_CIPH_CBC_MODE) ++ ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt; ++ else if (ctx->mode == EVP_CIPH_ECB_MODE) ++ ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt; ++ else if (ctx->mode == EVP_CIPH_CTR_MODE) ++ ctx->stream.ctr = (ctr128_f)vpsm4_ctr32_encrypt_blocks; ++ } else + #endif + { + ossl_sm4_set_key(key, ks); +@@ -61,6 +74,17 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx, + ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt; + #endif + } else ++#endif ++#ifdef VPSM4_CAPABLE ++ if (VPSM4_CAPABLE) { ++ vpsm4_set_decrypt_key(key, ks); ++ ctx->block = (block128_f)vpsm4_decrypt; ++ ctx->stream.cbc = NULL; ++ if (ctx->mode == EVP_CIPH_CBC_MODE) ++ ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt; ++ else if (ctx->mode == EVP_CIPH_ECB_MODE) ++ ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt; ++ } else + #endif + { + ossl_sm4_set_key(key, ks); +-- +2.37.3.windows.1 + diff --git a/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch b/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch new file mode 100644 index 0000000..c68f1a0 --- /dev/null +++ b/Backport-SM4-optimization-for-ARM-by-HW-instruction.patch @@ -0,0 +1,1228 @@ +From 1cd480c10b8bbaa6f72d503494ff2973672ec0e4 Mon Sep 17 00:00:00 2001 +From: Daniel Hu <Daniel.Hu@arm.com> +Date: Tue, 19 Oct 2021 22:49:05 +0100 +Subject: [PATCH 05/13] SM4 optimization for ARM by HW instruction + +This patch implements the SM4 optimization for ARM processor, +using SM4 HW instruction, which is an optional feature of +crypto extension for aarch64 V8. + +Tested on some modern ARM micro-architectures with SM4 support, the +performance uplift can be observed around 8X~40X over existing +C implementation in openssl. Algorithms that can be parallelized +(like CTR, ECB, CBC decryption) are on higher end, with algorithm +like CBC encryption on lower end (due to inter-block dependency) + +Perf data on Yitian-710 2.75GHz hardware, before and after optimization: + +Before: + type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes + SM4-CTR 105787.80k 107837.87k 108380.84k 108462.08k 108549.46k 108554.92k + SM4-ECB 111924.58k 118173.76k 119776.00k 120093.70k 120264.02k 120274.94k + SM4-CBC 106428.09k 109190.98k 109674.33k 109774.51k 109827.41k 109827.41k + +After (7.4x - 36.6x faster): + type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes + SM4-CTR 781979.02k 2432994.28k 3437753.86k 3834177.88k 3963715.58k 3974556.33k + SM4-ECB 937590.69k 2941689.02k 3945751.81k 4328655.87k 4459181.40k 4468692.31k + SM4-CBC 890639.88k 1027746.58k 1050621.78k 1056696.66k 1058613.93k 1058701.31k + +Signed-off-by: Daniel Hu <Daniel.Hu@arm.com> + +Reviewed-by: Paul Dale <pauli@openssl.org> +Reviewed-by: Tomas Mraz <tomas@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/17455) +--- + crypto/arm64cpuid.pl | 8 + + crypto/arm_arch.h | 1 + + crypto/armcap.c | 10 + + crypto/evp/e_sm4.c | 193 ++++-- + crypto/sm4/asm/sm4-armv8.pl | 635 ++++++++++++++++++ + crypto/sm4/build.info | 32 +- + include/crypto/sm4_platform.h | 48 ++ + .../implementations/ciphers/cipher_sm4.h | 1 + + .../ciphers/cipher_sm4_gcm_hw.c | 20 +- + .../implementations/ciphers/cipher_sm4_hw.c | 57 +- + 10 files changed, 945 insertions(+), 60 deletions(-) + create mode 100755 crypto/sm4/asm/sm4-armv8.pl + create mode 100644 include/crypto/sm4_platform.h + +diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl +index 10d267b7ad..36af3e075b 100755 +--- a/crypto/arm64cpuid.pl ++++ b/crypto/arm64cpuid.pl +@@ -80,6 +80,14 @@ _armv8_pmull_probe: + ret + .size _armv8_pmull_probe,.-_armv8_pmull_probe + ++.globl _armv8_sm4_probe ++.type _armv8_sm4_probe,%function ++_armv8_sm4_probe: ++ AARCH64_VALID_CALL_TARGET ++ .long 0xcec08400 // sm4e v0.4s, v0.4s ++ ret ++.size _armv8_sm4_probe,.-_armv8_sm4_probe ++ + .globl _armv8_sha512_probe + .type _armv8_sha512_probe,%function + _armv8_sha512_probe: +diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h +index c8b501f34c..5b5af31d92 100644 +--- a/crypto/arm_arch.h ++++ b/crypto/arm_arch.h +@@ -85,6 +85,7 @@ extern unsigned int OPENSSL_armv8_rsa_neonized; + # define ARMV8_CPUID (1<<7) + # define ARMV8_RNG (1<<8) + # define ARMV8_SM3 (1<<9) ++# define ARMV8_SM4 (1<<10) + + /* + * MIDR_EL1 system register +diff --git a/crypto/armcap.c b/crypto/armcap.c +index 365a48df45..c5aa062767 100644 +--- a/crypto/armcap.c ++++ b/crypto/armcap.c +@@ -53,6 +53,7 @@ void _armv8_sha256_probe(void); + void _armv8_pmull_probe(void); + # ifdef __aarch64__ + void _armv8_sm3_probe(void); ++void _armv8_sm4_probe(void); + void _armv8_sha512_probe(void); + unsigned int _armv8_cpuid_probe(void); + # endif +@@ -139,6 +140,7 @@ static unsigned long getauxval(unsigned long key) + # define HWCAP_CE_SHA256 (1 << 6) + # define HWCAP_CPUID (1 << 11) + # define HWCAP_CE_SM3 (1 << 18) ++# define HWCAP_CE_SM4 (1 << 19) + # define HWCAP_CE_SHA512 (1 << 21) + # endif + +@@ -207,6 +209,9 @@ void OPENSSL_cpuid_setup(void) + OPENSSL_armcap_P |= ARMV8_SHA256; + + # ifdef __aarch64__ ++ if (hwcap & HWCAP_CE_SM4) ++ OPENSSL_armcap_P |= ARMV8_SM4; ++ + if (hwcap & HWCAP_CE_SHA512) + OPENSSL_armcap_P |= ARMV8_SHA512; + +@@ -254,6 +259,11 @@ void OPENSSL_cpuid_setup(void) + OPENSSL_armcap_P |= ARMV8_SHA256; + } + # if defined(__aarch64__) && !defined(__APPLE__) ++ if (sigsetjmp(ill_jmp, 1) == 0) { ++ _armv8_sm4_probe(); ++ OPENSSL_armcap_P |= ARMV8_SM4; ++ } ++ + if (sigsetjmp(ill_jmp, 1) == 0) { + _armv8_sha512_probe(); + OPENSSL_armcap_P |= ARMV8_SHA512; +diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c +index abd603015c..bff79ff197 100644 +--- a/crypto/evp/e_sm4.c ++++ b/crypto/evp/e_sm4.c +@@ -17,92 +17,187 @@ + # include <openssl/modes.h> + # include "crypto/sm4.h" + # include "crypto/evp.h" ++# include "crypto/sm4_platform.h" + # include "evp_local.h" + + typedef struct { +- SM4_KEY ks; ++ union { ++ OSSL_UNION_ALIGN; ++ SM4_KEY ks; ++ } ks; ++ block128_f block; ++ union { ++ ecb128_f ecb; ++ cbc128_f cbc; ++ ctr128_f ctr; ++ } stream; + } EVP_SM4_KEY; + ++# define BLOCK_CIPHER_generic(nid,blocksize,ivlen,nmode,mode,MODE,flags) \ ++static const EVP_CIPHER sm4_##mode = { \ ++ nid##_##nmode,blocksize,128/8,ivlen, \ ++ flags|EVP_CIPH_##MODE##_MODE, \ ++ EVP_ORIG_GLOBAL, \ ++ sm4_init_key, \ ++ sm4_##mode##_cipher, \ ++ NULL, \ ++ sizeof(EVP_SM4_KEY), \ ++ NULL,NULL,NULL,NULL }; \ ++const EVP_CIPHER *EVP_sm4_##mode(void) \ ++{ return &sm4_##mode; } ++ ++#define DEFINE_BLOCK_CIPHERS(nid,flags) \ ++ BLOCK_CIPHER_generic(nid,16,16,cbc,cbc,CBC,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ ++ BLOCK_CIPHER_generic(nid,16,0,ecb,ecb,ECB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ ++ BLOCK_CIPHER_generic(nid,1,16,ofb128,ofb,OFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ ++ BLOCK_CIPHER_generic(nid,1,16,cfb128,cfb,CFB,flags|EVP_CIPH_FLAG_DEFAULT_ASN1) \ ++ BLOCK_CIPHER_generic(nid,1,16,ctr,ctr,CTR,flags) ++ + static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + const unsigned char *iv, int enc) + { +- ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); ++ int mode; ++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); ++ ++ mode = EVP_CIPHER_CTX_get_mode(ctx); ++ if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) ++ && !enc) { ++#ifdef HWSM4_CAPABLE ++ if (HWSM4_CAPABLE) { ++ HWSM4_set_decrypt_key(key, &dat->ks.ks); ++ dat->block = (block128_f) HWSM4_decrypt; ++ dat->stream.cbc = NULL; ++# ifdef HWSM4_cbc_encrypt ++ if (mode == EVP_CIPH_CBC_MODE) ++ dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt; ++# endif ++# ifdef HWSM4_ecb_encrypt ++ if (mode == EVP_CIPH_ECB_MODE) ++ dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt; ++# endif ++ } else ++#endif ++ { ++ dat->block = (block128_f) ossl_sm4_decrypt; ++ ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); ++ } ++ } else ++#ifdef HWSM4_CAPABLE ++ if (HWSM4_CAPABLE) { ++ HWSM4_set_encrypt_key(key, &dat->ks.ks); ++ dat->block = (block128_f) HWSM4_encrypt; ++ dat->stream.cbc = NULL; ++# ifdef HWSM4_cbc_encrypt ++ if (mode == EVP_CIPH_CBC_MODE) ++ dat->stream.cbc = (cbc128_f) HWSM4_cbc_encrypt; ++ else ++# endif ++# ifdef HWSM4_ecb_encrypt ++ if (mode == EVP_CIPH_ECB_MODE) ++ dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt; ++ else ++# endif ++# ifdef HWSM4_ctr32_encrypt_blocks ++ if (mode == EVP_CIPH_CTR_MODE) ++ dat->stream.ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks; ++ else ++# endif ++ (void)0; /* terminate potentially open 'else' */ ++ } else ++#endif ++ { ++ dat->block = (block128_f) ossl_sm4_encrypt; ++ ossl_sm4_set_key(key, EVP_CIPHER_CTX_get_cipher_data(ctx)); ++ } + return 1; + } + +-static void sm4_cbc_encrypt(const unsigned char *in, unsigned char *out, +- size_t len, const SM4_KEY *key, +- unsigned char *ivec, const int enc) ++static int sm4_cbc_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t len) + { +- if (enc) +- CRYPTO_cbc128_encrypt(in, out, len, key, ivec, +- (block128_f)ossl_sm4_encrypt); ++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); ++ ++ if (dat->stream.cbc) ++ (*dat->stream.cbc) (in, out, len, &dat->ks.ks, ctx->iv, ++ EVP_CIPHER_CTX_is_encrypting(ctx)); ++ else if (EVP_CIPHER_CTX_is_encrypting(ctx)) ++ CRYPTO_cbc128_encrypt(in, out, len, &dat->ks, ctx->iv, ++ dat->block); + else +- CRYPTO_cbc128_decrypt(in, out, len, key, ivec, +- (block128_f)ossl_sm4_decrypt); ++ CRYPTO_cbc128_decrypt(in, out, len, &dat->ks, ++ ctx->iv, dat->block); ++ return 1; + } + +-static void sm4_cfb128_encrypt(const unsigned char *in, unsigned char *out, +- size_t length, const SM4_KEY *key, +- unsigned char *ivec, int *num, const int enc) ++static int sm4_cfb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t len) + { +- CRYPTO_cfb128_encrypt(in, out, length, key, ivec, num, enc, +- (block128_f)ossl_sm4_encrypt); ++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); ++ int num = EVP_CIPHER_CTX_get_num(ctx); ++ ++ CRYPTO_cfb128_encrypt(in, out, len, &dat->ks, ++ ctx->iv, &num, ++ EVP_CIPHER_CTX_is_encrypting(ctx), dat->block); ++ EVP_CIPHER_CTX_set_num(ctx, num); ++ return 1; + } + +-static void sm4_ecb_encrypt(const unsigned char *in, unsigned char *out, +- const SM4_KEY *key, const int enc) ++static int sm4_ecb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t len) + { +- if (enc) +- ossl_sm4_encrypt(in, out, key); ++ size_t bl = EVP_CIPHER_CTX_get_block_size(ctx); ++ size_t i; ++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); ++ ++ if (len < bl) ++ return 1; ++ ++ if (dat->stream.ecb != NULL) ++ (*dat->stream.ecb) (in, out, len, &dat->ks.ks, ++ EVP_CIPHER_CTX_is_encrypting(ctx)); + else +- ossl_sm4_decrypt(in, out, key); ++ for (i = 0, len -= bl; i <= len; i += bl) ++ (*dat->block) (in + i, out + i, &dat->ks); ++ ++ return 1; + } + +-static void sm4_ofb128_encrypt(const unsigned char *in, unsigned char *out, +- size_t length, const SM4_KEY *key, +- unsigned char *ivec, int *num) ++static int sm4_ofb_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, ++ const unsigned char *in, size_t len) + { +- CRYPTO_ofb128_encrypt(in, out, length, key, ivec, num, +- (block128_f)ossl_sm4_encrypt); +-} ++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); ++ int num = EVP_CIPHER_CTX_get_num(ctx); + +-IMPLEMENT_BLOCK_CIPHER(sm4, ks, sm4, EVP_SM4_KEY, NID_sm4, +- 16, 16, 16, 128, EVP_CIPH_FLAG_DEFAULT_ASN1, +- sm4_init_key, 0, 0, 0, 0) ++ CRYPTO_ofb128_encrypt(in, out, len, &dat->ks, ++ ctx->iv, &num, dat->block); ++ EVP_CIPHER_CTX_set_num(ctx, num); ++ return 1; ++} + + static int sm4_ctr_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out, + const unsigned char *in, size_t len) + { + int n = EVP_CIPHER_CTX_get_num(ctx); + unsigned int num; +- EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY, ctx); ++ EVP_SM4_KEY *dat = EVP_C_DATA(EVP_SM4_KEY,ctx); + + if (n < 0) + return 0; + num = (unsigned int)n; + +- CRYPTO_ctr128_encrypt(in, out, len, &dat->ks, ctx->iv, +- EVP_CIPHER_CTX_buf_noconst(ctx), &num, +- (block128_f)ossl_sm4_encrypt); ++ if (dat->stream.ctr) ++ CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks, ++ ctx->iv, ++ EVP_CIPHER_CTX_buf_noconst(ctx), ++ &num, dat->stream.ctr); ++ else ++ CRYPTO_ctr128_encrypt(in, out, len, &dat->ks, ++ ctx->iv, ++ EVP_CIPHER_CTX_buf_noconst(ctx), &num, ++ dat->block); + EVP_CIPHER_CTX_set_num(ctx, num); + return 1; + } + +-static const EVP_CIPHER sm4_ctr_mode = { +- NID_sm4_ctr, 1, 16, 16, +- EVP_CIPH_CTR_MODE, +- EVP_ORIG_GLOBAL, +- sm4_init_key, +- sm4_ctr_cipher, +- NULL, +- sizeof(EVP_SM4_KEY), +- NULL, NULL, NULL, NULL +-}; +- +-const EVP_CIPHER *EVP_sm4_ctr(void) +-{ +- return &sm4_ctr_mode; +-} +- ++DEFINE_BLOCK_CIPHERS(NID_sm4, 0) + #endif +diff --git a/crypto/sm4/asm/sm4-armv8.pl b/crypto/sm4/asm/sm4-armv8.pl +new file mode 100755 +index 0000000000..7358a6e6a2 +--- /dev/null ++++ b/crypto/sm4/asm/sm4-armv8.pl +@@ -0,0 +1,635 @@ ++#! /usr/bin/env perl ++# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++ ++# ++# This module implements support for SM4 hw support on aarch64 ++# Oct 2021 ++# ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or ++die "can't locate arm-xlate.pl"; ++ ++open OUT,"| \"$^X\" $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++*STDOUT=*OUT; ++ ++$prefix="sm4_v8"; ++my @rks=map("v$_",(0..7)); ++ ++sub rev32() { ++my $dst = shift; ++my $src = shift; ++$code.=<<___; ++#ifndef __ARMEB__ ++ rev32 $dst.16b,$src.16b ++#endif ++___ ++} ++ ++sub enc_blk () { ++my $data = shift; ++$code.=<<___; ++ sm4e $data.4s,@rks[0].4s ++ sm4e $data.4s,@rks[1].4s ++ sm4e $data.4s,@rks[2].4s ++ sm4e $data.4s,@rks[3].4s ++ sm4e $data.4s,@rks[4].4s ++ sm4e $data.4s,@rks[5].4s ++ sm4e $data.4s,@rks[6].4s ++ sm4e $data.4s,@rks[7].4s ++ rev64 $data.4S,$data.4S ++ ext $data.16b,$data.16b,$data.16b,#8 ++___ ++} ++ ++sub enc_4blks () { ++my $data0 = shift; ++my $data1 = shift; ++my $data2 = shift; ++my $data3 = shift; ++$code.=<<___; ++ sm4e $data0.4s,@rks[0].4s ++ sm4e $data1.4s,@rks[0].4s ++ sm4e $data2.4s,@rks[0].4s ++ sm4e $data3.4s,@rks[0].4s ++ ++ sm4e $data0.4s,@rks[1].4s ++ sm4e $data1.4s,@rks[1].4s ++ sm4e $data2.4s,@rks[1].4s ++ sm4e $data3.4s,@rks[1].4s ++ ++ sm4e $data0.4s,@rks[2].4s ++ sm4e $data1.4s,@rks[2].4s ++ sm4e $data2.4s,@rks[2].4s ++ sm4e $data3.4s,@rks[2].4s ++ ++ sm4e $data0.4s,@rks[3].4s ++ sm4e $data1.4s,@rks[3].4s ++ sm4e $data2.4s,@rks[3].4s ++ sm4e $data3.4s,@rks[3].4s ++ ++ sm4e $data0.4s,@rks[4].4s ++ sm4e $data1.4s,@rks[4].4s ++ sm4e $data2.4s,@rks[4].4s ++ sm4e $data3.4s,@rks[4].4s ++ ++ sm4e $data0.4s,@rks[5].4s ++ sm4e $data1.4s,@rks[5].4s ++ sm4e $data2.4s,@rks[5].4s ++ sm4e $data3.4s,@rks[5].4s ++ ++ sm4e $data0.4s,@rks[6].4s ++ sm4e $data1.4s,@rks[6].4s ++ sm4e $data2.4s,@rks[6].4s ++ sm4e $data3.4s,@rks[6].4s ++ ++ sm4e $data0.4s,@rks[7].4s ++ rev64 $data0.4S,$data0.4S ++ sm4e $data1.4s,@rks[7].4s ++ ext $data0.16b,$data0.16b,$data0.16b,#8 ++ rev64 $data1.4S,$data1.4S ++ sm4e $data2.4s,@rks[7].4s ++ ext $data1.16b,$data1.16b,$data1.16b,#8 ++ rev64 $data2.4S,$data2.4S ++ sm4e $data3.4s,@rks[7].4s ++ ext $data2.16b,$data2.16b,$data2.16b,#8 ++ rev64 $data3.4S,$data3.4S ++ ext $data3.16b,$data3.16b,$data3.16b,#8 ++___ ++} ++ ++$code=<<___; ++#include "arm_arch.h" ++.arch armv8-a+crypto ++.text ++___ ++ ++{{{ ++$code.=<<___; ++.align 6 ++.Lck: ++ .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 ++ .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 ++ .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 ++ .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 ++ .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 ++ .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 ++ .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 ++ .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 ++.Lfk: ++ .long 0xa3b1bac6, 0x56aa3350, 0x677d9197, 0xb27022dc ++___ ++}}} ++ ++{{{ ++my ($key,$keys)=("x0","x1"); ++my ($tmp)=("x2"); ++my ($key0,$key1,$key2,$key3,$key4,$key5,$key6,$key7)=map("v$_",(0..7)); ++my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23)); ++my ($fkconst) = ("v24"); ++$code.=<<___; ++.globl ${prefix}_set_encrypt_key ++.type ${prefix}_set_encrypt_key,%function ++.align 5 ++${prefix}_set_encrypt_key: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {$key0.4s},[$key] ++ adr $tmp,.Lfk ++ ld1 {$fkconst.4s},[$tmp] ++ adr $tmp,.Lck ++ ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64 ++___ ++ &rev32($key0, $key0); ++$code.=<<___; ++ ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp] ++ eor $key0.16b,$key0.16b,$fkconst.16b; ++ sm4ekey $key0.4S,$key0.4S,$const0.4S ++ sm4ekey $key1.4S,$key0.4S,$const1.4S ++ sm4ekey $key2.4S,$key1.4S,$const2.4S ++ sm4ekey $key3.4S,$key2.4S,$const3.4S ++ sm4ekey $key4.4S,$key3.4S,$const4.4S ++ st1 {$key0.4s,$key1.4s,$key2.4s,$key3.4s},[$keys],64 ++ sm4ekey $key5.4S,$key4.4S,$const5.4S ++ sm4ekey $key6.4S,$key5.4S,$const6.4S ++ sm4ekey $key7.4S,$key6.4S,$const7.4S ++ st1 {$key4.4s,$key5.4s,$key6.4s,$key7.4s},[$keys] ++ ret ++.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key ++___ ++}}} ++ ++{{{ ++my ($key,$keys)=("x0","x1"); ++my ($tmp)=("x2"); ++my ($key7,$key6,$key5,$key4,$key3,$key2,$key1,$key0)=map("v$_",(0..7)); ++my ($const0,$const1,$const2,$const3,$const4,$const5,$const6,$const7)=map("v$_",(16..23)); ++my ($fkconst) = ("v24"); ++$code.=<<___; ++.globl ${prefix}_set_decrypt_key ++.type ${prefix}_set_decrypt_key,%function ++.align 5 ++${prefix}_set_decrypt_key: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {$key0.4s},[$key] ++ adr $tmp,.Lfk ++ ld1 {$fkconst.4s},[$tmp] ++ adr $tmp, .Lck ++ ld1 {$const0.4s,$const1.4s,$const2.4s,$const3.4s},[$tmp],64 ++___ ++ &rev32($key0, $key0); ++$code.=<<___; ++ ld1 {$const4.4s,$const5.4s,$const6.4s,$const7.4s},[$tmp] ++ eor $key0.16b, $key0.16b,$fkconst.16b; ++ sm4ekey $key0.4S,$key0.4S,$const0.4S ++ sm4ekey $key1.4S,$key0.4S,$const1.4S ++ sm4ekey $key2.4S,$key1.4S,$const2.4S ++ rev64 $key0.4s,$key0.4s ++ rev64 $key1.4s,$key1.4s ++ ext $key0.16b,$key0.16b,$key0.16b,#8 ++ ext $key1.16b,$key1.16b,$key1.16b,#8 ++ sm4ekey $key3.4S,$key2.4S,$const3.4S ++ sm4ekey $key4.4S,$key3.4S,$const4.4S ++ rev64 $key2.4s,$key2.4s ++ rev64 $key3.4s,$key3.4s ++ ext $key2.16b,$key2.16b,$key2.16b,#8 ++ ext $key3.16b,$key3.16b,$key3.16b,#8 ++ sm4ekey $key5.4S,$key4.4S,$const5.4S ++ sm4ekey $key6.4S,$key5.4S,$const6.4S ++ rev64 $key4.4s,$key4.4s ++ rev64 $key5.4s,$key5.4s ++ ext $key4.16b,$key4.16b,$key4.16b,#8 ++ ext $key5.16b,$key5.16b,$key5.16b,#8 ++ sm4ekey $key7.4S,$key6.4S,$const7.4S ++ rev64 $key6.4s, $key6.4s ++ rev64 $key7.4s, $key7.4s ++ ext $key6.16b,$key6.16b,$key6.16b,#8 ++ ext $key7.16b,$key7.16b,$key7.16b,#8 ++ st1 {$key7.4s,$key6.4s,$key5.4s,$key4.4s},[$keys],64 ++ st1 {$key3.4s,$key2.4s,$key1.4s,$key0.4s},[$keys] ++ ret ++.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key ++___ ++}}} ++ ++{{{ ++sub gen_block () { ++my $dir = shift; ++my ($inp,$out,$rk)=map("x$_",(0..2)); ++my ($data)=("v16"); ++$code.=<<___; ++.globl ${prefix}_${dir}crypt ++.type ${prefix}_${dir}crypt,%function ++.align 5 ++${prefix}_${dir}crypt: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {$data.4s},[$inp] ++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64 ++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] ++___ ++ &rev32($data,$data); ++ &enc_blk($data); ++ &rev32($data,$data); ++$code.=<<___; ++ st1 {$data.4s},[$out] ++ ret ++.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt ++___ ++} ++ ++&gen_block("en"); ++&gen_block("de"); ++}}} ++ ++{{{ ++my ($inp,$out,$len,$rk)=map("x$_",(0..3)); ++my ($enc) = ("w4"); ++my @dat=map("v$_",(16..23)); ++$code.=<<___; ++.globl ${prefix}_ecb_encrypt ++.type ${prefix}_ecb_encrypt,%function ++.align 5 ++${prefix}_ecb_encrypt: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64 ++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] ++1: ++ cmp $len,#64 ++ b.lt 1f ++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 ++ cmp $len,#128 ++ b.lt 2f ++ ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp],#64 ++ // 8 blocks ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],@dat[7]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++$code.=<<___; ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++___ ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],@dat[7]); ++$code.=<<___; ++ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 ++ subs $len,$len,#128 ++ b.gt 1b ++ ret ++ // 4 blocks ++2: ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++$code.=<<___; ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ subs $len,$len,#64 ++ b.gt 1b ++1: ++ subs $len,$len,#16 ++ b.lt 1f ++ ld1 {@dat[0].4s},[$inp],#16 ++___ ++ &rev32(@dat[0],@dat[0]); ++ &enc_blk(@dat[0]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ st1 {@dat[0].4s},[$out],#16 ++ b.ne 1b ++1: ++ ret ++.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt ++___ ++}}} ++ ++{{{ ++my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4)); ++my ($enc) = ("w5"); ++my @dat=map("v$_",(16..23)); ++my @in=map("v$_",(24..31)); ++my ($ivec) = ("v8"); ++$code.=<<___; ++.globl ${prefix}_cbc_encrypt ++.type ${prefix}_cbc_encrypt,%function ++.align 5 ++${prefix}_cbc_encrypt: ++ AARCH64_VALID_CALL_TARGET ++ stp d8,d9,[sp, #-16]! ++ ++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],#64 ++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] ++ ld1 {$ivec.4s},[$ivp] ++ cmp $enc,#0 ++ b.eq .Ldec ++1: ++ cmp $len, #64 ++ b.lt 1f ++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp],#64 ++ eor @dat[0].16b,@dat[0].16b,$ivec.16b ++___ ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &enc_blk(@dat[0]); ++$code.=<<___; ++ eor @dat[1].16b,@dat[1].16b,@dat[0].16b ++___ ++ &enc_blk(@dat[1]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ eor @dat[2].16b,@dat[2].16b,@dat[1].16b ++___ ++ &enc_blk(@dat[2]); ++ &rev32(@dat[1],@dat[1]); ++$code.=<<___; ++ eor @dat[3].16b,@dat[3].16b,@dat[2].16b ++___ ++ &enc_blk(@dat[3]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++$code.=<<___; ++ mov $ivec.16b,@dat[3].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ subs $len,$len,#64 ++ b.ne 1b ++1: ++ subs $len,$len,#16 ++ b.lt 3f ++ ld1 {@dat[0].4s},[$inp],#16 ++ eor $ivec.16b,$ivec.16b,@dat[0].16b ++___ ++ &rev32($ivec,$ivec); ++ &enc_blk($ivec); ++ &rev32($ivec,$ivec); ++$code.=<<___; ++ st1 {$ivec.16b},[$out],#16 ++ b.ne 1b ++ b 3f ++.Ldec: ++1: ++ cmp $len, #64 ++ b.lt 1f ++ ld1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$inp] ++ ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64 ++ cmp $len,#128 ++ b.lt 2f ++ // 8 blocks mode ++ ld1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$inp] ++ ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64 ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],$dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],$dat[7]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],@dat[7]); ++$code.=<<___; ++ eor @dat[0].16b,@dat[0].16b,$ivec.16b ++ eor @dat[1].16b,@dat[1].16b,@in[0].16b ++ eor @dat[2].16b,@dat[2].16b,@in[1].16b ++ mov $ivec.16b,@in[7].16b ++ eor @dat[3].16b,$dat[3].16b,@in[2].16b ++ eor @dat[4].16b,$dat[4].16b,@in[3].16b ++ eor @dat[5].16b,$dat[5].16b,@in[4].16b ++ eor @dat[6].16b,$dat[6].16b,@in[5].16b ++ eor @dat[7].16b,$dat[7].16b,@in[6].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 ++ subs $len,$len,128 ++ b.gt 1b ++ b 3f ++ // 4 blocks mode ++2: ++___ ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],$dat[3]); ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++$code.=<<___; ++ eor @dat[0].16b,@dat[0].16b,$ivec.16b ++ eor @dat[1].16b,@dat[1].16b,@in[0].16b ++ mov $ivec.16b,@in[3].16b ++ eor @dat[2].16b,@dat[2].16b,@in[1].16b ++ eor @dat[3].16b,$dat[3].16b,@in[2].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ subs $len,$len,#64 ++ b.gt 1b ++1: ++ subs $len,$len,#16 ++ b.lt 3f ++ ld1 {@dat[0].4s},[$inp],#16 ++ mov @in[0].16b,@dat[0].16b ++___ ++ &rev32(@dat[0],@dat[0]); ++ &enc_blk(@dat[0]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ eor @dat[0].16b,@dat[0].16b,$ivec.16b ++ mov $ivec.16b,@in[0].16b ++ st1 {@dat[0].16b},[$out],#16 ++ b.ne 1b ++3: ++ // save back IV ++ st1 {$ivec.16b},[$ivp] ++ ldp d8,d9,[sp],#16 ++ ret ++.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt ++___ ++}}} ++ ++{{{ ++my ($inp,$out,$len,$rk,$ivp)=map("x$_",(0..4)); ++my ($ctr)=("w5"); ++my @dat=map("v$_",(16..23)); ++my @in=map("v$_",(24..31)); ++my ($ivec)=("v8"); ++$code.=<<___; ++.globl ${prefix}_ctr32_encrypt_blocks ++.type ${prefix}_ctr32_encrypt_blocks,%function ++.align 5 ++${prefix}_ctr32_encrypt_blocks: ++ AARCH64_VALID_CALL_TARGET ++ stp d8,d9,[sp, #-16]! ++ ++ ld1 {$ivec.4s},[$ivp] ++ ld1 {@rks[0].4s,@rks[1].4s,@rks[2].4s,@rks[3].4s},[$rk],64 ++ ld1 {@rks[4].4s,@rks[5].4s,@rks[6].4s,@rks[7].4s},[$rk] ++___ ++ &rev32($ivec,$ivec); ++$code.=<<___; ++ mov $ctr,$ivec.s[3] ++1: ++ cmp $len,#4 ++ b.lt 1f ++ ld1 {@in[0].4s,@in[1].4s,@in[2].4s,@in[3].4s},[$inp],#64 ++ mov @dat[0].16b,$ivec.16b ++ mov @dat[1].16b,$ivec.16b ++ mov @dat[2].16b,$ivec.16b ++ mov @dat[3].16b,$ivec.16b ++ add $ctr,$ctr,#1 ++ mov $dat[1].s[3],$ctr ++ add $ctr,$ctr,#1 ++ mov @dat[2].s[3],$ctr ++ add $ctr,$ctr,#1 ++ mov @dat[3].s[3],$ctr ++ cmp $len,#8 ++ b.lt 2f ++ ld1 {@in[4].4s,@in[5].4s,@in[6].4s,@in[7].4s},[$inp],#64 ++ mov @dat[4].16b,$ivec.16b ++ mov @dat[5].16b,$ivec.16b ++ mov @dat[6].16b,$ivec.16b ++ mov @dat[7].16b,$ivec.16b ++ add $ctr,$ctr,#1 ++ mov $dat[4].s[3],$ctr ++ add $ctr,$ctr,#1 ++ mov @dat[5].s[3],$ctr ++ add $ctr,$ctr,#1 ++ mov @dat[6].s[3],$ctr ++ add $ctr,$ctr,#1 ++ mov @dat[7].s[3],$ctr ++___ ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &enc_4blks(@dat[4],@dat[5],@dat[6],@dat[7]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++ &rev32(@dat[4],@dat[4]); ++ &rev32(@dat[5],@dat[5]); ++ &rev32(@dat[6],@dat[6]); ++ &rev32(@dat[7],@dat[7]); ++$code.=<<___; ++ eor @dat[0].16b,@dat[0].16b,@in[0].16b ++ eor @dat[1].16b,@dat[1].16b,@in[1].16b ++ eor @dat[2].16b,@dat[2].16b,@in[2].16b ++ eor @dat[3].16b,@dat[3].16b,@in[3].16b ++ eor @dat[4].16b,@dat[4].16b,@in[4].16b ++ eor @dat[5].16b,@dat[5].16b,@in[5].16b ++ eor @dat[6].16b,@dat[6].16b,@in[6].16b ++ eor @dat[7].16b,@dat[7].16b,@in[7].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ st1 {@dat[4].4s,@dat[5].4s,@dat[6].4s,@dat[7].4s},[$out],#64 ++ subs $len,$len,#8 ++ b.eq 3f ++ add $ctr,$ctr,#1 ++ mov $ivec.s[3],$ctr ++ b 1b ++2: ++___ ++ &enc_4blks(@dat[0],@dat[1],@dat[2],@dat[3]); ++ &rev32(@dat[0],@dat[0]); ++ &rev32(@dat[1],@dat[1]); ++ &rev32(@dat[2],@dat[2]); ++ &rev32(@dat[3],@dat[3]); ++$code.=<<___; ++ eor @dat[0].16b,@dat[0].16b,@in[0].16b ++ eor @dat[1].16b,@dat[1].16b,@in[1].16b ++ eor @dat[2].16b,@dat[2].16b,@in[2].16b ++ eor @dat[3].16b,@dat[3].16b,@in[3].16b ++ st1 {@dat[0].4s,@dat[1].4s,@dat[2].4s,@dat[3].4s},[$out],#64 ++ subs $len,$len,#4 ++ b.eq 3f ++ add $ctr,$ctr,#1 ++ mov $ivec.s[3],$ctr ++ b 1b ++1: ++ subs $len,$len,#1 ++ b.lt 3f ++ mov $dat[0].16b,$ivec.16b ++ ld1 {@in[0].4s},[$inp],#16 ++___ ++ &enc_blk(@dat[0]); ++ &rev32(@dat[0],@dat[0]); ++$code.=<<___; ++ eor $dat[0].16b,$dat[0].16b,@in[0].16b ++ st1 {$dat[0].4s},[$out],#16 ++ b.eq 3f ++ add $ctr,$ctr,#1 ++ mov $ivec.s[3],$ctr ++ b 1b ++3: ++ ldp d8,d9,[sp],#16 ++ ret ++.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks ++___ ++}}} ++######################################## ++{ my %opcode = ( ++ "sm4e" => 0xcec08400, ++ "sm4ekey" => 0xce60c800); ++ ++ sub unsm4 { ++ my ($mnemonic,$arg)=@_; ++ ++ $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o ++ && ++ sprintf ".inst\t0x%08x\t//%s %s", ++ $opcode{$mnemonic}|$1|($2<<5)|($3<<16), ++ $mnemonic,$arg; ++ } ++} ++ ++open SELF,$0; ++while(<SELF>) { ++ next if (/^#!/); ++ last if (!s/^#/\/\// and !/^$/); ++ print; ++} ++close SELF; ++ ++foreach(split("\n",$code)) { ++ s/\`([^\`]*)\`/eval($1)/ge; ++ ++ s/\b(sm4\w+)\s+([qv].*)/unsm4($1,$2)/ge; ++ print $_,"\n"; ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; +diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info +index b65a7d149e..e27aa49e67 100644 +--- a/crypto/sm4/build.info ++++ b/crypto/sm4/build.info +@@ -1,4 +1,32 @@ + LIBS=../../libcrypto +-SOURCE[../../libcrypto]=\ +- sm4.c + ++IF[{- !$disabled{asm} -}] ++ $SM4DEF_aarch64=SM4_ASM ++ $SM4ASM_aarch64=sm4-armv8.S ++ ++ # Now that we have defined all the arch specific variables, use the ++ # appropriate one, and define the appropriate macros ++ IF[$SM4ASM_{- $target{asm_arch} -}] ++ $SM4ASM=$SM4ASM_{- $target{asm_arch} -} ++ $SM4DEF=$SM4DEF_{- $target{asm_arch} -} ++ ENDIF ++ENDIF ++ ++SOURCE[../../libcrypto]= $SM4ASM sm4.c ++ ++ ++# Implementations are now spread across several libraries, so the defines ++# need to be applied to all affected libraries and modules. ++DEFINE[../../libcrypto]=$SM4DEF ++DEFINE[../../providers/libfips.a]=$SM4DEF ++DEFINE[../../providers/libdefault.a]=$SM4DEF ++# We only need to include the SM4DEF stuff in the legacy provider when it's a ++# separate module and it's dynamically linked with libcrypto. Otherwise, it ++# already gets everything that the static libcrypto.a has, and doesn't need it ++# added again. ++IF[{- !$disabled{module} && !$disabled{shared} -}] ++ DEFINE[../providers/liblegacy.a]=$SM4DEF ++ENDIF ++ ++GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl ++INCLUDE[sm4-armv8.o]=.. +diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h +new file mode 100644 +index 0000000000..42c8b44a43 +--- /dev/null ++++ b/include/crypto/sm4_platform.h +@@ -0,0 +1,48 @@ ++/* ++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++#ifndef OSSL_SM4_PLATFORM_H ++# define OSSL_SM4_PLATFORM_H ++# pragma once ++ ++# if defined(OPENSSL_CPUID_OBJ) ++# if (defined(__arm__) || defined(__arm) || defined(__aarch64__)) ++# include "arm_arch.h" ++# if __ARM_MAX_ARCH__>=8 ++# define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4) ++# define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key ++# define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key ++# define HWSM4_encrypt sm4_v8_encrypt ++# define HWSM4_decrypt sm4_v8_decrypt ++# define HWSM4_cbc_encrypt sm4_v8_cbc_encrypt ++# define HWSM4_ecb_encrypt sm4_v8_ecb_encrypt ++# define HWSM4_ctr32_encrypt_blocks sm4_v8_ctr32_encrypt_blocks ++# endif ++# endif ++# endif /* OPENSSL_CPUID_OBJ */ ++ ++# if defined(HWSM4_CAPABLE) ++int HWSM4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); ++int HWSM4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); ++void HWSM4_encrypt(const unsigned char *in, unsigned char *out, ++ const SM4_KEY *key); ++void HWSM4_decrypt(const unsigned char *in, unsigned char *out, ++ const SM4_KEY *key); ++void HWSM4_cbc_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ unsigned char *ivec, const int enc); ++void HWSM4_ecb_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ const int enc); ++void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, ++ const unsigned char ivec[16]); ++# endif /* HWSM4_CAPABLE */ ++ ++#endif /* OSSL_SM4_PLATFORM_H */ +diff --git a/providers/implementations/ciphers/cipher_sm4.h b/providers/implementations/ciphers/cipher_sm4.h +index f7f833fcb4..01a031a74d 100644 +--- a/providers/implementations/ciphers/cipher_sm4.h ++++ b/providers/implementations/ciphers/cipher_sm4.h +@@ -9,6 +9,7 @@ + + #include "prov/ciphercommon.h" + #include "crypto/sm4.h" ++#include "crypto/sm4_platform.h" + + typedef struct prov_cast_ctx_st { + PROV_CIPHER_CTX base; /* Must be first */ +diff --git a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c +index 6bcd1ec406..c0c9b22bd3 100644 +--- a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c ++++ b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c +@@ -12,6 +12,7 @@ + */ + + #include "cipher_sm4_gcm.h" ++#include "crypto/sm4_platform.h" + + static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key, + size_t keylen) +@@ -20,9 +21,22 @@ static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key, + SM4_KEY *ks = &actx->ks.ks; + + ctx->ks = ks; +- ossl_sm4_set_key(key, ks); +- CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt); +- ctx->ctr = (ctr128_f)NULL; ++# ifdef HWSM4_CAPABLE ++ if (HWSM4_CAPABLE) { ++ HWSM4_set_encrypt_key(key, ks); ++ CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f) HWSM4_encrypt); ++# ifdef HWSM4_ctr32_encrypt_blocks ++ ctx->ctr = (ctr128_f) HWSM4_ctr32_encrypt_blocks; ++# else /* HWSM4_ctr32_encrypt_blocks */ ++ ctx->ctr = (ctr128_f)NULL; ++# endif ++ } else ++# endif /* HWSM4_CAPABLE */ ++ { ++ ossl_sm4_set_key(key, ks); ++ CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt); ++ ctx->ctr = (ctr128_f)NULL; ++ } + ctx->key_set = 1; + + return 1; +diff --git a/providers/implementations/ciphers/cipher_sm4_hw.c b/providers/implementations/ciphers/cipher_sm4_hw.c +index 0db04b1a74..4cd3d3d669 100644 +--- a/providers/implementations/ciphers/cipher_sm4_hw.c ++++ b/providers/implementations/ciphers/cipher_sm4_hw.c +@@ -15,14 +15,59 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx, + PROV_SM4_CTX *sctx = (PROV_SM4_CTX *)ctx; + SM4_KEY *ks = &sctx->ks.ks; + +- ossl_sm4_set_key(key, ks); + ctx->ks = ks; + if (ctx->enc + || (ctx->mode != EVP_CIPH_ECB_MODE +- && ctx->mode != EVP_CIPH_CBC_MODE)) +- ctx->block = (block128_f)ossl_sm4_encrypt; +- else +- ctx->block = (block128_f)ossl_sm4_decrypt; ++ && ctx->mode != EVP_CIPH_CBC_MODE)) { ++#ifdef HWSM4_CAPABLE ++ if (HWSM4_CAPABLE) { ++ HWSM4_set_encrypt_key(key, ks); ++ ctx->block = (block128_f)HWSM4_encrypt; ++ ctx->stream.cbc = NULL; ++#ifdef HWSM4_cbc_encrypt ++ if (ctx->mode == EVP_CIPH_CBC_MODE) ++ ctx->stream.cbc = (cbc128_f)HWSM4_cbc_encrypt; ++ else ++#endif ++#ifdef HWSM4_ecb_encrypt ++ if (ctx->mode == EVP_CIPH_ECB_MODE) ++ ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt; ++ else ++#endif ++#ifdef HWSM4_ctr32_encrypt_blocks ++ if (ctx->mode == EVP_CIPH_CTR_MODE) ++ ctx->stream.ctr = (ctr128_f)HWSM4_ctr32_encrypt_blocks; ++ else ++#endif ++ (void)0; /* terminate potentially open 'else' */ ++ } else ++#endif ++ { ++ ossl_sm4_set_key(key, ks); ++ ctx->block = (block128_f)ossl_sm4_encrypt; ++ } ++ } else { ++#ifdef HWSM4_CAPABLE ++ if (HWSM4_CAPABLE) { ++ HWSM4_set_decrypt_key(key, ks); ++ ctx->block = (block128_f)HWSM4_decrypt; ++ ctx->stream.cbc = NULL; ++#ifdef HWSM4_cbc_encrypt ++ if (ctx->mode == EVP_CIPH_CBC_MODE) ++ ctx->stream.cbc = (cbc128_f)HWSM4_cbc_encrypt; ++#endif ++#ifdef HWSM4_ecb_encrypt ++ if (ctx->mode == EVP_CIPH_ECB_MODE) ++ ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt; ++#endif ++ } else ++#endif ++ { ++ ossl_sm4_set_key(key, ks); ++ ctx->block = (block128_f)ossl_sm4_decrypt; ++ } ++ } ++ + return 1; + } + +@@ -31,7 +76,7 @@ IMPLEMENT_CIPHER_HW_COPYCTX(cipher_hw_sm4_copyctx, PROV_SM4_CTX) + # define PROV_CIPHER_HW_sm4_mode(mode) \ + static const PROV_CIPHER_HW sm4_##mode = { \ + cipher_hw_sm4_initkey, \ +- ossl_cipher_hw_chunked_##mode, \ ++ ossl_cipher_hw_generic_##mode, \ + cipher_hw_sm4_copyctx \ + }; \ + const PROV_CIPHER_HW *ossl_prov_cipher_hw_sm4_##mode(size_t keybits) \ +-- +2.37.3.windows.1 + diff --git a/Backport-aarch64-support-BTI-and-pointer-authentication-in-as.patch b/Backport-aarch64-support-BTI-and-pointer-authentication-in-as.patch new file mode 100644 index 0000000..31852cb --- /dev/null +++ b/Backport-aarch64-support-BTI-and-pointer-authentication-in-as.patch @@ -0,0 +1,1521 @@ +From 44b6e3d07ae5b09255710986e61035c862ec68aa Mon Sep 17 00:00:00 2001 +From: Russ Butler <russ.butler@arm.com> +Date: Sat, 28 Aug 2021 13:57:09 -0500 +Subject: [PATCH 01/13] aarch64: support BTI and pointer authentication in + assembly + +This change adds optional support for +- Armv8.3-A Pointer Authentication (PAuth) and +- Armv8.5-A Branch Target Identification (BTI) +features to the perl scripts. + +Both features can be enabled with additional compiler flags. +Unless any of these are enabled explicitly there is no code change at +all. + +The extensions are briefly described below. Please read the appropriate +chapters of the Arm Architecture Reference Manual for the complete +specification. + +Scope +----- + +This change only affects generated assembly code. + +Armv8.3-A Pointer Authentication +-------------------------------- + +Pointer Authentication extension supports the authentication of the +contents of registers before they are used for indirect branching +or load. + +PAuth provides a probabilistic method to detect corruption of register +values. PAuth signing instructions generate a Pointer Authentication +Code (PAC) based on the value of a register, a seed and a key. +The generated PAC is inserted into the original value in the register. +A PAuth authentication instruction recomputes the PAC, and if it matches +the PAC in the register, restores its original value. In case of a +mismatch, an architecturally unmapped address is generated instead. + +With PAuth, mitigation against ROP (Return-oriented Programming) attacks +can be implemented. This is achieved by signing the contents of the +link-register (LR) before it is pushed to stack. Once LR is popped, +it is authenticated. This way a stack corruption which overwrites the +LR on the stack is detectable. + +The PAuth extension adds several new instructions, some of which are not +recognized by older hardware. To support a single codebase for both pre +Armv8.3-A targets and newer ones, only NOP-space instructions are added +by this patch. These instructions are treated as NOPs on hardware +which does not support Armv8.3-A. Furthermore, this patch only considers +cases where LR is saved to the stack and then restored before branching +to its content. There are cases in the code where LR is pushed to stack +but it is not used later. We do not address these cases as they are not +affected by PAuth. + +There are two keys available to sign an instruction address: A and B. +PACIASP and PACIBSP only differ in the used keys: A and B, respectively. +The keys are typically managed by the operating system. + +To enable generating code for PAuth compile with +-mbranch-protection=<mode>: + +- standard or pac-ret: add PACIASP and AUTIASP, also enables BTI + (read below) +- pac-ret+b-key: add PACIBSP and AUTIBSP + +Armv8.5-A Branch Target Identification +-------------------------------------- + +Branch Target Identification features some new instructions which +protect the execution of instructions on guarded pages which are not +intended branch targets. + +If Armv8.5-A is supported by the hardware, execution of an instruction +changes the value of PSTATE.BTYPE field. If an indirect branch +lands on a guarded page the target instruction must be one of the +BTI <jc> flavors, or in case of a direct call or jump it can be any +other instruction. If the target instruction is not compatible with the +value of PSTATE.BTYPE a Branch Target Exception is generated. + +In short, indirect jumps are compatible with BTI <j> and <jc> while +indirect calls are compatible with BTI <c> and <jc>. Please refer to the +specification for the details. + +Armv8.3-A PACIASP and PACIBSP are implicit branch target +identification instructions which are equivalent with BTI c or BTI jc +depending on system register configuration. + +BTI is used to mitigate JOP (Jump-oriented Programming) attacks by +limiting the set of instructions which can be jumped to. + +BTI requires active linker support to mark the pages with BTI-enabled +code as guarded. For ELF64 files BTI compatibility is recorded in the +.note.gnu.property section. For a shared object or static binary it is +required that all linked units support BTI. This means that even a +single assembly file without the required note section turns-off BTI +for the whole binary or shared object. + +The new BTI instructions are treated as NOPs on hardware which does +not support Armv8.5-A or on pages which are not guarded. + +To insert this new and optional instruction compile with +-mbranch-protection=standard (also enables PAuth) or +bti. + +When targeting a guarded page from a non-guarded page, weaker +compatibility restrictions apply to maintain compatibility between +legacy and new code. For detailed rules please refer to the Arm ARM. + +Compiler support +---------------- + +Compiler support requires understanding '-mbranch-protection=<mode>' +and emitting the appropriate feature macros (__ARM_FEATURE_BTI_DEFAULT +and __ARM_FEATURE_PAC_DEFAULT). The current state is the following: + +------------------------------------------------------- +| Compiler | -mbranch-protection | Feature macros | ++----------+---------------------+--------------------+ +| clang | 9.0.0 | 11.0.0 | ++----------+---------------------+--------------------+ +| gcc | 9 | expected in 10.1+ | +------------------------------------------------------- + +Available Platforms +------------------ + +Arm Fast Model and QEMU support both extensions. + +https://developer.arm.com/tools-and-software/simulation-models/fast-models +https://www.qemu.org/ + +Implementation Notes +-------------------- + +This change adds BTI landing pads even to assembly functions which are +likely to be directly called only. In these cases, landing pads might +be superfluous depending on what code the linker generates. +Code size and performance impact for these cases would be negligible. + +Interaction with C code +----------------------- + +Pointer Authentication is a per-frame protection while Branch Target +Identification can be turned on and off only for all code pages of a +whole shared object or static binary. Because of these properties if +C/C++ code is compiled without any of the above features but assembly +files support any of them unconditionally there is no incompatibility +between the two. + +Useful Links +------------ + +To fully understand the details of both PAuth and BTI it is advised to +read the related chapters of the Arm Architecture Reference Manual +(Arm ARM): +https://developer.arm.com/documentation/ddi0487/latest/ + +Additional materials: + +"Providing protection for complex software" +https://developer.arm.com/architectures/learn-the-architecture/providing-protection-for-complex-software + +Arm Compiler Reference Guide Version 6.14: -mbranch-protection +https://developer.arm.com/documentation/101754/0614/armclang-Reference/armclang-Command-line-Options/-mbranch-protection?lang=en + +Arm C Language Extensions (ACLE) +https://developer.arm.com/docs/101028/latest + +Addional Notes +-------------- + +This patch is a copy of the work done by Tamas Petz in boringssl. It +contains the changes from the following commits: + +aarch64: support BTI and pointer authentication in assembly + Change-Id: I4335f92e2ccc8e209c7d68a0a79f1acdf3aeb791 + URL: https://boringssl-review.googlesource.com/c/boringssl/+/42084 +aarch64: Improve conditional compilation + Change-Id: I14902a64e5f403c2b6a117bc9f5fb1a4f4611ebf + URL: https://boringssl-review.googlesource.com/c/boringssl/+/43524 +aarch64: Fix name of gnu property note section + Change-Id: I6c432d1c852129e9c273f6469a8b60e3983671ec + URL: https://boringssl-review.googlesource.com/c/boringssl/+/44024 + +Change-Id: I2d95ebc5e4aeb5610d3b226f9754ee80cf74a9af + +Reviewed-by: Paul Dale <pauli@openssl.org> +Reviewed-by: Tomas Mraz <tomas@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/16674) +--- + crypto/aes/asm/aesv8-armx.pl | 18 +++++++- + crypto/aes/asm/vpaes-armv8.pl | 39 ++++++++-------- + crypto/aes/build.info | 1 + + crypto/arm64cpuid.pl | 10 +++++ + crypto/arm_arch.h | 58 ++++++++++++++++++++++++ + crypto/bn/asm/armv8-mont.pl | 19 +++++--- + crypto/chacha/asm/chacha-armv8.pl | 18 ++++---- + crypto/ec/asm/ecp_nistz256-armv8.pl | 64 ++++++++++++++++----------- + crypto/modes/asm/aes-gcm-armv8_64.pl | 6 +++ + crypto/modes/asm/ghashv8-armx.pl | 11 +++++ + crypto/poly1305/asm/poly1305-armv8.pl | 17 ++++++- + crypto/sha/asm/keccak1600-armv8.pl | 30 +++++++------ + crypto/sha/asm/sha1-armv8.pl | 5 ++- + crypto/sha/asm/sha512-armv8.pl | 11 +++-- + crypto/sha/build.info | 1 + + 15 files changed, 228 insertions(+), 80 deletions(-) + +diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl +index 6a7bf05d1b..ed5ae4207c 100755 +--- a/crypto/aes/asm/aesv8-armx.pl ++++ b/crypto/aes/asm/aesv8-armx.pl +@@ -120,6 +120,8 @@ ${prefix}_set_encrypt_key: + .Lenc_key: + ___ + $code.=<<___ if ($flavour =~ /64/); ++ AARCH64_VALID_CALL_TARGET ++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ___ +@@ -295,7 +297,7 @@ $code.=<<___; + ${prefix}_set_decrypt_key: + ___ + $code.=<<___ if ($flavour =~ /64/); +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ___ +@@ -339,7 +341,7 @@ $code.=<<___ if ($flavour !~ /64/); + ___ + $code.=<<___ if ($flavour =~ /64/); + ldp x29,x30,[sp],#16 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + ___ + $code.=<<___; +@@ -359,6 +361,11 @@ $code.=<<___; + .type ${prefix}_${dir}crypt,%function + .align 5 + ${prefix}_${dir}crypt: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ AARCH64_VALID_CALL_TARGET ++___ ++$code.=<<___; + ldr $rounds,[$key,#240] + vld1.32 {$rndkey0},[$key],#16 + vld1.8 {$inout},[$inp] +@@ -442,6 +449,7 @@ $code.=<<___; + ${prefix}_ecb_encrypt: + ___ + $code.=<<___ if ($flavour =~ /64/); ++ AARCH64_VALID_CALL_TARGET + subs $len,$len,#16 + // Original input data size bigger than 16, jump to big size processing. + b.ne .Lecb_big_size +@@ -1236,6 +1244,8 @@ $code.=<<___; + ${prefix}_cbc_encrypt: + ___ + $code.=<<___ if ($flavour =~ /64/); ++ AARCH64_VALID_CALL_TARGET ++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ___ +@@ -1764,6 +1774,8 @@ $code.=<<___; + ${prefix}_ctr32_encrypt_blocks: + ___ + $code.=<<___ if ($flavour =~ /64/); ++ AARCH64_VALID_CALL_TARGET ++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ___ +@@ -2256,6 +2268,7 @@ $code.=<<___ if ($flavour =~ /64/); + ${prefix}_xts_encrypt: + ___ + $code.=<<___ if ($flavour =~ /64/); ++ AARCH64_VALID_CALL_TARGET + cmp $len,#16 + // Original input data size bigger than 16, jump to big size processing. + b.ne .Lxts_enc_big_size +@@ -2930,6 +2943,7 @@ $code.=<<___ if ($flavour =~ /64/); + .type ${prefix}_xts_decrypt,%function + .align 5 + ${prefix}_xts_decrypt: ++ AARCH64_VALID_CALL_TARGET + ___ + $code.=<<___ if ($flavour =~ /64/); + cmp $len,#16 +diff --git a/crypto/aes/asm/vpaes-armv8.pl b/crypto/aes/asm/vpaes-armv8.pl +index dcd5065e68..49988e9c2b 100755 +--- a/crypto/aes/asm/vpaes-armv8.pl ++++ b/crypto/aes/asm/vpaes-armv8.pl +@@ -53,6 +53,8 @@ open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + *STDOUT=*OUT; + + $code.=<<___; ++#include "arm_arch.h" ++ + .text + + .type _vpaes_consts,%object +@@ -259,7 +261,7 @@ _vpaes_encrypt_core: + .type vpaes_encrypt,%function + .align 4 + vpaes_encrypt: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + +@@ -269,7 +271,7 @@ vpaes_encrypt: + st1 {v0.16b}, [$out] + + ldp x29,x30,[sp],#16 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size vpaes_encrypt,.-vpaes_encrypt + +@@ -492,7 +494,7 @@ _vpaes_decrypt_core: + .type vpaes_decrypt,%function + .align 4 + vpaes_decrypt: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + +@@ -502,7 +504,7 @@ vpaes_decrypt: + st1 {v0.16b}, [$out] + + ldp x29,x30,[sp],#16 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size vpaes_decrypt,.-vpaes_decrypt + +@@ -673,7 +675,7 @@ _vpaes_key_preheat: + .type _vpaes_schedule_core,%function + .align 4 + _vpaes_schedule_core: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp,#-16]! + add x29,sp,#0 + +@@ -838,7 +840,7 @@ _vpaes_schedule_core: + eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 + eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 + ldp x29, x30, [sp],#16 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size _vpaes_schedule_core,.-_vpaes_schedule_core + +@@ -1051,7 +1053,7 @@ _vpaes_schedule_mangle: + .type vpaes_set_encrypt_key,%function + .align 4 + vpaes_set_encrypt_key: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so +@@ -1067,7 +1069,7 @@ vpaes_set_encrypt_key: + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key + +@@ -1075,7 +1077,7 @@ vpaes_set_encrypt_key: + .type vpaes_set_decrypt_key,%function + .align 4 + vpaes_set_decrypt_key: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so +@@ -1095,7 +1097,7 @@ vpaes_set_decrypt_key: + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key + ___ +@@ -1108,11 +1110,11 @@ $code.=<<___; + .type vpaes_cbc_encrypt,%function + .align 4 + vpaes_cbc_encrypt: ++ AARCH64_SIGN_LINK_REGISTER + cbz $len, .Lcbc_abort + cmp w5, #0 // check direction + b.eq vpaes_cbc_decrypt + +- .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + +@@ -1135,15 +1137,16 @@ vpaes_cbc_encrypt: + st1 {v0.16b}, [$ivec] // write ivec + + ldp x29,x30,[sp],#16 +- .inst 0xd50323bf // autiasp + .Lcbc_abort: ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt + + .type vpaes_cbc_decrypt,%function + .align 4 + vpaes_cbc_decrypt: +- .inst 0xd503233f // paciasp ++ // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to ++ // only from vpaes_cbc_encrypt which has already signed the return address. + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so +@@ -1185,7 +1188,7 @@ vpaes_cbc_decrypt: + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt + ___ +@@ -1195,7 +1198,7 @@ $code.=<<___; + .type vpaes_ecb_encrypt,%function + .align 4 + vpaes_ecb_encrypt: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so +@@ -1229,7 +1232,7 @@ vpaes_ecb_encrypt: + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size vpaes_ecb_encrypt,.-vpaes_ecb_encrypt + +@@ -1237,7 +1240,7 @@ vpaes_ecb_encrypt: + .type vpaes_ecb_decrypt,%function + .align 4 + vpaes_ecb_decrypt: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so +@@ -1271,7 +1274,7 @@ vpaes_ecb_decrypt: + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size vpaes_ecb_decrypt,.-vpaes_ecb_decrypt + ___ +diff --git a/crypto/aes/build.info b/crypto/aes/build.info +index b250903fa6..47f99fdf33 100644 +--- a/crypto/aes/build.info ++++ b/crypto/aes/build.info +@@ -116,6 +116,7 @@ INCLUDE[aes-mips.o]=.. + GENERATE[aesv8-armx.S]=asm/aesv8-armx.pl + INCLUDE[aesv8-armx.o]=.. + GENERATE[vpaes-armv8.S]=asm/vpaes-armv8.pl ++INCLUDE[vpaes-armv8.o]=.. + + GENERATE[aes-armv4.S]=asm/aes-armv4.pl + INCLUDE[aes-armv4.o]=.. +diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl +index ac76dd449f..11f0e50279 100755 +--- a/crypto/arm64cpuid.pl ++++ b/crypto/arm64cpuid.pl +@@ -31,6 +31,7 @@ $code.=<<___; + .globl _armv7_neon_probe + .type _armv7_neon_probe,%function + _armv7_neon_probe: ++ AARCH64_VALID_CALL_TARGET + orr v15.16b, v15.16b, v15.16b + ret + .size _armv7_neon_probe,.-_armv7_neon_probe +@@ -38,6 +39,7 @@ _armv7_neon_probe: + .globl _armv7_tick + .type _armv7_tick,%function + _armv7_tick: ++ AARCH64_VALID_CALL_TARGET + #ifdef __APPLE__ + mrs x0, CNTPCT_EL0 + #else +@@ -49,6 +51,7 @@ _armv7_tick: + .globl _armv8_aes_probe + .type _armv8_aes_probe,%function + _armv8_aes_probe: ++ AARCH64_VALID_CALL_TARGET + aese v0.16b, v0.16b + ret + .size _armv8_aes_probe,.-_armv8_aes_probe +@@ -56,6 +59,7 @@ _armv8_aes_probe: + .globl _armv8_sha1_probe + .type _armv8_sha1_probe,%function + _armv8_sha1_probe: ++ AARCH64_VALID_CALL_TARGET + sha1h s0, s0 + ret + .size _armv8_sha1_probe,.-_armv8_sha1_probe +@@ -63,6 +67,7 @@ _armv8_sha1_probe: + .globl _armv8_sha256_probe + .type _armv8_sha256_probe,%function + _armv8_sha256_probe: ++ AARCH64_VALID_CALL_TARGET + sha256su0 v0.4s, v0.4s + ret + .size _armv8_sha256_probe,.-_armv8_sha256_probe +@@ -70,6 +75,7 @@ _armv8_sha256_probe: + .globl _armv8_pmull_probe + .type _armv8_pmull_probe,%function + _armv8_pmull_probe: ++ AARCH64_VALID_CALL_TARGET + pmull v0.1q, v0.1d, v0.1d + ret + .size _armv8_pmull_probe,.-_armv8_pmull_probe +@@ -77,6 +83,7 @@ _armv8_pmull_probe: + .globl _armv8_sha512_probe + .type _armv8_sha512_probe,%function + _armv8_sha512_probe: ++ AARCH64_VALID_CALL_TARGET + .long 0xcec08000 // sha512su0 v0.2d,v0.2d + ret + .size _armv8_sha512_probe,.-_armv8_sha512_probe +@@ -84,6 +91,7 @@ _armv8_sha512_probe: + .globl _armv8_cpuid_probe + .type _armv8_cpuid_probe,%function + _armv8_cpuid_probe: ++ AARCH64_VALID_CALL_TARGET + mrs x0, midr_el1 + ret + .size _armv8_cpuid_probe,.-_armv8_cpuid_probe +@@ -92,6 +100,7 @@ _armv8_cpuid_probe: + .type OPENSSL_cleanse,%function + .align 5 + OPENSSL_cleanse: ++ AARCH64_VALID_CALL_TARGET + cbz x1,.Lret // len==0? + cmp x1,#15 + b.hi .Lot // len>15 +@@ -123,6 +132,7 @@ OPENSSL_cleanse: + .type CRYPTO_memcmp,%function + .align 4 + CRYPTO_memcmp: ++ AARCH64_VALID_CALL_TARGET + eor w3,w3,w3 + cbz x2,.Lno_data // len==0? + cmp x2,#16 +diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h +index 45d7e15564..a815a5c72b 100644 +--- a/crypto/arm_arch.h ++++ b/crypto/arm_arch.h +@@ -126,4 +126,62 @@ extern unsigned int OPENSSL_armv8_rsa_neonized; + + # define MIDR_IS_CPU_MODEL(midr, imp, partnum) \ + (((midr) & MIDR_CPU_MODEL_MASK) == MIDR_CPU_MODEL(imp, partnum)) ++ ++#if defined(__ASSEMBLER__) ++ ++ /* ++ * Support macros for ++ * - Armv8.3-A Pointer Authentication and ++ * - Armv8.5-A Branch Target Identification ++ * features which require emitting a .note.gnu.property section with the ++ * appropriate architecture-dependent feature bits set. ++ * Read more: "ELF for the Arm® 64-bit Architecture" ++ */ ++ ++# if defined(__ARM_FEATURE_BTI_DEFAULT) && __ARM_FEATURE_BTI_DEFAULT == 1 ++# define GNU_PROPERTY_AARCH64_BTI (1 << 0) /* Has Branch Target Identification */ ++# define AARCH64_VALID_CALL_TARGET hint #34 /* BTI 'c' */ ++# else ++# define GNU_PROPERTY_AARCH64_BTI 0 /* No Branch Target Identification */ ++# define AARCH64_VALID_CALL_TARGET ++# endif ++ ++# if defined(__ARM_FEATURE_PAC_DEFAULT) && \ ++ (__ARM_FEATURE_PAC_DEFAULT & 1) == 1 /* Signed with A-key */ ++# define GNU_PROPERTY_AARCH64_POINTER_AUTH \ ++ (1 << 1) /* Has Pointer Authentication */ ++# define AARCH64_SIGN_LINK_REGISTER hint #25 /* PACIASP */ ++# define AARCH64_VALIDATE_LINK_REGISTER hint #29 /* AUTIASP */ ++# elif defined(__ARM_FEATURE_PAC_DEFAULT) && \ ++ (__ARM_FEATURE_PAC_DEFAULT & 2) == 2 /* Signed with B-key */ ++# define GNU_PROPERTY_AARCH64_POINTER_AUTH \ ++ (1 << 1) /* Has Pointer Authentication */ ++# define AARCH64_SIGN_LINK_REGISTER hint #27 /* PACIBSP */ ++# define AARCH64_VALIDATE_LINK_REGISTER hint #31 /* AUTIBSP */ ++# else ++# define GNU_PROPERTY_AARCH64_POINTER_AUTH 0 /* No Pointer Authentication */ ++# if GNU_PROPERTY_AARCH64_BTI != 0 ++# define AARCH64_SIGN_LINK_REGISTER AARCH64_VALID_CALL_TARGET ++# else ++# define AARCH64_SIGN_LINK_REGISTER ++# endif ++# define AARCH64_VALIDATE_LINK_REGISTER ++# endif ++ ++# if GNU_PROPERTY_AARCH64_POINTER_AUTH != 0 || GNU_PROPERTY_AARCH64_BTI != 0 ++ .pushsection .note.gnu.property, "a"; ++ .balign 8; ++ .long 4; ++ .long 0x10; ++ .long 0x5; ++ .asciz "GNU"; ++ .long 0xc0000000; /* GNU_PROPERTY_AARCH64_FEATURE_1_AND */ ++ .long 4; ++ .long (GNU_PROPERTY_AARCH64_POINTER_AUTH | GNU_PROPERTY_AARCH64_BTI); ++ .long 0; ++ .popsection; ++# endif ++ ++# endif /* defined __ASSEMBLER__ */ ++ + #endif +diff --git a/crypto/bn/asm/armv8-mont.pl b/crypto/bn/asm/armv8-mont.pl +index 54d2e8245f..21ab12bdf0 100755 +--- a/crypto/bn/asm/armv8-mont.pl ++++ b/crypto/bn/asm/armv8-mont.pl +@@ -67,8 +67,8 @@ $n0="x4"; # const BN_ULONG *n0, + $num="x5"; # int num); + + $code.=<<___; ++#include "arm_arch.h" + #ifndef __KERNEL__ +-# include "arm_arch.h" + .extern OPENSSL_armv8_rsa_neonized + .hidden OPENSSL_armv8_rsa_neonized + #endif +@@ -78,6 +78,7 @@ $code.=<<___; + .type bn_mul_mont,%function + .align 5 + bn_mul_mont: ++ AARCH64_SIGN_LINK_REGISTER + .Lbn_mul_mont: + tst $num,#3 + b.ne .Lmul_mont +@@ -288,6 +289,7 @@ bn_mul_mont: + mov x0,#1 + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size bn_mul_mont,.-bn_mul_mont + ___ +@@ -309,6 +311,8 @@ $code.=<<___; + .type bn_mul8x_mont_neon,%function + .align 5 + bn_mul8x_mont_neon: ++ // Not adding AARCH64_SIGN_LINK_REGISTER here because bn_mul8x_mont_neon is jumped to ++ // only from bn_mul_mont which has already signed the return address. + stp x29,x30,[sp,#-80]! + mov x16,sp + stp d8,d9,[sp,#16] +@@ -649,6 +653,7 @@ $code.=<<___; + ldp d10,d11,[sp,#32] + ldp d8,d9,[sp,#16] + ldr x29,[sp],#80 ++ AARCH64_VALIDATE_LINK_REGISTER + ret // bx lr + + .size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon +@@ -671,7 +676,8 @@ __bn_sqr8x_mont: + cmp $ap,$bp + b.ne __bn_mul4x_mont + .Lsqr8x_mont: +- .inst 0xd503233f // paciasp ++ // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to ++ // only from bn_mul_mont which has already signed the return address. + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] +@@ -1425,7 +1431,8 @@ $code.=<<___; + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +- .inst 0xd50323bf // autiasp ++ // x30 is loaded earlier ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size __bn_sqr8x_mont,.-__bn_sqr8x_mont + ___ +@@ -1449,7 +1456,8 @@ $code.=<<___; + .type __bn_mul4x_mont,%function + .align 5 + __bn_mul4x_mont: +- .inst 0xd503233f // paciasp ++ // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to ++ // only from bn_mul_mont (or __bn_sqr8x_mont from bn_mul_mont) which has already signed the return address. + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] +@@ -1883,7 +1891,8 @@ __bn_mul4x_mont: + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 +- .inst 0xd50323bf // autiasp ++ // x30 loaded earlier ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size __bn_mul4x_mont,.-__bn_mul4x_mont + ___ +diff --git a/crypto/chacha/asm/chacha-armv8.pl b/crypto/chacha/asm/chacha-armv8.pl +index dcdc4a04e3..e1a8b81594 100755 +--- a/crypto/chacha/asm/chacha-armv8.pl ++++ b/crypto/chacha/asm/chacha-armv8.pl +@@ -132,8 +132,8 @@ my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); + } + + $code.=<<___; ++#include "arm_arch.h" + #ifndef __KERNEL__ +-# include "arm_arch.h" + .extern OPENSSL_armcap_P + .hidden OPENSSL_armcap_P + #endif +@@ -153,6 +153,7 @@ $code.=<<___; + .type ChaCha20_ctr32,%function + .align 5 + ChaCha20_ctr32: ++ AARCH64_SIGN_LINK_REGISTER + cbz $len,.Labort + cmp $len,#192 + b.lo .Lshort +@@ -165,7 +166,6 @@ ChaCha20_ctr32: + #endif + + .Lshort: +- .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + +@@ -285,8 +285,8 @@ $code.=<<___; + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +- .inst 0xd50323bf // autiasp + .Labort: ++ AARCH64_VALIDATE_LINK_REGISTER + ret + + .align 4 +@@ -342,7 +342,7 @@ $code.=<<___; + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size ChaCha20_ctr32,.-ChaCha20_ctr32 + ___ +@@ -432,8 +432,8 @@ $code.=<<___; + .type ChaCha20_neon,%function + .align 5 + ChaCha20_neon: ++ AARCH64_SIGN_LINK_REGISTER + .LChaCha20_neon: +- .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + +@@ -667,7 +667,7 @@ $code.=<<___; + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + + .align 4 +@@ -799,7 +799,7 @@ $code.=<<___; + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size ChaCha20_neon,.-ChaCha20_neon + ___ +@@ -844,7 +844,7 @@ $code.=<<___; + .type ChaCha20_512_neon,%function + .align 5 + ChaCha20_512_neon: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + +@@ -1268,7 +1268,7 @@ $code.=<<___; + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size ChaCha20_512_neon,.-ChaCha20_512_neon + ___ +diff --git a/crypto/ec/asm/ecp_nistz256-armv8.pl b/crypto/ec/asm/ecp_nistz256-armv8.pl +index 81ee3947d7..6c5d0e8b3c 100644 +--- a/crypto/ec/asm/ecp_nistz256-armv8.pl ++++ b/crypto/ec/asm/ecp_nistz256-armv8.pl +@@ -122,7 +122,7 @@ $code.=<<___; + .type ecp_nistz256_to_mont,%function + .align 6 + ecp_nistz256_to_mont: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] +@@ -138,7 +138,7 @@ ecp_nistz256_to_mont: + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size ecp_nistz256_to_mont,.-ecp_nistz256_to_mont + +@@ -147,7 +147,7 @@ ecp_nistz256_to_mont: + .type ecp_nistz256_from_mont,%function + .align 4 + ecp_nistz256_from_mont: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] +@@ -163,7 +163,7 @@ ecp_nistz256_from_mont: + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size ecp_nistz256_from_mont,.-ecp_nistz256_from_mont + +@@ -173,7 +173,7 @@ ecp_nistz256_from_mont: + .type ecp_nistz256_mul_mont,%function + .align 4 + ecp_nistz256_mul_mont: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] +@@ -188,7 +188,7 @@ ecp_nistz256_mul_mont: + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +@@ -197,7 +197,7 @@ ecp_nistz256_mul_mont: + .type ecp_nistz256_sqr_mont,%function + .align 4 + ecp_nistz256_sqr_mont: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] +@@ -211,7 +211,7 @@ ecp_nistz256_sqr_mont: + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont + +@@ -221,7 +221,7 @@ ecp_nistz256_sqr_mont: + .type ecp_nistz256_add,%function + .align 4 + ecp_nistz256_add: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + +@@ -235,7 +235,7 @@ ecp_nistz256_add: + bl __ecp_nistz256_add + + ldp x29,x30,[sp],#16 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size ecp_nistz256_add,.-ecp_nistz256_add + +@@ -244,7 +244,7 @@ ecp_nistz256_add: + .type ecp_nistz256_div_by_2,%function + .align 4 + ecp_nistz256_div_by_2: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + +@@ -256,7 +256,7 @@ ecp_nistz256_div_by_2: + bl __ecp_nistz256_div_by_2 + + ldp x29,x30,[sp],#16 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 + +@@ -265,7 +265,7 @@ ecp_nistz256_div_by_2: + .type ecp_nistz256_mul_by_2,%function + .align 4 + ecp_nistz256_mul_by_2: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + +@@ -281,7 +281,7 @@ ecp_nistz256_mul_by_2: + bl __ecp_nistz256_add // ret = a+a // 2*a + + ldp x29,x30,[sp],#16 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 + +@@ -290,7 +290,7 @@ ecp_nistz256_mul_by_2: + .type ecp_nistz256_mul_by_3,%function + .align 4 + ecp_nistz256_mul_by_3: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + +@@ -317,7 +317,7 @@ ecp_nistz256_mul_by_3: + bl __ecp_nistz256_add // ret += a // 2*a+a=3*a + + ldp x29,x30,[sp],#16 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 + +@@ -327,7 +327,7 @@ ecp_nistz256_mul_by_3: + .type ecp_nistz256_sub,%function + .align 4 + ecp_nistz256_sub: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + +@@ -339,7 +339,7 @@ ecp_nistz256_sub: + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size ecp_nistz256_sub,.-ecp_nistz256_sub + +@@ -348,7 +348,7 @@ ecp_nistz256_sub: + .type ecp_nistz256_neg,%function + .align 4 + ecp_nistz256_neg: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + +@@ -363,7 +363,7 @@ ecp_nistz256_neg: + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size ecp_nistz256_neg,.-ecp_nistz256_neg + +@@ -724,7 +724,7 @@ $code.=<<___; + .type ecp_nistz256_point_double,%function + .align 5 + ecp_nistz256_point_double: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] +@@ -859,7 +859,7 @@ ecp_nistz256_point_double: + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x29,x30,[sp],#96 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size ecp_nistz256_point_double,.-ecp_nistz256_point_double + ___ +@@ -882,7 +882,7 @@ $code.=<<___; + .type ecp_nistz256_point_add,%function + .align 5 + ecp_nistz256_point_add: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] +@@ -1117,7 +1117,7 @@ $code.=<<___; + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size ecp_nistz256_point_add,.-ecp_nistz256_point_add + ___ +@@ -1139,7 +1139,7 @@ $code.=<<___; + .type ecp_nistz256_point_add_affine,%function + .align 5 + ecp_nistz256_point_add_affine: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x19,x20,[sp,#16] +@@ -1328,7 +1328,7 @@ $code.=<<___; + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x29,x30,[sp],#80 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine + ___ +@@ -1346,6 +1346,8 @@ $code.=<<___; + .type ecp_nistz256_ord_mul_mont,%function + .align 4 + ecp_nistz256_ord_mul_mont: ++ AARCH64_VALID_CALL_TARGET ++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] +@@ -1487,6 +1489,8 @@ $code.=<<___; + .type ecp_nistz256_ord_sqr_mont,%function + .align 4 + ecp_nistz256_ord_sqr_mont: ++ AARCH64_VALID_CALL_TARGET ++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] +@@ -1641,6 +1645,8 @@ $code.=<<___; + .type ecp_nistz256_scatter_w5,%function + .align 4 + ecp_nistz256_scatter_w5: ++ AARCH64_VALID_CALL_TARGET ++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + +@@ -1703,6 +1709,8 @@ ecp_nistz256_scatter_w5: + .type ecp_nistz256_gather_w5,%function + .align 4 + ecp_nistz256_gather_w5: ++ AARCH64_VALID_CALL_TARGET ++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + +@@ -1780,6 +1788,8 @@ ecp_nistz256_gather_w5: + .type ecp_nistz256_scatter_w7,%function + .align 4 + ecp_nistz256_scatter_w7: ++ AARCH64_VALID_CALL_TARGET ++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + +@@ -1824,6 +1834,8 @@ ecp_nistz256_scatter_w7: + .type ecp_nistz256_gather_w7,%function + .align 4 + ecp_nistz256_gather_w7: ++ AARCH64_VALID_CALL_TARGET ++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + +diff --git a/crypto/modes/asm/aes-gcm-armv8_64.pl b/crypto/modes/asm/aes-gcm-armv8_64.pl +index 3b9d5b6511..ff5809ec22 100755 +--- a/crypto/modes/asm/aes-gcm-armv8_64.pl ++++ b/crypto/modes/asm/aes-gcm-armv8_64.pl +@@ -256,6 +256,7 @@ $code.=<<___; + .type aes_gcm_enc_128_kernel,%function + .align 4 + aes_gcm_enc_128_kernel: ++ AARCH64_VALID_CALL_TARGET + cbz x1, .L128_enc_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 +@@ -1089,6 +1090,7 @@ $code.=<<___; + .type aes_gcm_dec_128_kernel,%function + .align 4 + aes_gcm_dec_128_kernel: ++ AARCH64_VALID_CALL_TARGET + cbz x1, .L128_dec_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 +@@ -1973,6 +1975,7 @@ $code.=<<___; + .type aes_gcm_enc_192_kernel,%function + .align 4 + aes_gcm_enc_192_kernel: ++ AARCH64_VALID_CALL_TARGET + cbz x1, .L192_enc_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 +@@ -2858,6 +2861,7 @@ $code.=<<___; + .type aes_gcm_dec_192_kernel,%function + .align 4 + aes_gcm_dec_192_kernel: ++ AARCH64_VALID_CALL_TARGET + cbz x1, .L192_dec_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 +@@ -3797,6 +3801,7 @@ $code.=<<___; + .type aes_gcm_enc_256_kernel,%function + .align 4 + aes_gcm_enc_256_kernel: ++ AARCH64_VALID_CALL_TARGET + cbz x1, .L256_enc_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 +@@ -4729,6 +4734,7 @@ $code.=<<___; + .type aes_gcm_dec_256_kernel,%function + .align 4 + aes_gcm_dec_256_kernel: ++ AARCH64_VALID_CALL_TARGET + cbz x1, .L256_dec_ret + stp x19, x20, [sp, #-112]! + mov x16, x4 +diff --git a/crypto/modes/asm/ghashv8-armx.pl b/crypto/modes/asm/ghashv8-armx.pl +index b1d35d25b5..57f893e77c 100644 +--- a/crypto/modes/asm/ghashv8-armx.pl ++++ b/crypto/modes/asm/ghashv8-armx.pl +@@ -107,6 +107,11 @@ $code.=<<___; + .type gcm_init_v8,%function + .align 4 + gcm_init_v8: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ AARCH64_VALID_CALL_TARGET ++___ ++$code.=<<___; + vld1.64 {$t1},[x1] @ load input H + vmov.i8 $xC2,#0xe1 + vshl.i64 $xC2,$xC2,#57 @ 0xc2.0 +@@ -214,6 +219,11 @@ $code.=<<___; + .type gcm_gmult_v8,%function + .align 4 + gcm_gmult_v8: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ AARCH64_VALID_CALL_TARGET ++___ ++$code.=<<___; + vld1.64 {$t1},[$Xi] @ load Xi + vmov.i8 $xC2,#0xe1 + vld1.64 {$H-$Hhl},[$Htbl] @ load twisted H, ... +@@ -268,6 +278,7 @@ $code.=<<___; + gcm_ghash_v8: + ___ + $code.=<<___ if ($flavour =~ /64/); ++ AARCH64_VALID_CALL_TARGET + cmp $len,#64 + b.hs .Lgcm_ghash_v8_4x + ___ +diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl +index 113a2151b6..20816c4283 100755 +--- a/crypto/poly1305/asm/poly1305-armv8.pl ++++ b/crypto/poly1305/asm/poly1305-armv8.pl +@@ -72,6 +72,7 @@ $code.=<<___; + .type poly1305_init,%function + .align 5 + poly1305_init: ++ AARCH64_VALID_CALL_TARGET + cmp $inp,xzr + stp xzr,xzr,[$ctx] // zero hash value + stp xzr,xzr,[$ctx,#16] // [along with is_base2_26] +@@ -119,6 +120,9 @@ poly1305_init: + .align 5 + poly1305_blocks: + .Lpoly1305_blocks: ++ // The symbol .Lpoly1305_blocks is not a .globl symbol ++ // but a pointer to it is returned by poly1305_init ++ AARCH64_VALID_CALL_TARGET + ands $len,$len,#-16 + b.eq .Lno_data + +@@ -184,6 +188,9 @@ poly1305_blocks: + .align 5 + poly1305_emit: + .Lpoly1305_emit: ++ // The symbol .poly1305_emit is not a .globl symbol ++ // but a pointer to it is returned by poly1305_init ++ AARCH64_VALID_CALL_TARGET + ldp $h0,$h1,[$ctx] // load hash base 2^64 + ldr $h2,[$ctx,#16] + ldp $t0,$t1,[$nonce] // load nonce +@@ -291,13 +298,16 @@ poly1305_splat: + .align 5 + poly1305_blocks_neon: + .Lpoly1305_blocks_neon: ++ // The symbol .Lpoly1305_blocks_neon is not a .globl symbol ++ // but a pointer to it is returned by poly1305_init ++ AARCH64_VALID_CALL_TARGET + ldr $is_base2_26,[$ctx,#24] + cmp $len,#128 + b.hs .Lblocks_neon + cbz $is_base2_26,.Lpoly1305_blocks + + .Lblocks_neon: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + +@@ -867,7 +877,7 @@ poly1305_blocks_neon: + + .Lno_data_neon: + ldr x29,[sp],#80 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size poly1305_blocks_neon,.-poly1305_blocks_neon + +@@ -875,6 +885,9 @@ poly1305_blocks_neon: + .align 5 + poly1305_emit_neon: + .Lpoly1305_emit_neon: ++ // The symbol .Lpoly1305_emit_neon is not a .globl symbol ++ // but a pointer to it is returned by poly1305_init ++ AARCH64_VALID_CALL_TARGET + ldr $is_base2_26,[$ctx,#24] + cbz $is_base2_26,poly1305_emit + +diff --git a/crypto/sha/asm/keccak1600-armv8.pl b/crypto/sha/asm/keccak1600-armv8.pl +index 65102e7c29..cf54b62c63 100755 +--- a/crypto/sha/asm/keccak1600-armv8.pl ++++ b/crypto/sha/asm/keccak1600-armv8.pl +@@ -80,6 +80,8 @@ my @rhotates = ([ 0, 1, 62, 28, 27 ], + [ 18, 2, 61, 56, 14 ]); + + $code.=<<___; ++#include "arm_arch.h" ++ + .text + + .align 8 // strategic alignment and padding that allows to use +@@ -125,7 +127,7 @@ $code.=<<___; + .align 5 + KeccakF1600_int: + adr $C[2],iotas +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp $C[2],x30,[sp,#16] // 32 bytes on top are mine + b .Loop + .align 4 +@@ -297,14 +299,14 @@ $code.=<<___; + bne .Loop + + ldr x30,[sp,#24] +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size KeccakF1600_int,.-KeccakF1600_int + + .type KeccakF1600,%function + .align 5 + KeccakF1600: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] +@@ -354,7 +356,7 @@ KeccakF1600: + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size KeccakF1600,.-KeccakF1600 + +@@ -362,7 +364,7 @@ KeccakF1600: + .type SHA3_absorb,%function + .align 5 + SHA3_absorb: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] +@@ -460,7 +462,7 @@ $code.=<<___; + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size SHA3_absorb,.-SHA3_absorb + ___ +@@ -471,7 +473,7 @@ $code.=<<___; + .type SHA3_squeeze,%function + .align 5 + SHA3_squeeze: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-48]! + add x29,sp,#0 + stp x19,x20,[sp,#16] +@@ -534,7 +536,7 @@ SHA3_squeeze: + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x29,x30,[sp],#48 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size SHA3_squeeze,.-SHA3_squeeze + ___ +@@ -653,7 +655,7 @@ KeccakF1600_ce: + .type KeccakF1600_cext,%function + .align 5 + KeccakF1600_cext: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp d8,d9,[sp,#16] // per ABI requirement +@@ -686,7 +688,7 @@ $code.=<<___; + ldp d12,d13,[sp,#48] + ldp d14,d15,[sp,#64] + ldr x29,[sp],#80 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size KeccakF1600_cext,.-KeccakF1600_cext + ___ +@@ -699,7 +701,7 @@ $code.=<<___; + .type SHA3_absorb_cext,%function + .align 5 + SHA3_absorb_cext: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp d8,d9,[sp,#16] // per ABI requirement +@@ -771,7 +773,7 @@ $code.=<<___; + ldp d12,d13,[sp,#48] + ldp d14,d15,[sp,#64] + ldp x29,x30,[sp],#80 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size SHA3_absorb_cext,.-SHA3_absorb_cext + ___ +@@ -783,7 +785,7 @@ $code.=<<___; + .type SHA3_squeeze_cext,%function + .align 5 + SHA3_squeeze_cext: +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + mov x9,$ctx +@@ -839,7 +841,7 @@ SHA3_squeeze_cext: + + .Lsqueeze_done_ce: + ldr x29,[sp],#16 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size SHA3_squeeze_cext,.-SHA3_squeeze_cext + ___ +diff --git a/crypto/sha/asm/sha1-armv8.pl b/crypto/sha/asm/sha1-armv8.pl +index cdea8845af..5f23a20c1a 100644 +--- a/crypto/sha/asm/sha1-armv8.pl ++++ b/crypto/sha/asm/sha1-armv8.pl +@@ -175,8 +175,8 @@ ___ + } + + $code.=<<___; ++#include "arm_arch.h" + #ifndef __KERNEL__ +-# include "arm_arch.h" + .extern OPENSSL_armcap_P + .hidden OPENSSL_armcap_P + #endif +@@ -187,11 +187,13 @@ $code.=<<___; + .type sha1_block_data_order,%function + .align 6 + sha1_block_data_order: ++ AARCH64_VALID_CALL_TARGET + adrp x16,OPENSSL_armcap_P + ldr w16,[x16,#:lo12:OPENSSL_armcap_P] + tst w16,#ARMV8_SHA1 + b.ne .Lv8_entry + ++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] +@@ -253,6 +255,7 @@ $code.=<<___; + .align 6 + sha1_block_armv8: + .Lv8_entry: ++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + +diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl +index 6bcff0b7d3..f900882fee 100644 +--- a/crypto/sha/asm/sha512-armv8.pl ++++ b/crypto/sha/asm/sha512-armv8.pl +@@ -190,8 +190,8 @@ ___ + } + + $code.=<<___; ++#include "arm_arch.h" + #ifndef __KERNEL__ +-# include "arm_arch.h" + .extern OPENSSL_armcap_P + .hidden OPENSSL_armcap_P + #endif +@@ -202,6 +202,7 @@ $code.=<<___; + .type $func,%function + .align 6 + $func: ++ AARCH64_VALID_CALL_TARGET + #ifndef __KERNEL__ + adrp x16,OPENSSL_armcap_P + ldr w16,[x16,#:lo12:OPENSSL_armcap_P] +@@ -218,7 +219,7 @@ $code.=<<___ if ($SZ==8); + ___ + $code.=<<___; + #endif +- .inst 0xd503233f // paciasp ++ AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + +@@ -280,7 +281,7 @@ $code.=<<___; + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 +- .inst 0xd50323bf // autiasp ++ AARCH64_VALIDATE_LINK_REGISTER + ret + .size $func,.-$func + +@@ -370,6 +371,7 @@ $code.=<<___; + .align 6 + sha256_block_armv8: + .Lv8_entry: ++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + +@@ -632,7 +634,9 @@ $code.=<<___; + .type sha256_block_neon,%function + .align 4 + sha256_block_neon: ++ AARCH64_VALID_CALL_TARGET + .Lneon_entry: ++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later + stp x29, x30, [sp, #-16]! + mov x29, sp + sub sp,sp,#16*4 +@@ -743,6 +747,7 @@ $code.=<<___; + .align 6 + sha512_block_armv8: + .Lv8_entry: ++ // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + +diff --git a/crypto/sha/build.info b/crypto/sha/build.info +index d61f7de9b6..556a658d8b 100644 +--- a/crypto/sha/build.info ++++ b/crypto/sha/build.info +@@ -153,6 +153,7 @@ INCLUDE[sha256-armv8.o]=.. + GENERATE[sha512-armv8.S]=asm/sha512-armv8.pl + INCLUDE[sha512-armv8.o]=.. + GENERATE[keccak1600-armv8.S]=asm/keccak1600-armv8.pl ++INCLUDE[keccak1600-armv8.o]=.. + + GENERATE[sha1-s390x.S]=asm/sha1-s390x.pl + INCLUDE[sha1-s390x.o]=.. +-- +2.37.3.windows.1 + diff --git a/Backport-providers-Add-SM4-GCM-implementation.patch b/Backport-providers-Add-SM4-GCM-implementation.patch new file mode 100644 index 0000000..3e2ee23 --- /dev/null +++ b/Backport-providers-Add-SM4-GCM-implementation.patch @@ -0,0 +1,360 @@ +From 2f1c0b5f1b585a307f21a70ef3ae652643c25f6d Mon Sep 17 00:00:00 2001 +From: Tianjia Zhang <tianjia.zhang@linux.alibaba.com> +Date: Wed, 1 Sep 2021 16:54:15 +0800 +Subject: [PATCH 04/13] providers: Add SM4 GCM implementation + +The GCM mode of the SM4 algorithm is specifieded by RFC8998. + +Signed-off-by: Tianjia Zhang <tianjia.zhang@linux.alibaba.com> + +Reviewed-by: Paul Yang <kaishen.yy@antfin.com> +Reviewed-by: Paul Dale <pauli@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/16491) +--- + providers/defltprov.c | 2 + + providers/implementations/ciphers/build.info | 4 +- + .../implementations/ciphers/cipher_sm4_ccm.c | 39 +++++++++++++++++ + .../implementations/ciphers/cipher_sm4_ccm.h | 22 ++++++++++ + .../ciphers/cipher_sm4_ccm_hw.c | 41 ++++++++++++++++++ + .../implementations/ciphers/cipher_sm4_gcm.c | 40 +++++++++++++++++ + .../implementations/ciphers/cipher_sm4_gcm.h | 22 ++++++++++ + .../ciphers/cipher_sm4_gcm_hw.c | 43 +++++++++++++++++++ + .../include/prov/implementations.h | 2 + + .../implementations/include/prov/names.h | 2 + + test/recipes/30-test_evp_data/evpciph_sm4.txt | 20 +++++++++ + 11 files changed, 236 insertions(+), 1 deletion(-) + create mode 100644 providers/implementations/ciphers/cipher_sm4_ccm.c + create mode 100644 providers/implementations/ciphers/cipher_sm4_ccm.h + create mode 100644 providers/implementations/ciphers/cipher_sm4_ccm_hw.c + create mode 100644 providers/implementations/ciphers/cipher_sm4_gcm.c + create mode 100644 providers/implementations/ciphers/cipher_sm4_gcm.h + create mode 100644 providers/implementations/ciphers/cipher_sm4_gcm_hw.c + +diff --git a/providers/defltprov.c b/providers/defltprov.c +index ed3f4799e7..cc0b0c3b62 100644 +--- a/providers/defltprov.c ++++ b/providers/defltprov.c +@@ -289,6 +289,8 @@ static const OSSL_ALGORITHM_CAPABLE deflt_ciphers[] = { + ALG(PROV_NAMES_DES_EDE_CFB, ossl_tdes_ede2_cfb_functions), + #endif /* OPENSSL_NO_DES */ + #ifndef OPENSSL_NO_SM4 ++ ALG(PROV_NAMES_SM4_GCM, ossl_sm4128gcm_functions), ++ ALG(PROV_NAMES_SM4_CCM, ossl_sm4128ccm_functions), + ALG(PROV_NAMES_SM4_ECB, ossl_sm4128ecb_functions), + ALG(PROV_NAMES_SM4_CBC, ossl_sm4128cbc_functions), + ALG(PROV_NAMES_SM4_CTR, ossl_sm4128ctr_functions), +diff --git a/providers/implementations/ciphers/build.info b/providers/implementations/ciphers/build.info +index e4c5f4f051..b5d9d4f6c1 100644 +--- a/providers/implementations/ciphers/build.info ++++ b/providers/implementations/ciphers/build.info +@@ -105,7 +105,9 @@ ENDIF + + IF[{- !$disabled{sm4} -}] + SOURCE[$SM4_GOAL]=\ +- cipher_sm4.c cipher_sm4_hw.c ++ cipher_sm4.c cipher_sm4_hw.c \ ++ cipher_sm4_gcm.c cipher_sm4_gcm_hw.c \ ++ cipher_sm4_ccm.c cipher_sm4_ccm_hw.c + ENDIF + + IF[{- !$disabled{ocb} -}] +diff --git a/providers/implementations/ciphers/cipher_sm4_ccm.c b/providers/implementations/ciphers/cipher_sm4_ccm.c +new file mode 100644 +index 0000000000..f0295a5ca2 +--- /dev/null ++++ b/providers/implementations/ciphers/cipher_sm4_ccm.c +@@ -0,0 +1,39 @@ ++/* ++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++/* Dispatch functions for SM4 CCM mode */ ++ ++#include "cipher_sm4_ccm.h" ++#include "prov/implementations.h" ++#include "prov/providercommon.h" ++ ++static OSSL_FUNC_cipher_freectx_fn sm4_ccm_freectx; ++ ++static void *sm4_ccm_newctx(void *provctx, size_t keybits) ++{ ++ PROV_SM4_CCM_CTX *ctx; ++ ++ if (!ossl_prov_is_running()) ++ return NULL; ++ ++ ctx = OPENSSL_zalloc(sizeof(*ctx)); ++ if (ctx != NULL) ++ ossl_ccm_initctx(&ctx->base, keybits, ossl_prov_sm4_hw_ccm(keybits)); ++ return ctx; ++} ++ ++static void sm4_ccm_freectx(void *vctx) ++{ ++ PROV_SM4_CCM_CTX *ctx = (PROV_SM4_CCM_CTX *)vctx; ++ ++ OPENSSL_clear_free(ctx, sizeof(*ctx)); ++} ++ ++/* sm4128ccm functions */ ++IMPLEMENT_aead_cipher(sm4, ccm, CCM, AEAD_FLAGS, 128, 8, 96); +diff --git a/providers/implementations/ciphers/cipher_sm4_ccm.h b/providers/implementations/ciphers/cipher_sm4_ccm.h +new file mode 100644 +index 0000000000..189e71e9e4 +--- /dev/null ++++ b/providers/implementations/ciphers/cipher_sm4_ccm.h +@@ -0,0 +1,22 @@ ++/* ++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++#include "crypto/sm4.h" ++#include "prov/ciphercommon.h" ++#include "prov/ciphercommon_ccm.h" ++ ++typedef struct prov_sm4_ccm_ctx_st { ++ PROV_CCM_CTX base; /* Must be first */ ++ union { ++ OSSL_UNION_ALIGN; ++ SM4_KEY ks; ++ } ks; /* SM4 key schedule to use */ ++} PROV_SM4_CCM_CTX; ++ ++const PROV_CCM_HW *ossl_prov_sm4_hw_ccm(size_t keylen); +diff --git a/providers/implementations/ciphers/cipher_sm4_ccm_hw.c b/providers/implementations/ciphers/cipher_sm4_ccm_hw.c +new file mode 100644 +index 0000000000..791daf3e46 +--- /dev/null ++++ b/providers/implementations/ciphers/cipher_sm4_ccm_hw.c +@@ -0,0 +1,41 @@ ++/* ++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++/*- ++ * Generic support for SM4 CCM. ++ */ ++ ++#include "cipher_sm4_ccm.h" ++ ++static int ccm_sm4_initkey(PROV_CCM_CTX *ctx, ++ const unsigned char *key, size_t keylen) ++{ ++ PROV_SM4_CCM_CTX *actx = (PROV_SM4_CCM_CTX *)ctx; ++ ++ ossl_sm4_set_key(key, &actx->ks.ks); ++ CRYPTO_ccm128_init(&ctx->ccm_ctx, ctx->m, ctx->l, &actx->ks.ks, ++ (block128_f)ossl_sm4_encrypt); ++ ctx->str = NULL; ++ ctx->key_set = 1; ++ return 1; ++} ++ ++static const PROV_CCM_HW ccm_sm4 = { ++ ccm_sm4_initkey, ++ ossl_ccm_generic_setiv, ++ ossl_ccm_generic_setaad, ++ ossl_ccm_generic_auth_encrypt, ++ ossl_ccm_generic_auth_decrypt, ++ ossl_ccm_generic_gettag ++}; ++ ++const PROV_CCM_HW *ossl_prov_sm4_hw_ccm(size_t keybits) ++{ ++ return &ccm_sm4; ++} +diff --git a/providers/implementations/ciphers/cipher_sm4_gcm.c b/providers/implementations/ciphers/cipher_sm4_gcm.c +new file mode 100644 +index 0000000000..7a936f00ee +--- /dev/null ++++ b/providers/implementations/ciphers/cipher_sm4_gcm.c +@@ -0,0 +1,40 @@ ++/* ++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++/* Dispatch functions for SM4 GCM mode */ ++ ++#include "cipher_sm4_gcm.h" ++#include "prov/implementations.h" ++#include "prov/providercommon.h" ++ ++static OSSL_FUNC_cipher_freectx_fn sm4_gcm_freectx; ++ ++static void *sm4_gcm_newctx(void *provctx, size_t keybits) ++{ ++ PROV_SM4_GCM_CTX *ctx; ++ ++ if (!ossl_prov_is_running()) ++ return NULL; ++ ++ ctx = OPENSSL_zalloc(sizeof(*ctx)); ++ if (ctx != NULL) ++ ossl_gcm_initctx(provctx, &ctx->base, keybits, ++ ossl_prov_sm4_hw_gcm(keybits)); ++ return ctx; ++} ++ ++static void sm4_gcm_freectx(void *vctx) ++{ ++ PROV_SM4_GCM_CTX *ctx = (PROV_SM4_GCM_CTX *)vctx; ++ ++ OPENSSL_clear_free(ctx, sizeof(*ctx)); ++} ++ ++/* ossl_sm4128gcm_functions */ ++IMPLEMENT_aead_cipher(sm4, gcm, GCM, AEAD_FLAGS, 128, 8, 96); +diff --git a/providers/implementations/ciphers/cipher_sm4_gcm.h b/providers/implementations/ciphers/cipher_sm4_gcm.h +new file mode 100644 +index 0000000000..2b6b5f3ece +--- /dev/null ++++ b/providers/implementations/ciphers/cipher_sm4_gcm.h +@@ -0,0 +1,22 @@ ++/* ++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++#include "crypto/sm4.h" ++#include "prov/ciphercommon.h" ++#include "prov/ciphercommon_gcm.h" ++ ++typedef struct prov_sm4_gcm_ctx_st { ++ PROV_GCM_CTX base; /* must be first entry in struct */ ++ union { ++ OSSL_UNION_ALIGN; ++ SM4_KEY ks; ++ } ks; ++} PROV_SM4_GCM_CTX; ++ ++const PROV_GCM_HW *ossl_prov_sm4_hw_gcm(size_t keybits); +diff --git a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c +new file mode 100644 +index 0000000000..6bcd1ec406 +--- /dev/null ++++ b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c +@@ -0,0 +1,43 @@ ++/* ++ * Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++/*- ++ * Generic support for SM4 GCM. ++ */ ++ ++#include "cipher_sm4_gcm.h" ++ ++static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key, ++ size_t keylen) ++{ ++ PROV_SM4_GCM_CTX *actx = (PROV_SM4_GCM_CTX *)ctx; ++ SM4_KEY *ks = &actx->ks.ks; ++ ++ ctx->ks = ks; ++ ossl_sm4_set_key(key, ks); ++ CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt); ++ ctx->ctr = (ctr128_f)NULL; ++ ctx->key_set = 1; ++ ++ return 1; ++} ++ ++static const PROV_GCM_HW sm4_gcm = { ++ sm4_gcm_initkey, ++ ossl_gcm_setiv, ++ ossl_gcm_aad_update, ++ ossl_gcm_cipher_update, ++ ossl_gcm_cipher_final, ++ ossl_gcm_one_shot ++}; ++ ++const PROV_GCM_HW *ossl_prov_sm4_hw_gcm(size_t keybits) ++{ ++ return &sm4_gcm; ++} +diff --git a/providers/implementations/include/prov/implementations.h b/providers/implementations/include/prov/implementations.h +index 3f6dd7ee16..498eab4ad4 100644 +--- a/providers/implementations/include/prov/implementations.h ++++ b/providers/implementations/include/prov/implementations.h +@@ -174,6 +174,8 @@ extern const OSSL_DISPATCH ossl_seed128ofb128_functions[]; + extern const OSSL_DISPATCH ossl_seed128cfb128_functions[]; + #endif /* OPENSSL_NO_SEED */ + #ifndef OPENSSL_NO_SM4 ++extern const OSSL_DISPATCH ossl_sm4128gcm_functions[]; ++extern const OSSL_DISPATCH ossl_sm4128ccm_functions[]; + extern const OSSL_DISPATCH ossl_sm4128ecb_functions[]; + extern const OSSL_DISPATCH ossl_sm4128cbc_functions[]; + extern const OSSL_DISPATCH ossl_sm4128ctr_functions[]; +diff --git a/providers/implementations/include/prov/names.h b/providers/implementations/include/prov/names.h +index e0dbb69a9d..0fac23a850 100644 +--- a/providers/implementations/include/prov/names.h ++++ b/providers/implementations/include/prov/names.h +@@ -162,6 +162,8 @@ + #define PROV_NAMES_SM4_CTR "SM4-CTR:1.2.156.10197.1.104.7" + #define PROV_NAMES_SM4_OFB "SM4-OFB:SM4-OFB128:1.2.156.10197.1.104.3" + #define PROV_NAMES_SM4_CFB "SM4-CFB:SM4-CFB128:1.2.156.10197.1.104.4" ++#define PROV_NAMES_SM4_GCM "SM4-GCM:1.2.156.10197.1.104.8" ++#define PROV_NAMES_SM4_CCM "SM4-CCM:1.2.156.10197.1.104.9" + #define PROV_NAMES_ChaCha20 "ChaCha20" + #define PROV_NAMES_ChaCha20_Poly1305 "ChaCha20-Poly1305" + #define PROV_NAMES_CAST5_ECB "CAST5-ECB" +diff --git a/test/recipes/30-test_evp_data/evpciph_sm4.txt b/test/recipes/30-test_evp_data/evpciph_sm4.txt +index ec8a45bd3f..9fb16ca15c 100644 +--- a/test/recipes/30-test_evp_data/evpciph_sm4.txt ++++ b/test/recipes/30-test_evp_data/evpciph_sm4.txt +@@ -36,3 +36,23 @@ Key = 0123456789ABCDEFFEDCBA9876543210 + IV = 0123456789ABCDEFFEDCBA9876543210 + Plaintext = AAAAAAAAAAAAAAAABBBBBBBBBBBBBBBBCCCCCCCCCCCCCCCCDDDDDDDDDDDDDDDDEEEEEEEEEEEEEEEEFFFFFFFFFFFFFFFFEEEEEEEEEEEEEEEEAAAAAAAAAAAAAAAA + Ciphertext = C2B4759E78AC3CF43D0852F4E8D5F9FD7256E8A5FCB65A350EE00630912E44492A0B17E1B85B060D0FBA612D8A95831638B361FD5FFACD942F081485A83CA35D ++ ++Title = SM4 GCM test vectors from RFC8998 ++ ++Cipher = SM4-GCM ++Key = 0123456789abcdeffedcba9876543210 ++IV = 00001234567800000000abcd ++AAD = feedfacedeadbeeffeedfacedeadbeefabaddad2 ++Tag = 83de3541e4c2b58177e065a9bf7b62ec ++Plaintext = aaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbccccccccccccccccddddddddddddddddeeeeeeeeeeeeeeeeffffffffffffffffeeeeeeeeeeeeeeeeaaaaaaaaaaaaaaaa ++Ciphertext = 17f399f08c67d5ee19d0dc9969c4bb7d5fd46fd3756489069157b282bb200735d82710ca5c22f0ccfa7cbf93d496ac15a56834cbcf98c397b4024a2691233b8d ++ ++Title = SM4 CCM test vectors from RFC8998 ++ ++Cipher = SM4-CCM ++Key = 0123456789abcdeffedcba9876543210 ++IV = 00001234567800000000abcd ++AAD = feedfacedeadbeeffeedfacedeadbeefabaddad2 ++Tag = 16842d4fa186f56ab33256971fa110f4 ++Plaintext = aaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbccccccccccccccccddddddddddddddddeeeeeeeeeeeeeeeeffffffffffffffffeeeeeeeeeeeeeeeeaaaaaaaaaaaaaaaa ++Ciphertext = 48af93501fa62adbcd414cce6034d895dda1bf8f132f042098661572e7483094fd12e518ce062c98acee28d95df4416bed31a2f04476c18bb40c84a74b97dc5b +-- +2.37.3.windows.1 + diff --git a/Backport-providers-Add-SM4-XTS-implementation.patch b/Backport-providers-Add-SM4-XTS-implementation.patch new file mode 100644 index 0000000..5136236 --- /dev/null +++ b/Backport-providers-Add-SM4-XTS-implementation.patch @@ -0,0 +1,763 @@ +From 57c854480481bd6b0900984d17db17426c44aa40 Mon Sep 17 00:00:00 2001 +From: Xu Yizhou <xuyizhou1@huawei.com> +Date: Fri, 25 Nov 2022 13:52:49 +0800 +Subject: [PATCH 08/13] providers: Add SM4 XTS implementation + +Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com> + +Reviewed-by: Hugo Landau <hlandau@openssl.org> +Reviewed-by: Tomas Mraz <tomas@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/19619) +--- + crypto/modes/build.info | 2 +- + crypto/modes/xts128gb.c | 199 +++++++++++++ + include/crypto/modes.h | 6 + + include/openssl/core_names.h | 1 + + providers/defltprov.c | 1 + + providers/implementations/ciphers/build.info | 4 +- + .../implementations/ciphers/cipher_sm4_xts.c | 281 ++++++++++++++++++ + .../implementations/ciphers/cipher_sm4_xts.h | 46 +++ + .../ciphers/cipher_sm4_xts_hw.c | 89 ++++++ + .../include/prov/implementations.h | 1 + + .../implementations/include/prov/names.h | 1 + + 11 files changed, 629 insertions(+), 2 deletions(-) + create mode 100644 crypto/modes/xts128gb.c + create mode 100644 providers/implementations/ciphers/cipher_sm4_xts.c + create mode 100644 providers/implementations/ciphers/cipher_sm4_xts.h + create mode 100644 providers/implementations/ciphers/cipher_sm4_xts_hw.c + +diff --git a/crypto/modes/build.info b/crypto/modes/build.info +index f3558fa1a4..0ee297ced8 100644 +--- a/crypto/modes/build.info ++++ b/crypto/modes/build.info +@@ -49,7 +49,7 @@ IF[{- !$disabled{asm} -}] + ENDIF + + $COMMON=cbc128.c ctr128.c cfb128.c ofb128.c gcm128.c ccm128.c xts128.c \ +- wrap128.c $MODESASM ++ wrap128.c xts128gb.c $MODESASM + SOURCE[../../libcrypto]=$COMMON \ + cts128.c ocb128.c siv128.c + SOURCE[../../providers/libfips.a]=$COMMON +diff --git a/crypto/modes/xts128gb.c b/crypto/modes/xts128gb.c +new file mode 100644 +index 0000000000..021c0597e4 +--- /dev/null ++++ b/crypto/modes/xts128gb.c +@@ -0,0 +1,199 @@ ++/* ++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++#include <string.h> ++#include <openssl/crypto.h> ++#include "internal/endian.h" ++#include "crypto/modes.h" ++ ++#ifndef STRICT_ALIGNMENT ++# ifdef __GNUC__ ++typedef u64 u64_a1 __attribute((__aligned__(1))); ++# else ++typedef u64 u64_a1; ++# endif ++#endif ++ ++int ossl_crypto_xts128gb_encrypt(const XTS128_CONTEXT *ctx, ++ const unsigned char iv[16], ++ const unsigned char *inp, unsigned char *out, ++ size_t len, int enc) ++{ ++ DECLARE_IS_ENDIAN; ++ union { ++ u64 u[2]; ++ u32 d[4]; ++ u8 c[16]; ++ } tweak, scratch; ++ unsigned int i; ++ ++ if (len < 16) ++ return -1; ++ ++ memcpy(tweak.c, iv, 16); ++ ++ (*ctx->block2) (tweak.c, tweak.c, ctx->key2); ++ ++ if (!enc && (len % 16)) ++ len -= 16; ++ ++ while (len >= 16) { ++#if defined(STRICT_ALIGNMENT) ++ memcpy(scratch.c, inp, 16); ++ scratch.u[0] ^= tweak.u[0]; ++ scratch.u[1] ^= tweak.u[1]; ++#else ++ scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak.u[0]; ++ scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak.u[1]; ++#endif ++ (*ctx->block1) (scratch.c, scratch.c, ctx->key1); ++#if defined(STRICT_ALIGNMENT) ++ scratch.u[0] ^= tweak.u[0]; ++ scratch.u[1] ^= tweak.u[1]; ++ memcpy(out, scratch.c, 16); ++#else ++ ((u64_a1 *)out)[0] = scratch.u[0] ^= tweak.u[0]; ++ ((u64_a1 *)out)[1] = scratch.u[1] ^= tweak.u[1]; ++#endif ++ inp += 16; ++ out += 16; ++ len -= 16; ++ ++ if (len == 0) ++ return 0; ++ ++ if (IS_LITTLE_ENDIAN) { ++ u8 res; ++ u64 hi, lo; ++#ifdef BSWAP8 ++ hi = BSWAP8(tweak.u[0]); ++ lo = BSWAP8(tweak.u[1]); ++#else ++ u8 *p = tweak.c; ++ ++ hi = (u64)GETU32(p) << 32 | GETU32(p + 4); ++ lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); ++#endif ++ res = (u8)lo & 1; ++ tweak.u[0] = (lo >> 1) | (hi << 63); ++ tweak.u[1] = hi >> 1; ++ if (res) ++ tweak.c[15] ^= 0xe1; ++#ifdef BSWAP8 ++ hi = BSWAP8(tweak.u[0]); ++ lo = BSWAP8(tweak.u[1]); ++#else ++ p = tweak.c; ++ ++ hi = (u64)GETU32(p) << 32 | GETU32(p + 4); ++ lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); ++#endif ++ tweak.u[0] = lo; ++ tweak.u[1] = hi; ++ } else { ++ u8 carry, res; ++ carry = 0; ++ for (i = 0; i < 16; ++i) { ++ res = (tweak.c[i] << 7) & 0x80; ++ tweak.c[i] = ((tweak.c[i] >> 1) + carry) & 0xff; ++ carry = res; ++ } ++ if (res) ++ tweak.c[0] ^= 0xe1; ++ } ++ } ++ if (enc) { ++ for (i = 0; i < len; ++i) { ++ u8 c = inp[i]; ++ out[i] = scratch.c[i]; ++ scratch.c[i] = c; ++ } ++ scratch.u[0] ^= tweak.u[0]; ++ scratch.u[1] ^= tweak.u[1]; ++ (*ctx->block1) (scratch.c, scratch.c, ctx->key1); ++ scratch.u[0] ^= tweak.u[0]; ++ scratch.u[1] ^= tweak.u[1]; ++ memcpy(out - 16, scratch.c, 16); ++ } else { ++ union { ++ u64 u[2]; ++ u8 c[16]; ++ } tweak1; ++ ++ if (IS_LITTLE_ENDIAN) { ++ u8 res; ++ u64 hi, lo; ++#ifdef BSWAP8 ++ hi = BSWAP8(tweak.u[0]); ++ lo = BSWAP8(tweak.u[1]); ++#else ++ u8 *p = tweak.c; ++ ++ hi = (u64)GETU32(p) << 32 | GETU32(p + 4); ++ lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); ++#endif ++ res = (u8)lo & 1; ++ tweak1.u[0] = (lo >> 1) | (hi << 63); ++ tweak1.u[1] = hi >> 1; ++ if (res) ++ tweak1.c[15] ^= 0xe1; ++#ifdef BSWAP8 ++ hi = BSWAP8(tweak1.u[0]); ++ lo = BSWAP8(tweak1.u[1]); ++#else ++ p = tweak1.c; ++ ++ hi = (u64)GETU32(p) << 32 | GETU32(p + 4); ++ lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12); ++#endif ++ tweak1.u[0] = lo; ++ tweak1.u[1] = hi; ++ } else { ++ u8 carry, res; ++ carry = 0; ++ for (i = 0; i < 16; ++i) { ++ res = (tweak.c[i] << 7) & 0x80; ++ tweak1.c[i] = ((tweak.c[i] >> 1) + carry) & 0xff; ++ carry = res; ++ } ++ if (res) ++ tweak1.c[0] ^= 0xe1; ++ } ++#if defined(STRICT_ALIGNMENT) ++ memcpy(scratch.c, inp, 16); ++ scratch.u[0] ^= tweak1.u[0]; ++ scratch.u[1] ^= tweak1.u[1]; ++#else ++ scratch.u[0] = ((u64_a1 *)inp)[0] ^ tweak1.u[0]; ++ scratch.u[1] = ((u64_a1 *)inp)[1] ^ tweak1.u[1]; ++#endif ++ (*ctx->block1) (scratch.c, scratch.c, ctx->key1); ++ scratch.u[0] ^= tweak1.u[0]; ++ scratch.u[1] ^= tweak1.u[1]; ++ ++ for (i = 0; i < len; ++i) { ++ u8 c = inp[16 + i]; ++ out[16 + i] = scratch.c[i]; ++ scratch.c[i] = c; ++ } ++ scratch.u[0] ^= tweak.u[0]; ++ scratch.u[1] ^= tweak.u[1]; ++ (*ctx->block1) (scratch.c, scratch.c, ctx->key1); ++#if defined(STRICT_ALIGNMENT) ++ scratch.u[0] ^= tweak.u[0]; ++ scratch.u[1] ^= tweak.u[1]; ++ memcpy(out, scratch.c, 16); ++#else ++ ((u64_a1 *)out)[0] = scratch.u[0] ^ tweak.u[0]; ++ ((u64_a1 *)out)[1] = scratch.u[1] ^ tweak.u[1]; ++#endif ++ } ++ ++ return 0; ++} +diff --git a/include/crypto/modes.h b/include/crypto/modes.h +index 19f9d85959..475b77f925 100644 +--- a/include/crypto/modes.h ++++ b/include/crypto/modes.h +@@ -148,6 +148,12 @@ struct xts128_context { + block128_f block1, block2; + }; + ++/* XTS mode for SM4 algorithm specified by GB/T 17964-2021 */ ++int ossl_crypto_xts128gb_encrypt(const XTS128_CONTEXT *ctx, ++ const unsigned char iv[16], ++ const unsigned char *inp, unsigned char *out, ++ size_t len, int enc); ++ + struct ccm128_context { + union { + u64 u[2]; +diff --git a/include/openssl/core_names.h b/include/openssl/core_names.h +index 6bed5a8a67..a90971099d 100644 +--- a/include/openssl/core_names.h ++++ b/include/openssl/core_names.h +@@ -97,6 +97,7 @@ extern "C" { + #define OSSL_CIPHER_PARAM_CTS_MODE "cts_mode" /* utf8_string */ + /* For passing the AlgorithmIdentifier parameter in DER form */ + #define OSSL_CIPHER_PARAM_ALGORITHM_ID_PARAMS "alg_id_param" /* octet_string */ ++#define OSSL_CIPHER_PARAM_XTS_STANDARD "xts_standard" /* utf8_string */ + + #define OSSL_CIPHER_PARAM_TLS1_MULTIBLOCK_MAX_SEND_FRAGMENT \ + "tls1multi_maxsndfrag" /* uint */ +diff --git a/providers/defltprov.c b/providers/defltprov.c +index cc0b0c3b62..ab898d3f44 100644 +--- a/providers/defltprov.c ++++ b/providers/defltprov.c +@@ -296,6 +296,7 @@ static const OSSL_ALGORITHM_CAPABLE deflt_ciphers[] = { + ALG(PROV_NAMES_SM4_CTR, ossl_sm4128ctr_functions), + ALG(PROV_NAMES_SM4_OFB, ossl_sm4128ofb128_functions), + ALG(PROV_NAMES_SM4_CFB, ossl_sm4128cfb128_functions), ++ ALG(PROV_NAMES_SM4_XTS, ossl_sm4128xts_functions), + #endif /* OPENSSL_NO_SM4 */ + #ifndef OPENSSL_NO_CHACHA + ALG(PROV_NAMES_ChaCha20, ossl_chacha20_functions), +diff --git a/providers/implementations/ciphers/build.info b/providers/implementations/ciphers/build.info +index b5d9d4f6c1..9f6eacf5e3 100644 +--- a/providers/implementations/ciphers/build.info ++++ b/providers/implementations/ciphers/build.info +@@ -107,7 +107,9 @@ IF[{- !$disabled{sm4} -}] + SOURCE[$SM4_GOAL]=\ + cipher_sm4.c cipher_sm4_hw.c \ + cipher_sm4_gcm.c cipher_sm4_gcm_hw.c \ +- cipher_sm4_ccm.c cipher_sm4_ccm_hw.c ++ cipher_sm4_ccm.c cipher_sm4_ccm_hw.c \ ++ cipher_sm4_xts.c cipher_sm4_xts_hw.c ++ + ENDIF + + IF[{- !$disabled{ocb} -}] +diff --git a/providers/implementations/ciphers/cipher_sm4_xts.c b/providers/implementations/ciphers/cipher_sm4_xts.c +new file mode 100644 +index 0000000000..3c568d4d18 +--- /dev/null ++++ b/providers/implementations/ciphers/cipher_sm4_xts.c +@@ -0,0 +1,281 @@ ++ ++/* ++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++/* Dispatch functions for SM4 XTS mode */ ++ ++#include <openssl/proverr.h> ++#include "cipher_sm4_xts.h" ++#include "prov/implementations.h" ++#include "prov/providercommon.h" ++ ++#define SM4_XTS_FLAGS PROV_CIPHER_FLAG_CUSTOM_IV ++#define SM4_XTS_IV_BITS 128 ++#define SM4_XTS_BLOCK_BITS 8 ++ ++/* forward declarations */ ++static OSSL_FUNC_cipher_encrypt_init_fn sm4_xts_einit; ++static OSSL_FUNC_cipher_decrypt_init_fn sm4_xts_dinit; ++static OSSL_FUNC_cipher_update_fn sm4_xts_stream_update; ++static OSSL_FUNC_cipher_final_fn sm4_xts_stream_final; ++static OSSL_FUNC_cipher_cipher_fn sm4_xts_cipher; ++static OSSL_FUNC_cipher_freectx_fn sm4_xts_freectx; ++static OSSL_FUNC_cipher_dupctx_fn sm4_xts_dupctx; ++static OSSL_FUNC_cipher_set_ctx_params_fn sm4_xts_set_ctx_params; ++static OSSL_FUNC_cipher_settable_ctx_params_fn sm4_xts_settable_ctx_params; ++ ++/*- ++ * Provider dispatch functions ++ */ ++static int sm4_xts_init(void *vctx, const unsigned char *key, size_t keylen, ++ const unsigned char *iv, size_t ivlen, ++ const OSSL_PARAM params[], int enc) ++{ ++ PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)vctx; ++ PROV_CIPHER_CTX *ctx = &xctx->base; ++ ++ if (!ossl_prov_is_running()) ++ return 0; ++ ++ ctx->enc = enc; ++ ++ if (iv != NULL) { ++ if (!ossl_cipher_generic_initiv(vctx, iv, ivlen)) ++ return 0; ++ } ++ if (key != NULL) { ++ if (keylen != ctx->keylen) { ++ ERR_raise(ERR_LIB_PROV, PROV_R_INVALID_KEY_LENGTH); ++ return 0; ++ } ++ if (!ctx->hw->init(ctx, key, keylen)) ++ return 0; ++ } ++ return sm4_xts_set_ctx_params(xctx, params); ++} ++ ++static int sm4_xts_einit(void *vctx, const unsigned char *key, size_t keylen, ++ const unsigned char *iv, size_t ivlen, ++ const OSSL_PARAM params[]) ++{ ++ return sm4_xts_init(vctx, key, keylen, iv, ivlen, params, 1); ++} ++ ++static int sm4_xts_dinit(void *vctx, const unsigned char *key, size_t keylen, ++ const unsigned char *iv, size_t ivlen, ++ const OSSL_PARAM params[]) ++{ ++ return sm4_xts_init(vctx, key, keylen, iv, ivlen, params, 0); ++} ++ ++static void *sm4_xts_newctx(void *provctx, unsigned int mode, uint64_t flags, ++ size_t kbits, size_t blkbits, size_t ivbits) ++{ ++ PROV_SM4_XTS_CTX *ctx = OPENSSL_zalloc(sizeof(*ctx)); ++ ++ if (ctx != NULL) { ++ ossl_cipher_generic_initkey(&ctx->base, kbits, blkbits, ivbits, mode, ++ flags, ossl_prov_cipher_hw_sm4_xts(kbits), ++ NULL); ++ } ++ return ctx; ++} ++ ++static void sm4_xts_freectx(void *vctx) ++{ ++ PROV_SM4_XTS_CTX *ctx = (PROV_SM4_XTS_CTX *)vctx; ++ ++ ossl_cipher_generic_reset_ctx((PROV_CIPHER_CTX *)vctx); ++ OPENSSL_clear_free(ctx, sizeof(*ctx)); ++} ++ ++static void *sm4_xts_dupctx(void *vctx) ++{ ++ PROV_SM4_XTS_CTX *in = (PROV_SM4_XTS_CTX *)vctx; ++ PROV_SM4_XTS_CTX *ret = NULL; ++ ++ if (!ossl_prov_is_running()) ++ return NULL; ++ ++ if (in->xts.key1 != NULL) { ++ if (in->xts.key1 != &in->ks1) ++ return NULL; ++ } ++ if (in->xts.key2 != NULL) { ++ if (in->xts.key2 != &in->ks2) ++ return NULL; ++ } ++ ret = OPENSSL_malloc(sizeof(*ret)); ++ if (ret == NULL) ++ return NULL; ++ in->base.hw->copyctx(&ret->base, &in->base); ++ return ret; ++} ++ ++static int sm4_xts_cipher(void *vctx, unsigned char *out, size_t *outl, ++ size_t outsize, const unsigned char *in, size_t inl) ++{ ++ PROV_SM4_XTS_CTX *ctx = (PROV_SM4_XTS_CTX *)vctx; ++ ++ if (!ossl_prov_is_running() ++ || ctx->xts.key1 == NULL ++ || ctx->xts.key2 == NULL ++ || !ctx->base.iv_set ++ || out == NULL ++ || in == NULL ++ || inl < SM4_BLOCK_SIZE) ++ return 0; ++ ++ /* ++ * Impose a limit of 2^20 blocks per data unit as specified by ++ * IEEE Std 1619-2018. The earlier and obsolete IEEE Std 1619-2007 ++ * indicated that this was a SHOULD NOT rather than a MUST NOT. ++ * NIST SP 800-38E mandates the same limit. ++ */ ++ if (inl > XTS_MAX_BLOCKS_PER_DATA_UNIT * SM4_BLOCK_SIZE) { ++ ERR_raise(ERR_LIB_PROV, PROV_R_XTS_DATA_UNIT_IS_TOO_LARGE); ++ return 0; ++ } ++ if (ctx->xts_standard) { ++ if (ctx->stream != NULL) ++ (*ctx->stream)(in, out, inl, ctx->xts.key1, ctx->xts.key2, ++ ctx->base.iv); ++ else if (CRYPTO_xts128_encrypt(&ctx->xts, ctx->base.iv, in, out, inl, ++ ctx->base.enc)) ++ return 0; ++ } else { ++ if (ctx->stream_gb != NULL) ++ (*ctx->stream_gb)(in, out, inl, ctx->xts.key1, ctx->xts.key2, ++ ctx->base.iv); ++ else if (ossl_crypto_xts128gb_encrypt(&ctx->xts, ctx->base.iv, in, out, ++ inl, ctx->base.enc)) ++ return 0; ++ } ++ *outl = inl; ++ return 1; ++} ++ ++static int sm4_xts_stream_update(void *vctx, unsigned char *out, size_t *outl, ++ size_t outsize, const unsigned char *in, ++ size_t inl) ++{ ++ PROV_SM4_XTS_CTX *ctx = (PROV_SM4_XTS_CTX *)vctx; ++ ++ if (outsize < inl) { ++ ERR_raise(ERR_LIB_PROV, PROV_R_OUTPUT_BUFFER_TOO_SMALL); ++ return 0; ++ } ++ ++ if (!sm4_xts_cipher(ctx, out, outl, outsize, in, inl)) { ++ ERR_raise(ERR_LIB_PROV, PROV_R_CIPHER_OPERATION_FAILED); ++ return 0; ++ } ++ ++ return 1; ++} ++ ++static int sm4_xts_stream_final(void *vctx, unsigned char *out, size_t *outl, ++ size_t outsize) ++{ ++ if (!ossl_prov_is_running()) ++ return 0; ++ *outl = 0; ++ return 1; ++} ++ ++static const OSSL_PARAM sm4_xts_known_settable_ctx_params[] = { ++ OSSL_PARAM_utf8_string(OSSL_CIPHER_PARAM_XTS_STANDARD, NULL, 0), ++ OSSL_PARAM_END ++}; ++ ++static const OSSL_PARAM *sm4_xts_settable_ctx_params(ossl_unused void *cctx, ++ ossl_unused void *provctx) ++{ ++ return sm4_xts_known_settable_ctx_params; ++} ++ ++static int sm4_xts_set_ctx_params(void *vxctx, const OSSL_PARAM params[]) ++{ ++ PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)vxctx; ++ const OSSL_PARAM *p; ++ ++ if (params == NULL) ++ return 1; ++ ++ /*- ++ * Sets the XTS standard to use with SM4-XTS algorithm. ++ * ++ * Must be utf8 string "GB" or "IEEE", ++ * "GB" means the GB/T 17964-2021 standard ++ * "IEEE" means the IEEE Std 1619-2007 standard ++ */ ++ p = OSSL_PARAM_locate_const(params, OSSL_CIPHER_PARAM_XTS_STANDARD); ++ ++ if (p != NULL) { ++ const char *xts_standard = NULL; ++ ++ if (p->data_type != OSSL_PARAM_UTF8_STRING) ++ return 0; ++ ++ if (!OSSL_PARAM_get_utf8_string_ptr(p, &xts_standard)) { ++ ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_GET_PARAMETER); ++ return 0; ++ } ++ if (OPENSSL_strcasecmp(xts_standard, "GB") == 0) { ++ xctx->xts_standard = 0; ++ } else if (OPENSSL_strcasecmp(xts_standard, "IEEE") == 0) { ++ xctx->xts_standard = 1; ++ } else { ++ ERR_raise(ERR_LIB_PROV, PROV_R_FAILED_TO_SET_PARAMETER); ++ return 0; ++ } ++ } ++ ++ return 1; ++} ++ ++#define IMPLEMENT_cipher(lcmode, UCMODE, kbits, flags) \ ++static OSSL_FUNC_cipher_get_params_fn sm4_##kbits##_##lcmode##_get_params; \ ++static int sm4_##kbits##_##lcmode##_get_params(OSSL_PARAM params[]) \ ++{ \ ++ return ossl_cipher_generic_get_params(params, EVP_CIPH_##UCMODE##_MODE, \ ++ flags, 2 * kbits, SM4_XTS_BLOCK_BITS,\ ++ SM4_XTS_IV_BITS); \ ++} \ ++static OSSL_FUNC_cipher_newctx_fn sm4_##kbits##_xts_newctx; \ ++static void *sm4_##kbits##_xts_newctx(void *provctx) \ ++{ \ ++ return sm4_xts_newctx(provctx, EVP_CIPH_##UCMODE##_MODE, flags, 2 * kbits, \ ++ SM4_XTS_BLOCK_BITS, SM4_XTS_IV_BITS); \ ++} \ ++const OSSL_DISPATCH ossl_sm4##kbits##xts_functions[] = { \ ++ { OSSL_FUNC_CIPHER_NEWCTX, (void (*)(void))sm4_##kbits##_xts_newctx }, \ ++ { OSSL_FUNC_CIPHER_ENCRYPT_INIT, (void (*)(void))sm4_xts_einit }, \ ++ { OSSL_FUNC_CIPHER_DECRYPT_INIT, (void (*)(void))sm4_xts_dinit }, \ ++ { OSSL_FUNC_CIPHER_UPDATE, (void (*)(void))sm4_xts_stream_update }, \ ++ { OSSL_FUNC_CIPHER_FINAL, (void (*)(void))sm4_xts_stream_final }, \ ++ { OSSL_FUNC_CIPHER_CIPHER, (void (*)(void))sm4_xts_cipher }, \ ++ { OSSL_FUNC_CIPHER_FREECTX, (void (*)(void))sm4_xts_freectx }, \ ++ { OSSL_FUNC_CIPHER_DUPCTX, (void (*)(void))sm4_xts_dupctx }, \ ++ { OSSL_FUNC_CIPHER_GET_PARAMS, \ ++ (void (*)(void))sm4_##kbits##_##lcmode##_get_params }, \ ++ { OSSL_FUNC_CIPHER_GETTABLE_PARAMS, \ ++ (void (*)(void))ossl_cipher_generic_gettable_params }, \ ++ { OSSL_FUNC_CIPHER_GET_CTX_PARAMS, \ ++ (void (*)(void))ossl_cipher_generic_get_ctx_params }, \ ++ { OSSL_FUNC_CIPHER_GETTABLE_CTX_PARAMS, \ ++ (void (*)(void))ossl_cipher_generic_gettable_ctx_params }, \ ++ { OSSL_FUNC_CIPHER_SET_CTX_PARAMS, \ ++ (void (*)(void))sm4_xts_set_ctx_params }, \ ++ { OSSL_FUNC_CIPHER_SETTABLE_CTX_PARAMS, \ ++ (void (*)(void))sm4_xts_settable_ctx_params }, \ ++ { 0, NULL } \ ++} ++/* ossl_sm4128xts_functions */ ++IMPLEMENT_cipher(xts, XTS, 128, SM4_XTS_FLAGS); +diff --git a/providers/implementations/ciphers/cipher_sm4_xts.h b/providers/implementations/ciphers/cipher_sm4_xts.h +new file mode 100644 +index 0000000000..4c369183e2 +--- /dev/null ++++ b/providers/implementations/ciphers/cipher_sm4_xts.h +@@ -0,0 +1,46 @@ ++/* ++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++#include <crypto/sm4.h> ++#include "prov/ciphercommon.h" ++#include "crypto/sm4_platform.h" ++ ++PROV_CIPHER_FUNC(void, xts_stream, ++ (const unsigned char *in, unsigned char *out, size_t len, ++ const SM4_KEY *key1, const SM4_KEY *key2, ++ const unsigned char iv[16])); ++ ++typedef struct prov_sm4_xts_ctx_st { ++ /* Must be first */ ++ PROV_CIPHER_CTX base; ++ ++ /* SM4 key schedules to use */ ++ union { ++ OSSL_UNION_ALIGN; ++ SM4_KEY ks; ++ } ks1, ks2; ++ ++ /*- ++ * XTS standard to use with SM4-XTS algorithm ++ * ++ * Must be 0 or 1, ++ * 0 for XTS mode specified by GB/T 17964-2021 ++ * 1 for XTS mode specified by IEEE Std 1619-2007 ++ */ ++ int xts_standard; ++ ++ XTS128_CONTEXT xts; ++ ++ /* Stream function for XTS mode specified by GB/T 17964-2021 */ ++ OSSL_xts_stream_fn stream_gb; ++ /* Stream function for XTS mode specified by IEEE Std 1619-2007 */ ++ OSSL_xts_stream_fn stream; ++} PROV_SM4_XTS_CTX; ++ ++const PROV_CIPHER_HW *ossl_prov_cipher_hw_sm4_xts(size_t keybits); +diff --git a/providers/implementations/ciphers/cipher_sm4_xts_hw.c b/providers/implementations/ciphers/cipher_sm4_xts_hw.c +new file mode 100644 +index 0000000000..403eb879b1 +--- /dev/null ++++ b/providers/implementations/ciphers/cipher_sm4_xts_hw.c +@@ -0,0 +1,89 @@ ++/* ++ * Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++#include "cipher_sm4_xts.h" ++ ++#define XTS_SET_KEY_FN(fn_set_enc_key, fn_set_dec_key, \ ++ fn_block_enc, fn_block_dec, \ ++ fn_stream_enc, fn_stream_dec, \ ++ fn_stream_gb_enc, fn_stream_gb_dec) { \ ++ size_t bytes = keylen / 2; \ ++ \ ++ if (ctx->enc) { \ ++ fn_set_enc_key(key, &xctx->ks1.ks); \ ++ xctx->xts.block1 = (block128_f)fn_block_enc; \ ++ } else { \ ++ fn_set_dec_key(key, &xctx->ks1.ks); \ ++ xctx->xts.block1 = (block128_f)fn_block_dec; \ ++ } \ ++ fn_set_enc_key(key + bytes, &xctx->ks2.ks); \ ++ xctx->xts.block2 = (block128_f)fn_block_enc; \ ++ xctx->xts.key1 = &xctx->ks1; \ ++ xctx->xts.key2 = &xctx->ks2; \ ++ xctx->stream = ctx->enc ? fn_stream_enc : fn_stream_dec; \ ++ xctx->stream_gb = ctx->enc ? fn_stream_gb_enc : fn_stream_gb_dec; \ ++} ++ ++static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx, ++ const unsigned char *key, ++ size_t keylen) ++{ ++ PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)ctx; ++ OSSL_xts_stream_fn stream_enc = NULL; ++ OSSL_xts_stream_fn stream_dec = NULL; ++ OSSL_xts_stream_fn stream_gb_enc = NULL; ++ OSSL_xts_stream_fn stream_gb_dec = NULL; ++#ifdef HWSM4_CAPABLE ++ if (HWSM4_CAPABLE) { ++ XTS_SET_KEY_FN(HWSM4_set_encrypt_key, HWSM4_set_decrypt_key, ++ HWSM4_encrypt, HWSM4_decrypt, stream_enc, stream_dec, ++ stream_gb_enc, stream_gb_dec); ++ return 1; ++ } else ++#endif /* HWSM4_CAPABLE */ ++#ifdef VPSM4_CAPABLE ++ if (VPSM4_CAPABLE) { ++ XTS_SET_KEY_FN(vpsm4_set_encrypt_key, vpsm4_set_decrypt_key, ++ vpsm4_encrypt, vpsm4_decrypt, stream_enc, stream_dec, ++ stream_gb_enc, stream_gb_dec); ++ return 1; ++ } else ++#endif /* VPSM4_CAPABLE */ ++ { ++ (void)0; ++ } ++ { ++ XTS_SET_KEY_FN(ossl_sm4_set_key, ossl_sm4_set_key, ossl_sm4_encrypt, ++ ossl_sm4_decrypt, stream_enc, stream_dec, stream_gb_enc, ++ stream_gb_dec); ++ } ++ return 1; ++} ++ ++static void cipher_hw_sm4_xts_copyctx(PROV_CIPHER_CTX *dst, ++ const PROV_CIPHER_CTX *src) ++{ ++ PROV_SM4_XTS_CTX *sctx = (PROV_SM4_XTS_CTX *)src; ++ PROV_SM4_XTS_CTX *dctx = (PROV_SM4_XTS_CTX *)dst; ++ ++ *dctx = *sctx; ++ dctx->xts.key1 = &dctx->ks1.ks; ++ dctx->xts.key2 = &dctx->ks2.ks; ++} ++ ++ ++static const PROV_CIPHER_HW sm4_generic_xts = { ++ cipher_hw_sm4_xts_generic_initkey, ++ NULL, ++ cipher_hw_sm4_xts_copyctx ++}; ++const PROV_CIPHER_HW *ossl_prov_cipher_hw_sm4_xts(size_t keybits) ++{ ++ return &sm4_generic_xts; ++} +diff --git a/providers/implementations/include/prov/implementations.h b/providers/implementations/include/prov/implementations.h +index 498eab4ad4..cfa32ea3ca 100644 +--- a/providers/implementations/include/prov/implementations.h ++++ b/providers/implementations/include/prov/implementations.h +@@ -181,6 +181,7 @@ extern const OSSL_DISPATCH ossl_sm4128cbc_functions[]; + extern const OSSL_DISPATCH ossl_sm4128ctr_functions[]; + extern const OSSL_DISPATCH ossl_sm4128ofb128_functions[]; + extern const OSSL_DISPATCH ossl_sm4128cfb128_functions[]; ++extern const OSSL_DISPATCH ossl_sm4128xts_functions[]; + #endif /* OPENSSL_NO_SM4 */ + #ifndef OPENSSL_NO_RC5 + extern const OSSL_DISPATCH ossl_rc5128ecb_functions[]; +diff --git a/providers/implementations/include/prov/names.h b/providers/implementations/include/prov/names.h +index 0fac23a850..5192f4f471 100644 +--- a/providers/implementations/include/prov/names.h ++++ b/providers/implementations/include/prov/names.h +@@ -164,6 +164,7 @@ + #define PROV_NAMES_SM4_CFB "SM4-CFB:SM4-CFB128:1.2.156.10197.1.104.4" + #define PROV_NAMES_SM4_GCM "SM4-GCM:1.2.156.10197.1.104.8" + #define PROV_NAMES_SM4_CCM "SM4-CCM:1.2.156.10197.1.104.9" ++#define PROV_NAMES_SM4_XTS "SM4-XTS:1.2.156.10197.1.104.10" + #define PROV_NAMES_ChaCha20 "ChaCha20" + #define PROV_NAMES_ChaCha20_Poly1305 "ChaCha20-Poly1305" + #define PROV_NAMES_CAST5_ECB "CAST5-ECB" +-- +2.37.3.windows.1 + diff --git a/Backport-support-decode-SM2-parameters.patch b/Backport-support-decode-SM2-parameters.patch new file mode 100644 index 0000000..7f4ea20 --- /dev/null +++ b/Backport-support-decode-SM2-parameters.patch @@ -0,0 +1,175 @@ +From 08ae9fa627e858b9f8e96e0c6d3cf84422a11d75 Mon Sep 17 00:00:00 2001 +From: K1 <dongbeiouba@gmail.com> +Date: Tue, 19 Jul 2022 01:18:12 +0800 +Subject: [PATCH] Support decode SM2 parameters + +Reviewed-by: Hugo Landau <hlandau@openssl.org> +Reviewed-by: Paul Dale <pauli@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/18819) + +Signed-off-by: Huaxin Lu <luhuaxin1@huawei.com> +--- + apps/ecparam.c | 12 ++++++++++-- + include/openssl/pem.h | 1 + + providers/decoders.inc | 1 + + .../implementations/encode_decode/decode_der2key.c | 1 + + .../implementations/encode_decode/decode_pem2der.c | 1 + + .../implementations/encode_decode/encode_key2text.c | 8 +++++--- + .../implementations/include/prov/implementations.h | 1 + + test/recipes/15-test_ecparam.t | 4 ++++ + .../15-test_ecparam_data/valid/sm2-explicit.pem | 7 +++++++ + .../recipes/15-test_ecparam_data/valid/sm2-named.pem | 3 +++ + 10 files changed, 34 insertions(+), 5 deletions(-) + create mode 100644 test/recipes/15-test_ecparam_data/valid/sm2-explicit.pem + create mode 100644 test/recipes/15-test_ecparam_data/valid/sm2-named.pem + +diff --git a/apps/ecparam.c b/apps/ecparam.c +index 5d66b65569..71f93c4ca5 100644 +--- a/apps/ecparam.c ++++ b/apps/ecparam.c +@@ -242,9 +242,17 @@ int ecparam_main(int argc, char **argv) + goto end; + } + } else { +- params_key = load_keyparams(infile, informat, 1, "EC", "EC parameters"); +- if (params_key == NULL || !EVP_PKEY_is_a(params_key, "EC")) ++ params_key = load_keyparams_suppress(infile, informat, 1, "EC", ++ "EC parameters", 1); ++ if (params_key == NULL) ++ params_key = load_keyparams_suppress(infile, informat, 1, "SM2", ++ "SM2 parameters", 1); ++ ++ if (params_key == NULL) { ++ BIO_printf(bio_err, "Unable to load parameters from %s\n", infile); + goto end; ++ } ++ + if (point_format + && !EVP_PKEY_set_utf8_string_param( + params_key, OSSL_PKEY_PARAM_EC_POINT_CONVERSION_FORMAT, +diff --git a/include/openssl/pem.h b/include/openssl/pem.h +index ed50f081fa..0446c77019 100644 +--- a/include/openssl/pem.h ++++ b/include/openssl/pem.h +@@ -57,6 +57,7 @@ extern "C" { + # define PEM_STRING_ECPRIVATEKEY "EC PRIVATE KEY" + # define PEM_STRING_PARAMETERS "PARAMETERS" + # define PEM_STRING_CMS "CMS" ++# define PEM_STRING_SM2PARAMETERS "SM2 PARAMETERS" + + # define PEM_TYPE_ENCRYPTED 10 + # define PEM_TYPE_MIC_ONLY 20 +diff --git a/providers/decoders.inc b/providers/decoders.inc +index 2772aad05d..edca39ea36 100644 +--- a/providers/decoders.inc ++++ b/providers/decoders.inc +@@ -69,6 +69,7 @@ DECODER_w_structure("X448", der, SubjectPublicKeyInfo, x448, yes), + # ifndef OPENSSL_NO_SM2 + DECODER_w_structure("SM2", der, PrivateKeyInfo, sm2, no), + DECODER_w_structure("SM2", der, SubjectPublicKeyInfo, sm2, no), ++DECODER_w_structure("SM2", der, type_specific_no_pub, sm2, no), + # endif + #endif + DECODER_w_structure("RSA", der, PrivateKeyInfo, rsa, yes), +diff --git a/providers/implementations/encode_decode/decode_der2key.c b/providers/implementations/encode_decode/decode_der2key.c +index ebc2d24833..d4d3731460 100644 +--- a/providers/implementations/encode_decode/decode_der2key.c ++++ b/providers/implementations/encode_decode/decode_der2key.c +@@ -783,6 +783,7 @@ MAKE_DECODER("ED448", ed448, ecx, SubjectPublicKeyInfo); + # ifndef OPENSSL_NO_SM2 + MAKE_DECODER("SM2", sm2, ec, PrivateKeyInfo); + MAKE_DECODER("SM2", sm2, ec, SubjectPublicKeyInfo); ++MAKE_DECODER("SM2", sm2, sm2, type_specific_no_pub); + # endif + #endif + MAKE_DECODER("RSA", rsa, rsa, PrivateKeyInfo); +diff --git a/providers/implementations/encode_decode/decode_pem2der.c b/providers/implementations/encode_decode/decode_pem2der.c +index bc937ffb9d..648ecd4584 100644 +--- a/providers/implementations/encode_decode/decode_pem2der.c ++++ b/providers/implementations/encode_decode/decode_pem2der.c +@@ -119,6 +119,7 @@ static int pem2der_decode(void *vctx, OSSL_CORE_BIO *cin, int selection, + { PEM_STRING_DSAPARAMS, OSSL_OBJECT_PKEY, "DSA", "type-specific" }, + { PEM_STRING_ECPRIVATEKEY, OSSL_OBJECT_PKEY, "EC", "type-specific" }, + { PEM_STRING_ECPARAMETERS, OSSL_OBJECT_PKEY, "EC", "type-specific" }, ++ { PEM_STRING_SM2PARAMETERS, OSSL_OBJECT_PKEY, "SM2", "type-specific" }, + { PEM_STRING_RSA, OSSL_OBJECT_PKEY, "RSA", "type-specific" }, + { PEM_STRING_RSA_PUBLIC, OSSL_OBJECT_PKEY, "RSA", "type-specific" }, + +diff --git a/providers/implementations/encode_decode/encode_key2text.c b/providers/implementations/encode_decode/encode_key2text.c +index 7d983f5e51..a92e04a89d 100644 +--- a/providers/implementations/encode_decode/encode_key2text.c ++++ b/providers/implementations/encode_decode/encode_key2text.c +@@ -512,7 +512,8 @@ static int ec_to_text(BIO *out, const void *key, int selection) + else if ((selection & OSSL_KEYMGMT_SELECT_PUBLIC_KEY) != 0) + type_label = "Public-Key"; + else if ((selection & OSSL_KEYMGMT_SELECT_DOMAIN_PARAMETERS) != 0) +- type_label = "EC-Parameters"; ++ if (EC_GROUP_get_curve_name(group) != NID_sm2) ++ type_label = "EC-Parameters"; + + if ((selection & OSSL_KEYMGMT_SELECT_PRIVATE_KEY) != 0) { + const BIGNUM *priv_key = EC_KEY_get0_private_key(ec); +@@ -538,8 +539,9 @@ static int ec_to_text(BIO *out, const void *key, int selection) + goto err; + } + +- if (BIO_printf(out, "%s: (%d bit)\n", type_label, +- EC_GROUP_order_bits(group)) <= 0) ++ if (type_label != NULL ++ && BIO_printf(out, "%s: (%d bit)\n", type_label, ++ EC_GROUP_order_bits(group)) <= 0) + goto err; + if (priv != NULL + && !print_labeled_buf(out, "priv:", priv, priv_len)) +diff --git a/providers/implementations/include/prov/implementations.h b/providers/implementations/include/prov/implementations.h +index 03ce43719e..288808bb6f 100644 +--- a/providers/implementations/include/prov/implementations.h ++++ b/providers/implementations/include/prov/implementations.h +@@ -508,6 +508,7 @@ extern const OSSL_DISPATCH ossl_SubjectPublicKeyInfo_der_to_ed448_decoder_functi + #ifndef OPENSSL_NO_SM2 + extern const OSSL_DISPATCH ossl_PrivateKeyInfo_der_to_sm2_decoder_functions[]; + extern const OSSL_DISPATCH ossl_SubjectPublicKeyInfo_der_to_sm2_decoder_functions[]; ++extern const OSSL_DISPATCH ossl_type_specific_no_pub_der_to_sm2_decoder_functions[]; + #endif + + extern const OSSL_DISPATCH ossl_PrivateKeyInfo_der_to_rsa_decoder_functions[]; +diff --git a/test/recipes/15-test_ecparam.t b/test/recipes/15-test_ecparam.t +index 37bf620f35..5dba866378 100644 +--- a/test/recipes/15-test_ecparam.t ++++ b/test/recipes/15-test_ecparam.t +@@ -25,6 +25,10 @@ my @valid = glob(data_file("valid", "*.pem")); + my @noncanon = glob(data_file("noncanon", "*.pem")); + my @invalid = glob(data_file("invalid", "*.pem")); + ++if (disabled("sm2")) { ++ @valid = grep { !/sm2-.*\.pem/} @valid; ++} ++ + plan tests => 12; + + sub checkload { +diff --git a/test/recipes/15-test_ecparam_data/valid/sm2-explicit.pem b/test/recipes/15-test_ecparam_data/valid/sm2-explicit.pem +new file mode 100644 +index 0000000000..bd07654ea4 +--- /dev/null ++++ b/test/recipes/15-test_ecparam_data/valid/sm2-explicit.pem +@@ -0,0 +1,7 @@ ++-----BEGIN SM2 PARAMETERS----- ++MIHgAgEBMCwGByqGSM49AQECIQD////+/////////////////////wAAAAD///// ++/////zBEBCD////+/////////////////////wAAAAD//////////AQgKOn6np2f ++XjRNWp5Lz2UJp/OXifUVq4+S3by9QU2UDpMEQQQyxK4sHxmBGV+ZBEZqOcmUj+ML ++v/JmC+FxWkWJM0x0x7w3NqL09necWb3O42tpIVPQqYd8xipHQALfMuUhOfCgAiEA ++/////v///////////////3ID32shxgUrU7v0CTnVQSMCAQE= ++-----END SM2 PARAMETERS----- +diff --git a/test/recipes/15-test_ecparam_data/valid/sm2-named.pem b/test/recipes/15-test_ecparam_data/valid/sm2-named.pem +new file mode 100644 +index 0000000000..d6e280f6c2 +--- /dev/null ++++ b/test/recipes/15-test_ecparam_data/valid/sm2-named.pem +@@ -0,0 +1,3 @@ ++-----BEGIN SM2 PARAMETERS----- ++BggqgRzPVQGCLQ== ++-----END SM2 PARAMETERS----- +-- +2.33.0 + diff --git a/Feature-support-SM2-CMS-signature.patch b/Feature-support-SM2-CMS-signature.patch new file mode 100644 index 0000000..b579537 --- /dev/null +++ b/Feature-support-SM2-CMS-signature.patch @@ -0,0 +1,41 @@ +From e7f35b6f10599a574acb3bcca40845eeccfdc63b Mon Sep 17 00:00:00 2001 +From: Huaxin Lu <luhuaxin1@huawei.com> +Date: Fri, 1 Sep 2023 20:08:46 +0800 +Subject: [PATCH] Support SM2 CMS signature + +Signed-off-by: Huaxin Lu <luhuaxin1@huawei.com> +--- + crypto/cms/cms_sd.c | 2 +- + crypto/evp/p_lib.c | 3 +++ + 2 files changed, 4 insertions(+), 1 deletion(-) + +diff --git a/crypto/cms/cms_sd.c b/crypto/cms/cms_sd.c +index 34c021b..093b41c 100644 +--- a/crypto/cms/cms_sd.c ++++ b/crypto/cms/cms_sd.c +@@ -232,7 +232,7 @@ static int cms_sd_asn1_ctrl(CMS_SignerInfo *si, int cmd) + EVP_PKEY *pkey = si->pkey; + int i; + +- if (EVP_PKEY_is_a(pkey, "DSA") || EVP_PKEY_is_a(pkey, "EC")) ++ if (EVP_PKEY_is_a(pkey, "DSA") || EVP_PKEY_is_a(pkey, "EC") || EVP_PKEY_is_a(pkey, "SM2")) + return ossl_cms_ecdsa_dsa_sign(si, cmd); + else if (EVP_PKEY_is_a(pkey, "RSA") || EVP_PKEY_is_a(pkey, "RSA-PSS")) + return ossl_cms_rsa_sign(si, cmd); +diff --git a/crypto/evp/p_lib.c b/crypto/evp/p_lib.c +index f6acb5b..9567bb0 100644 +--- a/crypto/evp/p_lib.c ++++ b/crypto/evp/p_lib.c +@@ -982,6 +982,9 @@ int EVP_PKEY_type(int type) + + int EVP_PKEY_get_id(const EVP_PKEY *pkey) + { ++ if (EVP_PKEY_is_a(pkey, "SM2")) { ++ return EVP_PKEY_SM2; ++ } + return pkey->type; + } + +-- +2.33.0 + diff --git a/Feature-use-default-id-if-SM2-id-is-not-set.patch b/Feature-use-default-id-if-SM2-id-is-not-set.patch new file mode 100644 index 0000000..ee6f2d8 --- /dev/null +++ b/Feature-use-default-id-if-SM2-id-is-not-set.patch @@ -0,0 +1,59 @@ +From 12f6ee3806c1f04a682b4c31aeb510a2dca602ef Mon Sep 17 00:00:00 2001 +From: Huaxin Lu <luhuaxin1@huawei.com> +Date: Fri, 1 Sep 2023 20:27:45 +0800 +Subject: [PATCH] use default id if SM2 id is not set + +Signed-off-by: Huaxin Lu <luhuaxin1@huawei.com> +--- + crypto/sm2/sm2_sign.c | 14 +++++++++++--- + 1 file changed, 11 insertions(+), 3 deletions(-) + +diff --git a/crypto/sm2/sm2_sign.c b/crypto/sm2/sm2_sign.c +index ff5be9b..33d3a73 100644 +--- a/crypto/sm2/sm2_sign.c ++++ b/crypto/sm2/sm2_sign.c +@@ -42,6 +42,8 @@ int ossl_sm2_compute_z_digest(uint8_t *out, + uint8_t *buf = NULL; + uint16_t entl = 0; + uint8_t e_byte = 0; ++ const uint8_t *f_id = id; ++ size_t f_id_len = id_len; + + hash = EVP_MD_CTX_new(); + ctx = BN_CTX_new_ex(ossl_ec_key_get_libctx(key)); +@@ -68,15 +70,21 @@ int ossl_sm2_compute_z_digest(uint8_t *out, + goto done; + } + ++ /* if id is not set, use default id */ ++ if (f_id == NULL || f_id_len == 0) { ++ f_id = (const uint8_t *)SM2_DEFAULT_USERID; ++ f_id_len = strlen(SM2_DEFAULT_USERID); ++ } ++ + /* Z = h(ENTL || ID || a || b || xG || yG || xA || yA) */ + +- if (id_len >= (UINT16_MAX / 8)) { ++ if (f_id_len >= (UINT16_MAX / 8)) { + /* too large */ + ERR_raise(ERR_LIB_SM2, SM2_R_ID_TOO_LARGE); + goto done; + } + +- entl = (uint16_t)(8 * id_len); ++ entl = (uint16_t)(8 * f_id_len); + + e_byte = entl >> 8; + if (!EVP_DigestUpdate(hash, &e_byte, 1)) { +@@ -89,7 +97,7 @@ int ossl_sm2_compute_z_digest(uint8_t *out, + goto done; + } + +- if (id_len > 0 && !EVP_DigestUpdate(hash, id, id_len)) { ++ if (f_id_len > 0 && !EVP_DigestUpdate(hash, f_id, f_id_len)) { + ERR_raise(ERR_LIB_SM2, ERR_R_EVP_LIB); + goto done; + } +-- +2.33.0 + diff --git a/Makefile.certificate b/Makefile.certificate new file mode 100644 index 0000000..cc88c52 --- /dev/null +++ b/Makefile.certificate @@ -0,0 +1,82 @@ +UTF8 := $(shell locale -c LC_CTYPE -k | grep -q charmap.*UTF-8 && echo -utf8) +DAYS=365 +KEYLEN=2048 +TYPE=rsa:$(KEYLEN) +EXTRA_FLAGS= +ifdef SERIAL + EXTRA_FLAGS+=-set_serial $(SERIAL) +endif + +.PHONY: usage +.SUFFIXES: .key .csr .crt .pem +.PRECIOUS: %.key %.csr %.crt %.pem + +usage: + @echo "This makefile allows you to create:" + @echo " o public/private key pairs" + @echo " o SSL certificate signing requests (CSRs)" + @echo " o self-signed SSL test certificates" + @echo + @echo "To create a key pair, run \"make SOMETHING.key\"." + @echo "To create a CSR, run \"make SOMETHING.csr\"." + @echo "To create a test certificate, run \"make SOMETHING.crt\"." + @echo "To create a key and a test certificate in one file, run \"make SOMETHING.pem\"." + @echo + @echo "To create a key for use with Apache, run \"make genkey\"." + @echo "To create a CSR for use with Apache, run \"make certreq\"." + @echo "To create a test certificate for use with Apache, run \"make testcert\"." + @echo + @echo "To create a test certificate with serial number other than random, add SERIAL=num" + @echo "You can also specify key length with KEYLEN=n and expiration in days with DAYS=n" + @echo "Any additional options can be passed to openssl req via EXTRA_FLAGS" + @echo + @echo Examples: + @echo " make server.key" + @echo " make server.csr" + @echo " make server.crt" + @echo " make stunnel.pem" + @echo " make genkey" + @echo " make certreq" + @echo " make testcert" + @echo " make server.crt SERIAL=1" + @echo " make stunnel.pem EXTRA_FLAGS=-sha384" + @echo " make testcert DAYS=600" + +%.pem: + umask 77 ; \ + PEM1=`/bin/mktemp /tmp/openssl.XXXXXX` ; \ + PEM2=`/bin/mktemp /tmp/openssl.XXXXXX` ; \ + /usr/bin/openssl req $(UTF8) -newkey $(TYPE) -keyout $$PEM1 -nodes -x509 -days $(DAYS) -out $$PEM2 $(EXTRA_FLAGS) ; \ + cat $$PEM1 > $@ ; \ + echo "" >> $@ ; \ + cat $$PEM2 >> $@ ; \ + $(RM) $$PEM1 $$PEM2 + +%.key: + umask 77 ; \ + /usr/bin/openssl genrsa -aes128 $(KEYLEN) > $@ + +%.csr: %.key + umask 77 ; \ + /usr/bin/openssl req $(UTF8) -new -key $^ -out $@ + +%.crt: %.key + umask 77 ; \ + /usr/bin/openssl req $(UTF8) -new -key $^ -x509 -days $(DAYS) -out $@ $(EXTRA_FLAGS) + +TLSROOT=/etc/pki/tls +KEY=$(TLSROOT)/private/localhost.key +CSR=$(TLSROOT)/certs/localhost.csr +CRT=$(TLSROOT)/certs/localhost.crt + +genkey: $(KEY) +certreq: $(CSR) +testcert: $(CRT) + +$(CSR): $(KEY) + umask 77 ; \ + /usr/bin/openssl req $(UTF8) -new -key $(KEY) -out $(CSR) + +$(CRT): $(KEY) + umask 77 ; \ + /usr/bin/openssl req $(UTF8) -new -key $(KEY) -x509 -days $(DAYS) -out $(CRT) $(EXTRA_FLAGS) diff --git a/backport-A-null-pointer-dereference-occurs-when-memory-alloca.patch b/backport-A-null-pointer-dereference-occurs-when-memory-alloca.patch new file mode 100644 index 0000000..afd87ba --- /dev/null +++ b/backport-A-null-pointer-dereference-occurs-when-memory-alloca.patch @@ -0,0 +1,36 @@ +From a8da305fa3dd6e34ba5aab3978281f652fd12883 Mon Sep 17 00:00:00 2001 +From: yangyangtiantianlonglong <yangtianlong1224@163.com> +Date: Mon, 31 Jul 2023 07:04:41 -0700 +Subject: [PATCH] A null pointer dereference occurs when memory allocation + fails + +Fixes #21605 + +Reviewed-by: Hugo Landau <hlandau@openssl.org> +Reviewed-by: Matthias St. Pierre <Matthias.St.Pierre@ncp-e.com> +Reviewed-by: Paul Dale <pauli@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/21606) +--- + ssl/ssl_sess.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/ssl/ssl_sess.c b/ssl/ssl_sess.c +index cda6b7cc5b..2a5d21be79 100644 +--- a/ssl/ssl_sess.c ++++ b/ssl/ssl_sess.c +@@ -139,8 +139,11 @@ SSL_SESSION *ssl_session_dup(SSL_SESSION *src, int ticket) + dest->references = 1; + + dest->lock = CRYPTO_THREAD_lock_new(); +- if (dest->lock == NULL) ++ if (dest->lock == NULL) { ++ OPENSSL_free(dest); ++ dest = NULL; + goto err; ++ } + + if (!CRYPTO_new_ex_data(CRYPTO_EX_INDEX_SSL_SESSION, dest, &dest->ex_data)) + goto err; +-- +2.27.0 + diff --git a/backport-Add-a-test-for-CVE-2023-3446.patch b/backport-Add-a-test-for-CVE-2023-3446.patch new file mode 100644 index 0000000..6c5f734 --- /dev/null +++ b/backport-Add-a-test-for-CVE-2023-3446.patch @@ -0,0 +1,63 @@ +From 8a62fd996cb1c22383ec75b4155d54dec4a1b0ee Mon Sep 17 00:00:00 2001 +From: Matt Caswell <matt@openssl.org> +Date: Fri, 7 Jul 2023 14:39:48 +0100 +Subject: [PATCH] Add a test for CVE-2023-3446 + +Confirm that the only errors DH_check() finds with DH parameters with an +excessively long modulus is that the modulus is too large. We should not +be performing time consuming checks using that modulus. + +Reviewed-by: Paul Dale <pauli@openssl.org> +Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com> +Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de> +Reviewed-by: Tomas Mraz <tomas@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/21451) + +(cherry picked from commit ede782b4c8868d1f09c9cd237f82b6f35b7dba8b) +--- + test/dhtest.c | 15 +++++++++++++-- + 1 file changed, 13 insertions(+), 2 deletions(-) + +diff --git a/test/dhtest.c b/test/dhtest.c +index 7b587f3cfa..f8dd8f3aa7 100644 +--- a/test/dhtest.c ++++ b/test/dhtest.c +@@ -73,7 +73,7 @@ static int dh_test(void) + goto err1; + + /* check fails, because p is way too small */ +- if (!DH_check(dh, &i)) ++ if (!TEST_true(DH_check(dh, &i))) + goto err2; + i ^= DH_MODULUS_TOO_SMALL; + if (!TEST_false(i & DH_CHECK_P_NOT_PRIME) +@@ -124,6 +124,17 @@ static int dh_test(void) + /* We'll have a stale error on the queue from the above test so clear it */ + ERR_clear_error(); + ++ /* Modulus of size: dh check max modulus bits + 1 */ ++ if (!TEST_true(BN_set_word(p, 1)) ++ || !TEST_true(BN_lshift(p, p, OPENSSL_DH_CHECK_MAX_MODULUS_BITS))) ++ goto err3; ++ ++ /* ++ * We expect no checks at all for an excessively large modulus ++ */ ++ if (!TEST_false(DH_check(dh, &i))) ++ goto err3; ++ + /* + * II) key generation + */ +@@ -138,7 +149,7 @@ static int dh_test(void) + goto err3; + + /* ... and check whether it is valid */ +- if (!DH_check(a, &i)) ++ if (!TEST_true(DH_check(a, &i))) + goto err3; + if (!TEST_false(i & DH_CHECK_P_NOT_PRIME) + || !TEST_false(i & DH_CHECK_P_NOT_SAFE_PRIME) +-- +2.27.0 + diff --git a/backport-Add-testcases-for-empty-associated-data-entries-with.patch b/backport-Add-testcases-for-empty-associated-data-entries-with.patch new file mode 100644 index 0000000..74126e7 --- /dev/null +++ b/backport-Add-testcases-for-empty-associated-data-entries-with.patch @@ -0,0 +1,66 @@ +From 96318a8d21bed334d78797eca5b32790775d5f05 Mon Sep 17 00:00:00 2001 +From: Tomas Mraz <tomas@openssl.org> +Date: Tue, 4 Jul 2023 17:50:37 +0200 +Subject: [PATCH] Add testcases for empty associated data entries with AES-SIV + +Reviewed-by: Matt Caswell <matt@openssl.org> +Reviewed-by: Paul Dale <pauli@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/21384) + +(cherry picked from commit 3993bb0c0c87e3ed0ab4274e4688aa814e164cfc) +--- + .../30-test_evp_data/evpciph_aes_siv.txt | 31 +++++++++++++++++++ + 1 file changed, 31 insertions(+) + +diff --git a/test/recipes/30-test_evp_data/evpciph_aes_siv.txt b/test/recipes/30-test_evp_data/evpciph_aes_siv.txt +index a78a49158d..e434f13f41 100644 +--- a/test/recipes/30-test_evp_data/evpciph_aes_siv.txt ++++ b/test/recipes/30-test_evp_data/evpciph_aes_siv.txt +@@ -20,6 +20,19 @@ Tag = 85632d07c6e8f37f950acd320a2ecc93 + Plaintext = 112233445566778899aabbccddee + Ciphertext = 40c02b9690c4dc04daef7f6afe5c + ++Cipher = aes-128-siv ++Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff ++Tag = f1c5fdeac1f15a26779c1501f9fb7588 ++Plaintext = 112233445566778899aabbccddee ++Ciphertext = 27e946c669088ab06da58c5c831c ++ ++Cipher = aes-128-siv ++Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff ++AAD = ++Tag = d1022f5b3664e5a4dfaf90f85be6f28a ++Plaintext = 112233445566778899aabbccddee ++Ciphertext = b66cff6b8eca0b79f083b39a0901 ++ + Cipher = aes-128-siv + Key = 7f7e7d7c7b7a79787776757473727170404142434445464748494a4b4c4d4e4f + AAD = 00112233445566778899aabbccddeeffdeaddadadeaddadaffeeddccbbaa99887766554433221100 +@@ -29,6 +42,24 @@ Tag = 7bdb6e3b432667eb06f4d14bff2fbd0f + Plaintext = 7468697320697320736f6d6520706c61696e7465787420746f20656e6372797074207573696e67205349562d414553 + Ciphertext = cb900f2fddbe404326601965c889bf17dba77ceb094fa663b7a3f748ba8af829ea64ad544a272e9c485b62a3fd5c0d + ++Cipher = aes-128-siv ++Key = 7f7e7d7c7b7a79787776757473727170404142434445464748494a4b4c4d4e4f ++AAD = 00112233445566778899aabbccddeeffdeaddadadeaddadaffeeddccbbaa99887766554433221100 ++AAD = ++AAD = 09f911029d74e35bd84156c5635688c0 ++Tag = 83ce6593a8fa67eb6fcd2819cedfc011 ++Plaintext = 7468697320697320736f6d6520706c61696e7465787420746f20656e6372797074207573696e67205349562d414553 ++Ciphertext = 30d937b42f71f71f93fc2d8d702d3eac8dc7651eefcd81120081ff29d626f97f3de17f2969b691c91b69b652bf3a6d ++ ++Cipher = aes-128-siv ++Key = 7f7e7d7c7b7a79787776757473727170404142434445464748494a4b4c4d4e4f ++AAD = ++AAD = 00112233445566778899aabbccddeeffdeaddadadeaddadaffeeddccbbaa99887766554433221100 ++AAD = 09f911029d74e35bd84156c5635688c0 ++Tag = 77dd4a44f5a6b41302121ee7f378de25 ++Plaintext = 7468697320697320736f6d6520706c61696e7465787420746f20656e6372797074207573696e67205349562d414553 ++Ciphertext = 0fcd664c922464c88939d71fad7aefb864e501b0848a07d39201c1067a7288f3dadf0131a823a0bc3d588e8564a5fe ++ + Cipher = aes-192-siv + Key = fffefdfcfbfaf9f8f7f6f5f4f3f2f1f0f0f1f2f3f4f5f6f7f8f9fafbfcfdfefffffefdfcfbfaf9f8f7f6f5f4f3f2f1f0 + AAD = 101112131415161718191a1b1c1d1e1f2021222324252627 +-- +2.27.0 + diff --git a/backport-DH_check-Do-not-try-checking-q-properties-if-it-is-o.patch b/backport-DH_check-Do-not-try-checking-q-properties-if-it-is-o.patch new file mode 100644 index 0000000..13ad1a2 --- /dev/null +++ b/backport-DH_check-Do-not-try-checking-q-properties-if-it-is-o.patch @@ -0,0 +1,61 @@ +From 9002fd07327a91f35ba6c1307e71fa6fd4409b7f Mon Sep 17 00:00:00 2001 +From: Tomas Mraz <tomas@openssl.org> +Date: Tue, 25 Jul 2023 15:22:48 +0200 +Subject: [PATCH] DH_check(): Do not try checking q properties if it is + obviously invalid + +If |q| >= |p| then the q value is obviously wrong as q +is supposed to be a prime divisor of p-1. + +We check if p is overly large so this added test implies that +q is not large either when performing subsequent tests using that +q value. + +Otherwise if it is too large these additional checks of the q value +such as the primality test can then trigger DoS by doing overly long +computations. + +Fixes CVE-2023-3817 + +Reviewed-by: Matt Caswell <matt@openssl.org> +Reviewed-by: Paul Dale <pauli@openssl.org> +Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com> +Reviewed-by: Todd Short <todd.short@me.com> +(Merged from https://github.com/openssl/openssl/pull/21550) + +(cherry picked from commit 1c16253f3c3a8d1e25918c3f404aae6a5b0893de) +(cherry picked from commit 6a1eb62c29db6cb5eec707f9338aee00f44e26f5) +--- + crypto/dh/dh_check.c | 9 ++++++++- + 1 file changed, 8 insertions(+), 1 deletion(-) + +diff --git a/crypto/dh/dh_check.c b/crypto/dh/dh_check.c +index aef6f9b1b7..fbe2797569 100644 +--- a/crypto/dh/dh_check.c ++++ b/crypto/dh/dh_check.c +@@ -143,7 +143,7 @@ int DH_check(const DH *dh, int *ret) + #ifdef FIPS_MODULE + return DH_check_params(dh, ret); + #else +- int ok = 0, r; ++ int ok = 0, r, q_good = 0; + BN_CTX *ctx = NULL; + BIGNUM *t1 = NULL, *t2 = NULL; + int nid = DH_get_nid((DH *)dh); +@@ -172,6 +172,13 @@ int DH_check(const DH *dh, int *ret) + goto err; + + if (dh->params.q != NULL) { ++ if (BN_ucmp(dh->params.p, dh->params.q) > 0) ++ q_good = 1; ++ else ++ *ret |= DH_CHECK_INVALID_Q_VALUE; ++ } ++ ++ if (q_good) { + if (BN_cmp(dh->params.g, BN_value_one()) <= 0) + *ret |= DH_NOT_SUITABLE_GENERATOR; + else if (BN_cmp(dh->params.g, dh->params.p) >= 0) +-- +2.27.0 + diff --git a/backport-Do-not-ignore-empty-associated-data-with-AES-SIV-mod.patch b/backport-Do-not-ignore-empty-associated-data-with-AES-SIV-mod.patch new file mode 100644 index 0000000..98b1a0b --- /dev/null +++ b/backport-Do-not-ignore-empty-associated-data-with-AES-SIV-mod.patch @@ -0,0 +1,57 @@ +From 00e2f5eea29994d19293ec4e8c8775ba73678598 Mon Sep 17 00:00:00 2001 +From: Tomas Mraz <tomas@openssl.org> +Date: Tue, 4 Jul 2023 17:30:35 +0200 +Subject: [PATCH] Do not ignore empty associated data with AES-SIV mode + +The AES-SIV mode allows for multiple associated data items +authenticated separately with any of these being 0 length. + +The provided implementation ignores such empty associated data +which is incorrect in regards to the RFC 5297 and is also +a security issue because such empty associated data then become +unauthenticated if an application expects to authenticate them. + +Fixes CVE-2023-2975 + +Reviewed-by: Matt Caswell <matt@openssl.org> +Reviewed-by: Paul Dale <pauli@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/21384) + +(cherry picked from commit c426c281cfc23ab182f7d7d7a35229e7db1494d9) +--- + .../implementations/ciphers/cipher_aes_siv.c | 18 +++++++++++------- + 1 file changed, 11 insertions(+), 7 deletions(-) + +diff --git a/providers/implementations/ciphers/cipher_aes_siv.c b/providers/implementations/ciphers/cipher_aes_siv.c +index 45010b90db..b396c8651a 100644 +--- a/providers/implementations/ciphers/cipher_aes_siv.c ++++ b/providers/implementations/ciphers/cipher_aes_siv.c +@@ -120,14 +120,18 @@ static int siv_cipher(void *vctx, unsigned char *out, size_t *outl, + if (!ossl_prov_is_running()) + return 0; + +- if (inl == 0) { +- *outl = 0; +- return 1; +- } ++ /* Ignore just empty encryption/decryption call and not AAD. */ ++ if (out != NULL) { ++ if (inl == 0) { ++ if (outl != NULL) ++ *outl = 0; ++ return 1; ++ } + +- if (outsize < inl) { +- ERR_raise(ERR_LIB_PROV, PROV_R_OUTPUT_BUFFER_TOO_SMALL); +- return 0; ++ if (outsize < inl) { ++ ERR_raise(ERR_LIB_PROV, PROV_R_OUTPUT_BUFFER_TOO_SMALL); ++ return 0; ++ } + } + + if (ctx->hw->cipher(ctx, out, in, inl) <= 0) +-- +2.27.0 + diff --git a/backport-Fix-DH_check-excessive-time-with-over-sized-modulus.patch b/backport-Fix-DH_check-excessive-time-with-over-sized-modulus.patch new file mode 100644 index 0000000..53ddf3b --- /dev/null +++ b/backport-Fix-DH_check-excessive-time-with-over-sized-modulus.patch @@ -0,0 +1,74 @@ +From 1fa20cf2f506113c761777127a38bce5068740eb Mon Sep 17 00:00:00 2001 +From: Matt Caswell <matt@openssl.org> +Date: Thu, 6 Jul 2023 16:36:35 +0100 +Subject: [PATCH] Fix DH_check() excessive time with over sized modulus + +The DH_check() function checks numerous aspects of the key or parameters +that have been supplied. Some of those checks use the supplied modulus +value even if it is excessively large. + +There is already a maximum DH modulus size (10,000 bits) over which +OpenSSL will not generate or derive keys. DH_check() will however still +perform various tests for validity on such a large modulus. We introduce a +new maximum (32,768) over which DH_check() will just fail. + +An application that calls DH_check() and supplies a key or parameters +obtained from an untrusted source could be vulnerable to a Denial of +Service attack. + +The function DH_check() is itself called by a number of other OpenSSL +functions. An application calling any of those other functions may +similarly be affected. The other functions affected by this are +DH_check_ex() and EVP_PKEY_param_check(). + +CVE-2023-3446 + +Reviewed-by: Paul Dale <pauli@openssl.org> +Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com> +Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de> +Reviewed-by: Tomas Mraz <tomas@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/21451) + +(cherry picked from commit 9e0094e2aa1b3428a12d5095132f133c078d3c3d) +--- + crypto/dh/dh_check.c | 6 ++++++ + include/openssl/dh.h | 6 +++++- + 2 files changed, 11 insertions(+), 1 deletion(-) + +diff --git a/crypto/dh/dh_check.c b/crypto/dh/dh_check.c +index 0b391910d6..84a926998e 100644 +--- a/crypto/dh/dh_check.c ++++ b/crypto/dh/dh_check.c +@@ -152,6 +152,12 @@ int DH_check(const DH *dh, int *ret) + if (nid != NID_undef) + return 1; + ++ /* Don't do any checks at all with an excessively large modulus */ ++ if (BN_num_bits(dh->params.p) > OPENSSL_DH_CHECK_MAX_MODULUS_BITS) { ++ ERR_raise(ERR_LIB_DH, DH_R_MODULUS_TOO_LARGE); ++ return 0; ++ } ++ + if (!DH_check_params(dh, ret)) + return 0; + +diff --git a/include/openssl/dh.h b/include/openssl/dh.h +index b97871eca7..36420f51d8 100644 +--- a/include/openssl/dh.h ++++ b/include/openssl/dh.h +@@ -89,7 +89,11 @@ int EVP_PKEY_CTX_get0_dh_kdf_ukm(EVP_PKEY_CTX *ctx, unsigned char **ukm); + # include <openssl/dherr.h> + + # ifndef OPENSSL_DH_MAX_MODULUS_BITS +-# define OPENSSL_DH_MAX_MODULUS_BITS 10000 ++# define OPENSSL_DH_MAX_MODULUS_BITS 10000 ++# endif ++ ++# ifndef OPENSSL_DH_CHECK_MAX_MODULUS_BITS ++# define OPENSSL_DH_CHECK_MAX_MODULUS_BITS 32768 + # endif + + # define OPENSSL_DH_FIPS_MIN_MODULUS_BITS 1024 +-- +2.27.0 + diff --git a/backport-Make-DH_check-set-some-error-bits-in-recently-added-.patch b/backport-Make-DH_check-set-some-error-bits-in-recently-added-.patch new file mode 100644 index 0000000..91e9417 --- /dev/null +++ b/backport-Make-DH_check-set-some-error-bits-in-recently-added-.patch @@ -0,0 +1,39 @@ +From e648db50d9a63f71cab5cb78424c2932d019a744 Mon Sep 17 00:00:00 2001 +From: Bernd Edlinger <bernd.edlinger@hotmail.de> +Date: Sun, 23 Jul 2023 14:27:54 +0200 +Subject: [PATCH] Make DH_check set some error bits in recently added error + +The pre-existing error cases where DH_check returned zero +are not related to the dh params in any way, but are only +triggered by out-of-memory errors, therefore having *ret +set to zero feels right, but since the new error case is +triggered by too large p values that is something different. +On the other hand some callers of this function might not +be prepared to handle the return value correctly but only +rely on *ret. Therefore we set some error bits in *ret as +additional safety measure. + +Reviewed-by: Paul Dale <pauli@openssl.org> +Reviewed-by: Tomas Mraz <tomas@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/21524) + +(cherry picked from commit 81d10e61a4b7d5394d08a718bf7d6bae20e818fc) +--- + crypto/dh/dh_check.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/crypto/dh/dh_check.c b/crypto/dh/dh_check.c +index 84a926998e..aef6f9b1b7 100644 +--- a/crypto/dh/dh_check.c ++++ b/crypto/dh/dh_check.c +@@ -155,6 +155,7 @@ int DH_check(const DH *dh, int *ret) + /* Don't do any checks at all with an excessively large modulus */ + if (BN_num_bits(dh->params.p) > OPENSSL_DH_CHECK_MAX_MODULUS_BITS) { + ERR_raise(ERR_LIB_DH, DH_R_MODULUS_TOO_LARGE); ++ *ret = DH_MODULUS_TOO_LARGE | DH_CHECK_P_NOT_PRIME; + return 0; + } + +-- +2.27.0 + diff --git a/backport-dhtest.c-Add-test-of-DH_check-with-q-p-1.patch b/backport-dhtest.c-Add-test-of-DH_check-with-q-p-1.patch new file mode 100644 index 0000000..d5d7890 --- /dev/null +++ b/backport-dhtest.c-Add-test-of-DH_check-with-q-p-1.patch @@ -0,0 +1,53 @@ +From 2255f6c74e6c8b702adcf352b04c5d3e6c759745 Mon Sep 17 00:00:00 2001 +From: Tomas Mraz <tomas@openssl.org> +Date: Tue, 25 Jul 2023 15:23:43 +0200 +Subject: [PATCH] dhtest.c: Add test of DH_check() with q = p + 1 + +This must fail with DH_CHECK_INVALID_Q_VALUE and +with DH_CHECK_Q_NOT_PRIME unset. + +Reviewed-by: Matt Caswell <matt@openssl.org> +Reviewed-by: Paul Dale <pauli@openssl.org> +Reviewed-by: Tom Cosgrove <tom.cosgrove@arm.com> +Reviewed-by: Todd Short <todd.short@me.com> +(Merged from https://github.com/openssl/openssl/pull/21550) + +(cherry picked from commit ad5d35572695d7b5748b2bd4fb1afaa189b29e28) +(cherry picked from commit 1478ffad3f123550ec1014642d5c880dfbe270ef) +--- + test/dhtest.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/test/dhtest.c b/test/dhtest.c +index f8dd8f3aa7..d02b3b7c58 100644 +--- a/test/dhtest.c ++++ b/test/dhtest.c +@@ -124,6 +124,15 @@ static int dh_test(void) + /* We'll have a stale error on the queue from the above test so clear it */ + ERR_clear_error(); + ++ if (!TEST_ptr(BN_copy(q, p)) || !TEST_true(BN_add(q, q, BN_value_one()))) ++ goto err3; ++ ++ if (!TEST_true(DH_check(dh, &i))) ++ goto err3; ++ if (!TEST_true(i & DH_CHECK_INVALID_Q_VALUE) ++ || !TEST_false(i & DH_CHECK_Q_NOT_PRIME)) ++ goto err3; ++ + /* Modulus of size: dh check max modulus bits + 1 */ + if (!TEST_true(BN_set_word(p, 1)) + || !TEST_true(BN_lshift(p, p, OPENSSL_DH_CHECK_MAX_MODULUS_BITS))) +@@ -135,6 +144,9 @@ static int dh_test(void) + if (!TEST_false(DH_check(dh, &i))) + goto err3; + ++ /* We'll have a stale error on the queue from the above test so clear it */ ++ ERR_clear_error(); ++ + /* + * II) key generation + */ +-- +2.27.0 + diff --git a/openssl-3.0-build.patch b/openssl-3.0-build.patch new file mode 100644 index 0000000..83243e1 --- /dev/null +++ b/openssl-3.0-build.patch @@ -0,0 +1,38 @@ +From 262bff1615d4461120327c5a9fe904ad1c6ce813 Mon Sep 17 00:00:00 2001 +From: hzero1996 <wangcheng156@huawei.com> +Date: Sun, 29 Jan 2023 14:53:03 +0800 +Subject: [PATCH] openssl-3.0-build + +--- + Configurations/10-main.conf | 1 + + Configurations/unix-Makefile.tmpl | 2 +- + 2 files changed, 2 insertions(+), 1 deletion(-) + +diff --git a/Configurations/10-main.conf b/Configurations/10-main.conf +index b578a3c..1ad81c3 100644 +--- a/Configurations/10-main.conf ++++ b/Configurations/10-main.conf +@@ -772,6 +772,7 @@ my %targets = ( + inherit_from => [ "linux-generic64" ], + asm_arch => 'aarch64', + perlasm_scheme => "linux64", ++ multilib => "64", + }, + "linux-arm64ilp32" => { # https://wiki.linaro.org/Platform/arm64-ilp32 + inherit_from => [ "linux-generic32" ], +diff --git a/Configurations/unix-Makefile.tmpl b/Configurations/unix-Makefile.tmpl +index 110ba06..712a779 100644 +--- a/Configurations/unix-Makefile.tmpl ++++ b/Configurations/unix-Makefile.tmpl +@@ -611,7 +611,7 @@ install_sw: install_dev install_engines install_modules install_runtime + + uninstall_sw: uninstall_runtime uninstall_modules uninstall_engines uninstall_dev + +-install_docs: install_man_docs install_html_docs ++install_docs: install_man_docs + + uninstall_docs: uninstall_man_docs uninstall_html_docs + $(RM) -r $(DESTDIR)$(DOCDIR) +-- +2.27.0 + diff --git a/openssl.spec b/openssl.spec new file mode 100644 index 0000000..c51ad86 --- /dev/null +++ b/openssl.spec @@ -0,0 +1,94 @@ +%define install_prefix /opt/openssl3 +%define soversion 3 +Name: openssl3 +Epoch: 1 +Version: 3.0.9 +Release: 1 +Summary: Cryptography and SSL/TLS Toolkit +License: OpenSSL and SSLeay +URL: https://www.openssl.org/ +Source0: https://www.openssl.org/source/openssl-%{version}.tar.gz +Source1: Makefile.certificate + +Patch1: openssl-3.0-build.patch +Patch2: Backport-aarch64-support-BTI-and-pointer-authentication-in-as.patch +Patch3: Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch +Patch4: Backport-Fix-sm3ss1-translation-issue-in-sm3-armv8.pl.patch +Patch5: Backport-providers-Add-SM4-GCM-implementation.patch +Patch6: Backport-SM4-optimization-for-ARM-by-HW-instruction.patch +Patch7: Backport-Further-acceleration-for-SM4-GCM-on-ARM.patch +Patch8: Backport-SM4-optimization-for-ARM-by-ASIMD.patch +Patch9: Backport-providers-Add-SM4-XTS-implementation.patch +Patch10: Backport-Fix-SM4-CBC-regression-on-Armv8.patch +Patch11: Backport-Fix-SM4-test-failures-on-big-endian-ARM-processors.patch +Patch12: Backport-Apply-SM4-optimization-patch-to-Kunpeng-920.patch +Patch13: Backport-SM4-AESE-optimization-for-ARMv8.patch +Patch14: Backport-Fix-SM4-XTS-build-failure-on-Mac-mini-M1.patch +Patch15: backport-Add-testcases-for-empty-associated-data-entries-with.patch +Patch16: backport-Do-not-ignore-empty-associated-data-with-AES-SIV-mod.patch +Patch17: backport-Add-a-test-for-CVE-2023-3446.patch +Patch18: backport-Fix-DH_check-excessive-time-with-over-sized-modulus.patch +Patch19: backport-Make-DH_check-set-some-error-bits-in-recently-added-.patch +Patch20: backport-DH_check-Do-not-try-checking-q-properties-if-it-is-o.patch +Patch21: backport-dhtest.c-Add-test-of-DH_check-with-q-p-1.patch +Patch22: Backport-support-decode-SM2-parameters.patch +Patch23: Feature-support-SM2-CMS-signature.patch +Patch24: Feature-use-default-id-if-SM2-id-is-not-set.patch +Patch25: backport-A-null-pointer-dereference-occurs-when-memory-alloca.patch + +BuildRequires: gcc gcc-c++ perl make lksctp-tools-devel coreutils util-linux zlib-devel +Requires: coreutils + +%description +OpenSSL is a robust, commercial-grade, and full-featured toolkit for the +Transport Layer Security (TLS) and Secure Sockets Layer (SSL) protocols. + +%prep +%autosetup -n openssl-%{version} -p1 + +%build + +sslarch=%{_os}-%{_target_cpu} +%ifarch i686 +sslarch=linux-elf +%endif +%ifarch riscv64 +sslarch=%{_os}64-%{_target_cpu} +%endif + +%ifarch x86_64 aarch64 +sslflags=enable-ec_nistp_64_gcc_128 +%endif + +RPM_OPT_FLAGS="$RPM_OPT_FLAGS -Wa,--noexecstack -Wa,--generate-missing-build-notes=yes -DPURIFY $RPM_LD_FLAGS" +./Configure \ + --prefix=%{install_prefix} -Wl,-rpath,%{install_prefix}/lib ${sslflags} \ + zlib enable-camellia enable-seed enable-rfc3779 \ + enable-cms enable-md2 enable-rc5 ${ktlsopt} enable-fips\ + no-mdc2 no-ec2m enable-sm2 enable-sm4 enable-buildtest-c++\ + shared ${sslarch} $RPM_OPT_FLAGS '-DDEVRANDOM="\"/dev/urandom\""' \ + -Wl,--allow-multiple-definition + + +%make_build all + +%install +# Install OpenSSL. +#install -d $RPM_BUILD_ROOT{%{_bindir},%{_includedir},%{_libdir},%{_mandir},%{_libdir}/openssl,%{_pkgdocdir}} + +%make_install + +rm -f %{buildroot}%{install_prefix}{/bin/c_rehash,/ssl/misc/tsget*,/ssl/misc/*.pl} + +export QA_RPATHS=$(( 0x0002 )) + +%check +%make_build test + +%files +%license LICENSE.txt +%{install_prefix} + +%changelog +* Mon Oct 02 2023 Funda Wang <fundawang@yeah.net> - 3.0.9-1 +- Try install into /opt @@ -0,0 +1 @@ +8b2aff668b8ce0da24b9505ebfd26b4d openssl-3.0.9.tar.gz |