diff options
Diffstat (limited to 'Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch')
-rw-r--r-- | Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch | 457 |
1 files changed, 457 insertions, 0 deletions
diff --git a/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch b/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch new file mode 100644 index 0000000..0467d78 --- /dev/null +++ b/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch @@ -0,0 +1,457 @@ +From 8a83d735057dde1f727eb0921446e4ca8b085267 Mon Sep 17 00:00:00 2001 +From: "fangming.fang" <fangming.fang@arm.com> +Date: Fri, 24 Dec 2021 08:29:04 +0000 +Subject: [PATCH 02/13] SM3 acceleration with SM3 hardware instruction on + aarch64 + +SM3 hardware instruction is optional feature of crypto extension for +aarch64. This implementation accelerates SM3 via SM3 instructions. For +the platform not supporting SM3 instruction, the original C +implementation still works. Thanks to AliBaba for testing and reporting +the following perf numbers for Yitian710: + +Benchmark on T-Head Yitian-710 2.75GHz: + +Before: +type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes +sm3 49297.82k 121062.63k 223106.05k 283371.52k 307574.10k 309400.92k + +After (33% - 74% faster): +type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes +sm3 65640.01k 179121.79k 359854.59k 481448.96k 534055.59k 538274.47k + +Reviewed-by: Paul Dale <pauli@openssl.org> +Reviewed-by: Tomas Mraz <tomas@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/17454) +--- + crypto/arm64cpuid.pl | 8 + + crypto/arm_arch.h | 2 + + crypto/armcap.c | 10 ++ + crypto/sm3/asm/sm3-armv8.pl | 282 ++++++++++++++++++++++++++++++++++++ + crypto/sm3/build.info | 21 ++- + crypto/sm3/sm3_local.h | 16 +- + 6 files changed, 336 insertions(+), 3 deletions(-) + create mode 100644 crypto/sm3/asm/sm3-armv8.pl + +diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl +index 11f0e50279..10d267b7ad 100755 +--- a/crypto/arm64cpuid.pl ++++ b/crypto/arm64cpuid.pl +@@ -96,6 +96,14 @@ _armv8_cpuid_probe: + ret + .size _armv8_cpuid_probe,.-_armv8_cpuid_probe + ++.globl _armv8_sm3_probe ++.type _armv8_sm3_probe,%function ++_armv8_sm3_probe: ++ AARCH64_VALID_CALL_TARGET ++ .long 0xce63c004 // sm3partw1 v4.4s, v0.4s, v3.4s ++ ret ++.size _armv8_sm3_probe,.-_armv8_sm3_probe ++ + .globl OPENSSL_cleanse + .type OPENSSL_cleanse,%function + .align 5 +diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h +index a815a5c72b..c8b501f34c 100644 +--- a/crypto/arm_arch.h ++++ b/crypto/arm_arch.h +@@ -83,6 +83,8 @@ extern unsigned int OPENSSL_armv8_rsa_neonized; + # define ARMV8_PMULL (1<<5) + # define ARMV8_SHA512 (1<<6) + # define ARMV8_CPUID (1<<7) ++# define ARMV8_RNG (1<<8) ++# define ARMV8_SM3 (1<<9) + + /* + * MIDR_EL1 system register +diff --git a/crypto/armcap.c b/crypto/armcap.c +index c021330e32..365a48df45 100644 +--- a/crypto/armcap.c ++++ b/crypto/armcap.c +@@ -52,6 +52,7 @@ void _armv8_sha1_probe(void); + void _armv8_sha256_probe(void); + void _armv8_pmull_probe(void); + # ifdef __aarch64__ ++void _armv8_sm3_probe(void); + void _armv8_sha512_probe(void); + unsigned int _armv8_cpuid_probe(void); + # endif +@@ -137,6 +138,7 @@ static unsigned long getauxval(unsigned long key) + # define HWCAP_CE_SHA1 (1 << 5) + # define HWCAP_CE_SHA256 (1 << 6) + # define HWCAP_CPUID (1 << 11) ++# define HWCAP_CE_SM3 (1 << 18) + # define HWCAP_CE_SHA512 (1 << 21) + # endif + +@@ -210,6 +212,9 @@ void OPENSSL_cpuid_setup(void) + + if (hwcap & HWCAP_CPUID) + OPENSSL_armcap_P |= ARMV8_CPUID; ++ ++ if (hwcap & HWCAP_CE_SM3) ++ OPENSSL_armcap_P |= ARMV8_SM3; + # endif + } + # endif +@@ -253,6 +258,11 @@ void OPENSSL_cpuid_setup(void) + _armv8_sha512_probe(); + OPENSSL_armcap_P |= ARMV8_SHA512; + } ++ ++ if (sigsetjmp(ill_jmp, 1) == 0) { ++ _armv8_sm3_probe(); ++ OPENSSL_armcap_P |= ARMV8_SM3; ++ } + # endif + } + # endif +diff --git a/crypto/sm3/asm/sm3-armv8.pl b/crypto/sm3/asm/sm3-armv8.pl +new file mode 100644 +index 0000000000..bb71b2eade +--- /dev/null ++++ b/crypto/sm3/asm/sm3-armv8.pl +@@ -0,0 +1,282 @@ ++#! /usr/bin/env perl ++# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++# ++# This module implements support for Armv8 SM3 instructions ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or ++die "can't locate arm-xlate.pl"; ++ ++open OUT,"| \"$^X\" $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++*STDOUT=*OUT; ++ ++# Message expanding: ++# Wj <- P1(W[j-16]^W[j-9]^(W[j-3]<<<15))^(W[j-13]<<<7)^W[j-6] ++# Input: s0, s1, s2, s3 ++# s0 = w0 | w1 | w2 | w3 ++# s1 = w4 | w5 | w6 | w7 ++# s2 = w8 | w9 | w10 | w11 ++# s3 = w12 | w13 | w14 | w15 ++# Output: s4 ++sub msg_exp () { ++my $s0 = shift; ++my $s1 = shift; ++my $s2 = shift; ++my $s3 = shift; ++my $s4 = shift; ++my $vtmp1 = shift; ++my $vtmp2 = shift; ++$code.=<<___; ++ // s4 = w7 | w8 | w9 | w10 ++ ext $s4.16b, $s1.16b, $s2.16b, #12 ++ // vtmp1 = w3 | w4 | w5 | w6 ++ ext $vtmp1.16b, $s0.16b, $s1.16b, #12 ++ // vtmp2 = w10 | w11 | w12 | w13 ++ ext $vtmp2.16b, $s2.16b, $s3.16b, #8 ++ sm3partw1 $s4.4s, $s0.4s, $s3.4s ++ sm3partw2 $s4.4s, $vtmp2.4s, $vtmp1.4s ++___ ++} ++ ++# A round of compresson function ++# Input: ++# ab - choose instruction among sm3tt1a, sm3tt1b, sm3tt2a, sm3tt2b ++# vstate0 - vstate1, store digest status(A - H) ++# vconst0 - vconst1, interleaved used to store Tj <<< j ++# vtmp - temporary register ++# vw - for sm3tt1ab, vw = s0 eor s1 ++# s0 - for sm3tt2ab, just be s0 ++# i, choose wj' or wj from vw ++sub round () { ++my $ab = shift; ++my $vstate0 = shift; ++my $vstate1 = shift; ++my $vconst0 = shift; ++my $vconst1 = shift; ++my $vtmp = shift; ++my $vw = shift; ++my $s0 = shift; ++my $i = shift; ++$code.=<<___; ++ sm3ss1 $vtmp.4s, $vstate0.4s, $vconst0.4s, $vstate1.4s ++ shl $vconst1.4s, $vconst0.4s, #1 ++ sri $vconst1.4s, $vconst0.4s, #31 ++ sm3tt1$ab $vstate0.4s, $vtmp.4s, $vw.4s[$i] ++ sm3tt2$ab $vstate1.4s, $vtmp.4s, $s0.4s[$i] ++___ ++} ++ ++sub qround () { ++my $ab = shift; ++my $vstate0 = shift; ++my $vstate1 = shift; ++my $vconst0 = shift; ++my $vconst1 = shift; ++my $vtmp1 = shift; ++my $vtmp2 = shift; ++my $s0 = shift; ++my $s1 = shift; ++my $s2 = shift; ++my $s3 = shift; ++my $s4 = shift; ++ if($s4) { ++ &msg_exp($s0, $s1, $s2, $s3, $s4, $vtmp1, $vtmp2); ++ } ++$code.=<<___; ++ eor $vtmp1.16b, $s0.16b, $s1.16b ++___ ++ &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2, ++ $vtmp1, $s0, 0); ++ &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2, ++ $vtmp1, $s0, 1); ++ &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2, ++ $vtmp1, $s0, 2); ++ &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2, ++ $vtmp1, $s0, 3); ++} ++ ++$code=<<___; ++#include "arm_arch.h" ++.arch armv8.2-a+sm4 ++.text ++___ ++ ++{{{ ++my ($pstate,$pdata,$num)=("x0","x1","w2"); ++my ($state1,$state2)=("v5","v6"); ++my ($sconst1, $sconst2)=("s16","s17"); ++my ($vconst1, $vconst2)=("v16","v17"); ++my ($s0,$s1,$s2,$s3,$s4)=map("v$_",(0..4)); ++my ($bkstate1,$bkstate2)=("v18","v19"); ++my ($vconst_tmp1,$vconst_tmp2)=("v20","v21"); ++my ($vtmp1,$vtmp2)=("v22","v23"); ++my $constaddr="x8"; ++# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num) ++$code.=<<___; ++.globl ossl_hwsm3_block_data_order ++.type ossl_hwsm3_block_data_order,%function ++.align 5 ++ossl_hwsm3_block_data_order: ++ AARCH64_VALID_CALL_TARGET ++ // load state ++ ld1 {$state1.4s-$state2.4s}, [$pstate] ++ rev64 $state1.4s, $state1.4s ++ rev64 $state2.4s, $state2.4s ++ ext $state1.16b, $state1.16b, $state1.16b, #8 ++ ext $state2.16b, $state2.16b, $state2.16b, #8 ++ ++ adr $constaddr, .Tj ++ ldp $sconst1, $sconst2, [$constaddr] ++ ++.Loop: ++ // load input ++ ld1 {$s0.16b-$s3.16b}, [$pdata], #64 ++ sub $num, $num, #1 ++ ++ mov $bkstate1.16b, $state1.16b ++ mov $bkstate2.16b, $state2.16b ++ ++#ifndef __ARMEB__ ++ rev32 $s0.16b, $s0.16b ++ rev32 $s1.16b, $s1.16b ++ rev32 $s2.16b, $s2.16b ++ rev32 $s3.16b, $s3.16b ++#endif ++ ++ ext $vconst_tmp1.16b, $vconst1.16b, $vconst1.16b, #4 ++___ ++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s0,$s1,$s2,$s3,$s4); ++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s1,$s2,$s3,$s4,$s0); ++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s2,$s3,$s4,$s0,$s1); ++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s3,$s4,$s0,$s1,$s2); ++ ++$code.=<<___; ++ ext $vconst_tmp1.16b, $vconst2.16b, $vconst2.16b, #4 ++___ ++ ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s4,$s0,$s1,$s2,$s3); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s0,$s1,$s2,$s3,$s4); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s1,$s2,$s3,$s4,$s0); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s2,$s3,$s4,$s0,$s1); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s3,$s4,$s0,$s1,$s2); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s4,$s0,$s1,$s2,$s3); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s0,$s1,$s2,$s3,$s4); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s1,$s2,$s3,$s4,$s0); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s2,$s3,$s4,$s0,$s1); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s3,$s4); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s4,$s0); ++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2, ++ $s0,$s1); ++ ++$code.=<<___; ++ eor $state1.16b, $state1.16b, $bkstate1.16b ++ eor $state2.16b, $state2.16b, $bkstate2.16b ++ ++ // any remained blocks? ++ cbnz $num, .Loop ++ ++ // save state ++ rev64 $state1.4s, $state1.4s ++ rev64 $state2.4s, $state2.4s ++ ext $state1.16b, $state1.16b, $state1.16b, #8 ++ ext $state2.16b, $state2.16b, $state2.16b, #8 ++ st1 {$state1.4s-$state2.4s}, [$pstate] ++ ret ++.size ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order ++ ++.align 3 ++.Tj: ++.word 0x79cc4519, 0x9d8a7a87 ++___ ++}}} ++ ++######################################### ++my %sm3partopcode = ( ++ "sm3partw1" => 0xce60C000, ++ "sm3partw2" => 0xce60C400); ++ ++my %sm3sslopcode = ( ++ "sm3ssl" => 0xce400000); ++ ++my %sm3ttopcode = ( ++ "sm3tt1a" => 0xce408000, ++ "sm3tt1b" => 0xce408400, ++ "sm3tt2a" => 0xce408800, ++ "sm3tt2b" => 0xce408C00); ++ ++sub unsm3part { ++ my ($mnemonic,$arg)=@_; ++ ++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o ++ && ++ sprintf ".inst\t0x%08x\t//%s %s", ++ $sm3partopcode{$mnemonic}|$1|($2<<5)|($3<<16), ++ $mnemonic,$arg; ++} ++ ++sub unsm3ssl { ++ my ($mnemonic,$arg)=@_; ++ ++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*, ++ \s*[qv](\d+)/o ++ && ++ sprintf ".inst\t0x%08x\t//%s %s", ++ $sm3sslopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10), ++ $mnemonic,$arg; ++} ++ ++sub unsm3tt { ++ my ($mnemonic,$arg)=@_; ++ ++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*\[([0-3])\]/o ++ && ++ sprintf ".inst\t0x%08x\t//%s %s", ++ $sm3ttopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<12), ++ $mnemonic,$arg; ++} ++ ++open SELF,$0; ++while(<SELF>) { ++ next if (/^#!/); ++ last if (!s/^#/\/\// and !/^$/); ++ print; ++} ++close SELF; ++ ++foreach(split("\n",$code)) { ++ s/\`([^\`]*)\`/eval($1)/ge; ++ ++ s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge; ++ s/\b(sm3ssl)\s+([qv].*)/unsm3ssl($1,$2)/ge; ++ s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge; ++ print $_,"\n"; ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; +diff --git a/crypto/sm3/build.info b/crypto/sm3/build.info +index eca68216f2..2fa54a4a8b 100644 +--- a/crypto/sm3/build.info ++++ b/crypto/sm3/build.info +@@ -1,5 +1,22 @@ + LIBS=../../libcrypto + + IF[{- !$disabled{sm3} -}] +- SOURCE[../../libcrypto]=sm3.c legacy_sm3.c +-ENDIF +\ No newline at end of file ++ IF[{- !$disabled{asm} -}] ++ $SM3ASM_aarch64=sm3-armv8.S ++ $SM3DEF_aarch64=OPENSSL_SM3_ASM ++ ++ # Now that we have defined all the arch specific variables, use the ++ # appropriate ones, and define the appropriate macros ++ IF[$SM3ASM_{- $target{asm_arch} -}] ++ $SM3ASM=$SM3ASM_{- $target{asm_arch} -} ++ $SM3DEF=$SM3DEF_{- $target{asm_arch} -} ++ ENDIF ++ ENDIF ++ ++ SOURCE[../../libcrypto]=sm3.c legacy_sm3.c $SM3ASM ++ DEFINE[../../libcrypto]=$SM3DEF ++ ++ GENERATE[sm3-armv8.S]=asm/sm3-armv8.pl ++ INCLUDE[sm3-armv8.o]=.. ++ENDIF ++ +diff --git a/crypto/sm3/sm3_local.h b/crypto/sm3/sm3_local.h +index 6daeb878a8..ac8a2bf768 100644 +--- a/crypto/sm3/sm3_local.h ++++ b/crypto/sm3/sm3_local.h +@@ -32,7 +32,21 @@ + ll=(c)->G; (void)HOST_l2c(ll, (s)); \ + ll=(c)->H; (void)HOST_l2c(ll, (s)); \ + } while (0) +-#define HASH_BLOCK_DATA_ORDER ossl_sm3_block_data_order ++ ++#if defined(OPENSSL_SM3_ASM) ++# if defined(__aarch64__) ++# include "crypto/arm_arch.h" ++# define HWSM3_CAPABLE (OPENSSL_armcap_P & ARMV8_SM3) ++void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num); ++# endif ++#endif ++ ++#if defined(HWSM3_CAPABLE) ++# define HASH_BLOCK_DATA_ORDER (HWSM3_CAPABLE ? ossl_hwsm3_block_data_order \ ++ : ossl_sm3_block_data_order) ++#else ++# define HASH_BLOCK_DATA_ORDER ossl_sm3_block_data_order ++#endif + + void ossl_sm3_block_data_order(SM3_CTX *c, const void *p, size_t num); + void ossl_sm3_transform(SM3_CTX *c, const unsigned char *data); +-- +2.37.3.windows.1 + |