summaryrefslogtreecommitdiff
path: root/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch
diff options
context:
space:
mode:
Diffstat (limited to 'Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch')
-rw-r--r--Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch457
1 files changed, 457 insertions, 0 deletions
diff --git a/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch b/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch
new file mode 100644
index 0000000..0467d78
--- /dev/null
+++ b/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch
@@ -0,0 +1,457 @@
+From 8a83d735057dde1f727eb0921446e4ca8b085267 Mon Sep 17 00:00:00 2001
+From: "fangming.fang" <fangming.fang@arm.com>
+Date: Fri, 24 Dec 2021 08:29:04 +0000
+Subject: [PATCH 02/13] SM3 acceleration with SM3 hardware instruction on
+ aarch64
+
+SM3 hardware instruction is optional feature of crypto extension for
+aarch64. This implementation accelerates SM3 via SM3 instructions. For
+the platform not supporting SM3 instruction, the original C
+implementation still works. Thanks to AliBaba for testing and reporting
+the following perf numbers for Yitian710:
+
+Benchmark on T-Head Yitian-710 2.75GHz:
+
+Before:
+type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
+sm3 49297.82k 121062.63k 223106.05k 283371.52k 307574.10k 309400.92k
+
+After (33% - 74% faster):
+type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes
+sm3 65640.01k 179121.79k 359854.59k 481448.96k 534055.59k 538274.47k
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/17454)
+---
+ crypto/arm64cpuid.pl | 8 +
+ crypto/arm_arch.h | 2 +
+ crypto/armcap.c | 10 ++
+ crypto/sm3/asm/sm3-armv8.pl | 282 ++++++++++++++++++++++++++++++++++++
+ crypto/sm3/build.info | 21 ++-
+ crypto/sm3/sm3_local.h | 16 +-
+ 6 files changed, 336 insertions(+), 3 deletions(-)
+ create mode 100644 crypto/sm3/asm/sm3-armv8.pl
+
+diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
+index 11f0e50279..10d267b7ad 100755
+--- a/crypto/arm64cpuid.pl
++++ b/crypto/arm64cpuid.pl
+@@ -96,6 +96,14 @@ _armv8_cpuid_probe:
+ ret
+ .size _armv8_cpuid_probe,.-_armv8_cpuid_probe
+
++.globl _armv8_sm3_probe
++.type _armv8_sm3_probe,%function
++_armv8_sm3_probe:
++ AARCH64_VALID_CALL_TARGET
++ .long 0xce63c004 // sm3partw1 v4.4s, v0.4s, v3.4s
++ ret
++.size _armv8_sm3_probe,.-_armv8_sm3_probe
++
+ .globl OPENSSL_cleanse
+ .type OPENSSL_cleanse,%function
+ .align 5
+diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
+index a815a5c72b..c8b501f34c 100644
+--- a/crypto/arm_arch.h
++++ b/crypto/arm_arch.h
+@@ -83,6 +83,8 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
+ # define ARMV8_PMULL (1<<5)
+ # define ARMV8_SHA512 (1<<6)
+ # define ARMV8_CPUID (1<<7)
++# define ARMV8_RNG (1<<8)
++# define ARMV8_SM3 (1<<9)
+
+ /*
+ * MIDR_EL1 system register
+diff --git a/crypto/armcap.c b/crypto/armcap.c
+index c021330e32..365a48df45 100644
+--- a/crypto/armcap.c
++++ b/crypto/armcap.c
+@@ -52,6 +52,7 @@ void _armv8_sha1_probe(void);
+ void _armv8_sha256_probe(void);
+ void _armv8_pmull_probe(void);
+ # ifdef __aarch64__
++void _armv8_sm3_probe(void);
+ void _armv8_sha512_probe(void);
+ unsigned int _armv8_cpuid_probe(void);
+ # endif
+@@ -137,6 +138,7 @@ static unsigned long getauxval(unsigned long key)
+ # define HWCAP_CE_SHA1 (1 << 5)
+ # define HWCAP_CE_SHA256 (1 << 6)
+ # define HWCAP_CPUID (1 << 11)
++# define HWCAP_CE_SM3 (1 << 18)
+ # define HWCAP_CE_SHA512 (1 << 21)
+ # endif
+
+@@ -210,6 +212,9 @@ void OPENSSL_cpuid_setup(void)
+
+ if (hwcap & HWCAP_CPUID)
+ OPENSSL_armcap_P |= ARMV8_CPUID;
++
++ if (hwcap & HWCAP_CE_SM3)
++ OPENSSL_armcap_P |= ARMV8_SM3;
+ # endif
+ }
+ # endif
+@@ -253,6 +258,11 @@ void OPENSSL_cpuid_setup(void)
+ _armv8_sha512_probe();
+ OPENSSL_armcap_P |= ARMV8_SHA512;
+ }
++
++ if (sigsetjmp(ill_jmp, 1) == 0) {
++ _armv8_sm3_probe();
++ OPENSSL_armcap_P |= ARMV8_SM3;
++ }
+ # endif
+ }
+ # endif
+diff --git a/crypto/sm3/asm/sm3-armv8.pl b/crypto/sm3/asm/sm3-armv8.pl
+new file mode 100644
+index 0000000000..bb71b2eade
+--- /dev/null
++++ b/crypto/sm3/asm/sm3-armv8.pl
+@@ -0,0 +1,282 @@
++#! /usr/bin/env perl
++# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the Apache License 2.0 (the "License"). You may not use
++# this file except in compliance with the License. You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++#
++# This module implements support for Armv8 SM3 instructions
++
++# $output is the last argument if it looks like a file (it has an extension)
++# $flavour is the first argument if it doesn't look like a file
++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
++die "can't locate arm-xlate.pl";
++
++open OUT,"| \"$^X\" $xlate $flavour \"$output\""
++ or die "can't call $xlate: $!";
++*STDOUT=*OUT;
++
++# Message expanding:
++# Wj <- P1(W[j-16]^W[j-9]^(W[j-3]<<<15))^(W[j-13]<<<7)^W[j-6]
++# Input: s0, s1, s2, s3
++# s0 = w0 | w1 | w2 | w3
++# s1 = w4 | w5 | w6 | w7
++# s2 = w8 | w9 | w10 | w11
++# s3 = w12 | w13 | w14 | w15
++# Output: s4
++sub msg_exp () {
++my $s0 = shift;
++my $s1 = shift;
++my $s2 = shift;
++my $s3 = shift;
++my $s4 = shift;
++my $vtmp1 = shift;
++my $vtmp2 = shift;
++$code.=<<___;
++ // s4 = w7 | w8 | w9 | w10
++ ext $s4.16b, $s1.16b, $s2.16b, #12
++ // vtmp1 = w3 | w4 | w5 | w6
++ ext $vtmp1.16b, $s0.16b, $s1.16b, #12
++ // vtmp2 = w10 | w11 | w12 | w13
++ ext $vtmp2.16b, $s2.16b, $s3.16b, #8
++ sm3partw1 $s4.4s, $s0.4s, $s3.4s
++ sm3partw2 $s4.4s, $vtmp2.4s, $vtmp1.4s
++___
++}
++
++# A round of compresson function
++# Input:
++# ab - choose instruction among sm3tt1a, sm3tt1b, sm3tt2a, sm3tt2b
++# vstate0 - vstate1, store digest status(A - H)
++# vconst0 - vconst1, interleaved used to store Tj <<< j
++# vtmp - temporary register
++# vw - for sm3tt1ab, vw = s0 eor s1
++# s0 - for sm3tt2ab, just be s0
++# i, choose wj' or wj from vw
++sub round () {
++my $ab = shift;
++my $vstate0 = shift;
++my $vstate1 = shift;
++my $vconst0 = shift;
++my $vconst1 = shift;
++my $vtmp = shift;
++my $vw = shift;
++my $s0 = shift;
++my $i = shift;
++$code.=<<___;
++ sm3ss1 $vtmp.4s, $vstate0.4s, $vconst0.4s, $vstate1.4s
++ shl $vconst1.4s, $vconst0.4s, #1
++ sri $vconst1.4s, $vconst0.4s, #31
++ sm3tt1$ab $vstate0.4s, $vtmp.4s, $vw.4s[$i]
++ sm3tt2$ab $vstate1.4s, $vtmp.4s, $s0.4s[$i]
++___
++}
++
++sub qround () {
++my $ab = shift;
++my $vstate0 = shift;
++my $vstate1 = shift;
++my $vconst0 = shift;
++my $vconst1 = shift;
++my $vtmp1 = shift;
++my $vtmp2 = shift;
++my $s0 = shift;
++my $s1 = shift;
++my $s2 = shift;
++my $s3 = shift;
++my $s4 = shift;
++ if($s4) {
++ &msg_exp($s0, $s1, $s2, $s3, $s4, $vtmp1, $vtmp2);
++ }
++$code.=<<___;
++ eor $vtmp1.16b, $s0.16b, $s1.16b
++___
++ &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
++ $vtmp1, $s0, 0);
++ &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
++ $vtmp1, $s0, 1);
++ &round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
++ $vtmp1, $s0, 2);
++ &round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
++ $vtmp1, $s0, 3);
++}
++
++$code=<<___;
++#include "arm_arch.h"
++.arch armv8.2-a+sm4
++.text
++___
++
++{{{
++my ($pstate,$pdata,$num)=("x0","x1","w2");
++my ($state1,$state2)=("v5","v6");
++my ($sconst1, $sconst2)=("s16","s17");
++my ($vconst1, $vconst2)=("v16","v17");
++my ($s0,$s1,$s2,$s3,$s4)=map("v$_",(0..4));
++my ($bkstate1,$bkstate2)=("v18","v19");
++my ($vconst_tmp1,$vconst_tmp2)=("v20","v21");
++my ($vtmp1,$vtmp2)=("v22","v23");
++my $constaddr="x8";
++# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num)
++$code.=<<___;
++.globl ossl_hwsm3_block_data_order
++.type ossl_hwsm3_block_data_order,%function
++.align 5
++ossl_hwsm3_block_data_order:
++ AARCH64_VALID_CALL_TARGET
++ // load state
++ ld1 {$state1.4s-$state2.4s}, [$pstate]
++ rev64 $state1.4s, $state1.4s
++ rev64 $state2.4s, $state2.4s
++ ext $state1.16b, $state1.16b, $state1.16b, #8
++ ext $state2.16b, $state2.16b, $state2.16b, #8
++
++ adr $constaddr, .Tj
++ ldp $sconst1, $sconst2, [$constaddr]
++
++.Loop:
++ // load input
++ ld1 {$s0.16b-$s3.16b}, [$pdata], #64
++ sub $num, $num, #1
++
++ mov $bkstate1.16b, $state1.16b
++ mov $bkstate2.16b, $state2.16b
++
++#ifndef __ARMEB__
++ rev32 $s0.16b, $s0.16b
++ rev32 $s1.16b, $s1.16b
++ rev32 $s2.16b, $s2.16b
++ rev32 $s3.16b, $s3.16b
++#endif
++
++ ext $vconst_tmp1.16b, $vconst1.16b, $vconst1.16b, #4
++___
++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s0,$s1,$s2,$s3,$s4);
++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s1,$s2,$s3,$s4,$s0);
++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s2,$s3,$s4,$s0,$s1);
++ &qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s3,$s4,$s0,$s1,$s2);
++
++$code.=<<___;
++ ext $vconst_tmp1.16b, $vconst2.16b, $vconst2.16b, #4
++___
++
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s4,$s0,$s1,$s2,$s3);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s0,$s1,$s2,$s3,$s4);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s1,$s2,$s3,$s4,$s0);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s2,$s3,$s4,$s0,$s1);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s3,$s4,$s0,$s1,$s2);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s4,$s0,$s1,$s2,$s3);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s0,$s1,$s2,$s3,$s4);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s1,$s2,$s3,$s4,$s0);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s2,$s3,$s4,$s0,$s1);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s3,$s4);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s4,$s0);
++ &qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++ $s0,$s1);
++
++$code.=<<___;
++ eor $state1.16b, $state1.16b, $bkstate1.16b
++ eor $state2.16b, $state2.16b, $bkstate2.16b
++
++ // any remained blocks?
++ cbnz $num, .Loop
++
++ // save state
++ rev64 $state1.4s, $state1.4s
++ rev64 $state2.4s, $state2.4s
++ ext $state1.16b, $state1.16b, $state1.16b, #8
++ ext $state2.16b, $state2.16b, $state2.16b, #8
++ st1 {$state1.4s-$state2.4s}, [$pstate]
++ ret
++.size ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order
++
++.align 3
++.Tj:
++.word 0x79cc4519, 0x9d8a7a87
++___
++}}}
++
++#########################################
++my %sm3partopcode = (
++ "sm3partw1" => 0xce60C000,
++ "sm3partw2" => 0xce60C400);
++
++my %sm3sslopcode = (
++ "sm3ssl" => 0xce400000);
++
++my %sm3ttopcode = (
++ "sm3tt1a" => 0xce408000,
++ "sm3tt1b" => 0xce408400,
++ "sm3tt2a" => 0xce408800,
++ "sm3tt2b" => 0xce408C00);
++
++sub unsm3part {
++ my ($mnemonic,$arg)=@_;
++
++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o
++ &&
++ sprintf ".inst\t0x%08x\t//%s %s",
++ $sm3partopcode{$mnemonic}|$1|($2<<5)|($3<<16),
++ $mnemonic,$arg;
++}
++
++sub unsm3ssl {
++ my ($mnemonic,$arg)=@_;
++
++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,
++ \s*[qv](\d+)/o
++ &&
++ sprintf ".inst\t0x%08x\t//%s %s",
++ $sm3sslopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10),
++ $mnemonic,$arg;
++}
++
++sub unsm3tt {
++ my ($mnemonic,$arg)=@_;
++
++ $arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*\[([0-3])\]/o
++ &&
++ sprintf ".inst\t0x%08x\t//%s %s",
++ $sm3ttopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<12),
++ $mnemonic,$arg;
++}
++
++open SELF,$0;
++while(<SELF>) {
++ next if (/^#!/);
++ last if (!s/^#/\/\// and !/^$/);
++ print;
++}
++close SELF;
++
++foreach(split("\n",$code)) {
++ s/\`([^\`]*)\`/eval($1)/ge;
++
++ s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge;
++ s/\b(sm3ssl)\s+([qv].*)/unsm3ssl($1,$2)/ge;
++ s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge;
++ print $_,"\n";
++}
++
++close STDOUT or die "error closing STDOUT: $!";
+diff --git a/crypto/sm3/build.info b/crypto/sm3/build.info
+index eca68216f2..2fa54a4a8b 100644
+--- a/crypto/sm3/build.info
++++ b/crypto/sm3/build.info
+@@ -1,5 +1,22 @@
+ LIBS=../../libcrypto
+
+ IF[{- !$disabled{sm3} -}]
+- SOURCE[../../libcrypto]=sm3.c legacy_sm3.c
+-ENDIF
+\ No newline at end of file
++ IF[{- !$disabled{asm} -}]
++ $SM3ASM_aarch64=sm3-armv8.S
++ $SM3DEF_aarch64=OPENSSL_SM3_ASM
++
++ # Now that we have defined all the arch specific variables, use the
++ # appropriate ones, and define the appropriate macros
++ IF[$SM3ASM_{- $target{asm_arch} -}]
++ $SM3ASM=$SM3ASM_{- $target{asm_arch} -}
++ $SM3DEF=$SM3DEF_{- $target{asm_arch} -}
++ ENDIF
++ ENDIF
++
++ SOURCE[../../libcrypto]=sm3.c legacy_sm3.c $SM3ASM
++ DEFINE[../../libcrypto]=$SM3DEF
++
++ GENERATE[sm3-armv8.S]=asm/sm3-armv8.pl
++ INCLUDE[sm3-armv8.o]=..
++ENDIF
++
+diff --git a/crypto/sm3/sm3_local.h b/crypto/sm3/sm3_local.h
+index 6daeb878a8..ac8a2bf768 100644
+--- a/crypto/sm3/sm3_local.h
++++ b/crypto/sm3/sm3_local.h
+@@ -32,7 +32,21 @@
+ ll=(c)->G; (void)HOST_l2c(ll, (s)); \
+ ll=(c)->H; (void)HOST_l2c(ll, (s)); \
+ } while (0)
+-#define HASH_BLOCK_DATA_ORDER ossl_sm3_block_data_order
++
++#if defined(OPENSSL_SM3_ASM)
++# if defined(__aarch64__)
++# include "crypto/arm_arch.h"
++# define HWSM3_CAPABLE (OPENSSL_armcap_P & ARMV8_SM3)
++void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num);
++# endif
++#endif
++
++#if defined(HWSM3_CAPABLE)
++# define HASH_BLOCK_DATA_ORDER (HWSM3_CAPABLE ? ossl_hwsm3_block_data_order \
++ : ossl_sm3_block_data_order)
++#else
++# define HASH_BLOCK_DATA_ORDER ossl_sm3_block_data_order
++#endif
+
+ void ossl_sm3_block_data_order(SM3_CTX *c, const void *p, size_t num);
+ void ossl_sm3_transform(SM3_CTX *c, const unsigned char *data);
+--
+2.37.3.windows.1
+