1 files changed, 457 insertions, 0 deletions
diff --git a/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch b/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch
new file mode 100644
index 0000000..0467d78
--- /dev/null
+++ b/Backport-SM3-acceleration-with-SM3-hardware-instruction-on-aa.patch
@@ -0,0 +1,457 @@
+From 8a83d735057dde1f727eb0921446e4ca8b085267 Mon Sep 17 00:00:00 2001
+From: "fangming.fang" <fangming.fang@arm.com>
+Date: Fri, 24 Dec 2021 08:29:04 +0000
+Subject: [PATCH 02/13] SM3 acceleration with SM3 hardware instruction on
+ aarch64
+
+SM3 hardware instruction is optional feature of crypto extension for
+aarch64. This implementation accelerates SM3 via SM3 instructions. For
+the platform not supporting SM3 instruction, the original C
+implementation still works. Thanks to AliBaba for testing and reporting
+the following perf numbers for Yitian710:
+
+Benchmark on T-Head Yitian-710 2.75GHz:
+
+Before:
+type  16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes   16384 bytes
+sm3   49297.82k   121062.63k   223106.05k   283371.52k   307574.10k   309400.92k
+
+After (33% - 74% faster):
+type  16 bytes     64 bytes    256 bytes    1024 bytes   8192 bytes   16384 bytes
+sm3   65640.01k   179121.79k   359854.59k   481448.96k   534055.59k   538274.47k
+
+Reviewed-by: Paul Dale <pauli@openssl.org>
+Reviewed-by: Tomas Mraz <tomas@openssl.org>
+(Merged from https://github.com/openssl/openssl/pull/17454)
+---
+ crypto/arm64cpuid.pl        |   8 +
+ crypto/arm_arch.h           |   2 +
+ crypto/armcap.c             |  10 ++
+ crypto/sm3/asm/sm3-armv8.pl | 282 ++++++++++++++++++++++++++++++++++++
+ crypto/sm3/build.info       |  21 ++-
+ crypto/sm3/sm3_local.h      |  16 +-
+ 6 files changed, 336 insertions(+), 3 deletions(-)
+ create mode 100644 crypto/sm3/asm/sm3-armv8.pl
+
+diff --git a/crypto/arm64cpuid.pl b/crypto/arm64cpuid.pl
+index 11f0e50279..10d267b7ad 100755
+--- a/crypto/arm64cpuid.pl
++++ b/crypto/arm64cpuid.pl
+@@ -96,6 +96,14 @@ _armv8_cpuid_probe:
+ 	ret
+ .size	_armv8_cpuid_probe,.-_armv8_cpuid_probe
+ 
++.globl	_armv8_sm3_probe
++.type	_armv8_sm3_probe,%function
++_armv8_sm3_probe:
++	AARCH64_VALID_CALL_TARGET
++	.long	0xce63c004	// sm3partw1 v4.4s, v0.4s, v3.4s
++	ret
++.size	_armv8_sm3_probe,.-_armv8_sm3_probe
++
+ .globl	OPENSSL_cleanse
+ .type	OPENSSL_cleanse,%function
+ .align	5
+diff --git a/crypto/arm_arch.h b/crypto/arm_arch.h
+index a815a5c72b..c8b501f34c 100644
+--- a/crypto/arm_arch.h
++++ b/crypto/arm_arch.h
+@@ -83,6 +83,8 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
+ # define ARMV8_PMULL     (1<<5)
+ # define ARMV8_SHA512    (1<<6)
+ # define ARMV8_CPUID     (1<<7)
++# define ARMV8_RNG       (1<<8)
++# define ARMV8_SM3       (1<<9)
+ 
+ /*
+  * MIDR_EL1 system register
+diff --git a/crypto/armcap.c b/crypto/armcap.c
+index c021330e32..365a48df45 100644
+--- a/crypto/armcap.c
++++ b/crypto/armcap.c
+@@ -52,6 +52,7 @@ void _armv8_sha1_probe(void);
+ void _armv8_sha256_probe(void);
+ void _armv8_pmull_probe(void);
+ # ifdef __aarch64__
++void _armv8_sm3_probe(void);
+ void _armv8_sha512_probe(void);
+ unsigned int _armv8_cpuid_probe(void);
+ # endif
+@@ -137,6 +138,7 @@ static unsigned long getauxval(unsigned long key)
+ #  define HWCAP_CE_SHA1          (1 << 5)
+ #  define HWCAP_CE_SHA256        (1 << 6)
+ #  define HWCAP_CPUID            (1 << 11)
++#  define HWCAP_CE_SM3           (1 << 18)
+ #  define HWCAP_CE_SHA512        (1 << 21)
+ # endif
+ 
+@@ -210,6 +212,9 @@ void OPENSSL_cpuid_setup(void)
+ 
+         if (hwcap & HWCAP_CPUID)
+             OPENSSL_armcap_P |= ARMV8_CPUID;
++
++        if (hwcap & HWCAP_CE_SM3)
++            OPENSSL_armcap_P |= ARMV8_SM3;
+ #  endif
+     }
+ # endif
+@@ -253,6 +258,11 @@ void OPENSSL_cpuid_setup(void)
+             _armv8_sha512_probe();
+             OPENSSL_armcap_P |= ARMV8_SHA512;
+         }
++
++        if (sigsetjmp(ill_jmp, 1) == 0) {
++            _armv8_sm3_probe();
++            OPENSSL_armcap_P |= ARMV8_SM3;
++        }
+ #  endif
+     }
+ # endif
+diff --git a/crypto/sm3/asm/sm3-armv8.pl b/crypto/sm3/asm/sm3-armv8.pl
+new file mode 100644
+index 0000000000..bb71b2eade
+--- /dev/null
++++ b/crypto/sm3/asm/sm3-armv8.pl
+@@ -0,0 +1,282 @@
++#! /usr/bin/env perl
++# Copyright 2021 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the Apache License 2.0 (the "License").  You may not use
++# this file except in compliance with the License.  You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++#
++# This module implements support for Armv8 SM3 instructions
++
++# $output is the last argument if it looks like a file (it has an extension)
++# $flavour is the first argument if it doesn't look like a file
++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
++die "can't locate arm-xlate.pl";
++
++open OUT,"| \"$^X\" $xlate $flavour \"$output\""
++    or die "can't call $xlate: $!";
++*STDOUT=*OUT;
++
++# Message expanding:
++#	Wj <- P1(W[j-16]^W[j-9]^(W[j-3]<<<15))^(W[j-13]<<<7)^W[j-6]
++# Input: s0, s1, s2, s3
++#	s0 = w0  | w1  | w2  | w3
++#	s1 = w4  | w5  | w6  | w7
++#	s2 = w8  | w9  | w10 | w11
++#	s3 = w12 | w13 | w14 | w15
++# Output: s4
++sub msg_exp () {
++my $s0 = shift;
++my $s1 = shift;
++my $s2 = shift;
++my $s3 = shift;
++my $s4 = shift;
++my $vtmp1 = shift;
++my $vtmp2 = shift;
++$code.=<<___;
++	// s4 = w7  | w8  | w9  | w10
++	ext     $s4.16b, $s1.16b, $s2.16b, #12
++	// vtmp1 = w3  | w4  | w5  | w6
++	ext	$vtmp1.16b, $s0.16b, $s1.16b, #12
++	// vtmp2 = w10 | w11 | w12 | w13
++	ext     $vtmp2.16b, $s2.16b, $s3.16b, #8
++	sm3partw1       $s4.4s, $s0.4s, $s3.4s
++	sm3partw2       $s4.4s, $vtmp2.4s, $vtmp1.4s
++___
++}
++
++# A round of compresson function
++# Input:
++# 	ab - choose instruction among sm3tt1a, sm3tt1b, sm3tt2a, sm3tt2b
++# 	vstate0 - vstate1, store digest status(A - H)
++# 	vconst0 - vconst1, interleaved used to store Tj <<< j
++# 	vtmp - temporary register
++# 	vw - for sm3tt1ab, vw = s0 eor s1
++# 	s0 - for sm3tt2ab, just be s0
++# 	i, choose wj' or wj from vw
++sub round () {
++my $ab = shift;
++my $vstate0 = shift;
++my $vstate1 = shift;
++my $vconst0 = shift;
++my $vconst1 = shift;
++my $vtmp = shift;
++my $vw = shift;
++my $s0 = shift;
++my $i = shift;
++$code.=<<___;
++	sm3ss1  $vtmp.4s, $vstate0.4s, $vconst0.4s, $vstate1.4s
++	shl     $vconst1.4s, $vconst0.4s, #1
++	sri     $vconst1.4s, $vconst0.4s, #31
++	sm3tt1$ab       $vstate0.4s, $vtmp.4s, $vw.4s[$i]
++	sm3tt2$ab       $vstate1.4s, $vtmp.4s, $s0.4s[$i]
++___
++}
++
++sub qround () {
++my $ab = shift;
++my $vstate0 = shift;
++my $vstate1 = shift;
++my $vconst0 = shift;
++my $vconst1 = shift;
++my $vtmp1 = shift;
++my $vtmp2 = shift;
++my $s0 = shift;
++my $s1 = shift;
++my $s2 = shift;
++my $s3 = shift;
++my $s4 = shift;
++	if($s4) {
++		&msg_exp($s0, $s1, $s2, $s3, $s4, $vtmp1, $vtmp2);
++	}
++$code.=<<___;
++	eor     $vtmp1.16b, $s0.16b, $s1.16b
++___
++	&round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
++               $vtmp1, $s0, 0);
++	&round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
++               $vtmp1, $s0, 1);
++	&round($ab, $vstate0, $vstate1, $vconst0, $vconst1, $vtmp2,
++               $vtmp1, $s0, 2);
++	&round($ab, $vstate0, $vstate1, $vconst1, $vconst0, $vtmp2,
++               $vtmp1, $s0, 3);
++}
++
++$code=<<___;
++#include "arm_arch.h"
++.arch	armv8.2-a+sm4
++.text
++___
++
++{{{
++my ($pstate,$pdata,$num)=("x0","x1","w2");
++my ($state1,$state2)=("v5","v6");
++my ($sconst1, $sconst2)=("s16","s17");
++my ($vconst1, $vconst2)=("v16","v17");
++my ($s0,$s1,$s2,$s3,$s4)=map("v$_",(0..4));
++my ($bkstate1,$bkstate2)=("v18","v19");
++my ($vconst_tmp1,$vconst_tmp2)=("v20","v21");
++my ($vtmp1,$vtmp2)=("v22","v23");
++my $constaddr="x8";
++# void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num)
++$code.=<<___;
++.globl	ossl_hwsm3_block_data_order
++.type	ossl_hwsm3_block_data_order,%function
++.align	5
++ossl_hwsm3_block_data_order:
++	AARCH64_VALID_CALL_TARGET
++	// load state
++	ld1     {$state1.4s-$state2.4s}, [$pstate]
++	rev64   $state1.4s, $state1.4s
++	rev64   $state2.4s, $state2.4s
++	ext     $state1.16b, $state1.16b, $state1.16b, #8
++	ext     $state2.16b, $state2.16b, $state2.16b, #8
++
++	adr     $constaddr, .Tj
++	ldp     $sconst1, $sconst2, [$constaddr]
++
++.Loop:
++	// load input
++	ld1     {$s0.16b-$s3.16b}, [$pdata], #64
++	sub     $num, $num, #1
++
++	mov     $bkstate1.16b, $state1.16b
++	mov     $bkstate2.16b, $state2.16b
++
++#ifndef __ARMEB__
++	rev32   $s0.16b, $s0.16b
++	rev32   $s1.16b, $s1.16b
++	rev32   $s2.16b, $s2.16b
++	rev32   $s3.16b, $s3.16b
++#endif
++
++	ext     $vconst_tmp1.16b, $vconst1.16b, $vconst1.16b, #4
++___
++	&qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s0,$s1,$s2,$s3,$s4);
++	&qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s1,$s2,$s3,$s4,$s0);
++	&qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s2,$s3,$s4,$s0,$s1);
++	&qround("a",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s3,$s4,$s0,$s1,$s2);
++
++$code.=<<___;
++	ext     $vconst_tmp1.16b, $vconst2.16b, $vconst2.16b, #4
++___
++
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s4,$s0,$s1,$s2,$s3);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s0,$s1,$s2,$s3,$s4);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s1,$s2,$s3,$s4,$s0);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s2,$s3,$s4,$s0,$s1);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s3,$s4,$s0,$s1,$s2);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s4,$s0,$s1,$s2,$s3);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s0,$s1,$s2,$s3,$s4);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s1,$s2,$s3,$s4,$s0);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s2,$s3,$s4,$s0,$s1);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s3,$s4);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s4,$s0);
++	&qround("b",$state1,$state2,$vconst_tmp1,$vconst_tmp2,$vtmp1,$vtmp2,
++                $s0,$s1);
++
++$code.=<<___;
++	eor     $state1.16b, $state1.16b, $bkstate1.16b
++	eor     $state2.16b, $state2.16b, $bkstate2.16b
++
++	// any remained blocks?
++	cbnz    $num, .Loop
++
++	// save state
++	rev64   $state1.4s, $state1.4s
++	rev64   $state2.4s, $state2.4s
++	ext     $state1.16b, $state1.16b, $state1.16b, #8
++	ext     $state2.16b, $state2.16b, $state2.16b, #8
++	st1     {$state1.4s-$state2.4s}, [$pstate]
++	ret
++.size	ossl_hwsm3_block_data_order,.-ossl_hwsm3_block_data_order
++
++.align	3
++.Tj:
++.word	0x79cc4519, 0x9d8a7a87
++___
++}}}
++
++#########################################
++my %sm3partopcode = (
++	"sm3partw1"         =>   0xce60C000,
++        "sm3partw2"         =>   0xce60C400);
++
++my %sm3sslopcode = (
++	"sm3ssl"            =>   0xce400000);
++
++my %sm3ttopcode = (
++	"sm3tt1a"           =>   0xce408000,
++	"sm3tt1b"           =>   0xce408400,
++	"sm3tt2a"           =>   0xce408800,
++	"sm3tt2b"           =>   0xce408C00);
++
++sub unsm3part {
++	my ($mnemonic,$arg)=@_;
++
++	$arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)/o
++	&&
++	sprintf ".inst\t0x%08x\t//%s %s",
++			$sm3partopcode{$mnemonic}|$1|($2<<5)|($3<<16),
++			$mnemonic,$arg;
++}
++
++sub unsm3ssl {
++	my ($mnemonic,$arg)=@_;
++
++	$arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,
++                \s*[qv](\d+)/o
++	&&
++	sprintf ".inst\t0x%08x\t//%s %s",
++			$sm3sslopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<10),
++			$mnemonic,$arg;
++}
++
++sub unsm3tt {
++	my ($mnemonic,$arg)=@_;
++
++	$arg=~ m/[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*,\s*[qv](\d+)[^,]*\[([0-3])\]/o
++	&&
++	sprintf ".inst\t0x%08x\t//%s %s",
++			$sm3ttopcode{$mnemonic}|$1|($2<<5)|($3<<16)|($4<<12),
++			$mnemonic,$arg;
++}
++
++open SELF,$0;
++while(<SELF>) {
++        next if (/^#!/);
++        last if (!s/^#/\/\// and !/^$/);
++        print;
++}
++close SELF;
++
++foreach(split("\n",$code)) {
++	s/\`([^\`]*)\`/eval($1)/ge;
++
++	s/\b(sm3partw[1-2])\s+([qv].*)/unsm3part($1,$2)/ge;
++	s/\b(sm3ssl)\s+([qv].*)/unsm3ssl($1,$2)/ge;
++	s/\b(sm3tt[1-2][a-b])\s+([qv].*)/unsm3tt($1,$2)/ge;
++	print $_,"\n";
++}
++
++close STDOUT or die "error closing STDOUT: $!";
+diff --git a/crypto/sm3/build.info b/crypto/sm3/build.info
+index eca68216f2..2fa54a4a8b 100644
+--- a/crypto/sm3/build.info
++++ b/crypto/sm3/build.info
+@@ -1,5 +1,22 @@
+ LIBS=../../libcrypto
+ 
+ IF[{- !$disabled{sm3} -}]
+-  SOURCE[../../libcrypto]=sm3.c legacy_sm3.c
+-ENDIF
+\ No newline at end of file
++  IF[{- !$disabled{asm} -}]
++    $SM3ASM_aarch64=sm3-armv8.S
++    $SM3DEF_aarch64=OPENSSL_SM3_ASM
++
++    # Now that we have defined all the arch specific variables, use the
++    # appropriate ones, and define the appropriate macros
++    IF[$SM3ASM_{- $target{asm_arch} -}]
++      $SM3ASM=$SM3ASM_{- $target{asm_arch} -}
++      $SM3DEF=$SM3DEF_{- $target{asm_arch} -}
++    ENDIF
++  ENDIF
++
++  SOURCE[../../libcrypto]=sm3.c legacy_sm3.c $SM3ASM
++  DEFINE[../../libcrypto]=$SM3DEF
++
++  GENERATE[sm3-armv8.S]=asm/sm3-armv8.pl
++  INCLUDE[sm3-armv8.o]=..
++ENDIF
++
+diff --git a/crypto/sm3/sm3_local.h b/crypto/sm3/sm3_local.h
+index 6daeb878a8..ac8a2bf768 100644
+--- a/crypto/sm3/sm3_local.h
++++ b/crypto/sm3/sm3_local.h
+@@ -32,7 +32,21 @@
+         ll=(c)->G; (void)HOST_l2c(ll, (s)); \
+         ll=(c)->H; (void)HOST_l2c(ll, (s)); \
+       } while (0)
+-#define HASH_BLOCK_DATA_ORDER   ossl_sm3_block_data_order
++
++#if defined(OPENSSL_SM3_ASM)
++# if defined(__aarch64__)
++#  include "crypto/arm_arch.h"
++#  define HWSM3_CAPABLE (OPENSSL_armcap_P & ARMV8_SM3)
++void ossl_hwsm3_block_data_order(SM3_CTX *c, const void *p, size_t num);
++# endif
++#endif
++
++#if defined(HWSM3_CAPABLE)
++# define HASH_BLOCK_DATA_ORDER (HWSM3_CAPABLE ? ossl_hwsm3_block_data_order \
++                                              : ossl_sm3_block_data_order)
++#else
++# define HASH_BLOCK_DATA_ORDER   ossl_sm3_block_data_order
++#endif
+ 
+ void ossl_sm3_block_data_order(SM3_CTX *c, const void *p, size_t num);
+ void ossl_sm3_transform(SM3_CTX *c, const unsigned char *data);
+-- 
+2.37.3.windows.1
+