diff options
Diffstat (limited to 'Backport-SM4-optimization-for-ARM-by-ASIMD.patch')
-rw-r--r-- | Backport-SM4-optimization-for-ARM-by-ASIMD.patch | 1334 |
1 files changed, 1334 insertions, 0 deletions
diff --git a/Backport-SM4-optimization-for-ARM-by-ASIMD.patch b/Backport-SM4-optimization-for-ARM-by-ASIMD.patch new file mode 100644 index 0000000..5d58d16 --- /dev/null +++ b/Backport-SM4-optimization-for-ARM-by-ASIMD.patch @@ -0,0 +1,1334 @@ +From ca0b08e39bb619b6e62ef58c80edc784e8f20966 Mon Sep 17 00:00:00 2001 +From: Daniel Hu <Daniel.Hu@arm.com> +Date: Mon, 14 Feb 2022 14:36:34 +0000 +Subject: [PATCH 07/13] SM4 optimization for ARM by ASIMD + +This patch optimizes SM4 for ARM processor using ASIMD instruction + +It will improve performance if both of following conditions are met: +1) Input data equal to or more than 4 blocks +2) Cipher mode allows parallelism, including ECB,CTR,GCM or CBC decryption + +This patch implements SM4 SBOX lookup in vector registers, with the +benefit of constant processing time over existing C implementation. + +It is only enabled for micro-architecture N1/V1. In the ideal scenario, +performance can reach up to 2.7X + +When either of above two conditions is not met, e.g. single block input +or CFB/OFB mode, CBC encryption, performance could drop about 50%. + +The assembly code has been reviewed internally by ARM engineer +Fangming.Fang@arm.com + +Signed-off-by: Daniel Hu <Daniel.Hu@arm.com> + +Reviewed-by: Paul Dale <pauli@openssl.org> +Reviewed-by: Tomas Mraz <tomas@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/17951) +--- + crypto/evp/e_sm4.c | 24 + + crypto/sm4/asm/vpsm4-armv8.pl | 1118 +++++++++++++++++ + crypto/sm4/build.info | 6 +- + include/crypto/sm4_platform.h | 29 + + .../ciphers/cipher_sm4_gcm_hw.c | 7 + + .../implementations/ciphers/cipher_sm4_hw.c | 24 + + 6 files changed, 1206 insertions(+), 2 deletions(-) + create mode 100755 crypto/sm4/asm/vpsm4-armv8.pl + +diff --git a/crypto/evp/e_sm4.c b/crypto/evp/e_sm4.c +index bff79ff197..c8e8cfe9c9 100644 +--- a/crypto/evp/e_sm4.c ++++ b/crypto/evp/e_sm4.c +@@ -76,6 +76,17 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + dat->stream.ecb = (ecb128_f) HWSM4_ecb_encrypt; + # endif + } else ++#endif ++#ifdef VPSM4_CAPABLE ++ if (VPSM4_CAPABLE) { ++ vpsm4_set_decrypt_key(key, &dat->ks.ks); ++ dat->block = (block128_f) vpsm4_decrypt; ++ dat->stream.cbc = NULL; ++ if (mode == EVP_CIPH_CBC_MODE) ++ dat->stream.cbc = (cbc128_f) vpsm4_cbc_encrypt; ++ else if (mode == EVP_CIPH_ECB_MODE) ++ dat->stream.ecb = (ecb128_f) vpsm4_ecb_encrypt; ++ } else + #endif + { + dat->block = (block128_f) ossl_sm4_decrypt; +@@ -104,6 +115,19 @@ static int sm4_init_key(EVP_CIPHER_CTX *ctx, const unsigned char *key, + # endif + (void)0; /* terminate potentially open 'else' */ + } else ++#endif ++#ifdef VPSM4_CAPABLE ++ if (VPSM4_CAPABLE) { ++ vpsm4_set_encrypt_key(key, &dat->ks.ks); ++ dat->block = (block128_f) vpsm4_encrypt; ++ dat->stream.cbc = NULL; ++ if (mode == EVP_CIPH_CBC_MODE) ++ dat->stream.cbc = (cbc128_f) vpsm4_cbc_encrypt; ++ else if (mode == EVP_CIPH_ECB_MODE) ++ dat->stream.ecb = (ecb128_f) vpsm4_ecb_encrypt; ++ else if (mode == EVP_CIPH_CTR_MODE) ++ dat->stream.ctr = (ctr128_f) vpsm4_ctr32_encrypt_blocks; ++ } else + #endif + { + dat->block = (block128_f) ossl_sm4_encrypt; +diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl +new file mode 100755 +index 0000000000..095d9dae64 +--- /dev/null ++++ b/crypto/sm4/asm/vpsm4-armv8.pl +@@ -0,0 +1,1118 @@ ++#! /usr/bin/env perl ++# Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++ ++# ++# This module implements SM4 with ASIMD on aarch64 ++# ++# Feb 2022 ++# ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or ++die "can't locate arm-xlate.pl"; ++ ++open OUT,"| \"$^X\" $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++*STDOUT=*OUT; ++ ++$prefix="vpsm4"; ++my @vtmp=map("v$_",(0..3)); ++my @data=map("v$_",(4..7)); ++my @datax=map("v$_",(8..11)); ++my ($rk0,$rk1)=("v12","v13"); ++my ($rka,$rkb)=("v14","v15"); ++my @vtmpx=map("v$_",(12..15)); ++my @sbox=map("v$_",(16..31)); ++my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3"); ++my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9"); ++my ($ptr,$counter)=("x10","w11"); ++my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15"); ++ ++sub rev32() { ++ my $dst = shift; ++ my $src = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++$code.=<<___; ++#ifndef __ARMEB__ ++ rev32 $dst.16b,$src.16b ++#else ++ mov $dst.16b,$src.16b ++#endif ++___ ++ } else { ++$code.=<<___; ++#ifndef __ARMEB__ ++ rev32 $dst.16b,$dst.16b ++#endif ++___ ++ } ++} ++ ++sub transpose() { ++ my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; ++ ++$code.=<<___; ++ zip1 $vt0.4s,$dat0.4s,$dat1.4s ++ zip2 $vt1.4s,$dat0.4s,$dat1.4s ++ zip1 $vt2.4s,$dat2.4s,$dat3.4s ++ zip2 $vt3.4s,$dat2.4s,$dat3.4s ++ zip1 $dat0.2d,$vt0.2d,$vt2.2d ++ zip2 $dat1.2d,$vt0.2d,$vt2.2d ++ zip1 $dat2.2d,$vt1.2d,$vt3.2d ++ zip2 $dat3.2d,$vt1.2d,$vt3.2d ++___ ++} ++ ++# sbox operations for 4-lane of words ++sub sbox() { ++ my $dat = shift; ++ ++$code.=<<___; ++ movi @vtmp[0].16b,#64 ++ movi @vtmp[1].16b,#128 ++ movi @vtmp[2].16b,#192 ++ sub @vtmp[0].16b,$dat.16b,@vtmp[0].16b ++ sub @vtmp[1].16b,$dat.16b,@vtmp[1].16b ++ sub @vtmp[2].16b,$dat.16b,@vtmp[2].16b ++ tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b ++ tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b ++ tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b ++ tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b ++ add @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d ++ add @vtmp[2].2d,@vtmp[2].2d,$dat.2d ++ add $dat.2d,@vtmp[0].2d,@vtmp[2].2d ++ ++ ushr @vtmp[0].4s,$dat.4s,32-2 ++ sli @vtmp[0].4s,$dat.4s,2 ++ ushr @vtmp[2].4s,$dat.4s,32-10 ++ eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b ++ sli @vtmp[2].4s,$dat.4s,10 ++ eor @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b ++ ushr @vtmp[0].4s,$dat.4s,32-18 ++ sli @vtmp[0].4s,$dat.4s,18 ++ ushr @vtmp[2].4s,$dat.4s,32-24 ++ eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b ++ sli @vtmp[2].4s,$dat.4s,24 ++ eor $dat.16b,@vtmp[2].16b,@vtmp[1].16b ++___ ++} ++ ++# sbox operation for 8-lane of words ++sub sbox_double() { ++ my $dat = shift; ++ my $datx = shift; ++ ++$code.=<<___; ++ movi @vtmp[3].16b,#64 ++ sub @vtmp[0].16b,$dat.16b,@vtmp[3].16b ++ sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b ++ sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b ++ tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b ++ tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b ++ tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b ++ tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b ++ add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d ++ add $dat.2d,@vtmp[2].2d,$dat.2d ++ add $dat.2d,@vtmp[1].2d,$dat.2d ++ ++ sub @vtmp[0].16b,$datx.16b,@vtmp[3].16b ++ sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b ++ sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b ++ tbl $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b ++ tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b ++ tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b ++ tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b ++ add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d ++ add $datx.2d,@vtmp[2].2d,$datx.2d ++ add $datx.2d,@vtmp[1].2d,$datx.2d ++ ++ ushr @vtmp[0].4s,$dat.4s,32-2 ++ sli @vtmp[0].4s,$dat.4s,2 ++ ushr @vtmp[2].4s,$datx.4s,32-2 ++ eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b ++ sli @vtmp[2].4s,$datx.4s,2 ++ ++ ushr @vtmp[0].4s,$dat.4s,32-10 ++ eor @vtmp[3].16b,@vtmp[2].16b,$datx.16b ++ sli @vtmp[0].4s,$dat.4s,10 ++ ushr @vtmp[2].4s,$datx.4s,32-10 ++ eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b ++ sli @vtmp[2].4s,$datx.4s,10 ++ ++ ushr @vtmp[0].4s,$dat.4s,32-18 ++ eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b ++ sli @vtmp[0].4s,$dat.4s,18 ++ ushr @vtmp[2].4s,$datx.4s,32-18 ++ eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b ++ sli @vtmp[2].4s,$datx.4s,18 ++ ++ ushr @vtmp[0].4s,$dat.4s,32-24 ++ eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b ++ sli @vtmp[0].4s,$dat.4s,24 ++ ushr @vtmp[2].4s,$datx.4s,32-24 ++ eor $dat.16b,@vtmp[0].16b,@vtmp[1].16b ++ sli @vtmp[2].4s,$datx.4s,24 ++ eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b ++___ ++} ++ ++# sbox operation for one single word ++sub sbox_1word () { ++ my $word = shift; ++ ++$code.=<<___; ++ movi @vtmp[1].16b,#64 ++ movi @vtmp[2].16b,#128 ++ movi @vtmp[3].16b,#192 ++ mov @vtmp[0].s[0],$word ++ ++ sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b ++ sub @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b ++ sub @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b ++ ++ tbl @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b ++ tbl @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b ++ tbl @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b ++ tbl @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b ++ ++ mov $word,@vtmp[0].s[0] ++ mov $wtmp0,@vtmp[1].s[0] ++ mov $wtmp2,@vtmp[2].s[0] ++ add $wtmp0,$word,$wtmp0 ++ mov $word,@vtmp[3].s[0] ++ add $wtmp0,$wtmp0,$wtmp2 ++ add $wtmp0,$wtmp0,$word ++ ++ eor $word,$wtmp0,$wtmp0,ror #32-2 ++ eor $word,$word,$wtmp0,ror #32-10 ++ eor $word,$word,$wtmp0,ror #32-18 ++ eor $word,$word,$wtmp0,ror #32-24 ++___ ++} ++ ++# sm4 for one block of data, in scalar registers word0/word1/word2/word3 ++sub sm4_1blk () { ++ my $kptr = shift; ++ ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) ++ eor $tmpw,$word2,$word3 ++ eor $wtmp2,$wtmp0,$word1 ++ eor $tmpw,$tmpw,$wtmp2 ++___ ++ &sbox_1word($tmpw); ++$code.=<<___; ++ eor $word0,$word0,$tmpw ++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) ++ eor $tmpw,$word2,$word3 ++ eor $wtmp2,$word0,$wtmp1 ++ eor $tmpw,$tmpw,$wtmp2 ++___ ++ &sbox_1word($tmpw); ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ eor $word1,$word1,$tmpw ++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) ++ eor $tmpw,$word0,$word1 ++ eor $wtmp2,$wtmp0,$word3 ++ eor $tmpw,$tmpw,$wtmp2 ++___ ++ &sbox_1word($tmpw); ++$code.=<<___; ++ eor $word2,$word2,$tmpw ++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) ++ eor $tmpw,$word0,$word1 ++ eor $wtmp2,$word2,$wtmp1 ++ eor $tmpw,$tmpw,$wtmp2 ++___ ++ &sbox_1word($tmpw); ++$code.=<<___; ++ eor $word3,$word3,$tmpw ++___ ++} ++ ++# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 ++sub sm4_4blks () { ++ my $kptr = shift; ++ ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ dup $rk0.4s,$wtmp0 ++ dup $rk1.4s,$wtmp1 ++ ++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) ++ eor $rka.16b,@data[2].16b,@data[3].16b ++ eor $rk0.16b,@data[1].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,$rk0.16b ++___ ++ &sbox($rk0); ++$code.=<<___; ++ eor @data[0].16b,@data[0].16b,$rk0.16b ++ ++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) ++ eor $rka.16b,$rka.16b,@data[0].16b ++ eor $rk1.16b,$rka.16b,$rk1.16b ++___ ++ &sbox($rk1); ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ eor @data[1].16b,@data[1].16b,$rk1.16b ++ ++ dup $rk0.4s,$wtmp0 ++ dup $rk1.4s,$wtmp1 ++ ++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) ++ eor $rka.16b,@data[0].16b,@data[1].16b ++ eor $rk0.16b,@data[3].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,$rk0.16b ++___ ++ &sbox($rk0); ++$code.=<<___; ++ eor @data[2].16b,@data[2].16b,$rk0.16b ++ ++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) ++ eor $rka.16b,$rka.16b,@data[2].16b ++ eor $rk1.16b,$rka.16b,$rk1.16b ++___ ++ &sbox($rk1); ++$code.=<<___; ++ eor @data[3].16b,@data[3].16b,$rk1.16b ++___ ++} ++ ++# sm4 for 8 lanes of data, in neon registers ++# data0/data1/data2/data3 datax0/datax1/datax2/datax3 ++sub sm4_8blks () { ++ my $kptr = shift; ++ ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) ++ dup $rk0.4s,$wtmp0 ++ eor $rka.16b,@data[2].16b,@data[3].16b ++ eor $rkb.16b,@datax[2].16b,@datax[3].16b ++ eor @vtmp[0].16b,@data[1].16b,$rk0.16b ++ eor @vtmp[1].16b,@datax[1].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,@vtmp[0].16b ++ eor $rk1.16b,$rkb.16b,@vtmp[1].16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ eor @data[0].16b,@data[0].16b,$rk0.16b ++ eor @datax[0].16b,@datax[0].16b,$rk1.16b ++ ++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) ++ dup $rk1.4s,$wtmp1 ++ eor $rka.16b,$rka.16b,@data[0].16b ++ eor $rkb.16b,$rkb.16b,@datax[0].16b ++ eor $rk0.16b,$rka.16b,$rk1.16b ++ eor $rk1.16b,$rkb.16b,$rk1.16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ eor @data[1].16b,@data[1].16b,$rk0.16b ++ eor @datax[1].16b,@datax[1].16b,$rk1.16b ++ ++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) ++ dup $rk0.4s,$wtmp0 ++ eor $rka.16b,@data[0].16b,@data[1].16b ++ eor $rkb.16b,@datax[0].16b,@datax[1].16b ++ eor @vtmp[0].16b,@data[3].16b,$rk0.16b ++ eor @vtmp[1].16b,@datax[3].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,@vtmp[0].16b ++ eor $rk1.16b,$rkb.16b,@vtmp[1].16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ eor @data[2].16b,@data[2].16b,$rk0.16b ++ eor @datax[2].16b,@datax[2].16b,$rk1.16b ++ ++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) ++ dup $rk1.4s,$wtmp1 ++ eor $rka.16b,$rka.16b,@data[2].16b ++ eor $rkb.16b,$rkb.16b,@datax[2].16b ++ eor $rk0.16b,$rka.16b,$rk1.16b ++ eor $rk1.16b,$rkb.16b,$rk1.16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ eor @data[3].16b,@data[3].16b,$rk0.16b ++ eor @datax[3].16b,@datax[3].16b,$rk1.16b ++___ ++} ++ ++sub encrypt_1blk_norev() { ++ my $dat = shift; ++ ++$code.=<<___; ++ mov $ptr,$rks ++ mov $counter,#8 ++ mov $word0,$dat.s[0] ++ mov $word1,$dat.s[1] ++ mov $word2,$dat.s[2] ++ mov $word3,$dat.s[3] ++10: ++___ ++ &sm4_1blk($ptr); ++$code.=<<___; ++ subs $counter,$counter,#1 ++ b.ne 10b ++ mov $dat.s[0],$word3 ++ mov $dat.s[1],$word2 ++ mov $dat.s[2],$word1 ++ mov $dat.s[3],$word0 ++___ ++} ++ ++sub encrypt_1blk() { ++ my $dat = shift; ++ ++ &encrypt_1blk_norev($dat); ++ &rev32($dat,$dat); ++} ++ ++sub encrypt_4blks() { ++$code.=<<___; ++ mov $ptr,$rks ++ mov $counter,#8 ++10: ++___ ++ &sm4_4blks($ptr); ++$code.=<<___; ++ subs $counter,$counter,#1 ++ b.ne 10b ++___ ++ &rev32(@vtmp[3],@data[0]); ++ &rev32(@vtmp[2],@data[1]); ++ &rev32(@vtmp[1],@data[2]); ++ &rev32(@vtmp[0],@data[3]); ++} ++ ++sub encrypt_8blks() { ++$code.=<<___; ++ mov $ptr,$rks ++ mov $counter,#8 ++10: ++___ ++ &sm4_8blks($ptr); ++$code.=<<___; ++ subs $counter,$counter,#1 ++ b.ne 10b ++___ ++ &rev32(@vtmp[3],@data[0]); ++ &rev32(@vtmp[2],@data[1]); ++ &rev32(@vtmp[1],@data[2]); ++ &rev32(@vtmp[0],@data[3]); ++ &rev32(@data[3],@datax[0]); ++ &rev32(@data[2],@datax[1]); ++ &rev32(@data[1],@datax[2]); ++ &rev32(@data[0],@datax[3]); ++} ++ ++sub load_sbox () { ++ my $data = shift; ++ ++$code.=<<___; ++ adr $ptr,.Lsbox ++ ld1 {@sbox[0].4s,@sbox[1].4s,@sbox[2].4s,@sbox[3].4s},[$ptr],#64 ++ ld1 {@sbox[4].4s,@sbox[5].4s,@sbox[6].4s,@sbox[7].4s},[$ptr],#64 ++ ld1 {@sbox[8].4s,@sbox[9].4s,@sbox[10].4s,@sbox[11].4s},[$ptr],#64 ++ ld1 {@sbox[12].4s,@sbox[13].4s,@sbox[14].4s,@sbox[15].4s},[$ptr] ++___ ++} ++ ++$code=<<___; ++#include "arm_arch.h" ++.arch armv8-a ++.text ++ ++.type _vpsm4_consts,%object ++.align 7 ++_vpsm4_consts: ++.Lsbox: ++ .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05 ++ .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99 ++ .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62 ++ .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6 ++ .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8 ++ .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35 ++ .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87 ++ .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E ++ .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1 ++ .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3 ++ .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F ++ .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51 ++ .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8 ++ .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0 ++ .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84 ++ .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48 ++.Lck: ++ .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 ++ .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 ++ .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 ++ .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 ++ .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 ++ .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 ++ .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 ++ .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 ++.Lfk: ++ .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197 ++.Lshuffles: ++ .dword 0x0B0A090807060504,0x030201000F0E0D0C ++ ++.size _vpsm4_consts,.-_vpsm4_consts ++___ ++ ++{{{ ++my ($key,$keys,$enc)=("x0","x1","w2"); ++my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8"); ++my ($vkey,$vfk,$vmap)=("v5","v6","v7"); ++$code.=<<___; ++.type _vpsm4_set_key,%function ++.align 4 ++_vpsm4_set_key: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {$vkey.4s},[$key] ++___ ++ &load_sbox(); ++ &rev32($vkey,$vkey); ++$code.=<<___; ++ adr $pointer,.Lshuffles ++ ld1 {$vmap.4s},[$pointer] ++ adr $pointer,.Lfk ++ ld1 {$vfk.4s},[$pointer] ++ eor $vkey.16b,$vkey.16b,$vfk.16b ++ mov $schedules,#32 ++ adr $pointer,.Lck ++ movi @vtmp[0].16b,#64 ++ cbnz $enc,1f ++ add $keys,$keys,124 ++1: ++ mov $wtmp,$vkey.s[1] ++ ldr $roundkey,[$pointer],#4 ++ eor $roundkey,$roundkey,$wtmp ++ mov $wtmp,$vkey.s[2] ++ eor $roundkey,$roundkey,$wtmp ++ mov $wtmp,$vkey.s[3] ++ eor $roundkey,$roundkey,$wtmp ++ // sbox lookup ++ mov @data[0].s[0],$roundkey ++ tbl @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b ++ sub @data[0].16b,@data[0].16b,@vtmp[0].16b ++ tbx @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b ++ sub @data[0].16b,@data[0].16b,@vtmp[0].16b ++ tbx @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b ++ sub @data[0].16b,@data[0].16b,@vtmp[0].16b ++ tbx @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b ++ mov $wtmp,@vtmp[1].s[0] ++ eor $roundkey,$wtmp,$wtmp,ror #19 ++ eor $roundkey,$roundkey,$wtmp,ror #9 ++ mov $wtmp,$vkey.s[0] ++ eor $roundkey,$roundkey,$wtmp ++ mov $vkey.s[0],$roundkey ++ cbz $enc,2f ++ str $roundkey,[$keys],#4 ++ b 3f ++2: ++ str $roundkey,[$keys],#-4 ++3: ++ tbl $vkey.16b,{$vkey.16b},$vmap.16b ++ subs $schedules,$schedules,#1 ++ b.ne 1b ++ ret ++.size _vpsm4_set_key,.-_vpsm4_set_key ++___ ++}}} ++ ++ ++{{{ ++$code.=<<___; ++.type _vpsm4_enc_4blks,%function ++.align 4 ++_vpsm4_enc_4blks: ++ AARCH64_VALID_CALL_TARGET ++___ ++ &encrypt_4blks(); ++$code.=<<___; ++ ret ++.size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks ++___ ++}}} ++ ++{{{ ++$code.=<<___; ++.type _vpsm4_enc_8blks,%function ++.align 4 ++_vpsm4_enc_8blks: ++ AARCH64_VALID_CALL_TARGET ++___ ++ &encrypt_8blks(); ++$code.=<<___; ++ ret ++.size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks ++___ ++}}} ++ ++ ++{{{ ++my ($key,$keys)=("x0","x1"); ++$code.=<<___; ++.globl ${prefix}_set_encrypt_key ++.type ${prefix}_set_encrypt_key,%function ++.align 5 ++${prefix}_set_encrypt_key: ++ AARCH64_SIGN_LINK_REGISTER ++ stp x29,x30,[sp,#-16]! ++ mov w2,1 ++ bl _vpsm4_set_key ++ ldp x29,x30,[sp],#16 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key ++___ ++}}} ++ ++{{{ ++my ($key,$keys)=("x0","x1"); ++$code.=<<___; ++.globl ${prefix}_set_decrypt_key ++.type ${prefix}_set_decrypt_key,%function ++.align 5 ++${prefix}_set_decrypt_key: ++ AARCH64_SIGN_LINK_REGISTER ++ stp x29,x30,[sp,#-16]! ++ mov w2,0 ++ bl _vpsm4_set_key ++ ldp x29,x30,[sp],#16 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key ++___ ++}}} ++ ++{{{ ++sub gen_block () { ++ my $dir = shift; ++ my ($inp,$outp,$rk)=map("x$_",(0..2)); ++ ++$code.=<<___; ++.globl ${prefix}_${dir}crypt ++.type ${prefix}_${dir}crypt,%function ++.align 5 ++${prefix}_${dir}crypt: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {@data[0].16b},[$inp] ++___ ++ &load_sbox(); ++ &rev32(@data[0],@data[0]); ++$code.=<<___; ++ mov $rks,x2 ++___ ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ st1 {@data[0].16b},[$outp] ++ ret ++.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt ++___ ++} ++&gen_block("en"); ++&gen_block("de"); ++}}} ++ ++{{{ ++my ($enc) = ("w4"); ++my @dat=map("v$_",(16..23)); ++ ++$code.=<<___; ++.globl ${prefix}_ecb_encrypt ++.type ${prefix}_ecb_encrypt,%function ++.align 5 ++${prefix}_ecb_encrypt: ++ AARCH64_SIGN_LINK_REGISTER ++ // convert length into blocks ++ lsr x2,x2,4 ++ stp d8,d9,[sp,#-80]! ++ stp d10,d11,[sp,#16] ++ stp d12,d13,[sp,#32] ++ stp d14,d15,[sp,#48] ++ stp x29,x30,[sp,#64] ++___ ++ &load_sbox(); ++$code.=<<___; ++.Lecb_8_blocks_process: ++ cmp $blocks,#8 ++ b.lt .Lecb_4_blocks_process ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++ ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &rev32(@datax[0],@datax[0]); ++ &rev32(@datax[1],@datax[1]); ++ &rev32(@datax[2],@datax[2]); ++ &rev32(@datax[3],@datax[3]); ++$code.=<<___; ++ bl _vpsm4_enc_8blks ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.gt .Lecb_8_blocks_process ++ b 100f ++.Lecb_4_blocks_process: ++ cmp $blocks,#4 ++ b.lt 1f ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _vpsm4_enc_4blks ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ sub $blocks,$blocks,#4 ++1: ++ // process last block ++ cmp $blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ ld1 {@data[0].16b},[$inp] ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ st1 {@data[0].16b},[$outp] ++ b 100f ++1: // process last 2 blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16 ++ cmp $blocks,#2 ++ b.gt 1f ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _vpsm4_enc_4blks ++ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp] ++ b 100f ++1: // process last 3 blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16 ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _vpsm4_enc_4blks ++ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16 ++ st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp] ++100: ++ ldp d10,d11,[sp,#16] ++ ldp d12,d13,[sp,#32] ++ ldp d14,d15,[sp,#48] ++ ldp x29,x30,[sp,#64] ++ ldp d8,d9,[sp],#80 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt ++___ ++}}} ++ ++{{{ ++my ($len,$ivp,$enc)=("x2","x4","w5"); ++my $ivec0=("v3"); ++my $ivec1=("v15"); ++ ++$code.=<<___; ++.globl ${prefix}_cbc_encrypt ++.type ${prefix}_cbc_encrypt,%function ++.align 5 ++${prefix}_cbc_encrypt: ++ AARCH64_VALID_CALL_TARGET ++ lsr $len,$len,4 ++___ ++ &load_sbox(); ++$code.=<<___; ++ cbz $enc,.Ldec ++ ld1 {$ivec0.4s},[$ivp] ++.Lcbc_4_blocks_enc: ++ cmp $blocks,#4 ++ b.lt 1f ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++ eor @data[0].16b,@data[0].16b,$ivec0.16b ++___ ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &encrypt_1blk_norev(@data[0]); ++$code.=<<___; ++ eor @data[1].16b,@data[1].16b,@data[0].16b ++___ ++ &encrypt_1blk_norev(@data[1]); ++ &rev32(@data[0],@data[0]); ++ ++$code.=<<___; ++ eor @data[2].16b,@data[2].16b,@data[1].16b ++___ ++ &encrypt_1blk_norev(@data[2]); ++ &rev32(@data[1],@data[1]); ++$code.=<<___; ++ eor @data[3].16b,@data[3].16b,@data[2].16b ++___ ++ &encrypt_1blk_norev(@data[3]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ orr $ivec0.16b,@data[3].16b,@data[3].16b ++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#4 ++ b.ne .Lcbc_4_blocks_enc ++ b 2f ++1: ++ subs $blocks,$blocks,#1 ++ b.lt 2f ++ ld1 {@data[0].4s},[$inp],#16 ++ eor $ivec0.16b,$ivec0.16b,@data[0].16b ++___ ++ &rev32($ivec0,$ivec0); ++ &encrypt_1blk($ivec0); ++$code.=<<___; ++ st1 {$ivec0.16b},[$outp],#16 ++ b 1b ++2: ++ // save back IV ++ st1 {$ivec0.16b},[$ivp] ++ ret ++ ++.Ldec: ++ // decryption mode starts ++ AARCH64_SIGN_LINK_REGISTER ++ stp d8,d9,[sp,#-80]! ++ stp d10,d11,[sp,#16] ++ stp d12,d13,[sp,#32] ++ stp d14,d15,[sp,#48] ++ stp x29,x30,[sp,#64] ++.Lcbc_8_blocks_dec: ++ cmp $blocks,#8 ++ b.lt 1f ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] ++ add $ptr,$inp,#64 ++ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr] ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],$data[3]); ++ &rev32(@datax[0],@datax[0]); ++ &rev32(@datax[1],@datax[1]); ++ &rev32(@datax[2],@datax[2]); ++ &rev32(@datax[3],$datax[3]); ++$code.=<<___; ++ bl _vpsm4_enc_8blks ++___ ++ &transpose(@vtmp,@datax); ++ &transpose(@data,@datax); ++$code.=<<___; ++ ld1 {$ivec1.16b},[$ivp] ++ ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++ // note ivec1 and vtmpx[3] are resuing the same register ++ // care needs to be taken to avoid conflict ++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b ++ ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 ++ eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b ++ eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b ++ // save back IV ++ st1 {$vtmpx[3].16b}, [$ivp] ++ eor @data[0].16b,@data[0].16b,$datax[3].16b ++ eor @data[1].16b,@data[1].16b,@vtmpx[0].16b ++ eor @data[2].16b,@data[2].16b,@vtmpx[1].16b ++ eor @data[3].16b,$data[3].16b,@vtmpx[2].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.gt .Lcbc_8_blocks_dec ++ b.eq 100f ++1: ++ ld1 {$ivec1.16b},[$ivp] ++.Lcbc_4_blocks_dec: ++ cmp $blocks,#4 ++ b.lt 1f ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],$data[3]); ++$code.=<<___; ++ bl _vpsm4_enc_4blks ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &transpose(@vtmp,@datax); ++$code.=<<___; ++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b ++ orr $ivec1.16b,@data[3].16b,@data[3].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b ++ eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#4 ++ b.gt .Lcbc_4_blocks_dec ++ // save back IV ++ st1 {@vtmp[3].16b}, [$ivp] ++ b 100f ++1: // last block ++ subs $blocks,$blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ ld1 {@data[0].4s},[$inp],#16 ++ // save back IV ++ st1 {$data[0].16b}, [$ivp] ++___ ++ &rev32(@datax[0],@data[0]); ++ &encrypt_1blk(@datax[0]); ++$code.=<<___; ++ eor @datax[0].16b,@datax[0].16b,$ivec1.16b ++ st1 {@datax[0].16b},[$outp],#16 ++ b 100f ++1: // last two blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp] ++ add $ptr,$inp,#16 ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16 ++ subs $blocks,$blocks,1 ++ b.gt 1f ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _vpsm4_enc_4blks ++ ld1 {@data[0].4s,@data[1].4s},[$inp],#32 ++___ ++ &transpose(@vtmp,@datax); ++$code.=<<___; ++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 ++ // save back IV ++ st1 {@data[1].16b}, [$ivp] ++ b 100f ++1: // last 3 blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr] ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _vpsm4_enc_4blks ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 ++___ ++ &transpose(@vtmp,@datax); ++$code.=<<___; ++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 ++ // save back IV ++ st1 {@data[2].16b}, [$ivp] ++100: ++ ldp d10,d11,[sp,#16] ++ ldp d12,d13,[sp,#32] ++ ldp d14,d15,[sp,#48] ++ ldp x29,x30,[sp,#64] ++ ldp d8,d9,[sp],#80 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt ++___ ++}}} ++ ++{{{ ++my ($ivp)=("x4"); ++my ($ctr)=("w5"); ++my $ivec=("v3"); ++ ++$code.=<<___; ++.globl ${prefix}_ctr32_encrypt_blocks ++.type ${prefix}_ctr32_encrypt_blocks,%function ++.align 5 ++${prefix}_ctr32_encrypt_blocks: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {$ivec.4s},[$ivp] ++___ ++ &rev32($ivec,$ivec); ++ &load_sbox(); ++$code.=<<___; ++ cmp $blocks,#1 ++ b.ne 1f ++ // fast processing for one single block without ++ // context saving overhead ++___ ++ &encrypt_1blk($ivec); ++$code.=<<___; ++ ld1 {@data[0].16b},[$inp] ++ eor @data[0].16b,@data[0].16b,$ivec.16b ++ st1 {@data[0].16b},[$outp] ++ ret ++1: ++ AARCH64_SIGN_LINK_REGISTER ++ stp d8,d9,[sp,#-80]! ++ stp d10,d11,[sp,#16] ++ stp d12,d13,[sp,#32] ++ stp d14,d15,[sp,#48] ++ stp x29,x30,[sp,#64] ++ mov $word0,$ivec.s[0] ++ mov $word1,$ivec.s[1] ++ mov $word2,$ivec.s[2] ++ mov $ctr,$ivec.s[3] ++.Lctr32_4_blocks_process: ++ cmp $blocks,#4 ++ b.lt 1f ++ dup @data[0].4s,$word0 ++ dup @data[1].4s,$word1 ++ dup @data[2].4s,$word2 ++ mov @data[3].s[0],$ctr ++ add $ctr,$ctr,#1 ++ mov $data[3].s[1],$ctr ++ add $ctr,$ctr,#1 ++ mov @data[3].s[2],$ctr ++ add $ctr,$ctr,#1 ++ mov @data[3].s[3],$ctr ++ add $ctr,$ctr,#1 ++ cmp $blocks,#8 ++ b.ge .Lctr32_8_blocks_process ++ bl _vpsm4_enc_4blks ++ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 ++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b ++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#4 ++ b.ne .Lctr32_4_blocks_process ++ b 100f ++.Lctr32_8_blocks_process: ++ dup @datax[0].4s,$word0 ++ dup @datax[1].4s,$word1 ++ dup @datax[2].4s,$word2 ++ mov @datax[3].s[0],$ctr ++ add $ctr,$ctr,#1 ++ mov $datax[3].s[1],$ctr ++ add $ctr,$ctr,#1 ++ mov @datax[3].s[2],$ctr ++ add $ctr,$ctr,#1 ++ mov @datax[3].s[3],$ctr ++ add $ctr,$ctr,#1 ++ bl _vpsm4_enc_8blks ++ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 ++ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b ++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b ++ eor @data[0].16b,@data[0].16b,@datax[0].16b ++ eor @data[1].16b,@data[1].16b,@datax[1].16b ++ eor @data[2].16b,@data[2].16b,@datax[2].16b ++ eor @data[3].16b,@data[3].16b,@datax[3].16b ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.ne .Lctr32_4_blocks_process ++ b 100f ++1: // last block processing ++ subs $blocks,$blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ mov $ivec.s[0],$word0 ++ mov $ivec.s[1],$word1 ++ mov $ivec.s[2],$word2 ++ mov $ivec.s[3],$ctr ++___ ++ &encrypt_1blk($ivec); ++$code.=<<___; ++ ld1 {@data[0].16b},[$inp] ++ eor @data[0].16b,@data[0].16b,$ivec.16b ++ st1 {@data[0].16b},[$outp] ++ b 100f ++1: // last 2 blocks processing ++ dup @data[0].4s,$word0 ++ dup @data[1].4s,$word1 ++ dup @data[2].4s,$word2 ++ mov @data[3].s[0],$ctr ++ add $ctr,$ctr,#1 ++ mov @data[3].s[1],$ctr ++ subs $blocks,$blocks,#1 ++ b.ne 1f ++ bl _vpsm4_enc_4blks ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 ++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b ++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 ++ b 100f ++1: // last 3 blocks processing ++ add $ctr,$ctr,#1 ++ mov @data[3].s[2],$ctr ++ bl _vpsm4_enc_4blks ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16 ++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b ++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16 ++100: ++ ldp d10,d11,[sp,#16] ++ ldp d12,d13,[sp,#32] ++ ldp d14,d15,[sp,#48] ++ ldp x29,x30,[sp,#64] ++ ldp d8,d9,[sp],#80 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks ++___ ++}}} ++######################################## ++open SELF,$0; ++while(<SELF>) { ++ next if (/^#!/); ++ last if (!s/^#/\/\// and !/^$/); ++ print; ++} ++close SELF; ++ ++foreach(split("\n",$code)) { ++ s/\`([^\`]*)\`/eval($1)/ge; ++ print $_,"\n"; ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; +diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info +index e27aa49e67..75a215ab80 100644 +--- a/crypto/sm4/build.info ++++ b/crypto/sm4/build.info +@@ -1,8 +1,8 @@ + LIBS=../../libcrypto + + IF[{- !$disabled{asm} -}] +- $SM4DEF_aarch64=SM4_ASM +- $SM4ASM_aarch64=sm4-armv8.S ++ $SM4DEF_aarch64=SM4_ASM VPSM4_ASM ++ $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S + + # Now that we have defined all the arch specific variables, use the + # appropriate one, and define the appropriate macros +@@ -29,4 +29,6 @@ IF[{- !$disabled{module} && !$disabled{shared} -}] + ENDIF + + GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl ++GENERATE[vpsm4-armv8.S]=asm/vpsm4-armv8.pl + INCLUDE[sm4-armv8.o]=.. ++INCLUDE[vpsm4-armv8.o]=.. +diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h +index 42c8b44a43..11f9b9d88b 100644 +--- a/include/crypto/sm4_platform.h ++++ b/include/crypto/sm4_platform.h +@@ -15,6 +15,16 @@ + # if (defined(__arm__) || defined(__arm) || defined(__aarch64__)) + # include "arm_arch.h" + # if __ARM_MAX_ARCH__>=8 ++extern unsigned int OPENSSL_arm_midr; ++static inline int vpsm4_capable(void) ++{ ++ return (OPENSSL_armcap_P & ARMV8_CPUID) && ++ (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1) || ++ MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1)); ++} ++# if defined(VPSM4_ASM) ++# define VPSM4_CAPABLE vpsm4_capable() ++# endif + # define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4) + # define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key + # define HWSM4_set_decrypt_key sm4_v8_set_decrypt_key +@@ -45,4 +55,23 @@ void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + const unsigned char ivec[16]); + # endif /* HWSM4_CAPABLE */ + ++#ifdef VPSM4_CAPABLE ++int vpsm4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); ++int vpsm4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); ++void vpsm4_encrypt(const unsigned char *in, unsigned char *out, ++ const SM4_KEY *key); ++void vpsm4_decrypt(const unsigned char *in, unsigned char *out, ++ const SM4_KEY *key); ++void vpsm4_cbc_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ unsigned char *ivec, const int enc); ++void vpsm4_ecb_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ const int enc); ++void vpsm4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, ++ const unsigned char ivec[16]); ++# endif /* VPSM4_CAPABLE */ ++ ++ + #endif /* OSSL_SM4_PLATFORM_H */ +diff --git a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c +index b9633f83ed..db7fe0fe2f 100644 +--- a/providers/implementations/ciphers/cipher_sm4_gcm_hw.c ++++ b/providers/implementations/ciphers/cipher_sm4_gcm_hw.c +@@ -32,6 +32,13 @@ static int sm4_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key, + # endif + } else + # endif /* HWSM4_CAPABLE */ ++# ifdef VPSM4_CAPABLE ++ if (VPSM4_CAPABLE) { ++ vpsm4_set_encrypt_key(key, ks); ++ CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f) vpsm4_encrypt); ++ ctx->ctr = (ctr128_f) vpsm4_ctr32_encrypt_blocks; ++ } else ++# endif /* VPSM4_CAPABLE */ + { + ossl_sm4_set_key(key, ks); + CRYPTO_gcm128_init(&ctx->gcm, ks, (block128_f)ossl_sm4_encrypt); +diff --git a/providers/implementations/ciphers/cipher_sm4_hw.c b/providers/implementations/ciphers/cipher_sm4_hw.c +index 4cd3d3d669..9a2e99f67c 100644 +--- a/providers/implementations/ciphers/cipher_sm4_hw.c ++++ b/providers/implementations/ciphers/cipher_sm4_hw.c +@@ -41,6 +41,19 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx, + #endif + (void)0; /* terminate potentially open 'else' */ + } else ++#endif ++#ifdef VPSM4_CAPABLE ++ if (VPSM4_CAPABLE) { ++ vpsm4_set_encrypt_key(key, ks); ++ ctx->block = (block128_f)vpsm4_encrypt; ++ ctx->stream.cbc = NULL; ++ if (ctx->mode == EVP_CIPH_CBC_MODE) ++ ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt; ++ else if (ctx->mode == EVP_CIPH_ECB_MODE) ++ ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt; ++ else if (ctx->mode == EVP_CIPH_CTR_MODE) ++ ctx->stream.ctr = (ctr128_f)vpsm4_ctr32_encrypt_blocks; ++ } else + #endif + { + ossl_sm4_set_key(key, ks); +@@ -61,6 +74,17 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx, + ctx->stream.ecb = (ecb128_f)HWSM4_ecb_encrypt; + #endif + } else ++#endif ++#ifdef VPSM4_CAPABLE ++ if (VPSM4_CAPABLE) { ++ vpsm4_set_decrypt_key(key, ks); ++ ctx->block = (block128_f)vpsm4_decrypt; ++ ctx->stream.cbc = NULL; ++ if (ctx->mode == EVP_CIPH_CBC_MODE) ++ ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt; ++ else if (ctx->mode == EVP_CIPH_ECB_MODE) ++ ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt; ++ } else + #endif + { + ossl_sm4_set_key(key, ks); +-- +2.37.3.windows.1 + |