diff options
Diffstat (limited to 'Backport-SM4-AESE-optimization-for-ARMv8.patch')
-rw-r--r-- | Backport-SM4-AESE-optimization-for-ARMv8.patch | 2322 |
1 files changed, 2322 insertions, 0 deletions
diff --git a/Backport-SM4-AESE-optimization-for-ARMv8.patch b/Backport-SM4-AESE-optimization-for-ARMv8.patch new file mode 100644 index 0000000..0866262 --- /dev/null +++ b/Backport-SM4-AESE-optimization-for-ARMv8.patch @@ -0,0 +1,2322 @@ +From 730387aebda57a1bb0af5a74747d4dadc5e033f7 Mon Sep 17 00:00:00 2001 +From: Xu Yizhou <xuyizhou1@huawei.com> +Date: Wed, 18 Jan 2023 09:55:02 +0800 +Subject: [PATCH 12/13] SM4 AESE optimization for ARMv8 + +Signed-off-by: Xu Yizhou <xuyizhou1@huawei.com> + +Reviewed-by: Tomas Mraz <tomas@openssl.org> +Reviewed-by: Paul Dale <pauli@openssl.org> +(Merged from https://github.com/openssl/openssl/pull/19914) +--- + crypto/sm4/asm/vpsm4-armv8.pl | 458 +++++ + crypto/sm4/asm/vpsm4_ex-armv8.pl | 1544 +++++++++++++++++ + crypto/sm4/build.info | 4 +- + include/crypto/sm4_platform.h | 41 +- + .../implementations/ciphers/cipher_sm4_hw.c | 26 +- + .../implementations/ciphers/cipher_sm4_xts.c | 4 +- + .../implementations/ciphers/cipher_sm4_xts.h | 2 +- + .../ciphers/cipher_sm4_xts_hw.c | 33 +- + 8 files changed, 2090 insertions(+), 22 deletions(-) + create mode 100644 crypto/sm4/asm/vpsm4_ex-armv8.pl + +diff --git a/crypto/sm4/asm/vpsm4-armv8.pl b/crypto/sm4/asm/vpsm4-armv8.pl +index 73797af582..e19de30901 100755 +--- a/crypto/sm4/asm/vpsm4-armv8.pl ++++ b/crypto/sm4/asm/vpsm4-armv8.pl +@@ -28,6 +28,7 @@ open OUT,"| \"$^X\" $xlate $flavour \"$output\"" + + $prefix="vpsm4"; + my @vtmp=map("v$_",(0..3)); ++my @qtmp=map("q$_",(0..3)); + my @data=map("v$_",(4..7)); + my @datax=map("v$_",(8..11)); + my ($rk0,$rk1)=("v12","v13"); +@@ -36,6 +37,7 @@ my @vtmpx=map("v$_",(12..15)); + my @sbox=map("v$_",(16..31)); + my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3"); + my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9"); ++my ($xtmp1,$xtmp2)=("x8","x9"); + my ($ptr,$counter)=("x10","w11"); + my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15"); + +@@ -60,6 +62,51 @@ ___ + } + } + ++sub rev32_armeb() { ++ my $dst = shift; ++ my $src = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++$code.=<<___; ++#ifdef __AARCH64EB__ ++ rev32 $dst.16b,$src.16b ++#else ++ mov $dst.16b,$src.16b ++#endif ++___ ++ } else { ++$code.=<<___; ++#ifdef __AARCH64EB__ ++ rev32 $dst.16b,$dst.16b ++#endif ++___ ++ } ++} ++ ++sub rbit() { ++ my $dst = shift; ++ my $src = shift; ++ my $std = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++ if ($std eq "_gb") { ++$code.=<<___; ++ rbit $dst.16b,$src.16b ++___ ++ } else { ++$code.=<<___; ++ mov $dst.16b,$src.16b ++___ ++ } ++ } else { ++ if ($std eq "_gb") { ++$code.=<<___; ++ rbit $dst.16b,$src.16b ++___ ++ } ++ } ++} ++ + sub transpose() { + my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; + +@@ -435,6 +482,58 @@ $code.=<<___; + ___ + } + ++ ++sub mov_reg_to_vec() { ++ my $src0 = shift; ++ my $src1 = shift; ++ my $desv = shift; ++$code.=<<___; ++ mov $desv.d[0],$src0 ++ mov $desv.d[1],$src1 ++___ ++ &rev32_armeb($desv,$desv); ++} ++ ++sub mov_vec_to_reg() { ++ my $srcv = shift; ++ my $des0 = shift; ++ my $des1 = shift; ++$code.=<<___; ++ mov $des0,$srcv.d[0] ++ mov $des1,$srcv.d[1] ++___ ++} ++ ++sub compute_tweak() { ++ my $src0 = shift; ++ my $src1 = shift; ++ my $des0 = shift; ++ my $des1 = shift; ++$code.=<<___; ++ mov $wtmp0,0x87 ++ extr $xtmp2,$src1,$src1,#32 ++ extr $des1,$src1,$src0,#63 ++ and $wtmp1,$wtmp0,$wtmp2,asr#31 ++ eor $des0,$xtmp1,$src0,lsl#1 ++___ ++} ++ ++sub compute_tweak_vec() { ++ my $src = shift; ++ my $des = shift; ++ my $std = shift; ++ &rbit(@vtmp[2],$src,$std); ++$code.=<<___; ++ ldr @qtmp[0], =0x01010101010101010101010101010187 ++ shl $des.16b, @vtmp[2].16b, #1 ++ ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 ++ ushr @vtmp[1].16b, @vtmp[1].16b, #7 ++ mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b ++ eor $des.16b, $des.16b, @vtmp[1].16b ++___ ++ &rbit($des,$des,$std); ++} ++ + $code=<<___; + #include "arm_arch.h" + .arch armv8-a +@@ -1101,6 +1200,365 @@ $code.=<<___; + .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks + ___ + }}} ++ ++{{{ ++my ($blocks,$len)=("x2","x2"); ++my $ivp=("x5"); ++my @twx=map("x$_",(12..27)); ++my ($rks1,$rks2)=("x26","x27"); ++my $lastBlk=("x26"); ++my $enc=("w28"); ++my $remain=("x29"); ++ ++my @tweak=@datax; ++ ++sub gen_xts_cipher() { ++ my $std = shift; ++$code.=<<___; ++.globl ${prefix}_xts_encrypt${std} ++.type ${prefix}_xts_encrypt${std},%function ++.align 5 ++${prefix}_xts_encrypt${std}: ++ AARCH64_SIGN_LINK_REGISTER ++ stp x15, x16, [sp, #-0x10]! ++ stp x17, x18, [sp, #-0x10]! ++ stp x19, x20, [sp, #-0x10]! ++ stp x21, x22, [sp, #-0x10]! ++ stp x23, x24, [sp, #-0x10]! ++ stp x25, x26, [sp, #-0x10]! ++ stp x27, x28, [sp, #-0x10]! ++ stp x29, x30, [sp, #-0x10]! ++ stp d8, d9, [sp, #-0x10]! ++ stp d10, d11, [sp, #-0x10]! ++ stp d12, d13, [sp, #-0x10]! ++ stp d14, d15, [sp, #-0x10]! ++ mov $rks1,x3 ++ mov $rks2,x4 ++ mov $enc,w6 ++ ld1 {@tweak[0].4s}, [$ivp] ++ mov $rks,$rks2 ++___ ++ &load_sbox(); ++ &rev32(@tweak[0],@tweak[0]); ++ &encrypt_1blk(@tweak[0]); ++$code.=<<___; ++ mov $rks,$rks1 ++ and $remain,$len,#0x0F ++ // convert length into blocks ++ lsr $blocks,$len,4 ++ cmp $blocks,#1 ++ b.lt .return${std} ++ ++ cmp $remain,0 ++ // If the encryption/decryption Length is N times of 16, ++ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std} ++ b.eq .xts_encrypt_blocks${std} ++ ++ // If the encryption/decryption length is not N times of 16, ++ // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std} ++ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std} ++ subs $blocks,$blocks,#1 ++ b.eq .only_2blks_tweak${std} ++.xts_encrypt_blocks${std}: ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rev32_armeb(@tweak[0],@tweak[0]); ++ &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); ++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); ++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); ++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); ++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); ++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); ++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); ++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); ++$code.=<<___; ++.Lxts_8_blocks_process${std}: ++ cmp $blocks,#8 ++ b.lt .Lxts_4_blocks_process${std} ++___ ++ &mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]); ++ &mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]); ++ &mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]); ++ &mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]); ++ &mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]); ++ &mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]); ++ &mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]); ++ &mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]); ++$code.=<<___; ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &rbit(@vtmp[0],@vtmp[0],$std); ++ &rbit(@vtmp[1],@vtmp[1],$std); ++ &rbit(@vtmp[2],@vtmp[2],$std); ++ &rbit(@vtmp[3],@vtmp[3],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @vtmp[0].16b ++ eor @data[1].16b, @data[1].16b, @vtmp[1].16b ++ eor @data[2].16b, @data[2].16b, @vtmp[2].16b ++ eor @data[3].16b, @data[3].16b, @vtmp[3].16b ++ ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++___ ++ &rbit(@vtmpx[0],@vtmpx[0],$std); ++ &rbit(@vtmpx[1],@vtmpx[1],$std); ++ &rbit(@vtmpx[2],@vtmpx[2],$std); ++ &rbit(@vtmpx[3],@vtmpx[3],$std); ++$code.=<<___; ++ eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b ++ eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b ++ eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b ++ eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &rev32(@datax[0],@datax[0]); ++ &rev32(@datax[1],@datax[1]); ++ &rev32(@datax[2],@datax[2]); ++ &rev32(@datax[3],@datax[3]); ++ &transpose(@data,@vtmp); ++ &transpose(@datax,@vtmp); ++$code.=<<___; ++ bl _${prefix}_enc_8blks ++___ ++ &transpose(@vtmp,@datax); ++ &transpose(@data,@datax); ++ ++ &mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]); ++ &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); ++ &mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]); ++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); ++ &mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]); ++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); ++ &mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]); ++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); ++ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]); ++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); ++ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]); ++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); ++ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]); ++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); ++ &mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]); ++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b ++ eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b ++ eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++ eor @data[2].16b, @data[2].16b, @tweak[2].16b ++ eor @data[3].16b, @data[3].16b, @tweak[3].16b ++ ++ // save the last tweak ++ st1 {@tweak[3].4s},[$ivp] ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.gt .Lxts_8_blocks_process${std} ++ b 100f ++.Lxts_4_blocks_process${std}: ++___ ++ &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); ++ &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); ++ &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); ++ &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); ++$code.=<<___; ++ cmp $blocks,#4 ++ b.lt 1f ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rbit(@tweak[1],@tweak[1],$std); ++ &rbit(@tweak[2],@tweak[2],$std); ++ &rbit(@tweak[3],@tweak[3],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++ eor @data[2].16b, @data[2].16b, @tweak[2].16b ++ eor @data[3].16b, @data[3].16b, @tweak[3].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &transpose(@data,@vtmp); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++___ ++ &transpose(@vtmp,@data); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b ++ eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ sub $blocks,$blocks,#4 ++___ ++ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]); ++ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]); ++ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]); ++$code.=<<___; ++ // save the last tweak ++ st1 {@tweak[3].4s},[$ivp] ++1: ++ // process last block ++ cmp $blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ ld1 {@data[0].4s},[$inp],#16 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ st1 {@data[0].4s},[$outp],#16 ++ // save the last tweak ++ st1 {@tweak[0].4s},[$ivp] ++ b 100f ++1: // process last 2 blocks ++ cmp $blocks,#2 ++ b.gt 1f ++ ld1 {@data[0].4s,@data[1].4s},[$inp],#32 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rbit(@tweak[1],@tweak[1],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &transpose(@data,@vtmp); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++___ ++ &transpose(@vtmp,@data); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 ++ // save the last tweak ++ st1 {@tweak[1].4s},[$ivp] ++ b 100f ++1: // process last 3 blocks ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rbit(@tweak[1],@tweak[1],$std); ++ &rbit(@tweak[2],@tweak[2],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++ eor @data[2].16b, @data[2].16b, @tweak[2].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &transpose(@data,@vtmp); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++___ ++ &transpose(@vtmp,@data); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 ++ // save the last tweak ++ st1 {@tweak[2].4s},[$ivp] ++100: ++ cmp $remain,0 ++ b.eq .return${std} ++ ++// This brance calculates the last two tweaks, ++// while the encryption/decryption length is larger than 32 ++.last_2blks_tweak${std}: ++ ld1 {@tweak[0].4s},[$ivp] ++___ ++ &rev32_armeb(@tweak[0],@tweak[0]); ++ &compute_tweak_vec(@tweak[0],@tweak[1],$std); ++ &compute_tweak_vec(@tweak[1],@tweak[2],$std); ++$code.=<<___; ++ b .check_dec${std} ++ ++ ++// This brance calculates the last two tweaks, ++// while the encryption/decryption length is equal to 32, who only need two tweaks ++.only_2blks_tweak${std}: ++ mov @tweak[1].16b,@tweak[0].16b ++___ ++ &rev32_armeb(@tweak[1],@tweak[1]); ++ &compute_tweak_vec(@tweak[1],@tweak[2]); ++$code.=<<___; ++ b .check_dec${std} ++ ++ ++// Determine whether encryption or decryption is required. ++// The last two tweaks need to be swapped for decryption. ++.check_dec${std}: ++ // encryption:1 decryption:0 ++ cmp $enc,1 ++ b.eq .prcess_last_2blks${std} ++ mov @vtmp[0].16B,@tweak[1].16b ++ mov @tweak[1].16B,@tweak[2].16b ++ mov @tweak[2].16B,@vtmp[0].16b ++ ++.prcess_last_2blks${std}: ++___ ++ &rev32_armeb(@tweak[1],@tweak[1]); ++ &rev32_armeb(@tweak[2],@tweak[2]); ++$code.=<<___; ++ ld1 {@data[0].4s},[$inp],#16 ++ eor @data[0].16b, @data[0].16b, @tweak[1].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[1].16b ++ st1 {@data[0].4s},[$outp],#16 ++ ++ sub $lastBlk,$outp,16 ++ .loop${std}: ++ subs $remain,$remain,1 ++ ldrb $wtmp0,[$lastBlk,$remain] ++ ldrb $wtmp1,[$inp,$remain] ++ strb $wtmp1,[$lastBlk,$remain] ++ strb $wtmp0,[$outp,$remain] ++ b.gt .loop${std} ++ ld1 {@data[0].4s}, [$lastBlk] ++ eor @data[0].16b, @data[0].16b, @tweak[2].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[2].16b ++ st1 {@data[0].4s}, [$lastBlk] ++.return${std}: ++ ldp d14, d15, [sp], #0x10 ++ ldp d12, d13, [sp], #0x10 ++ ldp d10, d11, [sp], #0x10 ++ ldp d8, d9, [sp], #0x10 ++ ldp x29, x30, [sp], #0x10 ++ ldp x27, x28, [sp], #0x10 ++ ldp x25, x26, [sp], #0x10 ++ ldp x23, x24, [sp], #0x10 ++ ldp x21, x22, [sp], #0x10 ++ ldp x19, x20, [sp], #0x10 ++ ldp x17, x18, [sp], #0x10 ++ ldp x15, x16, [sp], #0x10 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std} ++___ ++} # end of gen_xts_cipher ++&gen_xts_cipher("_gb"); ++&gen_xts_cipher(""); ++}}} + ######################################## + open SELF,$0; + while(<SELF>) { +diff --git a/crypto/sm4/asm/vpsm4_ex-armv8.pl b/crypto/sm4/asm/vpsm4_ex-armv8.pl +new file mode 100644 +index 0000000000..3d094aa535 +--- /dev/null ++++ b/crypto/sm4/asm/vpsm4_ex-armv8.pl +@@ -0,0 +1,1544 @@ ++#! /usr/bin/env perl ++# Copyright 2022 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++ ++# ++# This module implements SM4 with ASIMD and AESE on AARCH64 ++# ++# Dec 2022 ++# ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or ++die "can't locate arm-xlate.pl"; ++ ++open OUT,"| \"$^X\" $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++*STDOUT=*OUT; ++ ++$prefix="vpsm4_ex"; ++my @vtmp=map("v$_",(0..3)); ++my @qtmp=map("q$_",(0..3)); ++my @data=map("v$_",(4..7)); ++my @datax=map("v$_",(8..11)); ++my ($rk0,$rk1)=("v12","v13"); ++my ($rka,$rkb)=("v14","v15"); ++my @vtmpx=map("v$_",(12..15)); ++my ($vtmp4,$vtmp5)=("v24","v25"); ++my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31"); ++my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31"); ++ ++my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3"); ++my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9"); ++my ($xtmp1,$xtmp2)=("x8","x9"); ++my ($ptr,$counter)=("x10","w11"); ++my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15"); ++ ++sub rev32() { ++ my $dst = shift; ++ my $src = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++$code.=<<___; ++#ifndef __AARCH64EB__ ++ rev32 $dst.16b,$src.16b ++#else ++ mov $dst.16b,$src.16b ++#endif ++___ ++ } else { ++$code.=<<___; ++#ifndef __AARCH64EB__ ++ rev32 $dst.16b,$dst.16b ++#endif ++___ ++ } ++} ++ ++sub rev32_armeb() { ++ my $dst = shift; ++ my $src = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++$code.=<<___; ++#ifdef __AARCH64EB__ ++ rev32 $dst.16b,$src.16b ++#else ++ mov $dst.16b,$src.16b ++#endif ++___ ++ } else { ++$code.=<<___; ++#ifdef __AARCH64EB__ ++ rev32 $dst.16b,$dst.16b ++#endif ++___ ++ } ++} ++ ++sub rbit() { ++ my $dst = shift; ++ my $src = shift; ++ my $std = shift; ++ ++ if ($src and ("$src" ne "$dst")) { ++ if ($std eq "_gb") { ++$code.=<<___; ++ rbit $dst.16b,$src.16b ++___ ++ } else { ++$code.=<<___; ++ mov $dst.16b,$src.16b ++___ ++ } ++ } else { ++ if ($std eq "_gb") { ++$code.=<<___; ++ rbit $dst.16b,$src.16b ++___ ++ } ++ } ++} ++ ++sub transpose() { ++ my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_; ++ ++$code.=<<___; ++ zip1 $vt0.4s,$dat0.4s,$dat1.4s ++ zip2 $vt1.4s,$dat0.4s,$dat1.4s ++ zip1 $vt2.4s,$dat2.4s,$dat3.4s ++ zip2 $vt3.4s,$dat2.4s,$dat3.4s ++ zip1 $dat0.2d,$vt0.2d,$vt2.2d ++ zip2 $dat1.2d,$vt0.2d,$vt2.2d ++ zip1 $dat2.2d,$vt1.2d,$vt3.2d ++ zip2 $dat3.2d,$vt1.2d,$vt3.2d ++___ ++} ++ ++# matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x) ++sub mul_matrix() { ++ my $x = shift; ++ my $higherMat = shift; ++ my $lowerMat = shift; ++ my $tmp = shift; ++$code.=<<___; ++ ushr $tmp.16b, $x.16b, 4 ++ and $x.16b, $x.16b, $ANDMaskV.16b ++ tbl $x.16b, {$lowerMat.16b}, $x.16b ++ tbl $tmp.16b, {$higherMat.16b}, $tmp.16b ++ eor $x.16b, $x.16b, $tmp.16b ++___ ++} ++ ++# sbox operations for 4-lane of words ++# sbox operation for 4-lane of words ++sub sbox() { ++ my $dat = shift; ++ ++$code.=<<___; ++ // optimize sbox using AESE instruction ++ tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b ++___ ++ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); ++$code.=<<___; ++ eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b ++ aese @vtmp[0].16b,@vtmp[1].16b ++___ ++ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4); ++$code.=<<___; ++ mov $dat.16b,@vtmp[0].16b ++ ++ // linear transformation ++ ushr @vtmp[0].4s,$dat.4s,32-2 ++ ushr @vtmp[1].4s,$dat.4s,32-10 ++ ushr @vtmp[2].4s,$dat.4s,32-18 ++ ushr @vtmp[3].4s,$dat.4s,32-24 ++ sli @vtmp[0].4s,$dat.4s,2 ++ sli @vtmp[1].4s,$dat.4s,10 ++ sli @vtmp[2].4s,$dat.4s,18 ++ sli @vtmp[3].4s,$dat.4s,24 ++ eor $vtmp4.16b,@vtmp[0].16b,$dat.16b ++ eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b ++ eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b ++ eor $dat.16b,$dat.16b,$vtmp4.16b ++___ ++} ++ ++# sbox operation for 8-lane of words ++sub sbox_double() { ++ my $dat = shift; ++ my $datx = shift; ++ ++$code.=<<___; ++ // optimize sbox using AESE instruction ++ tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b ++ tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b ++___ ++ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4); ++ &mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4); ++$code.=<<___; ++ eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b ++ aese @vtmp[0].16b,$vtmp5.16b ++ aese @vtmp[1].16b,$vtmp5.16b ++___ ++ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4); ++ &mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4); ++$code.=<<___; ++ mov $dat.16b,@vtmp[0].16b ++ mov $datx.16b,@vtmp[1].16b ++ ++ // linear transformation ++ ushr @vtmp[0].4s,$dat.4s,32-2 ++ ushr $vtmp5.4s,$datx.4s,32-2 ++ ushr @vtmp[1].4s,$dat.4s,32-10 ++ ushr @vtmp[2].4s,$dat.4s,32-18 ++ ushr @vtmp[3].4s,$dat.4s,32-24 ++ sli @vtmp[0].4s,$dat.4s,2 ++ sli $vtmp5.4s,$datx.4s,2 ++ sli @vtmp[1].4s,$dat.4s,10 ++ sli @vtmp[2].4s,$dat.4s,18 ++ sli @vtmp[3].4s,$dat.4s,24 ++ eor $vtmp4.16b,@vtmp[0].16b,$dat.16b ++ eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b ++ eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b ++ eor $dat.16b,$dat.16b,$vtmp4.16b ++ ushr @vtmp[1].4s,$datx.4s,32-10 ++ ushr @vtmp[2].4s,$datx.4s,32-18 ++ ushr @vtmp[3].4s,$datx.4s,32-24 ++ sli @vtmp[1].4s,$datx.4s,10 ++ sli @vtmp[2].4s,$datx.4s,18 ++ sli @vtmp[3].4s,$datx.4s,24 ++ eor $vtmp4.16b,$vtmp5.16b,$datx.16b ++ eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b ++ eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b ++ eor $datx.16b,$datx.16b,$vtmp4.16b ++___ ++} ++ ++# sbox operation for one single word ++sub sbox_1word () { ++ my $word = shift; ++ ++$code.=<<___; ++ mov @vtmp[3].s[0],$word ++ // optimize sbox using AESE instruction ++ tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b ++___ ++ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); ++$code.=<<___; ++ eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b ++ aese @vtmp[0].16b,@vtmp[1].16b ++___ ++ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); ++$code.=<<___; ++ ++ mov $wtmp0,@vtmp[0].s[0] ++ eor $word,$wtmp0,$wtmp0,ror #32-2 ++ eor $word,$word,$wtmp0,ror #32-10 ++ eor $word,$word,$wtmp0,ror #32-18 ++ eor $word,$word,$wtmp0,ror #32-24 ++___ ++} ++ ++# sm4 for one block of data, in scalar registers word0/word1/word2/word3 ++sub sm4_1blk () { ++ my $kptr = shift; ++ ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) ++ eor $tmpw,$word2,$word3 ++ eor $wtmp2,$wtmp0,$word1 ++ eor $tmpw,$tmpw,$wtmp2 ++___ ++ &sbox_1word($tmpw); ++$code.=<<___; ++ eor $word0,$word0,$tmpw ++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) ++ eor $tmpw,$word2,$word3 ++ eor $wtmp2,$word0,$wtmp1 ++ eor $tmpw,$tmpw,$wtmp2 ++___ ++ &sbox_1word($tmpw); ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ eor $word1,$word1,$tmpw ++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) ++ eor $tmpw,$word0,$word1 ++ eor $wtmp2,$wtmp0,$word3 ++ eor $tmpw,$tmpw,$wtmp2 ++___ ++ &sbox_1word($tmpw); ++$code.=<<___; ++ eor $word2,$word2,$tmpw ++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) ++ eor $tmpw,$word0,$word1 ++ eor $wtmp2,$word2,$wtmp1 ++ eor $tmpw,$tmpw,$wtmp2 ++___ ++ &sbox_1word($tmpw); ++$code.=<<___; ++ eor $word3,$word3,$tmpw ++___ ++} ++ ++# sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3 ++sub sm4_4blks () { ++ my $kptr = shift; ++ ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ dup $rk0.4s,$wtmp0 ++ dup $rk1.4s,$wtmp1 ++ ++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) ++ eor $rka.16b,@data[2].16b,@data[3].16b ++ eor $rk0.16b,@data[1].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,$rk0.16b ++___ ++ &sbox($rk0); ++$code.=<<___; ++ eor @data[0].16b,@data[0].16b,$rk0.16b ++ ++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) ++ eor $rka.16b,$rka.16b,@data[0].16b ++ eor $rk1.16b,$rka.16b,$rk1.16b ++___ ++ &sbox($rk1); ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ eor @data[1].16b,@data[1].16b,$rk1.16b ++ ++ dup $rk0.4s,$wtmp0 ++ dup $rk1.4s,$wtmp1 ++ ++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) ++ eor $rka.16b,@data[0].16b,@data[1].16b ++ eor $rk0.16b,@data[3].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,$rk0.16b ++___ ++ &sbox($rk0); ++$code.=<<___; ++ eor @data[2].16b,@data[2].16b,$rk0.16b ++ ++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) ++ eor $rka.16b,$rka.16b,@data[2].16b ++ eor $rk1.16b,$rka.16b,$rk1.16b ++___ ++ &sbox($rk1); ++$code.=<<___; ++ eor @data[3].16b,@data[3].16b,$rk1.16b ++___ ++} ++ ++# sm4 for 8 lanes of data, in neon registers ++# data0/data1/data2/data3 datax0/datax1/datax2/datax3 ++sub sm4_8blks () { ++ my $kptr = shift; ++ ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) ++ dup $rk0.4s,$wtmp0 ++ eor $rka.16b,@data[2].16b,@data[3].16b ++ eor $rkb.16b,@datax[2].16b,@datax[3].16b ++ eor @vtmp[0].16b,@data[1].16b,$rk0.16b ++ eor @vtmp[1].16b,@datax[1].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,@vtmp[0].16b ++ eor $rk1.16b,$rkb.16b,@vtmp[1].16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ eor @data[0].16b,@data[0].16b,$rk0.16b ++ eor @datax[0].16b,@datax[0].16b,$rk1.16b ++ ++ // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) ++ dup $rk1.4s,$wtmp1 ++ eor $rka.16b,$rka.16b,@data[0].16b ++ eor $rkb.16b,$rkb.16b,@datax[0].16b ++ eor $rk0.16b,$rka.16b,$rk1.16b ++ eor $rk1.16b,$rkb.16b,$rk1.16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ ldp $wtmp0,$wtmp1,[$kptr],8 ++ eor @data[1].16b,@data[1].16b,$rk0.16b ++ eor @datax[1].16b,@datax[1].16b,$rk1.16b ++ ++ // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) ++ dup $rk0.4s,$wtmp0 ++ eor $rka.16b,@data[0].16b,@data[1].16b ++ eor $rkb.16b,@datax[0].16b,@datax[1].16b ++ eor @vtmp[0].16b,@data[3].16b,$rk0.16b ++ eor @vtmp[1].16b,@datax[3].16b,$rk0.16b ++ eor $rk0.16b,$rka.16b,@vtmp[0].16b ++ eor $rk1.16b,$rkb.16b,@vtmp[1].16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ eor @data[2].16b,@data[2].16b,$rk0.16b ++ eor @datax[2].16b,@datax[2].16b,$rk1.16b ++ ++ // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) ++ dup $rk1.4s,$wtmp1 ++ eor $rka.16b,$rka.16b,@data[2].16b ++ eor $rkb.16b,$rkb.16b,@datax[2].16b ++ eor $rk0.16b,$rka.16b,$rk1.16b ++ eor $rk1.16b,$rkb.16b,$rk1.16b ++___ ++ &sbox_double($rk0,$rk1); ++$code.=<<___; ++ eor @data[3].16b,@data[3].16b,$rk0.16b ++ eor @datax[3].16b,@datax[3].16b,$rk1.16b ++___ ++} ++ ++sub encrypt_1blk_norev() { ++ my $dat = shift; ++ ++$code.=<<___; ++ mov $ptr,$rks ++ mov $counter,#8 ++ mov $word0,$dat.s[0] ++ mov $word1,$dat.s[1] ++ mov $word2,$dat.s[2] ++ mov $word3,$dat.s[3] ++10: ++___ ++ &sm4_1blk($ptr); ++$code.=<<___; ++ subs $counter,$counter,#1 ++ b.ne 10b ++ mov $dat.s[0],$word3 ++ mov $dat.s[1],$word2 ++ mov $dat.s[2],$word1 ++ mov $dat.s[3],$word0 ++___ ++} ++ ++sub encrypt_1blk() { ++ my $dat = shift; ++ ++ &encrypt_1blk_norev($dat); ++ &rev32($dat,$dat); ++} ++ ++sub encrypt_4blks() { ++$code.=<<___; ++ mov $ptr,$rks ++ mov $counter,#8 ++10: ++___ ++ &sm4_4blks($ptr); ++$code.=<<___; ++ subs $counter,$counter,#1 ++ b.ne 10b ++___ ++ &rev32(@vtmp[3],@data[0]); ++ &rev32(@vtmp[2],@data[1]); ++ &rev32(@vtmp[1],@data[2]); ++ &rev32(@vtmp[0],@data[3]); ++} ++ ++sub encrypt_8blks() { ++$code.=<<___; ++ mov $ptr,$rks ++ mov $counter,#8 ++10: ++___ ++ &sm4_8blks($ptr); ++$code.=<<___; ++ subs $counter,$counter,#1 ++ b.ne 10b ++___ ++ &rev32(@vtmp[3],@data[0]); ++ &rev32(@vtmp[2],@data[1]); ++ &rev32(@vtmp[1],@data[2]); ++ &rev32(@vtmp[0],@data[3]); ++ &rev32(@data[3],@datax[0]); ++ &rev32(@data[2],@datax[1]); ++ &rev32(@data[1],@datax[2]); ++ &rev32(@data[0],@datax[3]); ++} ++ ++sub load_sbox () { ++ my $data = shift; ++ ++$code.=<<___; ++ ldr $MaskQ, =0x0306090c0f0205080b0e0104070a0d00 ++ ldr $TAHMatQ, =0x22581a6002783a4062185a2042387a00 ++ ldr $TALMatQ, =0xc10bb67c4a803df715df62a89e54e923 ++ ldr $ATAHMatQ, =0x1407c6d56c7fbeadb9aa6b78c1d21300 ++ ldr $ATALMatQ, =0xe383c1a1fe9edcbc6404462679195b3b ++ ldr $ANDMaskQ, =0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f ++___ ++} ++ ++sub mov_reg_to_vec() { ++ my $src0 = shift; ++ my $src1 = shift; ++ my $desv = shift; ++$code.=<<___; ++ mov $desv.d[0],$src0 ++ mov $desv.d[1],$src1 ++___ ++ &rev32_armeb($desv,$desv); ++} ++ ++sub mov_vec_to_reg() { ++ my $srcv = shift; ++ my $des0 = shift; ++ my $des1 = shift; ++$code.=<<___; ++ mov $des0,$srcv.d[0] ++ mov $des1,$srcv.d[1] ++___ ++} ++ ++sub compute_tweak() { ++ my $src0 = shift; ++ my $src1 = shift; ++ my $des0 = shift; ++ my $des1 = shift; ++$code.=<<___; ++ mov $wtmp0,0x87 ++ extr $xtmp2,$src1,$src1,#32 ++ extr $des1,$src1,$src0,#63 ++ and $wtmp1,$wtmp0,$wtmp2,asr#31 ++ eor $des0,$xtmp1,$src0,lsl#1 ++___ ++} ++ ++sub compute_tweak_vec() { ++ my $src = shift; ++ my $des = shift; ++ my $std = shift; ++ &rbit(@vtmp[2],$src,$std); ++$code.=<<___; ++ ldr @qtmp[0], =0x01010101010101010101010101010187 ++ shl $des.16b, @vtmp[2].16b, #1 ++ ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15 ++ ushr @vtmp[1].16b, @vtmp[1].16b, #7 ++ mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b ++ eor $des.16b, $des.16b, @vtmp[1].16b ++___ ++ &rbit($des,$des,$std); ++} ++ ++$code=<<___; ++#include "arm_arch.h" ++.arch armv8-a+crypto ++.text ++ ++.type _${prefix}_consts,%object ++.align 7 ++_${prefix}_consts: ++.Lck: ++ .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 ++ .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 ++ .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 ++ .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 ++ .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 ++ .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 ++ .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 ++ .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 ++.Lfk: ++ .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197 ++.Lshuffles: ++ .dword 0x0B0A090807060504,0x030201000F0E0D0C ++ ++.size _${prefix}_consts,.-_${prefix}_consts ++___ ++ ++{{{ ++my ($key,$keys,$enc)=("x0","x1","w2"); ++my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8"); ++my ($vkey,$vfk,$vmap)=("v5","v6","v7"); ++$code.=<<___; ++.type _${prefix}_set_key,%function ++.align 4 ++_${prefix}_set_key: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {$vkey.4s},[$key] ++___ ++ &load_sbox(); ++ &rev32($vkey,$vkey); ++$code.=<<___; ++ adr $pointer,.Lshuffles ++ ld1 {$vmap.2d},[$pointer] ++ adr $pointer,.Lfk ++ ld1 {$vfk.2d},[$pointer] ++ eor $vkey.16b,$vkey.16b,$vfk.16b ++ mov $schedules,#32 ++ adr $pointer,.Lck ++ movi @vtmp[0].16b,#64 ++ cbnz $enc,1f ++ add $keys,$keys,124 ++1: ++ mov $wtmp,$vkey.s[1] ++ ldr $roundkey,[$pointer],#4 ++ eor $roundkey,$roundkey,$wtmp ++ mov $wtmp,$vkey.s[2] ++ eor $roundkey,$roundkey,$wtmp ++ mov $wtmp,$vkey.s[3] ++ eor $roundkey,$roundkey,$wtmp ++ // optimize sbox using AESE instruction ++ mov @data[0].s[0],$roundkey ++ tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b ++___ ++ &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]); ++$code.=<<___; ++ eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b ++ aese @vtmp[0].16b,@vtmp[1].16b ++___ ++ &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]); ++$code.=<<___; ++ mov $wtmp,@vtmp[0].s[0] ++ eor $roundkey,$wtmp,$wtmp,ror #19 ++ eor $roundkey,$roundkey,$wtmp,ror #9 ++ mov $wtmp,$vkey.s[0] ++ eor $roundkey,$roundkey,$wtmp ++ mov $vkey.s[0],$roundkey ++ cbz $enc,2f ++ str $roundkey,[$keys],#4 ++ b 3f ++2: ++ str $roundkey,[$keys],#-4 ++3: ++ tbl $vkey.16b,{$vkey.16b},$vmap.16b ++ subs $schedules,$schedules,#1 ++ b.ne 1b ++ ret ++.size _${prefix}_set_key,.-_${prefix}_set_key ++___ ++}}} ++ ++ ++{{{ ++$code.=<<___; ++.type _${prefix}_enc_4blks,%function ++.align 4 ++_${prefix}_enc_4blks: ++ AARCH64_VALID_CALL_TARGET ++___ ++ &encrypt_4blks(); ++$code.=<<___; ++ ret ++.size _${prefix}_enc_4blks,.-_${prefix}_enc_4blks ++___ ++}}} ++ ++{{{ ++$code.=<<___; ++.type _${prefix}_enc_8blks,%function ++.align 4 ++_${prefix}_enc_8blks: ++ AARCH64_VALID_CALL_TARGET ++___ ++ &encrypt_8blks(); ++$code.=<<___; ++ ret ++.size _${prefix}_enc_8blks,.-_${prefix}_enc_8blks ++___ ++}}} ++ ++ ++{{{ ++my ($key,$keys)=("x0","x1"); ++$code.=<<___; ++.globl ${prefix}_set_encrypt_key ++.type ${prefix}_set_encrypt_key,%function ++.align 5 ++${prefix}_set_encrypt_key: ++ AARCH64_SIGN_LINK_REGISTER ++ stp x29,x30,[sp,#-16]! ++ mov w2,1 ++ bl _${prefix}_set_key ++ ldp x29,x30,[sp],#16 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key ++___ ++}}} ++ ++{{{ ++my ($key,$keys)=("x0","x1"); ++$code.=<<___; ++.globl ${prefix}_set_decrypt_key ++.type ${prefix}_set_decrypt_key,%function ++.align 5 ++${prefix}_set_decrypt_key: ++ AARCH64_SIGN_LINK_REGISTER ++ stp x29,x30,[sp,#-16]! ++ mov w2,0 ++ bl _${prefix}_set_key ++ ldp x29,x30,[sp],#16 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key ++___ ++}}} ++ ++{{{ ++sub gen_block () { ++ my $dir = shift; ++ my ($inp,$outp,$rk)=map("x$_",(0..2)); ++ ++$code.=<<___; ++.globl ${prefix}_${dir}crypt ++.type ${prefix}_${dir}crypt,%function ++.align 5 ++${prefix}_${dir}crypt: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {@data[0].4s},[$inp] ++___ ++ &load_sbox(); ++ &rev32(@data[0],@data[0]); ++$code.=<<___; ++ mov $rks,$rk ++___ ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ st1 {@data[0].4s},[$outp] ++ ret ++.size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt ++___ ++} ++&gen_block("en"); ++&gen_block("de"); ++}}} ++ ++{{{ ++$code.=<<___; ++.globl ${prefix}_ecb_encrypt ++.type ${prefix}_ecb_encrypt,%function ++.align 5 ++${prefix}_ecb_encrypt: ++ AARCH64_SIGN_LINK_REGISTER ++ // convert length into blocks ++ lsr x2,x2,4 ++ stp d8,d9,[sp,#-80]! ++ stp d10,d11,[sp,#16] ++ stp d12,d13,[sp,#32] ++ stp d14,d15,[sp,#48] ++ stp x29,x30,[sp,#64] ++___ ++ &load_sbox(); ++$code.=<<___; ++.Lecb_8_blocks_process: ++ cmp $blocks,#8 ++ b.lt .Lecb_4_blocks_process ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++ ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &rev32(@datax[0],@datax[0]); ++ &rev32(@datax[1],@datax[1]); ++ &rev32(@datax[2],@datax[2]); ++ &rev32(@datax[3],@datax[3]); ++$code.=<<___; ++ bl _${prefix}_enc_8blks ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.gt .Lecb_8_blocks_process ++ b 100f ++.Lecb_4_blocks_process: ++ cmp $blocks,#4 ++ b.lt 1f ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ sub $blocks,$blocks,#4 ++1: ++ // process last block ++ cmp $blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ ld1 {@data[0].4s},[$inp] ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ st1 {@data[0].4s},[$outp] ++ b 100f ++1: // process last 2 blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16 ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16 ++ cmp $blocks,#2 ++ b.gt 1f ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp] ++ b 100f ++1: // process last 3 blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16 ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++ st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16 ++ st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp] ++100: ++ ldp d10,d11,[sp,#16] ++ ldp d12,d13,[sp,#32] ++ ldp d14,d15,[sp,#48] ++ ldp x29,x30,[sp,#64] ++ ldp d8,d9,[sp],#80 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt ++___ ++}}} ++ ++{{{ ++my ($len,$ivp,$enc)=("x2","x4","w5"); ++my $ivec0=("v3"); ++my $ivec1=("v15"); ++ ++$code.=<<___; ++.globl ${prefix}_cbc_encrypt ++.type ${prefix}_cbc_encrypt,%function ++.align 5 ++${prefix}_cbc_encrypt: ++ AARCH64_VALID_CALL_TARGET ++ lsr $len,$len,4 ++___ ++ &load_sbox(); ++$code.=<<___; ++ cbz $enc,.Ldec ++ ld1 {$ivec0.4s},[$ivp] ++.Lcbc_4_blocks_enc: ++ cmp $blocks,#4 ++ b.lt 1f ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++ eor @data[0].16b,@data[0].16b,$ivec0.16b ++___ ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &encrypt_1blk_norev(@data[0]); ++$code.=<<___; ++ eor @data[1].16b,@data[1].16b,@data[0].16b ++___ ++ &encrypt_1blk_norev(@data[1]); ++ &rev32(@data[0],@data[0]); ++ ++$code.=<<___; ++ eor @data[2].16b,@data[2].16b,@data[1].16b ++___ ++ &encrypt_1blk_norev(@data[2]); ++ &rev32(@data[1],@data[1]); ++$code.=<<___; ++ eor @data[3].16b,@data[3].16b,@data[2].16b ++___ ++ &encrypt_1blk_norev(@data[3]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ orr $ivec0.16b,@data[3].16b,@data[3].16b ++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#4 ++ b.ne .Lcbc_4_blocks_enc ++ b 2f ++1: ++ subs $blocks,$blocks,#1 ++ b.lt 2f ++ ld1 {@data[0].4s},[$inp],#16 ++ eor $ivec0.16b,$ivec0.16b,@data[0].16b ++___ ++ &rev32($ivec0,$ivec0); ++ &encrypt_1blk($ivec0); ++$code.=<<___; ++ st1 {$ivec0.4s},[$outp],#16 ++ b 1b ++2: ++ // save back IV ++ st1 {$ivec0.4s},[$ivp] ++ ret ++ ++.Ldec: ++ // decryption mode starts ++ AARCH64_SIGN_LINK_REGISTER ++ stp d8,d9,[sp,#-80]! ++ stp d10,d11,[sp,#16] ++ stp d12,d13,[sp,#32] ++ stp d14,d15,[sp,#48] ++ stp x29,x30,[sp,#64] ++.Lcbc_8_blocks_dec: ++ cmp $blocks,#8 ++ b.lt 1f ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] ++ add $ptr,$inp,#64 ++ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr] ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],$data[3]); ++ &rev32(@datax[0],@datax[0]); ++ &rev32(@datax[1],@datax[1]); ++ &rev32(@datax[2],@datax[2]); ++ &rev32(@datax[3],$datax[3]); ++$code.=<<___; ++ bl _${prefix}_enc_8blks ++___ ++ &transpose(@vtmp,@datax); ++ &transpose(@data,@datax); ++$code.=<<___; ++ ld1 {$ivec1.4s},[$ivp] ++ ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++ // note ivec1 and vtmpx[3] are resuing the same register ++ // care needs to be taken to avoid conflict ++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b ++ ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 ++ eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b ++ eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b ++ // save back IV ++ st1 {$vtmpx[3].4s}, [$ivp] ++ eor @data[0].16b,@data[0].16b,$datax[3].16b ++ eor @data[1].16b,@data[1].16b,@vtmpx[0].16b ++ eor @data[2].16b,@data[2].16b,@vtmpx[1].16b ++ eor @data[3].16b,$data[3].16b,@vtmpx[2].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.gt .Lcbc_8_blocks_dec ++ b.eq 100f ++1: ++ ld1 {$ivec1.4s},[$ivp] ++.Lcbc_4_blocks_dec: ++ cmp $blocks,#4 ++ b.lt 1f ++ ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp] ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],$data[3]); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &transpose(@vtmp,@datax); ++$code.=<<___; ++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b ++ orr $ivec1.16b,@data[3].16b,@data[3].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b ++ eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#4 ++ b.gt .Lcbc_4_blocks_dec ++ // save back IV ++ st1 {@data[3].4s}, [$ivp] ++ b 100f ++1: // last block ++ subs $blocks,$blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ ld1 {@data[0].4s},[$inp],#16 ++ // save back IV ++ st1 {$data[0].4s}, [$ivp] ++___ ++ &rev32(@datax[0],@data[0]); ++ &encrypt_1blk(@datax[0]); ++$code.=<<___; ++ eor @datax[0].16b,@datax[0].16b,$ivec1.16b ++ st1 {@datax[0].4s},[$outp],#16 ++ b 100f ++1: // last two blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp] ++ add $ptr,$inp,#16 ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16 ++ subs $blocks,$blocks,1 ++ b.gt 1f ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++ ld1 {@data[0].4s,@data[1].4s},[$inp],#32 ++___ ++ &transpose(@vtmp,@datax); ++$code.=<<___; ++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 ++ // save back IV ++ st1 {@data[1].4s}, [$ivp] ++ b 100f ++1: // last 3 blocks ++ ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr] ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 ++___ ++ &transpose(@vtmp,@datax); ++$code.=<<___; ++ eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 ++ // save back IV ++ st1 {@data[2].4s}, [$ivp] ++100: ++ ldp d10,d11,[sp,#16] ++ ldp d12,d13,[sp,#32] ++ ldp d14,d15,[sp,#48] ++ ldp x29,x30,[sp,#64] ++ ldp d8,d9,[sp],#80 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt ++___ ++}}} ++ ++{{{ ++my ($ivp)=("x4"); ++my ($ctr)=("w5"); ++my $ivec=("v3"); ++ ++$code.=<<___; ++.globl ${prefix}_ctr32_encrypt_blocks ++.type ${prefix}_ctr32_encrypt_blocks,%function ++.align 5 ++${prefix}_ctr32_encrypt_blocks: ++ AARCH64_VALID_CALL_TARGET ++ ld1 {$ivec.4s},[$ivp] ++___ ++ &rev32($ivec,$ivec); ++ &load_sbox(); ++$code.=<<___; ++ cmp $blocks,#1 ++ b.ne 1f ++ // fast processing for one single block without ++ // context saving overhead ++___ ++ &encrypt_1blk($ivec); ++$code.=<<___; ++ ld1 {@data[0].4s},[$inp] ++ eor @data[0].16b,@data[0].16b,$ivec.16b ++ st1 {@data[0].4s},[$outp] ++ ret ++1: ++ AARCH64_SIGN_LINK_REGISTER ++ stp d8,d9,[sp,#-80]! ++ stp d10,d11,[sp,#16] ++ stp d12,d13,[sp,#32] ++ stp d14,d15,[sp,#48] ++ stp x29,x30,[sp,#64] ++ mov $word0,$ivec.s[0] ++ mov $word1,$ivec.s[1] ++ mov $word2,$ivec.s[2] ++ mov $ctr,$ivec.s[3] ++.Lctr32_4_blocks_process: ++ cmp $blocks,#4 ++ b.lt 1f ++ dup @data[0].4s,$word0 ++ dup @data[1].4s,$word1 ++ dup @data[2].4s,$word2 ++ mov @data[3].s[0],$ctr ++ add $ctr,$ctr,#1 ++ mov $data[3].s[1],$ctr ++ add $ctr,$ctr,#1 ++ mov @data[3].s[2],$ctr ++ add $ctr,$ctr,#1 ++ mov @data[3].s[3],$ctr ++ add $ctr,$ctr,#1 ++ cmp $blocks,#8 ++ b.ge .Lctr32_8_blocks_process ++ bl _${prefix}_enc_4blks ++ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 ++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b ++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#4 ++ b.ne .Lctr32_4_blocks_process ++ b 100f ++.Lctr32_8_blocks_process: ++ dup @datax[0].4s,$word0 ++ dup @datax[1].4s,$word1 ++ dup @datax[2].4s,$word2 ++ mov @datax[3].s[0],$ctr ++ add $ctr,$ctr,#1 ++ mov $datax[3].s[1],$ctr ++ add $ctr,$ctr,#1 ++ mov @datax[3].s[2],$ctr ++ add $ctr,$ctr,#1 ++ mov @datax[3].s[3],$ctr ++ add $ctr,$ctr,#1 ++ bl _${prefix}_enc_8blks ++ ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64 ++ ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b ++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b ++ eor @data[0].16b,@data[0].16b,@datax[0].16b ++ eor @data[1].16b,@data[1].16b,@datax[1].16b ++ eor @data[2].16b,@data[2].16b,@datax[2].16b ++ eor @data[3].16b,@data[3].16b,@datax[3].16b ++ st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.ne .Lctr32_4_blocks_process ++ b 100f ++1: // last block processing ++ subs $blocks,$blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ mov $ivec.s[0],$word0 ++ mov $ivec.s[1],$word1 ++ mov $ivec.s[2],$word2 ++ mov $ivec.s[3],$ctr ++___ ++ &encrypt_1blk($ivec); ++$code.=<<___; ++ ld1 {@data[0].4s},[$inp] ++ eor @data[0].16b,@data[0].16b,$ivec.16b ++ st1 {@data[0].4s},[$outp] ++ b 100f ++1: // last 2 blocks processing ++ dup @data[0].4s,$word0 ++ dup @data[1].4s,$word1 ++ dup @data[2].4s,$word2 ++ mov @data[3].s[0],$ctr ++ add $ctr,$ctr,#1 ++ mov @data[3].s[1],$ctr ++ subs $blocks,$blocks,#1 ++ b.ne 1f ++ bl _${prefix}_enc_4blks ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 ++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b ++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 ++ b 100f ++1: // last 3 blocks processing ++ add $ctr,$ctr,#1 ++ mov @data[3].s[2],$ctr ++ bl _${prefix}_enc_4blks ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16 ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16 ++ ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16 ++ eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b ++ eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b ++ eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b ++ eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16 ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16 ++ st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16 ++100: ++ ldp d10,d11,[sp,#16] ++ ldp d12,d13,[sp,#32] ++ ldp d14,d15,[sp,#48] ++ ldp x29,x30,[sp,#64] ++ ldp d8,d9,[sp],#80 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks ++___ ++}}} ++ ++ ++{{{ ++my ($blocks,$len)=("x2","x2"); ++my $ivp=("x5"); ++my @twx=map("x$_",(12..27)); ++my ($rks1,$rks2)=("x26","x27"); ++my $lastBlk=("x26"); ++my $enc=("w28"); ++my $remain=("x29"); ++ ++my @tweak=map("v$_",(16..23)); ++my $lastTweak=("v25"); ++ ++sub gen_xts_cipher() { ++ my $std = shift; ++$code.=<<___; ++.globl ${prefix}_xts_encrypt${std} ++.type ${prefix}_xts_encrypt${std},%function ++.align 5 ++${prefix}_xts_encrypt${std}: ++ AARCH64_SIGN_LINK_REGISTER ++ stp x15, x16, [sp, #-0x10]! ++ stp x17, x18, [sp, #-0x10]! ++ stp x19, x20, [sp, #-0x10]! ++ stp x21, x22, [sp, #-0x10]! ++ stp x23, x24, [sp, #-0x10]! ++ stp x25, x26, [sp, #-0x10]! ++ stp x27, x28, [sp, #-0x10]! ++ stp x29, x30, [sp, #-0x10]! ++ stp d8, d9, [sp, #-0x10]! ++ stp d10, d11, [sp, #-0x10]! ++ stp d12, d13, [sp, #-0x10]! ++ stp d14, d15, [sp, #-0x10]! ++ mov $rks1,x3 ++ mov $rks2,x4 ++ mov $enc,w6 ++ ld1 {@tweak[0].4s}, [$ivp] ++ mov $rks,$rks2 ++___ ++ &load_sbox(); ++ &rev32(@tweak[0],@tweak[0]); ++ &encrypt_1blk(@tweak[0]); ++$code.=<<___; ++ mov $rks,$rks1 ++ and $remain,$len,#0x0F ++ // convert length into blocks ++ lsr $blocks,$len,4 ++ cmp $blocks,#1 ++ b.lt .return${std} ++ ++ cmp $remain,0 ++ // If the encryption/decryption Length is N times of 16, ++ // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std} ++ b.eq .xts_encrypt_blocks${std} ++ ++ // If the encryption/decryption length is not N times of 16, ++ // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std} ++ // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std} ++ subs $blocks,$blocks,#1 ++ b.eq .only_2blks_tweak${std} ++.xts_encrypt_blocks${std}: ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rev32_armeb(@tweak[0],@tweak[0]); ++ &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]); ++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); ++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); ++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); ++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); ++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); ++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); ++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); ++$code.=<<___; ++.Lxts_8_blocks_process${std}: ++ cmp $blocks,#8 ++___ ++ &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]); ++ &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]); ++ &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]); ++ &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]); ++ &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]); ++ &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]); ++ &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]); ++ &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]); ++ &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]); ++ &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]); ++ &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]); ++ &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]); ++ &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]); ++ &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]); ++ &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]); ++ &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]); ++$code.=<<___; ++ b.lt .Lxts_4_blocks_process${std} ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rbit(@tweak[1],@tweak[1],$std); ++ &rbit(@tweak[2],@tweak[2],$std); ++ &rbit(@tweak[3],@tweak[3],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++ eor @data[2].16b, @data[2].16b, @tweak[2].16b ++ eor @data[3].16b, @data[3].16b, @tweak[3].16b ++ ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64 ++___ ++ &rbit(@tweak[4],@tweak[4],$std); ++ &rbit(@tweak[5],@tweak[5],$std); ++ &rbit(@tweak[6],@tweak[6],$std); ++ &rbit(@tweak[7],@tweak[7],$std); ++$code.=<<___; ++ eor @datax[0].16b, @datax[0].16b, @tweak[4].16b ++ eor @datax[1].16b, @datax[1].16b, @tweak[5].16b ++ eor @datax[2].16b, @datax[2].16b, @tweak[6].16b ++ eor @datax[3].16b, @datax[3].16b, @tweak[7].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &rev32(@datax[0],@datax[0]); ++ &rev32(@datax[1],@datax[1]); ++ &rev32(@datax[2],@datax[2]); ++ &rev32(@datax[3],@datax[3]); ++ &transpose(@data,@vtmp); ++ &transpose(@datax,@vtmp); ++$code.=<<___; ++ bl _${prefix}_enc_8blks ++___ ++ &transpose(@vtmp,@datax); ++ &transpose(@data,@datax); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b ++ eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b ++ eor @data[0].16b, @data[0].16b, @tweak[4].16b ++ eor @data[1].16b, @data[1].16b, @tweak[5].16b ++ eor @data[2].16b, @data[2].16b, @tweak[6].16b ++ eor @data[3].16b, @data[3].16b, @tweak[7].16b ++ ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[7].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64 ++ subs $blocks,$blocks,#8 ++ b.gt .Lxts_8_blocks_process${std} ++ b 100f ++.Lxts_4_blocks_process${std}: ++ cmp $blocks,#4 ++ b.lt 1f ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rbit(@tweak[1],@tweak[1],$std); ++ &rbit(@tweak[2],@tweak[2],$std); ++ &rbit(@tweak[3],@tweak[3],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++ eor @data[2].16b, @data[2].16b, @tweak[2].16b ++ eor @data[3].16b, @data[3].16b, @tweak[3].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &rev32(@data[3],@data[3]); ++ &transpose(@data,@vtmp); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++___ ++ &transpose(@vtmp,@data); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b ++ eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64 ++ sub $blocks,$blocks,#4 ++ mov @tweak[0].16b,@tweak[4].16b ++ mov @tweak[1].16b,@tweak[5].16b ++ mov @tweak[2].16b,@tweak[6].16b ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[3].16b ++1: ++ // process last block ++ cmp $blocks,#1 ++ b.lt 100f ++ b.gt 1f ++ ld1 {@data[0].4s},[$inp],#16 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ st1 {@data[0].4s},[$outp],#16 ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[0].16b ++ b 100f ++1: // process last 2 blocks ++ cmp $blocks,#2 ++ b.gt 1f ++ ld1 {@data[0].4s,@data[1].4s},[$inp],#32 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rbit(@tweak[1],@tweak[1],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &transpose(@data,@vtmp); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++___ ++ &transpose(@vtmp,@data); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32 ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[1].16b ++ b 100f ++1: // process last 3 blocks ++ ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48 ++___ ++ &rbit(@tweak[0],@tweak[0],$std); ++ &rbit(@tweak[1],@tweak[1],$std); ++ &rbit(@tweak[2],@tweak[2],$std); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[0].16b ++ eor @data[1].16b, @data[1].16b, @tweak[1].16b ++ eor @data[2].16b, @data[2].16b, @tweak[2].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &rev32(@data[1],@data[1]); ++ &rev32(@data[2],@data[2]); ++ &transpose(@data,@vtmp); ++$code.=<<___; ++ bl _${prefix}_enc_4blks ++___ ++ &transpose(@vtmp,@data); ++$code.=<<___; ++ eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b ++ eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b ++ eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b ++ st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48 ++ // save the last tweak ++ mov $lastTweak.16b,@tweak[2].16b ++100: ++ cmp $remain,0 ++ b.eq .return${std} ++ ++// This brance calculates the last two tweaks, ++// while the encryption/decryption length is larger than 32 ++.last_2blks_tweak${std}: ++___ ++ &rev32_armeb($lastTweak,$lastTweak); ++ &compute_tweak_vec($lastTweak,@tweak[1],$std); ++ &compute_tweak_vec(@tweak[1],@tweak[2],$std); ++$code.=<<___; ++ b .check_dec${std} ++ ++ ++// This brance calculates the last two tweaks, ++// while the encryption/decryption length is equal to 32, who only need two tweaks ++.only_2blks_tweak${std}: ++ mov @tweak[1].16b,@tweak[0].16b ++___ ++ &rev32_armeb(@tweak[1],@tweak[1]); ++ &compute_tweak_vec(@tweak[1],@tweak[2]); ++$code.=<<___; ++ b .check_dec${std} ++ ++ ++// Determine whether encryption or decryption is required. ++// The last two tweaks need to be swapped for decryption. ++.check_dec${std}: ++ // encryption:1 decryption:0 ++ cmp $enc,1 ++ b.eq .prcess_last_2blks${std} ++ mov @vtmp[0].16B,@tweak[1].16b ++ mov @tweak[1].16B,@tweak[2].16b ++ mov @tweak[2].16B,@vtmp[0].16b ++ ++.prcess_last_2blks${std}: ++___ ++ &rev32_armeb(@tweak[1],@tweak[1]); ++ &rev32_armeb(@tweak[2],@tweak[2]); ++$code.=<<___; ++ ld1 {@data[0].4s},[$inp],#16 ++ eor @data[0].16b, @data[0].16b, @tweak[1].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[1].16b ++ st1 {@data[0].4s},[$outp],#16 ++ ++ sub $lastBlk,$outp,16 ++ .loop${std}: ++ subs $remain,$remain,1 ++ ldrb $wtmp0,[$lastBlk,$remain] ++ ldrb $wtmp1,[$inp,$remain] ++ strb $wtmp1,[$lastBlk,$remain] ++ strb $wtmp0,[$outp,$remain] ++ b.gt .loop${std} ++ ld1 {@data[0].4s}, [$lastBlk] ++ eor @data[0].16b, @data[0].16b, @tweak[2].16b ++___ ++ &rev32(@data[0],@data[0]); ++ &encrypt_1blk(@data[0]); ++$code.=<<___; ++ eor @data[0].16b, @data[0].16b, @tweak[2].16b ++ st1 {@data[0].4s}, [$lastBlk] ++.return${std}: ++ ldp d14, d15, [sp], #0x10 ++ ldp d12, d13, [sp], #0x10 ++ ldp d10, d11, [sp], #0x10 ++ ldp d8, d9, [sp], #0x10 ++ ldp x29, x30, [sp], #0x10 ++ ldp x27, x28, [sp], #0x10 ++ ldp x25, x26, [sp], #0x10 ++ ldp x23, x24, [sp], #0x10 ++ ldp x21, x22, [sp], #0x10 ++ ldp x19, x20, [sp], #0x10 ++ ldp x17, x18, [sp], #0x10 ++ ldp x15, x16, [sp], #0x10 ++ AARCH64_VALIDATE_LINK_REGISTER ++ ret ++.size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std} ++___ ++} # end of gen_xts_cipher ++&gen_xts_cipher("_gb"); ++&gen_xts_cipher(""); ++}}} ++ ++######################################## ++open SELF,$0; ++while(<SELF>) { ++ next if (/^#!/); ++ last if (!s/^#/\/\// and !/^$/); ++ print; ++} ++close SELF; ++ ++foreach(split("\n",$code)) { ++ s/\`([^\`]*)\`/eval($1)/ge; ++ print $_,"\n"; ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; +diff --git a/crypto/sm4/build.info b/crypto/sm4/build.info +index 75a215ab80..73ffe5ea09 100644 +--- a/crypto/sm4/build.info ++++ b/crypto/sm4/build.info +@@ -2,7 +2,7 @@ LIBS=../../libcrypto + + IF[{- !$disabled{asm} -}] + $SM4DEF_aarch64=SM4_ASM VPSM4_ASM +- $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S ++ $SM4ASM_aarch64=sm4-armv8.S vpsm4-armv8.S vpsm4_ex-armv8.S + + # Now that we have defined all the arch specific variables, use the + # appropriate one, and define the appropriate macros +@@ -30,5 +30,7 @@ ENDIF + + GENERATE[sm4-armv8.S]=asm/sm4-armv8.pl + GENERATE[vpsm4-armv8.S]=asm/vpsm4-armv8.pl ++GENERATE[vpsm4_ex-armv8.S]=asm/vpsm4_ex-armv8.pl + INCLUDE[sm4-armv8.o]=.. + INCLUDE[vpsm4-armv8.o]=.. ++INCLUDE[vpsm4_ex-armv8.o]=.. +diff --git a/include/crypto/sm4_platform.h b/include/crypto/sm4_platform.h +index 15d8abbcb1..8b9cd10f97 100644 +--- a/include/crypto/sm4_platform.h ++++ b/include/crypto/sm4_platform.h +@@ -20,11 +20,16 @@ static inline int vpsm4_capable(void) + { + return (OPENSSL_armcap_P & ARMV8_CPUID) && + (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1) || +- MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1) || +- MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, HISI_CPU_IMP, HISI_CPU_PART_KP920)); ++ MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_N1)); ++} ++static inline int vpsm4_ex_capable(void) ++{ ++ return (OPENSSL_armcap_P & ARMV8_CPUID) && ++ (MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, HISI_CPU_IMP, HISI_CPU_PART_KP920)); + } + # if defined(VPSM4_ASM) + # define VPSM4_CAPABLE vpsm4_capable() ++# define VPSM4_EX_CAPABLE vpsm4_ex_capable() + # endif + # define HWSM4_CAPABLE (OPENSSL_armcap_P & ARMV8_SM4) + # define HWSM4_set_encrypt_key sm4_v8_set_encrypt_key +@@ -56,7 +61,7 @@ void HWSM4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + const unsigned char ivec[16]); + # endif /* HWSM4_CAPABLE */ + +-#ifdef VPSM4_CAPABLE ++# ifdef VPSM4_CAPABLE + int vpsm4_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); + int vpsm4_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); + void vpsm4_encrypt(const unsigned char *in, unsigned char *out, +@@ -72,7 +77,37 @@ void vpsm4_ecb_encrypt(const unsigned char *in, unsigned char *out, + void vpsm4_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, + size_t len, const void *key, + const unsigned char ivec[16]); ++void vpsm4_xts_encrypt(const unsigned char *in, unsigned char *out, ++ size_t len, const SM4_KEY *key1, const SM4_KEY *key2, ++ const unsigned char ivec[16], const int enc); ++void vpsm4_xts_encrypt_gb(const unsigned char *in, unsigned char *out, ++ size_t len, const SM4_KEY *key1, const SM4_KEY *key2, ++ const unsigned char ivec[16], const int enc); + # endif /* VPSM4_CAPABLE */ + ++# ifdef VPSM4_EX_CAPABLE ++int vpsm4_ex_set_encrypt_key(const unsigned char *userKey, SM4_KEY *key); ++int vpsm4_ex_set_decrypt_key(const unsigned char *userKey, SM4_KEY *key); ++void vpsm4_ex_encrypt(const unsigned char *in, unsigned char *out, ++ const SM4_KEY *key); ++void vpsm4_ex_decrypt(const unsigned char *in, unsigned char *out, ++ const SM4_KEY *key); ++void vpsm4_ex_cbc_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ unsigned char *ivec, const int enc); ++void vpsm4_ex_ecb_encrypt(const unsigned char *in, unsigned char *out, ++ size_t length, const SM4_KEY *key, ++ const int enc); ++void vpsm4_ex_ctr32_encrypt_blocks(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, ++ const unsigned char ivec[16]); ++void vpsm4_ex_xts_encrypt(const unsigned char *in, unsigned char *out, ++ size_t len, const SM4_KEY *key1, const SM4_KEY *key2, ++ const unsigned char ivec[16], const int enc); ++void vpsm4_ex_xts_encrypt_gb(const unsigned char *in, unsigned char *out, ++ size_t len, const SM4_KEY *key1, ++ const SM4_KEY *key2, const unsigned char ivec[16], ++ const int enc); ++# endif /* VPSM4_EX_CAPABLE */ + + #endif /* OSSL_SM4_PLATFORM_H */ +diff --git a/providers/implementations/ciphers/cipher_sm4_hw.c b/providers/implementations/ciphers/cipher_sm4_hw.c +index 9a2e99f67c..8cabd78266 100644 +--- a/providers/implementations/ciphers/cipher_sm4_hw.c ++++ b/providers/implementations/ciphers/cipher_sm4_hw.c +@@ -42,6 +42,19 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx, + (void)0; /* terminate potentially open 'else' */ + } else + #endif ++#ifdef VPSM4_EX_CAPABLE ++ if (VPSM4_EX_CAPABLE) { ++ vpsm4_ex_set_encrypt_key(key, ks); ++ ctx->block = (block128_f)vpsm4_ex_encrypt; ++ ctx->stream.cbc = NULL; ++ if (ctx->mode == EVP_CIPH_CBC_MODE) ++ ctx->stream.cbc = (cbc128_f)vpsm4_ex_cbc_encrypt; ++ else if (ctx->mode == EVP_CIPH_ECB_MODE) ++ ctx->stream.ecb = (ecb128_f)vpsm4_ex_ecb_encrypt; ++ else if (ctx->mode == EVP_CIPH_CTR_MODE) ++ ctx->stream.ctr = (ctr128_f)vpsm4_ex_ctr32_encrypt_blocks; ++ } else ++#endif + #ifdef VPSM4_CAPABLE + if (VPSM4_CAPABLE) { + vpsm4_set_encrypt_key(key, ks); +@@ -75,6 +88,17 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx, + #endif + } else + #endif ++#ifdef VPSM4_EX_CAPABLE ++ if (VPSM4_EX_CAPABLE) { ++ vpsm4_ex_set_decrypt_key(key, ks); ++ ctx->block = (block128_f)vpsm4_ex_decrypt; ++ ctx->stream.cbc = NULL; ++ if (ctx->mode == EVP_CIPH_CBC_MODE) ++ ctx->stream.cbc = (cbc128_f)vpsm4_ex_cbc_encrypt; ++ else if (ctx->mode == EVP_CIPH_ECB_MODE) ++ ctx->stream.ecb = (ecb128_f)vpsm4_ex_ecb_encrypt; ++ } else ++#endif + #ifdef VPSM4_CAPABLE + if (VPSM4_CAPABLE) { + vpsm4_set_decrypt_key(key, ks); +@@ -82,7 +106,7 @@ static int cipher_hw_sm4_initkey(PROV_CIPHER_CTX *ctx, + ctx->stream.cbc = NULL; + if (ctx->mode == EVP_CIPH_CBC_MODE) + ctx->stream.cbc = (cbc128_f)vpsm4_cbc_encrypt; +- else if (ctx->mode == EVP_CIPH_ECB_MODE) ++ else if (ctx->mode == EVP_CIPH_ECB_MODE) + ctx->stream.ecb = (ecb128_f)vpsm4_ecb_encrypt; + } else + #endif +diff --git a/providers/implementations/ciphers/cipher_sm4_xts.c b/providers/implementations/ciphers/cipher_sm4_xts.c +index 3c568d4d18..037055fce8 100644 +--- a/providers/implementations/ciphers/cipher_sm4_xts.c ++++ b/providers/implementations/ciphers/cipher_sm4_xts.c +@@ -145,14 +145,14 @@ static int sm4_xts_cipher(void *vctx, unsigned char *out, size_t *outl, + if (ctx->xts_standard) { + if (ctx->stream != NULL) + (*ctx->stream)(in, out, inl, ctx->xts.key1, ctx->xts.key2, +- ctx->base.iv); ++ ctx->base.iv, ctx->base.enc); + else if (CRYPTO_xts128_encrypt(&ctx->xts, ctx->base.iv, in, out, inl, + ctx->base.enc)) + return 0; + } else { + if (ctx->stream_gb != NULL) + (*ctx->stream_gb)(in, out, inl, ctx->xts.key1, ctx->xts.key2, +- ctx->base.iv); ++ ctx->base.iv, ctx->base.enc); + else if (ossl_crypto_xts128gb_encrypt(&ctx->xts, ctx->base.iv, in, out, + inl, ctx->base.enc)) + return 0; +diff --git a/providers/implementations/ciphers/cipher_sm4_xts.h b/providers/implementations/ciphers/cipher_sm4_xts.h +index 4c369183e2..cfca596979 100644 +--- a/providers/implementations/ciphers/cipher_sm4_xts.h ++++ b/providers/implementations/ciphers/cipher_sm4_xts.h +@@ -14,7 +14,7 @@ + PROV_CIPHER_FUNC(void, xts_stream, + (const unsigned char *in, unsigned char *out, size_t len, + const SM4_KEY *key1, const SM4_KEY *key2, +- const unsigned char iv[16])); ++ const unsigned char iv[16], const int enc)); + + typedef struct prov_sm4_xts_ctx_st { + /* Must be first */ +diff --git a/providers/implementations/ciphers/cipher_sm4_xts_hw.c b/providers/implementations/ciphers/cipher_sm4_xts_hw.c +index 403eb879b1..67a9923d94 100644 +--- a/providers/implementations/ciphers/cipher_sm4_xts_hw.c ++++ b/providers/implementations/ciphers/cipher_sm4_xts_hw.c +@@ -11,8 +11,7 @@ + + #define XTS_SET_KEY_FN(fn_set_enc_key, fn_set_dec_key, \ + fn_block_enc, fn_block_dec, \ +- fn_stream_enc, fn_stream_dec, \ +- fn_stream_gb_enc, fn_stream_gb_dec) { \ ++ fn_stream, fn_stream_gb) { \ + size_t bytes = keylen / 2; \ + \ + if (ctx->enc) { \ +@@ -26,8 +25,8 @@ + xctx->xts.block2 = (block128_f)fn_block_enc; \ + xctx->xts.key1 = &xctx->ks1; \ + xctx->xts.key2 = &xctx->ks2; \ +- xctx->stream = ctx->enc ? fn_stream_enc : fn_stream_dec; \ +- xctx->stream_gb = ctx->enc ? fn_stream_gb_enc : fn_stream_gb_dec; \ ++ xctx->stream = fn_stream; \ ++ xctx->stream_gb = fn_stream_gb; \ + } + + static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx, +@@ -35,23 +34,30 @@ static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx, + size_t keylen) + { + PROV_SM4_XTS_CTX *xctx = (PROV_SM4_XTS_CTX *)ctx; +- OSSL_xts_stream_fn stream_enc = NULL; +- OSSL_xts_stream_fn stream_dec = NULL; +- OSSL_xts_stream_fn stream_gb_enc = NULL; +- OSSL_xts_stream_fn stream_gb_dec = NULL; ++ OSSL_xts_stream_fn stream = NULL; ++ OSSL_xts_stream_fn stream_gb = NULL; + #ifdef HWSM4_CAPABLE + if (HWSM4_CAPABLE) { + XTS_SET_KEY_FN(HWSM4_set_encrypt_key, HWSM4_set_decrypt_key, +- HWSM4_encrypt, HWSM4_decrypt, stream_enc, stream_dec, +- stream_gb_enc, stream_gb_dec); ++ HWSM4_encrypt, HWSM4_decrypt, stream, stream_gb); + return 1; + } else + #endif /* HWSM4_CAPABLE */ ++#ifdef VPSM4_EX_CAPABLE ++ if (VPSM4_EX_CAPABLE) { ++ stream = vpsm4_ex_xts_encrypt; ++ stream_gb = vpsm4_ex_xts_encrypt_gb; ++ XTS_SET_KEY_FN(vpsm4_ex_set_encrypt_key, vpsm4_ex_set_decrypt_key, ++ vpsm4_ex_encrypt, vpsm4_ex_decrypt, stream, stream_gb); ++ return 1; ++ } else ++#endif /* VPSM4_EX_CAPABLE */ + #ifdef VPSM4_CAPABLE + if (VPSM4_CAPABLE) { ++ stream = vpsm4_xts_encrypt; ++ stream_gb = vpsm4_xts_encrypt_gb; + XTS_SET_KEY_FN(vpsm4_set_encrypt_key, vpsm4_set_decrypt_key, +- vpsm4_encrypt, vpsm4_decrypt, stream_enc, stream_dec, +- stream_gb_enc, stream_gb_dec); ++ vpsm4_encrypt, vpsm4_decrypt, stream, stream_gb); + return 1; + } else + #endif /* VPSM4_CAPABLE */ +@@ -60,8 +66,7 @@ static int cipher_hw_sm4_xts_generic_initkey(PROV_CIPHER_CTX *ctx, + } + { + XTS_SET_KEY_FN(ossl_sm4_set_key, ossl_sm4_set_key, ossl_sm4_encrypt, +- ossl_sm4_decrypt, stream_enc, stream_dec, stream_gb_enc, +- stream_gb_dec); ++ ossl_sm4_decrypt, stream, stream_gb); + } + return 1; + } +-- +2.37.3.windows.1 + |