diff options
author | CoprDistGit <infra@openeuler.org> | 2024-08-01 14:09:41 +0000 |
---|---|---|
committer | CoprDistGit <infra@openeuler.org> | 2024-08-01 14:09:41 +0000 |
commit | d8253271bee23ac54145c0ba1d15f48e6620bb5e (patch) | |
tree | 470a23e7c4bb3541b4ff08932ce1fbd46fe59c9c /openssl-1.1.1-arm-update.patch | |
parent | 17146a82118c2690c851cbb31077186594ba86d6 (diff) |
automatic import of compat-openssl11openeuler24.03_LTSopeneuler23.09
Diffstat (limited to 'openssl-1.1.1-arm-update.patch')
-rw-r--r-- | openssl-1.1.1-arm-update.patch | 3706 |
1 files changed, 3706 insertions, 0 deletions
diff --git a/openssl-1.1.1-arm-update.patch b/openssl-1.1.1-arm-update.patch new file mode 100644 index 0000000..2b8c549 --- /dev/null +++ b/openssl-1.1.1-arm-update.patch @@ -0,0 +1,3706 @@ +diff -up openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl +--- openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl 2020-12-09 10:39:50.645705385 +0100 +@@ -27,44 +27,72 @@ + # CBC encrypt case. On Cortex-A57 parallelizable mode performance + # seems to be limited by sheer amount of NEON instructions... + # ++# April 2019 ++# ++# Key to performance of parallelize-able modes is round instruction ++# interleaving. But which factor to use? There is optimal one for ++# each combination of instruction latency and issue rate, beyond ++# which increasing interleave factor doesn't pay off. While on cons ++# side we have code size increase and resource waste on platforms for ++# which interleave factor is too high. In other words you want it to ++# be just right. So far interleave factor of 3x was serving well all ++# platforms. But for ThunderX2 optimal interleave factor was measured ++# to be 5x... ++# + # Performance in cycles per byte processed with 128-bit key: + # + # CBC enc CBC dec CTR + # Apple A7 2.39 1.20 1.20 +-# Cortex-A53 1.32 1.29 1.46 +-# Cortex-A57(*) 1.95 0.85 0.93 +-# Denver 1.96 0.86 0.80 +-# Mongoose 1.33 1.20 1.20 +-# Kryo 1.26 0.94 1.00 ++# Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46 ++# Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93 ++# Cortex-A72 1.33 0.85/0.88 0.92/0.96 ++# Denver 1.96 0.65/0.86 0.76/0.80 ++# Mongoose 1.33 1.23/1.20 1.30/1.20 ++# Kryo 1.26 0.87/0.94 1.00/1.00 ++# ThunderX2 5.95 1.25 1.30 + # + # (*) original 3.64/1.34/1.32 results were for r0p0 revision + # and are still same even for updated module; ++# (**) numbers after slash are for 32-bit code, which is 3x- ++# interleaved; + +-$flavour = shift; +-$output = shift; ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + +-open OUT,"| \"$^X\" $xlate $flavour $output"; ++open OUT,"| \"$^X\" $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; + *STDOUT=*OUT; + + $prefix="aes_v8"; + ++$_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); ++ + $code=<<___; + #include "arm_arch.h" + + #if __ARM_MAX_ARCH__>=7 +-.text + ___ +-$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); ++$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); + $code.=<<___ if ($flavour !~ /64/); + .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-) + .fpu neon ++#ifdef __thumb2__ ++.syntax unified ++.thumb ++# define INST(a,b,c,d) $_byte c,d|0xc,a,b ++#else + .code 32 +-#undef __thumb2__ ++# define INST(a,b,c,d) $_byte a,b,c,d ++#endif ++ ++.text + ___ + + # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, +@@ -361,6 +389,836 @@ ___ + &gen_block("en"); + &gen_block("de"); + }}} ++ ++# Performance in cycles per byte. ++# Processed with AES-ECB different key size. ++# It shows the value before and after optimization as below: ++# (before/after): ++# ++# AES-128-ECB AES-192-ECB AES-256-ECB ++# Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10 ++# Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14 ++ ++# Optimization is implemented by loop unrolling and interleaving. ++# Commonly, we choose the unrolling factor as 5, if the input ++# data size smaller than 5 blocks, but not smaller than 3 blocks, ++# choose 3 as the unrolling factor. ++# If the input data size dsize >= 5*16 bytes, then take 5 blocks ++# as one iteration, every loop the left size lsize -= 5*16. ++# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration, ++# every loop lsize -=3*16. ++# If lsize < 3*16 bytes, treat them as the tail, interleave the ++# two blocks AES instructions. ++# There is one special case, if the original input data size dsize ++# = 16 bytes, we will treat it seperately to improve the ++# performance: one independent code block without LR, FP load and ++# store, just looks like what the original ECB implementation does. ++ ++{{{ ++my ($inp,$out,$len,$key)=map("x$_",(0..3)); ++my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8"); ++my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); ++ ++my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); ++ ++### q7 last round key ++### q10-q15 q7 Last 7 round keys ++### q8-q9 preloaded round keys except last 7 keys for big size ++### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte ++ ++{ ++my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); ++ ++my ($dat3,$in3,$tmp3); # used only in 64-bit mode ++my ($dat4,$in4,$tmp4); ++if ($flavour =~ /64/) { ++ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); ++} ++ ++$code.=<<___; ++.globl ${prefix}_ecb_encrypt ++.type ${prefix}_ecb_encrypt,%function ++.align 5 ++${prefix}_ecb_encrypt: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ subs $len,$len,#16 ++ // Original input data size bigger than 16, jump to big size processing. ++ b.ne .Lecb_big_size ++ vld1.8 {$dat0},[$inp] ++ cmp $enc,#0 // en- or decrypting? ++ ldr $rounds,[$key,#240] ++ vld1.32 {q5-q6},[$key],#32 // load key schedule... ++ ++ b.eq .Lecb_small_dec ++ aese $dat0,q5 ++ aesmc $dat0,$dat0 ++ vld1.32 {q8-q9},[$key],#32 // load key schedule... ++ aese $dat0,q6 ++ aesmc $dat0,$dat0 ++ subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing ++ b.eq .Lecb_128_enc ++.Lecb_round_loop: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ vld1.32 {q8},[$key],#16 // load key schedule... ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ vld1.32 {q9},[$key],#16 // load key schedule... ++ subs $rounds,$rounds,#2 // bias ++ b.gt .Lecb_round_loop ++.Lecb_128_enc: ++ vld1.32 {q10-q11},[$key],#32 // load key schedule... ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ vld1.32 {q12-q13},[$key],#32 // load key schedule... ++ aese $dat0,q10 ++ aesmc $dat0,$dat0 ++ aese $dat0,q11 ++ aesmc $dat0,$dat0 ++ vld1.32 {q14-q15},[$key],#32 // load key schedule... ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ vld1.32 {$rndlast},[$key] ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat0,q15 ++ veor $dat0,$dat0,$rndlast ++ vst1.8 {$dat0},[$out] ++ b .Lecb_Final_abort ++.Lecb_small_dec: ++ aesd $dat0,q5 ++ aesimc $dat0,$dat0 ++ vld1.32 {q8-q9},[$key],#32 // load key schedule... ++ aesd $dat0,q6 ++ aesimc $dat0,$dat0 ++ subs $rounds,$rounds,#10 // bias ++ b.eq .Lecb_128_dec ++.Lecb_dec_round_loop: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ vld1.32 {q8},[$key],#16 // load key schedule... ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ vld1.32 {q9},[$key],#16 // load key schedule... ++ subs $rounds,$rounds,#2 // bias ++ b.gt .Lecb_dec_round_loop ++.Lecb_128_dec: ++ vld1.32 {q10-q11},[$key],#32 // load key schedule... ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ vld1.32 {q12-q13},[$key],#32 // load key schedule... ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ vld1.32 {q14-q15},[$key],#32 // load key schedule... ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ vld1.32 {$rndlast},[$key] ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q15 ++ veor $dat0,$dat0,$rndlast ++ vst1.8 {$dat0},[$out] ++ b .Lecb_Final_abort ++.Lecb_big_size: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ stp x29,x30,[sp,#-16]! ++ add x29,sp,#0 ++___ ++$code.=<<___ if ($flavour !~ /64/); ++ mov ip,sp ++ stmdb sp!,{r4-r8,lr} ++ vstmdb sp!,{d8-d15} @ ABI specification says so ++ ldmia ip,{r4-r5} @ load remaining args ++ subs $len,$len,#16 ++___ ++$code.=<<___; ++ mov $step,#16 ++ b.lo .Lecb_done ++ cclr $step,eq ++ ++ cmp $enc,#0 // en- or decrypting? ++ ldr $rounds,[$key,#240] ++ and $len,$len,#-16 ++ vld1.8 {$dat},[$inp],$step ++ ++ vld1.32 {q8-q9},[$key] // load key schedule... ++ sub $rounds,$rounds,#6 ++ add $key_,$key,x5,lsl#4 // pointer to last 7 round keys ++ sub $rounds,$rounds,#2 ++ vld1.32 {q10-q11},[$key_],#32 ++ vld1.32 {q12-q13},[$key_],#32 ++ vld1.32 {q14-q15},[$key_],#32 ++ vld1.32 {$rndlast},[$key_] ++ ++ add $key_,$key,#32 ++ mov $cnt,$rounds ++ b.eq .Lecb_dec ++ ++ vld1.8 {$dat1},[$inp],#16 ++ subs $len,$len,#32 // bias ++ add $cnt,$rounds,#2 ++ vorr $in1,$dat1,$dat1 ++ vorr $dat2,$dat1,$dat1 ++ vorr $dat1,$dat,$dat ++ b.lo .Lecb_enc_tail ++ ++ vorr $dat1,$in1,$in1 ++ vld1.8 {$dat2},[$inp],#16 ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#32 ++ b.lo .Loop3x_ecb_enc ++ ++ vld1.8 {$dat3},[$inp],#16 ++ vld1.8 {$dat4},[$inp],#16 ++ sub $len,$len,#32 // bias ++ mov $cnt,$rounds ++ ++.Loop5x_ecb_enc: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop5x_ecb_enc ++ ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ cmp $len,#0x40 // because .Lecb_enc_tail4x ++ sub $len,$len,#0x50 ++ ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo ++ mov $key_,$key ++ ++ aese $dat0,q10 ++ aesmc $dat0,$dat0 ++ aese $dat1,q10 ++ aesmc $dat1,$dat1 ++ aese $dat2,q10 ++ aesmc $dat2,$dat2 ++ aese $dat3,q10 ++ aesmc $dat3,$dat3 ++ aese $dat4,q10 ++ aesmc $dat4,$dat4 ++ add $inp,$inp,x6 // $inp is adjusted in such way that ++ // at exit from the loop $dat1-$dat4 ++ // are loaded with last "words" ++ add x6,$len,#0x60 // because .Lecb_enc_tail4x ++ ++ aese $dat0,q11 ++ aesmc $dat0,$dat0 ++ aese $dat1,q11 ++ aesmc $dat1,$dat1 ++ aese $dat2,q11 ++ aesmc $dat2,$dat2 ++ aese $dat3,q11 ++ aesmc $dat3,$dat3 ++ aese $dat4,q11 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ aese $dat3,q12 ++ aesmc $dat3,$dat3 ++ aese $dat4,q12 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat3,q13 ++ aesmc $dat3,$dat3 ++ aese $dat4,q13 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ aese $dat3,q14 ++ aesmc $dat3,$dat3 ++ aese $dat4,q14 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q15 ++ vld1.8 {$in0},[$inp],#16 ++ aese $dat1,q15 ++ vld1.8 {$in1},[$inp],#16 ++ aese $dat2,q15 ++ vld1.8 {$in2},[$inp],#16 ++ aese $dat3,q15 ++ vld1.8 {$in3},[$inp],#16 ++ aese $dat4,q15 ++ vld1.8 {$in4},[$inp],#16 ++ cbz x6,.Lecb_enc_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$rndlast,$dat0 ++ vorr $dat0,$in0,$in0 ++ veor $tmp1,$rndlast,$dat1 ++ vorr $dat1,$in1,$in1 ++ veor $tmp2,$rndlast,$dat2 ++ vorr $dat2,$in2,$in2 ++ veor $tmp3,$rndlast,$dat3 ++ vorr $dat3,$in3,$in3 ++ veor $tmp4,$rndlast,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ vorr $dat4,$in4,$in4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $cnt,$rounds ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_ecb_enc ++ ++ add $len,$len,#0x50 ++ cbz $len,.Lecb_done ++ ++ add $cnt,$rounds,#2 ++ subs $len,$len,#0x30 ++ vorr $dat0,$in2,$in2 ++ vorr $dat1,$in3,$in3 ++ vorr $dat2,$in4,$in4 ++ b.lo .Lecb_enc_tail ++ ++ b .Loop3x_ecb_enc ++ ++.align 4 ++.Lecb_enc_tail4x: ++ veor $tmp1,$rndlast,$dat1 ++ veor $tmp2,$rndlast,$dat2 ++ veor $tmp3,$rndlast,$dat3 ++ veor $tmp4,$rndlast,$dat4 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ ++ b .Lecb_done ++.align 4 ++___ ++$code.=<<___; ++.Loop3x_ecb_enc: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop3x_ecb_enc ++ ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ subs $len,$len,#0x30 ++ mov.lo x6,$len // x6, $cnt, is zero at this point ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ add $inp,$inp,x6 // $inp is adjusted in such way that ++ // at exit from the loop $dat1-$dat2 ++ // are loaded with last "words" ++ mov $key_,$key ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ vld1.8 {$in0},[$inp],#16 ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ vld1.8 {$in1},[$inp],#16 ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ vld1.8 {$in2},[$inp],#16 ++ aese $dat0,q15 ++ aese $dat1,q15 ++ aese $dat2,q15 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ add $cnt,$rounds,#2 ++ veor $tmp0,$rndlast,$dat0 ++ veor $tmp1,$rndlast,$dat1 ++ veor $dat2,$dat2,$rndlast ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp0},[$out],#16 ++ vorr $dat0,$in0,$in0 ++ vst1.8 {$tmp1},[$out],#16 ++ vorr $dat1,$in1,$in1 ++ vst1.8 {$dat2},[$out],#16 ++ vorr $dat2,$in2,$in2 ++ b.hs .Loop3x_ecb_enc ++ ++ cmn $len,#0x30 ++ b.eq .Lecb_done ++ nop ++ ++.Lecb_enc_tail: ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lecb_enc_tail ++ ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ cmn $len,#0x20 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ aese $dat1,q15 ++ aese $dat2,q15 ++ b.eq .Lecb_enc_one ++ veor $tmp1,$rndlast,$dat1 ++ veor $tmp2,$rndlast,$dat2 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ b .Lecb_done ++ ++.Lecb_enc_one: ++ veor $tmp1,$rndlast,$dat2 ++ vst1.8 {$tmp1},[$out],#16 ++ b .Lecb_done ++___ ++ ++$code.=<<___; ++.align 5 ++.Lecb_dec: ++ vld1.8 {$dat1},[$inp],#16 ++ subs $len,$len,#32 // bias ++ add $cnt,$rounds,#2 ++ vorr $in1,$dat1,$dat1 ++ vorr $dat2,$dat1,$dat1 ++ vorr $dat1,$dat,$dat ++ b.lo .Lecb_dec_tail ++ ++ vorr $dat1,$in1,$in1 ++ vld1.8 {$dat2},[$inp],#16 ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#32 ++ b.lo .Loop3x_ecb_dec ++ ++ vld1.8 {$dat3},[$inp],#16 ++ vld1.8 {$dat4},[$inp],#16 ++ sub $len,$len,#32 // bias ++ mov $cnt,$rounds ++ ++.Loop5x_ecb_dec: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop5x_ecb_dec ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ cmp $len,#0x40 // because .Lecb_tail4x ++ sub $len,$len,#0x50 ++ ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo ++ mov $key_,$key ++ ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q10 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q10 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q10 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q10 ++ aesimc $dat4,$dat4 ++ add $inp,$inp,x6 // $inp is adjusted in such way that ++ // at exit from the loop $dat1-$dat4 ++ // are loaded with last "words" ++ add x6,$len,#0x60 // because .Lecb_tail4x ++ ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q11 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q11 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q11 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q11 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q12 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q12 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q13 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q13 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q14 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q14 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q15 ++ vld1.8 {$in0},[$inp],#16 ++ aesd $dat1,q15 ++ vld1.8 {$in1},[$inp],#16 ++ aesd $dat2,q15 ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat3,q15 ++ vld1.8 {$in3},[$inp],#16 ++ aesd $dat4,q15 ++ vld1.8 {$in4},[$inp],#16 ++ cbz x6,.Lecb_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$rndlast,$dat0 ++ vorr $dat0,$in0,$in0 ++ veor $tmp1,$rndlast,$dat1 ++ vorr $dat1,$in1,$in1 ++ veor $tmp2,$rndlast,$dat2 ++ vorr $dat2,$in2,$in2 ++ veor $tmp3,$rndlast,$dat3 ++ vorr $dat3,$in3,$in3 ++ veor $tmp4,$rndlast,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ vorr $dat4,$in4,$in4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $cnt,$rounds ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_ecb_dec ++ ++ add $len,$len,#0x50 ++ cbz $len,.Lecb_done ++ ++ add $cnt,$rounds,#2 ++ subs $len,$len,#0x30 ++ vorr $dat0,$in2,$in2 ++ vorr $dat1,$in3,$in3 ++ vorr $dat2,$in4,$in4 ++ b.lo .Lecb_dec_tail ++ ++ b .Loop3x_ecb_dec ++ ++.align 4 ++.Lecb_tail4x: ++ veor $tmp1,$rndlast,$dat1 ++ veor $tmp2,$rndlast,$dat2 ++ veor $tmp3,$rndlast,$dat3 ++ veor $tmp4,$rndlast,$dat4 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ ++ b .Lecb_done ++.align 4 ++___ ++$code.=<<___; ++.Loop3x_ecb_dec: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop3x_ecb_dec ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ subs $len,$len,#0x30 ++ mov.lo x6,$len // x6, $cnt, is zero at this point ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ add $inp,$inp,x6 // $inp is adjusted in such way that ++ // at exit from the loop $dat1-$dat2 ++ // are loaded with last "words" ++ mov $key_,$key ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ vld1.8 {$in0},[$inp],#16 ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ vld1.8 {$in1},[$inp],#16 ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat0,q15 ++ aesd $dat1,q15 ++ aesd $dat2,q15 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ add $cnt,$rounds,#2 ++ veor $tmp0,$rndlast,$dat0 ++ veor $tmp1,$rndlast,$dat1 ++ veor $dat2,$dat2,$rndlast ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp0},[$out],#16 ++ vorr $dat0,$in0,$in0 ++ vst1.8 {$tmp1},[$out],#16 ++ vorr $dat1,$in1,$in1 ++ vst1.8 {$dat2},[$out],#16 ++ vorr $dat2,$in2,$in2 ++ b.hs .Loop3x_ecb_dec ++ ++ cmn $len,#0x30 ++ b.eq .Lecb_done ++ nop ++ ++.Lecb_dec_tail: ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lecb_dec_tail ++ ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ cmn $len,#0x20 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q15 ++ aesd $dat2,q15 ++ b.eq .Lecb_dec_one ++ veor $tmp1,$rndlast,$dat1 ++ veor $tmp2,$rndlast,$dat2 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ b .Lecb_done ++ ++.Lecb_dec_one: ++ veor $tmp1,$rndlast,$dat2 ++ vst1.8 {$tmp1},[$out],#16 ++ ++.Lecb_done: ++___ ++} ++$code.=<<___ if ($flavour !~ /64/); ++ vldmia sp!,{d8-d15} ++ ldmia sp!,{r4-r8,pc} ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ ldr x29,[sp],#16 ++___ ++$code.=<<___ if ($flavour =~ /64/); ++.Lecb_Final_abort: ++ ret ++___ ++$code.=<<___; ++.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt ++___ ++}}} + {{{ + my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; + my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); +@@ -519,6 +1377,13 @@ $code.=<<___; + ___ + { + my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); ++ ++my ($dat3,$in3,$tmp3); # used only in 64-bit mode ++my ($dat4,$in4,$tmp4); ++if ($flavour =~ /64/) { ++ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); ++} ++ + $code.=<<___; + .align 5 + .Lcbc_dec: +@@ -535,7 +1400,196 @@ $code.=<<___; + vorr $in0,$dat,$dat + vorr $in1,$dat1,$dat1 + vorr $in2,$dat2,$dat2 ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#32 ++ b.lo .Loop3x_cbc_dec ++ ++ vld1.8 {$dat3},[$inp],#16 ++ vld1.8 {$dat4},[$inp],#16 ++ sub $len,$len,#32 // bias ++ mov $cnt,$rounds ++ vorr $in3,$dat3,$dat3 ++ vorr $in4,$dat4,$dat4 ++ ++.Loop5x_cbc_dec: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop5x_cbc_dec ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ cmp $len,#0x40 // because .Lcbc_tail4x ++ sub $len,$len,#0x50 ++ ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo ++ mov $key_,$key ++ ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q10 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q10 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q10 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q10 ++ aesimc $dat4,$dat4 ++ add $inp,$inp,x6 // $inp is adjusted in such way that ++ // at exit from the loop $dat1-$dat4 ++ // are loaded with last "words" ++ add x6,$len,#0x60 // because .Lcbc_tail4x ++ ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q11 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q11 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q11 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q11 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q12 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q12 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q13 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q13 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q14 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q14 ++ aesimc $dat4,$dat4 + ++ veor $tmp0,$ivec,$rndlast ++ aesd $dat0,q15 ++ veor $tmp1,$in0,$rndlast ++ vld1.8 {$in0},[$inp],#16 ++ aesd $dat1,q15 ++ veor $tmp2,$in1,$rndlast ++ vld1.8 {$in1},[$inp],#16 ++ aesd $dat2,q15 ++ veor $tmp3,$in2,$rndlast ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat3,q15 ++ veor $tmp4,$in3,$rndlast ++ vld1.8 {$in3},[$inp],#16 ++ aesd $dat4,q15 ++ vorr $ivec,$in4,$in4 ++ vld1.8 {$in4},[$inp],#16 ++ cbz x6,.Lcbc_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$tmp0,$dat0 ++ vorr $dat0,$in0,$in0 ++ veor $tmp1,$tmp1,$dat1 ++ vorr $dat1,$in1,$in1 ++ veor $tmp2,$tmp2,$dat2 ++ vorr $dat2,$in2,$in2 ++ veor $tmp3,$tmp3,$dat3 ++ vorr $dat3,$in3,$in3 ++ veor $tmp4,$tmp4,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ vorr $dat4,$in4,$in4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $cnt,$rounds ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_cbc_dec ++ ++ add $len,$len,#0x50 ++ cbz $len,.Lcbc_done ++ ++ add $cnt,$rounds,#2 ++ subs $len,$len,#0x30 ++ vorr $dat0,$in2,$in2 ++ vorr $in0,$in2,$in2 ++ vorr $dat1,$in3,$in3 ++ vorr $in1,$in3,$in3 ++ vorr $dat2,$in4,$in4 ++ vorr $in2,$in4,$in4 ++ b.lo .Lcbc_dec_tail ++ ++ b .Loop3x_cbc_dec ++ ++.align 4 ++.Lcbc_tail4x: ++ veor $tmp1,$tmp0,$dat1 ++ veor $tmp2,$tmp2,$dat2 ++ veor $tmp3,$tmp3,$dat3 ++ veor $tmp4,$tmp4,$dat4 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ ++ b .Lcbc_done ++.align 4 ++___ ++$code.=<<___; + .Loop3x_cbc_dec: + aesd $dat0,q8 + aesimc $dat0,$dat0 +@@ -696,6 +1750,9 @@ my $step="x12"; # aliases with $tctr2 + my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); + my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); + ++# used only in 64-bit mode... ++my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23)); ++ + my ($dat,$tmp)=($dat0,$tmp0); + + ### q8-q15 preloaded key schedule +@@ -751,6 +1808,175 @@ $code.=<<___; + vmov.32 ${ivec}[3],$tctr2 + sub $len,$len,#3 // bias + vorr $dat2,$ivec,$ivec ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#2 ++ b.lo .Loop3x_ctr32 ++ ++ add w13,$ctr,#1 ++ add w14,$ctr,#2 ++ vorr $dat3,$dat0,$dat0 ++ rev w13,w13 ++ vorr $dat4,$dat0,$dat0 ++ rev w14,w14 ++ vmov.32 ${dat3}[3],w13 ++ sub $len,$len,#2 // bias ++ vmov.32 ${dat4}[3],w14 ++ add $ctr,$ctr,#2 ++ b .Loop5x_ctr32 ++ ++.align 4 ++.Loop5x_ctr32: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop5x_ctr32 ++ ++ mov $key_,$key ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ add $tctr0,$ctr,#1 ++ add $tctr1,$ctr,#2 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ add $tctr2,$ctr,#3 ++ add w13,$ctr,#4 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ add w14,$ctr,#5 ++ rev $tctr0,$tctr0 ++ aese $dat3,q12 ++ aesmc $dat3,$dat3 ++ rev $tctr1,$tctr1 ++ rev $tctr2,$tctr2 ++ aese $dat4,q12 ++ aesmc $dat4,$dat4 ++ rev w13,w13 ++ rev w14,w14 ++ ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat3,q13 ++ aesmc $dat3,$dat3 ++ aese $dat4,q13 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ vld1.8 {$in0},[$inp],#16 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ vld1.8 {$in1},[$inp],#16 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ vld1.8 {$in2},[$inp],#16 ++ aese $dat3,q14 ++ aesmc $dat3,$dat3 ++ vld1.8 {$in3},[$inp],#16 ++ aese $dat4,q14 ++ aesmc $dat4,$dat4 ++ vld1.8 {$in4},[$inp],#16 ++ ++ aese $dat0,q15 ++ veor $in0,$in0,$rndlast ++ aese $dat1,q15 ++ veor $in1,$in1,$rndlast ++ aese $dat2,q15 ++ veor $in2,$in2,$rndlast ++ aese $dat3,q15 ++ veor $in3,$in3,$rndlast ++ aese $dat4,q15 ++ veor $in4,$in4,$rndlast ++ ++ veor $in0,$in0,$dat0 ++ vorr $dat0,$ivec,$ivec ++ veor $in1,$in1,$dat1 ++ vorr $dat1,$ivec,$ivec ++ veor $in2,$in2,$dat2 ++ vorr $dat2,$ivec,$ivec ++ veor $in3,$in3,$dat3 ++ vorr $dat3,$ivec,$ivec ++ veor $in4,$in4,$dat4 ++ vorr $dat4,$ivec,$ivec ++ ++ vst1.8 {$in0},[$out],#16 ++ vmov.32 ${dat0}[3],$tctr0 ++ vst1.8 {$in1},[$out],#16 ++ vmov.32 ${dat1}[3],$tctr1 ++ vst1.8 {$in2},[$out],#16 ++ vmov.32 ${dat2}[3],$tctr2 ++ vst1.8 {$in3},[$out],#16 ++ vmov.32 ${dat3}[3],w13 ++ vst1.8 {$in4},[$out],#16 ++ vmov.32 ${dat4}[3],w14 ++ ++ mov $cnt,$rounds ++ cbz $len,.Lctr32_done ++ ++ add $ctr,$ctr,#5 ++ subs $len,$len,#5 ++ b.hs .Loop5x_ctr32 ++ ++ add $len,$len,#5 ++ sub $ctr,$ctr,#5 ++ ++ cmp $len,#2 ++ mov $step,#16 ++ cclr $step,lo ++ b.ls .Lctr32_tail ++ ++ sub $len,$len,#3 // bias ++ add $ctr,$ctr,#3 ++___ ++$code.=<<___; + b .Loop3x_ctr32 + + .align 4 +@@ -905,6 +2131,1432 @@ $code.=<<___; + .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks + ___ + }}} ++# Performance in cycles per byte. ++# Processed with AES-XTS different key size. ++# It shows the value before and after optimization as below: ++# (before/after): ++# ++# AES-128-XTS AES-256-XTS ++# Cortex-A57 3.36/1.09 4.02/1.37 ++# Cortex-A72 3.03/1.02 3.28/1.33 ++ ++# Optimization is implemented by loop unrolling and interleaving. ++# Commonly, we choose the unrolling factor as 5, if the input ++# data size smaller than 5 blocks, but not smaller than 3 blocks, ++# choose 3 as the unrolling factor. ++# If the input data size dsize >= 5*16 bytes, then take 5 blocks ++# as one iteration, every loop the left size lsize -= 5*16. ++# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes ++# will be processed specially, which be integrated into the 5*16 bytes ++# loop to improve the efficiency. ++# There is one special case, if the original input data size dsize ++# = 16 bytes, we will treat it seperately to improve the ++# performance: one independent code block without LR, FP load and ++# store. ++# Encryption will process the (length -tailcnt) bytes as mentioned ++# previously, then encrypt the composite block as last second ++# cipher block. ++# Decryption will process the (length -tailcnt -1) bytes as mentioned ++# previously, then decrypt the last second cipher block to get the ++# last plain block(tail), decrypt the composite block as last second ++# plain text block. ++ ++{{{ ++my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); ++my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); ++my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); ++my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); ++my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); ++my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); ++my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b"); ++my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); ++my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); ++ ++my ($tmpin)=("v26.16b"); ++my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); ++ ++# q7 last round key ++# q10-q15, q7 Last 7 round keys ++# q8-q9 preloaded round keys except last 7 keys for big size ++# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte ++ ++ ++my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); ++ ++my ($dat3,$in3,$tmp3); # used only in 64-bit mode ++my ($dat4,$in4,$tmp4); ++if ($flavour =~ /64/) { ++ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); ++} ++ ++$code.=<<___ if ($flavour =~ /64/); ++.globl ${prefix}_xts_encrypt ++.type ${prefix}_xts_encrypt,%function ++.align 5 ++${prefix}_xts_encrypt: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#16 ++ // Original input data size bigger than 16, jump to big size processing. ++ b.ne .Lxts_enc_big_size ++ // Encrypt the iv with key2, as the first XEX iv. ++ ldr $rounds,[$key2,#240] ++ vld1.8 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key2],#16 ++ ++.Loop_enc_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_enc_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ vld1.8 {$dat0},[$inp] ++ veor $dat0,$iv0,$dat0 ++ ++ ldr $rounds,[$key1,#240] ++ vld1.32 {q20-q21},[$key1],#32 // load key schedule... ++ ++ aese $dat0,q20 ++ aesmc $dat0,$dat0 ++ vld1.32 {q8-q9},[$key1],#32 // load key schedule... ++ aese $dat0,q21 ++ aesmc $dat0,$dat0 ++ subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing ++ b.eq .Lxts_128_enc ++.Lxts_enc_round_loop: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ vld1.32 {q8},[$key1],#16 // load key schedule... ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ vld1.32 {q9},[$key1],#16 // load key schedule... ++ subs $rounds,$rounds,#2 // bias ++ b.gt .Lxts_enc_round_loop ++.Lxts_128_enc: ++ vld1.32 {q10-q11},[$key1],#32 // load key schedule... ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ vld1.32 {q12-q13},[$key1],#32 // load key schedule... ++ aese $dat0,q10 ++ aesmc $dat0,$dat0 ++ aese $dat0,q11 ++ aesmc $dat0,$dat0 ++ vld1.32 {q14-q15},[$key1],#32 // load key schedule... ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ vld1.32 {$rndlast},[$key1] ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat0,q15 ++ veor $dat0,$dat0,$rndlast ++ veor $dat0,$dat0,$iv0 ++ vst1.8 {$dat0},[$out] ++ b .Lxts_enc_final_abort ++ ++.align 4 ++.Lxts_enc_big_size: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ stp $constnumx,$tmpinp,[sp,#-64]! ++ stp $tailcnt,$midnumx,[sp,#48] ++ stp $ivd10,$ivd20,[sp,#32] ++ stp $ivd30,$ivd40,[sp,#16] ++ ++ // tailcnt store the tail value of length%16. ++ and $tailcnt,$len,#0xf ++ and $len,$len,#-16 ++ subs $len,$len,#16 ++ mov $step,#16 ++ b.lo .Lxts_abort ++ csel $step,xzr,$step,eq ++ ++ // Firstly, encrypt the iv with key2, as the first iv of XEX. ++ ldr $rounds,[$key2,#240] ++ vld1.32 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.32 {$dat1},[$key2],#16 ++ ++.Loop_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ // The iv for second block ++ // $ivl- iv(low), $ivh - iv(high) ++ // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 ++ fmov $ivl,$ivd00 ++ fmov $ivh,$ivd01 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ ++ ldr $rounds0,[$key1,#240] // next starting point ++ vld1.8 {$dat},[$inp],$step ++ ++ vld1.32 {q8-q9},[$key1] // load key schedule... ++ sub $rounds0,$rounds0,#6 ++ add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys ++ sub $rounds0,$rounds0,#2 ++ vld1.32 {q10-q11},[$key_],#32 ++ vld1.32 {q12-q13},[$key_],#32 ++ vld1.32 {q14-q15},[$key_],#32 ++ vld1.32 {$rndlast},[$key_] ++ ++ add $key_,$key1,#32 ++ mov $rounds,$rounds0 ++ ++ // Encryption ++.Lxts_enc: ++ vld1.8 {$dat2},[$inp],#16 ++ subs $len,$len,#32 // bias ++ add $rounds,$rounds0,#2 ++ vorr $in1,$dat,$dat ++ vorr $dat1,$dat,$dat ++ vorr $in3,$dat,$dat ++ vorr $in2,$dat2,$dat2 ++ vorr $in4,$dat2,$dat2 ++ b.lo .Lxts_inner_enc_tail ++ veor $dat,$dat,$iv0 // before encryption, xor with iv ++ veor $dat2,$dat2,$iv1 ++ ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ ++ ++ vorr $dat1,$dat2,$dat2 ++ vld1.8 {$dat2},[$inp],#16 ++ vorr $in0,$dat,$dat ++ vorr $in1,$dat1,$dat1 ++ veor $in2,$dat2,$iv2 // the third block ++ veor $dat2,$dat2,$iv2 ++ cmp $len,#32 ++ b.lo .Lxts_outer_enc_tail ++ ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ ++ vld1.8 {$dat3},[$inp],#16 ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$dat4},[$inp],#16 ++ veor $dat3,$dat3,$iv3 // the fourth block ++ veor $dat4,$dat4,$iv4 ++ sub $len,$len,#32 // bias ++ mov $rounds,$rounds0 ++ b .Loop5x_xts_enc ++ ++.align 4 ++.Loop5x_xts_enc: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop5x_xts_enc ++ ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ subs $len,$len,#0x50 // because .Lxts_enc_tail4x ++ ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo ++ mov $key_,$key1 ++ ++ aese $dat0,q10 ++ aesmc $dat0,$dat0 ++ aese $dat1,q10 ++ aesmc $dat1,$dat1 ++ aese $dat2,q10 ++ aesmc $dat2,$dat2 ++ aese $dat3,q10 ++ aesmc $dat3,$dat3 ++ aese $dat4,q10 ++ aesmc $dat4,$dat4 ++ add $inp,$inp,$xoffset // x0 is adjusted in such way that ++ // at exit from the loop v1.16b-v26.16b ++ // are loaded with last "words" ++ add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x ++ ++ aese $dat0,q11 ++ aesmc $dat0,$dat0 ++ aese $dat1,q11 ++ aesmc $dat1,$dat1 ++ aese $dat2,q11 ++ aesmc $dat2,$dat2 ++ aese $dat3,q11 ++ aesmc $dat3,$dat3 ++ aese $dat4,q11 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ aese $dat3,q12 ++ aesmc $dat3,$dat3 ++ aese $dat4,q12 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat3,q13 ++ aesmc $dat3,$dat3 ++ aese $dat4,q13 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ aese $dat3,q14 ++ aesmc $dat3,$dat3 ++ aese $dat4,q14 ++ aesmc $dat4,$dat4 ++ ++ veor $tmp0,$rndlast,$iv0 ++ aese $dat0,q15 ++ // The iv for first block of one iteration ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$rndlast,$iv1 ++ vld1.8 {$in0},[$inp],#16 ++ aese $dat1,q15 ++ // The iv for second block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ veor $tmp2,$rndlast,$iv2 ++ vld1.8 {$in1},[$inp],#16 ++ aese $dat2,q15 ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ veor $tmp3,$rndlast,$iv3 ++ vld1.8 {$in2},[$inp],#16 ++ aese $dat3,q15 ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ veor $tmp4,$rndlast,$iv4 ++ vld1.8 {$in3},[$inp],#16 ++ aese $dat4,q15 ++ ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$in4},[$inp],#16 ++ cbz $xoffset,.Lxts_enc_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$tmp0,$dat0 ++ veor $dat0,$in0,$iv0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat1,$in1,$iv1 ++ veor $tmp2,$tmp2,$dat2 ++ veor $dat2,$in2,$iv2 ++ veor $tmp3,$tmp3,$dat3 ++ veor $dat3,$in3,$iv3 ++ veor $tmp4,$tmp4,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ veor $dat4,$in4,$iv4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $rounds,$rounds0 ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_xts_enc ++ ++ ++ // If left 4 blocks, borrow the five block's processing. ++ cmn $len,#0x10 ++ b.ne .Loop5x_enc_after ++ vorr $iv4,$iv3,$iv3 ++ vorr $iv3,$iv2,$iv2 ++ vorr $iv2,$iv1,$iv1 ++ vorr $iv1,$iv0,$iv0 ++ fmov $ivl,$ivd40 ++ fmov $ivh,$ivd41 ++ veor $dat0,$iv0,$in0 ++ veor $dat1,$iv1,$in1 ++ veor $dat2,$in2,$iv2 ++ veor $dat3,$in3,$iv3 ++ veor $dat4,$in4,$iv4 ++ b.eq .Loop5x_xts_enc ++ ++.Loop5x_enc_after: ++ add $len,$len,#0x50 ++ cbz $len,.Lxts_enc_done ++ ++ add $rounds,$rounds0,#2 ++ subs $len,$len,#0x30 ++ b.lo .Lxts_inner_enc_tail ++ ++ veor $dat0,$iv0,$in2 ++ veor $dat1,$iv1,$in3 ++ veor $dat2,$in4,$iv2 ++ b .Lxts_outer_enc_tail ++ ++.align 4 ++.Lxts_enc_tail4x: ++ add $inp,$inp,#16 ++ veor $tmp1,$dat1,$tmp1 ++ vst1.8 {$tmp1},[$out],#16 ++ veor $tmp2,$dat2,$tmp2 ++ vst1.8 {$tmp2},[$out],#16 ++ veor $tmp3,$dat3,$tmp3 ++ veor $tmp4,$dat4,$tmp4 ++ vst1.8 {$tmp3-$tmp4},[$out],#32 ++ ++ b .Lxts_enc_done ++.align 4 ++.Lxts_outer_enc_tail: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_outer_enc_tail ++ ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ veor $tmp0,$iv0,$rndlast ++ subs $len,$len,#0x30 ++ // The iv for first block ++ fmov $ivl,$ivd20 ++ fmov $ivh,$ivd21 ++ //mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$iv1,$rndlast ++ csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ veor $tmp2,$iv2,$rndlast ++ ++ add $xoffset,$xoffset,#0x20 ++ add $inp,$inp,$xoffset ++ mov $key_,$key1 ++ ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ aese $dat0,q15 ++ aese $dat1,q15 ++ aese $dat2,q15 ++ vld1.8 {$in2},[$inp],#16 ++ add $rounds,$rounds0,#2 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$tmp0,$dat0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat2,$dat2,$tmp2 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp0},[$out],#16 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$dat2},[$out],#16 ++ cmn $len,#0x30 ++ b.eq .Lxts_enc_done ++.Lxts_encxor_one: ++ vorr $in3,$in1,$in1 ++ vorr $in4,$in2,$in2 ++ nop ++ ++.Lxts_inner_enc_tail: ++ cmn $len,#0x10 ++ veor $dat1,$in3,$iv0 ++ veor $dat2,$in4,$iv1 ++ b.eq .Lxts_enc_tail_loop ++ veor $dat2,$in4,$iv0 ++.Lxts_enc_tail_loop: ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_enc_tail_loop ++ ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ cmn $len,#0x20 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ veor $tmp1,$iv0,$rndlast ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ veor $tmp2,$iv1,$rndlast ++ aese $dat1,q15 ++ aese $dat2,q15 ++ b.eq .Lxts_enc_one ++ veor $tmp1,$tmp1,$dat1 ++ vst1.8 {$tmp1},[$out],#16 ++ veor $tmp2,$tmp2,$dat2 ++ vorr $iv0,$iv1,$iv1 ++ vst1.8 {$tmp2},[$out],#16 ++ fmov $ivl,$ivd10 ++ fmov $ivh,$ivd11 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ b .Lxts_enc_done ++ ++.Lxts_enc_one: ++ veor $tmp1,$tmp1,$dat2 ++ vorr $iv0,$iv0,$iv0 ++ vst1.8 {$tmp1},[$out],#16 ++ fmov $ivl,$ivd00 ++ fmov $ivh,$ivd01 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ b .Lxts_enc_done ++.align 5 ++.Lxts_enc_done: ++ // Process the tail block with cipher stealing. ++ tst $tailcnt,#0xf ++ b.eq .Lxts_abort ++ ++ mov $tmpinp,$inp ++ mov $tmpoutp,$out ++ sub $out,$out,#16 ++.composite_enc_loop: ++ subs $tailcnt,$tailcnt,#1 ++ ldrb $l2outp,[$out,$tailcnt] ++ ldrb $loutp,[$tmpinp,$tailcnt] ++ strb $l2outp,[$tmpoutp,$tailcnt] ++ strb $loutp,[$out,$tailcnt] ++ b.gt .composite_enc_loop ++.Lxts_enc_load_done: ++ vld1.8 {$tmpin},[$out] ++ veor $tmpin,$tmpin,$iv0 ++ ++ // Encrypt the composite block to get the last second encrypted text block ++ ldr $rounds,[$key1,#240] // load key schedule... ++ vld1.8 {$dat},[$key1],#16 ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key1],#16 // load key schedule... ++.Loop_final_enc: ++ aese $tmpin,$dat0 ++ aesmc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1],#16 ++ subs $rounds,$rounds,#2 ++ aese $tmpin,$dat1 ++ aesmc $tmpin,$tmpin ++ vld1.32 {$dat1},[$key1],#16 ++ b.gt .Loop_final_enc ++ ++ aese $tmpin,$dat0 ++ aesmc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1] ++ aese $tmpin,$dat1 ++ veor $tmpin,$tmpin,$dat0 ++ veor $tmpin,$tmpin,$iv0 ++ vst1.8 {$tmpin},[$out] ++ ++.Lxts_abort: ++ ldp $tailcnt,$midnumx,[sp,#48] ++ ldp $ivd10,$ivd20,[sp,#32] ++ ldp $ivd30,$ivd40,[sp,#16] ++ ldp $constnumx,$tmpinp,[sp],#64 ++.Lxts_enc_final_abort: ++ ret ++.size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt ++___ ++ ++}}} ++{{{ ++my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); ++my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); ++my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); ++my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); ++my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); ++my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); ++my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b"); ++my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); ++my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); ++ ++my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); ++ ++# q7 last round key ++# q10-q15, q7 Last 7 round keys ++# q8-q9 preloaded round keys except last 7 keys for big size ++# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte ++ ++{ ++my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); ++ ++my ($dat3,$in3,$tmp3); # used only in 64-bit mode ++my ($dat4,$in4,$tmp4); ++if ($flavour =~ /64/) { ++ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); ++} ++ ++$code.=<<___ if ($flavour =~ /64/); ++.globl ${prefix}_xts_decrypt ++.type ${prefix}_xts_decrypt,%function ++.align 5 ++${prefix}_xts_decrypt: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#16 ++ // Original input data size bigger than 16, jump to big size processing. ++ b.ne .Lxts_dec_big_size ++ // Encrypt the iv with key2, as the first XEX iv. ++ ldr $rounds,[$key2,#240] ++ vld1.8 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key2],#16 ++ ++.Loop_dec_small_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_dec_small_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ vld1.8 {$dat0},[$inp] ++ veor $dat0,$iv0,$dat0 ++ ++ ldr $rounds,[$key1,#240] ++ vld1.32 {q20-q21},[$key1],#32 // load key schedule... ++ ++ aesd $dat0,q20 ++ aesimc $dat0,$dat0 ++ vld1.32 {q8-q9},[$key1],#32 // load key schedule... ++ aesd $dat0,q21 ++ aesimc $dat0,$dat0 ++ subs $rounds,$rounds,#10 // bias ++ b.eq .Lxts_128_dec ++.Lxts_dec_round_loop: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ vld1.32 {q8},[$key1],#16 // load key schedule... ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ vld1.32 {q9},[$key1],#16 // load key schedule... ++ subs $rounds,$rounds,#2 // bias ++ b.gt .Lxts_dec_round_loop ++.Lxts_128_dec: ++ vld1.32 {q10-q11},[$key1],#32 // load key schedule... ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ vld1.32 {q12-q13},[$key1],#32 // load key schedule... ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ vld1.32 {q14-q15},[$key1],#32 // load key schedule... ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ vld1.32 {$rndlast},[$key1] ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q15 ++ veor $dat0,$dat0,$rndlast ++ veor $dat0,$iv0,$dat0 ++ vst1.8 {$dat0},[$out] ++ b .Lxts_dec_final_abort ++.Lxts_dec_big_size: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ stp $constnumx,$tmpinp,[sp,#-64]! ++ stp $tailcnt,$midnumx,[sp,#48] ++ stp $ivd10,$ivd20,[sp,#32] ++ stp $ivd30,$ivd40,[sp,#16] ++ ++ and $tailcnt,$len,#0xf ++ and $len,$len,#-16 ++ subs $len,$len,#16 ++ mov $step,#16 ++ b.lo .Lxts_dec_abort ++ ++ // Encrypt the iv with key2, as the first XEX iv ++ ldr $rounds,[$key2,#240] ++ vld1.8 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key2],#16 ++ ++.Loop_dec_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_dec_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ // The iv for second block ++ // $ivl- iv(low), $ivh - iv(high) ++ // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 ++ fmov $ivl,$ivd00 ++ fmov $ivh,$ivd01 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ ++ ldr $rounds0,[$key1,#240] // load rounds number ++ ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ ++ vld1.32 {q8-q9},[$key1] // load key schedule... ++ sub $rounds0,$rounds0,#6 ++ add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys ++ sub $rounds0,$rounds0,#2 ++ vld1.32 {q10-q11},[$key_],#32 // load key schedule... ++ vld1.32 {q12-q13},[$key_],#32 ++ vld1.32 {q14-q15},[$key_],#32 ++ vld1.32 {$rndlast},[$key_] ++ ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ ++ add $key_,$key1,#32 ++ mov $rounds,$rounds0 ++ b .Lxts_dec ++ ++ // Decryption ++.align 5 ++.Lxts_dec: ++ tst $tailcnt,#0xf ++ b.eq .Lxts_dec_begin ++ subs $len,$len,#16 ++ csel $step,xzr,$step,eq ++ vld1.8 {$dat},[$inp],#16 ++ b.lo .Lxts_done ++ sub $inp,$inp,#16 ++.Lxts_dec_begin: ++ vld1.8 {$dat},[$inp],$step ++ subs $len,$len,#32 // bias ++ add $rounds,$rounds0,#2 ++ vorr $in1,$dat,$dat ++ vorr $dat1,$dat,$dat ++ vorr $in3,$dat,$dat ++ vld1.8 {$dat2},[$inp],#16 ++ vorr $in2,$dat2,$dat2 ++ vorr $in4,$dat2,$dat2 ++ b.lo .Lxts_inner_dec_tail ++ veor $dat,$dat,$iv0 // before decryt, xor with iv ++ veor $dat2,$dat2,$iv1 ++ ++ vorr $dat1,$dat2,$dat2 ++ vld1.8 {$dat2},[$inp],#16 ++ vorr $in0,$dat,$dat ++ vorr $in1,$dat1,$dat1 ++ veor $in2,$dat2,$iv2 // third block xox with third iv ++ veor $dat2,$dat2,$iv2 ++ cmp $len,#32 ++ b.lo .Lxts_outer_dec_tail ++ ++ vld1.8 {$dat3},[$inp],#16 ++ ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$dat4},[$inp],#16 ++ veor $dat3,$dat3,$iv3 // the fourth block ++ veor $dat4,$dat4,$iv4 ++ sub $len,$len,#32 // bias ++ mov $rounds,$rounds0 ++ b .Loop5x_xts_dec ++ ++.align 4 ++.Loop5x_xts_dec: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 // load key schedule... ++ subs $rounds,$rounds,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 // load key schedule... ++ b.gt .Loop5x_xts_dec ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ subs $len,$len,#0x50 // because .Lxts_dec_tail4x ++ ++ aesd $dat0,q9 ++ aesimc $dat0,$dat ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo ++ mov $key_,$key1 ++ ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q10 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q10 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q10 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q10 ++ aesimc $dat4,$dat4 ++ add $inp,$inp,$xoffset // x0 is adjusted in such way that ++ // at exit from the loop v1.16b-v26.16b ++ // are loaded with last "words" ++ add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x ++ ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q11 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q11 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q11 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q11 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q12 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q12 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q13 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q13 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q14 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q14 ++ aesimc $dat4,$dat4 ++ ++ veor $tmp0,$rndlast,$iv0 ++ aesd $dat0,q15 ++ // The iv for first block of next iteration. ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$rndlast,$iv1 ++ vld1.8 {$in0},[$inp],#16 ++ aesd $dat1,q15 ++ // The iv for second block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ veor $tmp2,$rndlast,$iv2 ++ vld1.8 {$in1},[$inp],#16 ++ aesd $dat2,q15 ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ veor $tmp3,$rndlast,$iv3 ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat3,q15 ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ veor $tmp4,$rndlast,$iv4 ++ vld1.8 {$in3},[$inp],#16 ++ aesd $dat4,q15 ++ ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$in4},[$inp],#16 ++ cbz $xoffset,.Lxts_dec_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$tmp0,$dat0 ++ veor $dat0,$in0,$iv0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat1,$in1,$iv1 ++ veor $tmp2,$tmp2,$dat2 ++ veor $dat2,$in2,$iv2 ++ veor $tmp3,$tmp3,$dat3 ++ veor $dat3,$in3,$iv3 ++ veor $tmp4,$tmp4,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ veor $dat4,$in4,$iv4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $rounds,$rounds0 ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_xts_dec ++ ++ cmn $len,#0x10 ++ b.ne .Loop5x_dec_after ++ // If x2($len) equal to -0x10, the left blocks is 4. ++ // After specially processing, utilize the five blocks processing again. ++ // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3. ++ vorr $iv4,$iv3,$iv3 ++ vorr $iv3,$iv2,$iv2 ++ vorr $iv2,$iv1,$iv1 ++ vorr $iv1,$iv0,$iv0 ++ fmov $ivl,$ivd40 ++ fmov $ivh,$ivd41 ++ veor $dat0,$iv0,$in0 ++ veor $dat1,$iv1,$in1 ++ veor $dat2,$in2,$iv2 ++ veor $dat3,$in3,$iv3 ++ veor $dat4,$in4,$iv4 ++ b.eq .Loop5x_xts_dec ++ ++.Loop5x_dec_after: ++ add $len,$len,#0x50 ++ cbz $len,.Lxts_done ++ ++ add $rounds,$rounds0,#2 ++ subs $len,$len,#0x30 ++ b.lo .Lxts_inner_dec_tail ++ ++ veor $dat0,$iv0,$in2 ++ veor $dat1,$iv1,$in3 ++ veor $dat2,$in4,$iv2 ++ b .Lxts_outer_dec_tail ++ ++.align 4 ++.Lxts_dec_tail4x: ++ add $inp,$inp,#16 ++ vld1.32 {$dat0},[$inp],#16 ++ veor $tmp1,$dat1,$tmp0 ++ vst1.8 {$tmp1},[$out],#16 ++ veor $tmp2,$dat2,$tmp2 ++ vst1.8 {$tmp2},[$out],#16 ++ veor $tmp3,$dat3,$tmp3 ++ veor $tmp4,$dat4,$tmp4 ++ vst1.8 {$tmp3-$tmp4},[$out],#32 ++ ++ b .Lxts_done ++.align 4 ++.Lxts_outer_dec_tail: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_outer_dec_tail ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ veor $tmp0,$iv0,$rndlast ++ subs $len,$len,#0x30 ++ // The iv for first block ++ fmov $ivl,$ivd20 ++ fmov $ivh,$ivd21 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$iv1,$rndlast ++ csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ veor $tmp2,$iv2,$rndlast ++ // The iv for second block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ ++ add $xoffset,$xoffset,#0x20 ++ add $inp,$inp,$xoffset // $inp is adjusted to the last data ++ ++ mov $key_,$key1 ++ ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat0,q15 ++ aesd $dat1,q15 ++ aesd $dat2,q15 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ add $rounds,$rounds0,#2 ++ veor $tmp0,$tmp0,$dat0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat2,$dat2,$tmp2 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp0},[$out],#16 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$dat2},[$out],#16 ++ ++ cmn $len,#0x30 ++ add $len,$len,#0x30 ++ b.eq .Lxts_done ++ sub $len,$len,#0x30 ++ vorr $in3,$in1,$in1 ++ vorr $in4,$in2,$in2 ++ nop ++ ++.Lxts_inner_dec_tail: ++ // $len == -0x10 means two blocks left. ++ cmn $len,#0x10 ++ veor $dat1,$in3,$iv0 ++ veor $dat2,$in4,$iv1 ++ b.eq .Lxts_dec_tail_loop ++ veor $dat2,$in4,$iv0 ++.Lxts_dec_tail_loop: ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_dec_tail_loop ++ ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ cmn $len,#0x20 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ veor $tmp1,$iv0,$rndlast ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ veor $tmp2,$iv1,$rndlast ++ aesd $dat1,q15 ++ aesd $dat2,q15 ++ b.eq .Lxts_dec_one ++ veor $tmp1,$tmp1,$dat1 ++ veor $tmp2,$tmp2,$dat2 ++ vorr $iv0,$iv2,$iv2 ++ vorr $iv1,$iv3,$iv3 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ add $len,$len,#16 ++ b .Lxts_done ++ ++.Lxts_dec_one: ++ veor $tmp1,$tmp1,$dat2 ++ vorr $iv0,$iv1,$iv1 ++ vorr $iv1,$iv2,$iv2 ++ vst1.8 {$tmp1},[$out],#16 ++ add $len,$len,#32 ++ ++.Lxts_done: ++ tst $tailcnt,#0xf ++ b.eq .Lxts_dec_abort ++ // Processing the last two blocks with cipher stealing. ++ mov x7,x3 ++ cbnz x2,.Lxts_dec_1st_done ++ vld1.32 {$dat0},[$inp],#16 ++ ++ // Decrypt the last secod block to get the last plain text block ++.Lxts_dec_1st_done: ++ eor $tmpin,$dat0,$iv1 ++ ldr $rounds,[$key1,#240] ++ vld1.32 {$dat0},[$key1],#16 ++ sub $rounds,$rounds,#2 ++ vld1.32 {$dat1},[$key1],#16 ++.Loop_final_2nd_dec: ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1],#16 // load key schedule... ++ subs $rounds,$rounds,#2 ++ aesd $tmpin,$dat1 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat1},[$key1],#16 // load key schedule... ++ b.gt .Loop_final_2nd_dec ++ ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1] ++ aesd $tmpin,$dat1 ++ veor $tmpin,$tmpin,$dat0 ++ veor $tmpin,$tmpin,$iv1 ++ vst1.8 {$tmpin},[$out] ++ ++ mov $tmpinp,$inp ++ add $tmpoutp,$out,#16 ++ ++ // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks ++ // to get the last encrypted block. ++.composite_dec_loop: ++ subs $tailcnt,$tailcnt,#1 ++ ldrb $l2outp,[$out,$tailcnt] ++ ldrb $loutp,[$tmpinp,$tailcnt] ++ strb $l2outp,[$tmpoutp,$tailcnt] ++ strb $loutp,[$out,$tailcnt] ++ b.gt .composite_dec_loop ++.Lxts_dec_load_done: ++ vld1.8 {$tmpin},[$out] ++ veor $tmpin,$tmpin,$iv0 ++ ++ // Decrypt the composite block to get the last second plain text block ++ ldr $rounds,[$key_,#240] ++ vld1.8 {$dat},[$key_],#16 ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key_],#16 ++.Loop_final_dec: ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key_],#16 // load key schedule... ++ subs $rounds,$rounds,#2 ++ aesd $tmpin,$dat1 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat1},[$key_],#16 // load key schedule... ++ b.gt .Loop_final_dec ++ ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key_] ++ aesd $tmpin,$dat1 ++ veor $tmpin,$tmpin,$dat0 ++ veor $tmpin,$tmpin,$iv0 ++ vst1.8 {$tmpin},[$out] ++ ++.Lxts_dec_abort: ++ ldp $tailcnt,$midnumx,[sp,#48] ++ ldp $ivd10,$ivd20,[sp,#32] ++ ldp $ivd30,$ivd40,[sp,#16] ++ ldp $constnumx,$tmpinp,[sp],#64 ++ ++.Lxts_dec_final_abort: ++ ret ++.size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt ++___ ++} ++}}} + $code.=<<___; + #endif + ___ +@@ -963,7 +3615,7 @@ if ($flavour =~ /64/) { ######## 64-bi + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( +- sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", ++ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, + $mnemonic,$arg; +@@ -1004,14 +3656,17 @@ if ($flavour =~ /64/) { ######## 64-bi + s/\],#[0-9]+/]!/o; + + s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or +- s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or ++ s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or + s/vtbl\.8\s+(.*)/unvtbl($1)/geo or + s/vdup\.32\s+(.*)/unvdup32($1)/geo or + s/vmov\.32\s+(.*)/unvmov32($1)/geo or + s/^(\s+)b\./$1b/o or +- s/^(\s+)mov\./$1mov/o or + s/^(\s+)ret/$1bx\tlr/o; + ++ if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { ++ print " it $2\n"; ++ } ++ + print $_,"\n"; + } + } +diff -up openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl.arm-update openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl +--- openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl 2020-12-09 10:37:38.405558929 +0100 +@@ -30,6 +30,7 @@ + # Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ] + # Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ] + # Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ] ++# ThunderX2(***) 39.4(**) 33.8/48.6(**) + # + # (*) ECB denotes approximate result for parallelizable modes + # such as CBC decrypt, CTR, etc.; +diff -up openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl +--- openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl 2020-12-09 10:40:57.922288627 +0100 +@@ -18,32 +18,44 @@ + # + # ChaCha20 for ARMv8. + # ++# April 2019 ++# ++# Replace 3xNEON+1xIALU code path with 4+1. 4+1 is actually fastest ++# option on most(*), but not all, processors, yet 6+2 is retained. ++# This is because penalties are considered tolerable in comparison to ++# improvement on processors where 6+2 helps. Most notably +37% on ++# ThunderX2. It's server-oriented processor which will have to serve ++# as many requests as possible. While others are mostly clients, when ++# performance doesn't have to be absolute top-notch, just fast enough, ++# as majority of time is spent "entertaining" relatively slow human. ++# + # Performance in cycles per byte out of large buffer. + # +-# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU ++# IALU/gcc-4.9 4xNEON+1xIALU 6xNEON+2xIALU + # +-# Apple A7 5.50/+49% 3.33 1.70 +-# Cortex-A53 8.40/+80% 4.72 4.72(*) +-# Cortex-A57 8.06/+43% 4.90 4.43(**) +-# Denver 4.50/+82% 2.63 2.67(*) +-# X-Gene 9.50/+46% 8.82 8.89(*) +-# Mongoose 8.00/+44% 3.64 3.25 +-# Kryo 8.17/+50% 4.83 4.65 ++# Apple A7 5.50/+49% 2.72 1.60 ++# Cortex-A53 8.40/+80% 4.06 4.45(*) ++# Cortex-A57 8.06/+43% 4.15 4.40(*) ++# Denver 4.50/+82% 2.30 2.70(*) ++# X-Gene 9.50/+46% 8.20 8.90(*) ++# Mongoose 8.00/+44% 2.74 3.12(*) ++# Kryo 8.17/+50% 4.47 4.65(*) ++# ThunderX2 7.22/+48% 5.64 4.10 + # +-# (*) it's expected that doubling interleave factor doesn't help +-# all processors, only those with higher NEON latency and +-# higher instruction issue rate; +-# (**) expected improvement was actually higher; ++# (*) slower than 4+1:-( + +-$flavour=shift; +-$output=shift; ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; + + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; + ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or + ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or + die "can't locate arm-xlate.pl"; + +-open OUT,"| \"$^X\" $xlate $flavour $output"; ++open OUT,"| \"$^X\" $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; + *STDOUT=*OUT; + + sub AUTOLOAD() # thunk [simplified] x86-style perlasm +@@ -120,42 +132,37 @@ my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1) + } + + $code.=<<___; +-#include "arm_arch.h" +- +-.text +- ++#ifndef __KERNEL__ ++# include "arm_arch.h" + .extern OPENSSL_armcap_P + .hidden OPENSSL_armcap_P ++#endif ++ ++.text + + .align 5 + .Lsigma: + .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral + .Lone: +-.long 1,0,0,0 +-.LOPENSSL_armcap_P: +-#ifdef __ILP32__ +-.long OPENSSL_armcap_P-. +-#else +-.quad OPENSSL_armcap_P-. +-#endif +-.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" ++.long 1,2,3,4 ++.Lrot24: ++.long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f ++.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by \@dot-asm" + + .globl ChaCha20_ctr32 + .type ChaCha20_ctr32,%function + .align 5 + ChaCha20_ctr32: + cbz $len,.Labort +- adr @x[0],.LOPENSSL_armcap_P + cmp $len,#192 + b.lo .Lshort +-#ifdef __ILP32__ +- ldrsw @x[1],[@x[0]] +-#else +- ldr @x[1],[@x[0]] +-#endif +- ldr w17,[@x[1],@x[0]] ++ ++#ifndef __KERNEL__ ++ adrp x17,OPENSSL_armcap_P ++ ldr w17,[x17,#:lo12:OPENSSL_armcap_P] + tst w17,#ARMV7_NEON +- b.ne ChaCha20_neon ++ b.ne .LChaCha20_neon ++#endif + + .Lshort: + .inst 0xd503233f // paciasp +@@ -174,7 +181,7 @@ ChaCha20_ctr32: + ldp @d[2],@d[3],[$key] // load key + ldp @d[4],@d[5],[$key,#16] + ldp @d[6],@d[7],[$ctr] // load counter +-#ifdef __ARMEB__ ++#ifdef __AARCH64EB__ + ror @d[2],@d[2],#32 + ror @d[3],@d[3],#32 + ror @d[4],@d[4],#32 +@@ -243,7 +250,7 @@ $code.=<<___; + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +-#ifdef __ARMEB__ ++#ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] +@@ -300,7 +307,7 @@ $code.=<<___; + add @x[10],@x[10],@x[11],lsl#32 + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 +-#ifdef __ARMEB__ ++#ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] +@@ -341,46 +348,91 @@ $code.=<<___; + ___ + + {{{ +-my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) = +- map("v$_.4s",(0..7,16..23)); +-my (@K)=map("v$_.4s",(24..30)); +-my $ONE="v31.4s"; ++my @K = map("v$_.4s",(0..3)); ++my ($xt0,$xt1,$xt2,$xt3, $CTR,$ROT24) = map("v$_.4s",(4..9)); ++my @X = map("v$_.4s",(16,20,24,28, 17,21,25,29, 18,22,26,30, 19,23,27,31)); ++my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, ++ $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @X; + +-sub NEONROUND { +-my $odd = pop; +-my ($a,$b,$c,$d,$t)=@_; ++sub NEON_lane_ROUND { ++my ($a0,$b0,$c0,$d0)=@_; ++my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); ++my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); ++my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); ++my @x=map("'$_'",@X); + + ( +- "&add ('$a','$a','$b')", +- "&eor ('$d','$d','$a')", +- "&rev32_16 ('$d','$d')", # vrot ($d,16) +- +- "&add ('$c','$c','$d')", +- "&eor ('$t','$b','$c')", +- "&ushr ('$b','$t',20)", +- "&sli ('$b','$t',12)", +- +- "&add ('$a','$a','$b')", +- "&eor ('$t','$d','$a')", +- "&ushr ('$d','$t',24)", +- "&sli ('$d','$t',8)", +- +- "&add ('$c','$c','$d')", +- "&eor ('$t','$b','$c')", +- "&ushr ('$b','$t',25)", +- "&sli ('$b','$t',7)", +- +- "&ext ('$c','$c','$c',8)", +- "&ext ('$d','$d','$d',$odd?4:12)", +- "&ext ('$b','$b','$b',$odd?12:4)" ++ "&add (@x[$a0],@x[$a0],@x[$b0])", # Q1 ++ "&add (@x[$a1],@x[$a1],@x[$b1])", # Q2 ++ "&add (@x[$a2],@x[$a2],@x[$b2])", # Q3 ++ "&add (@x[$a3],@x[$a3],@x[$b3])", # Q4 ++ "&eor (@x[$d0],@x[$d0],@x[$a0])", ++ "&eor (@x[$d1],@x[$d1],@x[$a1])", ++ "&eor (@x[$d2],@x[$d2],@x[$a2])", ++ "&eor (@x[$d3],@x[$d3],@x[$a3])", ++ "&rev32_16 (@x[$d0],@x[$d0])", ++ "&rev32_16 (@x[$d1],@x[$d1])", ++ "&rev32_16 (@x[$d2],@x[$d2])", ++ "&rev32_16 (@x[$d3],@x[$d3])", ++ ++ "&add (@x[$c0],@x[$c0],@x[$d0])", ++ "&add (@x[$c1],@x[$c1],@x[$d1])", ++ "&add (@x[$c2],@x[$c2],@x[$d2])", ++ "&add (@x[$c3],@x[$c3],@x[$d3])", ++ "&eor ('$xt0',@x[$b0],@x[$c0])", ++ "&eor ('$xt1',@x[$b1],@x[$c1])", ++ "&eor ('$xt2',@x[$b2],@x[$c2])", ++ "&eor ('$xt3',@x[$b3],@x[$c3])", ++ "&ushr (@x[$b0],'$xt0',20)", ++ "&ushr (@x[$b1],'$xt1',20)", ++ "&ushr (@x[$b2],'$xt2',20)", ++ "&ushr (@x[$b3],'$xt3',20)", ++ "&sli (@x[$b0],'$xt0',12)", ++ "&sli (@x[$b1],'$xt1',12)", ++ "&sli (@x[$b2],'$xt2',12)", ++ "&sli (@x[$b3],'$xt3',12)", ++ ++ "&add (@x[$a0],@x[$a0],@x[$b0])", ++ "&add (@x[$a1],@x[$a1],@x[$b1])", ++ "&add (@x[$a2],@x[$a2],@x[$b2])", ++ "&add (@x[$a3],@x[$a3],@x[$b3])", ++ "&eor ('$xt0',@x[$d0],@x[$a0])", ++ "&eor ('$xt1',@x[$d1],@x[$a1])", ++ "&eor ('$xt2',@x[$d2],@x[$a2])", ++ "&eor ('$xt3',@x[$d3],@x[$a3])", ++ "&tbl (@x[$d0],'{$xt0}','$ROT24')", ++ "&tbl (@x[$d1],'{$xt1}','$ROT24')", ++ "&tbl (@x[$d2],'{$xt2}','$ROT24')", ++ "&tbl (@x[$d3],'{$xt3}','$ROT24')", ++ ++ "&add (@x[$c0],@x[$c0],@x[$d0])", ++ "&add (@x[$c1],@x[$c1],@x[$d1])", ++ "&add (@x[$c2],@x[$c2],@x[$d2])", ++ "&add (@x[$c3],@x[$c3],@x[$d3])", ++ "&eor ('$xt0',@x[$b0],@x[$c0])", ++ "&eor ('$xt1',@x[$b1],@x[$c1])", ++ "&eor ('$xt2',@x[$b2],@x[$c2])", ++ "&eor ('$xt3',@x[$b3],@x[$c3])", ++ "&ushr (@x[$b0],'$xt0',25)", ++ "&ushr (@x[$b1],'$xt1',25)", ++ "&ushr (@x[$b2],'$xt2',25)", ++ "&ushr (@x[$b3],'$xt3',25)", ++ "&sli (@x[$b0],'$xt0',7)", ++ "&sli (@x[$b1],'$xt1',7)", ++ "&sli (@x[$b2],'$xt2',7)", ++ "&sli (@x[$b3],'$xt3',7)" + ); + } + + $code.=<<___; + ++#ifdef __KERNEL__ ++.globl ChaCha20_neon ++#endif + .type ChaCha20_neon,%function + .align 5 + ChaCha20_neon: ++.LChaCha20_neon: + .inst 0xd503233f // paciasp + stp x29,x30,[sp,#-96]! + add x29,sp,#0 +@@ -403,8 +455,9 @@ ChaCha20_neon: + ld1 {@K[1],@K[2]},[$key] + ldp @d[6],@d[7],[$ctr] // load counter + ld1 {@K[3]},[$ctr] +- ld1 {$ONE},[@x[0]] +-#ifdef __ARMEB__ ++ stp d8,d9,[sp] // meet ABI requirements ++ ld1 {$CTR,$ROT24},[@x[0]] ++#ifdef __AARCH64EB__ + rev64 @K[0],@K[0] + ror @d[2],@d[2],#32 + ror @d[3],@d[3],#32 +@@ -413,115 +466,129 @@ ChaCha20_neon: + ror @d[6],@d[6],#32 + ror @d[7],@d[7],#32 + #endif +- add @K[3],@K[3],$ONE // += 1 +- add @K[4],@K[3],$ONE +- add @K[5],@K[4],$ONE +- shl $ONE,$ONE,#2 // 1 -> 4 + + .Loop_outer_neon: +- mov.32 @x[0],@d[0] // unpack key block +- lsr @x[1],@d[0],#32 +- mov $A0,@K[0] +- mov.32 @x[2],@d[1] +- lsr @x[3],@d[1],#32 +- mov $A1,@K[0] +- mov.32 @x[4],@d[2] +- lsr @x[5],@d[2],#32 +- mov $A2,@K[0] +- mov.32 @x[6],@d[3] +- mov $B0,@K[1] +- lsr @x[7],@d[3],#32 +- mov $B1,@K[1] +- mov.32 @x[8],@d[4] +- mov $B2,@K[1] +- lsr @x[9],@d[4],#32 +- mov $D0,@K[3] +- mov.32 @x[10],@d[5] +- mov $D1,@K[4] +- lsr @x[11],@d[5],#32 +- mov $D2,@K[5] +- mov.32 @x[12],@d[6] +- mov $C0,@K[2] +- lsr @x[13],@d[6],#32 +- mov $C1,@K[2] +- mov.32 @x[14],@d[7] +- mov $C2,@K[2] +- lsr @x[15],@d[7],#32 ++ dup $xa0,@{K[0]}[0] // unpack key block ++ mov.32 @x[0],@d[0] ++ dup $xa1,@{K[0]}[1] ++ lsr @x[1],@d[0],#32 ++ dup $xa2,@{K[0]}[2] ++ mov.32 @x[2],@d[1] ++ dup $xa3,@{K[0]}[3] ++ lsr @x[3],@d[1],#32 ++ dup $xb0,@{K[1]}[0] ++ mov.32 @x[4],@d[2] ++ dup $xb1,@{K[1]}[1] ++ lsr @x[5],@d[2],#32 ++ dup $xb2,@{K[1]}[2] ++ mov.32 @x[6],@d[3] ++ dup $xb3,@{K[1]}[3] ++ lsr @x[7],@d[3],#32 ++ dup $xd0,@{K[3]}[0] ++ mov.32 @x[8],@d[4] ++ dup $xd1,@{K[3]}[1] ++ lsr @x[9],@d[4],#32 ++ dup $xd2,@{K[3]}[2] ++ mov.32 @x[10],@d[5] ++ dup $xd3,@{K[3]}[3] ++ lsr @x[11],@d[5],#32 ++ add $xd0,$xd0,$CTR ++ mov.32 @x[12],@d[6] ++ dup $xc0,@{K[2]}[0] ++ lsr @x[13],@d[6],#32 ++ dup $xc1,@{K[2]}[1] ++ mov.32 @x[14],@d[7] ++ dup $xc2,@{K[2]}[2] ++ lsr @x[15],@d[7],#32 ++ dup $xc3,@{K[2]}[3] + + mov $ctr,#10 +- subs $len,$len,#256 ++ subs $len,$len,#320 + .Loop_neon: + sub $ctr,$ctr,#1 + ___ +- my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0); +- my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0); +- my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0); +- my @thread3=&ROUND(0,4,8,12); +- +- foreach (@thread0) { +- eval; eval(shift(@thread3)); +- eval(shift(@thread1)); eval(shift(@thread3)); +- eval(shift(@thread2)); eval(shift(@thread3)); +- } +- +- @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1); +- @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1); +- @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1); +- @thread3=&ROUND(0,5,10,15); ++ my @plus_one=&ROUND(0,4,8,12); ++ foreach (&NEON_lane_ROUND(0,4,8,12)) { eval; eval(shift(@plus_one)); } + +- foreach (@thread0) { +- eval; eval(shift(@thread3)); +- eval(shift(@thread1)); eval(shift(@thread3)); +- eval(shift(@thread2)); eval(shift(@thread3)); +- } ++ @plus_one=&ROUND(0,5,10,15); ++ foreach (&NEON_lane_ROUND(0,5,10,15)) { eval; eval(shift(@plus_one)); } + $code.=<<___; + cbnz $ctr,.Loop_neon + +- add.32 @x[0],@x[0],@d[0] // accumulate key block +- add $A0,$A0,@K[0] +- add @x[1],@x[1],@d[0],lsr#32 +- add $A1,$A1,@K[0] +- add.32 @x[2],@x[2],@d[1] +- add $A2,$A2,@K[0] +- add @x[3],@x[3],@d[1],lsr#32 +- add $C0,$C0,@K[2] +- add.32 @x[4],@x[4],@d[2] +- add $C1,$C1,@K[2] +- add @x[5],@x[5],@d[2],lsr#32 +- add $C2,$C2,@K[2] +- add.32 @x[6],@x[6],@d[3] +- add $D0,$D0,@K[3] +- add @x[7],@x[7],@d[3],lsr#32 +- add.32 @x[8],@x[8],@d[4] +- add $D1,$D1,@K[4] +- add @x[9],@x[9],@d[4],lsr#32 +- add.32 @x[10],@x[10],@d[5] +- add $D2,$D2,@K[5] +- add @x[11],@x[11],@d[5],lsr#32 +- add.32 @x[12],@x[12],@d[6] +- add $B0,$B0,@K[1] +- add @x[13],@x[13],@d[6],lsr#32 +- add.32 @x[14],@x[14],@d[7] +- add $B1,$B1,@K[1] +- add @x[15],@x[15],@d[7],lsr#32 +- add $B2,$B2,@K[1] ++ add $xd0,$xd0,$CTR ++ ++ zip1 $xt0,$xa0,$xa1 // transpose data ++ zip1 $xt1,$xa2,$xa3 ++ zip2 $xt2,$xa0,$xa1 ++ zip2 $xt3,$xa2,$xa3 ++ zip1.64 $xa0,$xt0,$xt1 ++ zip2.64 $xa1,$xt0,$xt1 ++ zip1.64 $xa2,$xt2,$xt3 ++ zip2.64 $xa3,$xt2,$xt3 ++ ++ zip1 $xt0,$xb0,$xb1 ++ zip1 $xt1,$xb2,$xb3 ++ zip2 $xt2,$xb0,$xb1 ++ zip2 $xt3,$xb2,$xb3 ++ zip1.64 $xb0,$xt0,$xt1 ++ zip2.64 $xb1,$xt0,$xt1 ++ zip1.64 $xb2,$xt2,$xt3 ++ zip2.64 $xb3,$xt2,$xt3 ++ ++ zip1 $xt0,$xc0,$xc1 ++ add.32 @x[0],@x[0],@d[0] // accumulate key block ++ zip1 $xt1,$xc2,$xc3 ++ add @x[1],@x[1],@d[0],lsr#32 ++ zip2 $xt2,$xc0,$xc1 ++ add.32 @x[2],@x[2],@d[1] ++ zip2 $xt3,$xc2,$xc3 ++ add @x[3],@x[3],@d[1],lsr#32 ++ zip1.64 $xc0,$xt0,$xt1 ++ add.32 @x[4],@x[4],@d[2] ++ zip2.64 $xc1,$xt0,$xt1 ++ add @x[5],@x[5],@d[2],lsr#32 ++ zip1.64 $xc2,$xt2,$xt3 ++ add.32 @x[6],@x[6],@d[3] ++ zip2.64 $xc3,$xt2,$xt3 ++ add @x[7],@x[7],@d[3],lsr#32 ++ ++ zip1 $xt0,$xd0,$xd1 ++ add.32 @x[8],@x[8],@d[4] ++ zip1 $xt1,$xd2,$xd3 ++ add @x[9],@x[9],@d[4],lsr#32 ++ zip2 $xt2,$xd0,$xd1 ++ add.32 @x[10],@x[10],@d[5] ++ zip2 $xt3,$xd2,$xd3 ++ add @x[11],@x[11],@d[5],lsr#32 ++ zip1.64 $xd0,$xt0,$xt1 ++ add.32 @x[12],@x[12],@d[6] ++ zip2.64 $xd1,$xt0,$xt1 ++ add @x[13],@x[13],@d[6],lsr#32 ++ zip1.64 $xd2,$xt2,$xt3 ++ add.32 @x[14],@x[14],@d[7] ++ zip2.64 $xd3,$xt2,$xt3 ++ add @x[15],@x[15],@d[7],lsr#32 + + b.lo .Ltail_neon + + add @x[0],@x[0],@x[1],lsl#32 // pack + add @x[2],@x[2],@x[3],lsl#32 + ldp @x[1],@x[3],[$inp,#0] // load input ++ add $xa0,$xa0,@K[0] // accumulate key block + add @x[4],@x[4],@x[5],lsl#32 + add @x[6],@x[6],@x[7],lsl#32 + ldp @x[5],@x[7],[$inp,#16] ++ add $xb0,$xb0,@K[1] + add @x[8],@x[8],@x[9],lsl#32 + add @x[10],@x[10],@x[11],lsl#32 + ldp @x[9],@x[11],[$inp,#32] ++ add $xc0,$xc0,@K[2] + add @x[12],@x[12],@x[13],lsl#32 + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] ++ add $xd0,$xd0,@K[3] + add $inp,$inp,#64 +-#ifdef __ARMEB__ ++#ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] +@@ -531,48 +598,68 @@ $code.=<<___; + rev @x[12],@x[12] + rev @x[14],@x[14] + #endif +- ld1.8 {$T0-$T3},[$inp],#64 ++ ld1.8 {$xt0-$xt3},[$inp],#64 + eor @x[0],@x[0],@x[1] ++ add $xa1,$xa1,@K[0] + eor @x[2],@x[2],@x[3] ++ add $xb1,$xb1,@K[1] + eor @x[4],@x[4],@x[5] ++ add $xc1,$xc1,@K[2] + eor @x[6],@x[6],@x[7] ++ add $xd1,$xd1,@K[3] + eor @x[8],@x[8],@x[9] +- eor $A0,$A0,$T0 ++ eor $xa0,$xa0,$xt0 ++ movi $xt0,#5 + eor @x[10],@x[10],@x[11] +- eor $B0,$B0,$T1 ++ eor $xb0,$xb0,$xt1 + eor @x[12],@x[12],@x[13] +- eor $C0,$C0,$T2 ++ eor $xc0,$xc0,$xt2 + eor @x[14],@x[14],@x[15] +- eor $D0,$D0,$T3 +- ld1.8 {$T0-$T3},[$inp],#64 ++ eor $xd0,$xd0,$xt3 ++ add $CTR,$CTR,$xt0 // += 5 ++ ld1.8 {$xt0-$xt3},[$inp],#64 + + stp @x[0],@x[2],[$out,#0] // store output +- add @d[6],@d[6],#4 // increment counter ++ add @d[6],@d[6],#5 // increment counter + stp @x[4],@x[6],[$out,#16] +- add @K[3],@K[3],$ONE // += 4 + stp @x[8],@x[10],[$out,#32] +- add @K[4],@K[4],$ONE + stp @x[12],@x[14],[$out,#48] +- add @K[5],@K[5],$ONE + add $out,$out,#64 + +- st1.8 {$A0-$D0},[$out],#64 +- ld1.8 {$A0-$D0},[$inp],#64 +- +- eor $A1,$A1,$T0 +- eor $B1,$B1,$T1 +- eor $C1,$C1,$T2 +- eor $D1,$D1,$T3 +- st1.8 {$A1-$D1},[$out],#64 +- +- eor $A2,$A2,$A0 +- eor $B2,$B2,$B0 +- eor $C2,$C2,$C0 +- eor $D2,$D2,$D0 +- st1.8 {$A2-$D2},[$out],#64 ++ st1.8 {$xa0-$xd0},[$out],#64 ++ add $xa2,$xa2,@K[0] ++ add $xb2,$xb2,@K[1] ++ add $xc2,$xc2,@K[2] ++ add $xd2,$xd2,@K[3] ++ ld1.8 {$xa0-$xd0},[$inp],#64 ++ ++ eor $xa1,$xa1,$xt0 ++ eor $xb1,$xb1,$xt1 ++ eor $xc1,$xc1,$xt2 ++ eor $xd1,$xd1,$xt3 ++ st1.8 {$xa1-$xd1},[$out],#64 ++ add $xa3,$xa3,@K[0] ++ add $xb3,$xb3,@K[1] ++ add $xc3,$xc3,@K[2] ++ add $xd3,$xd3,@K[3] ++ ld1.8 {$xa1-$xd1},[$inp],#64 ++ ++ eor $xa2,$xa2,$xa0 ++ eor $xb2,$xb2,$xb0 ++ eor $xc2,$xc2,$xc0 ++ eor $xd2,$xd2,$xd0 ++ st1.8 {$xa2-$xd2},[$out],#64 ++ ++ eor $xa3,$xa3,$xa1 ++ eor $xb3,$xb3,$xb1 ++ eor $xc3,$xc3,$xc1 ++ eor $xd3,$xd3,$xd1 ++ st1.8 {$xa3-$xd3},[$out],#64 + + b.hi .Loop_outer_neon + ++ ldp d8,d9,[sp] // meet ABI requirements ++ + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] +@@ -583,8 +670,10 @@ $code.=<<___; + .inst 0xd50323bf // autiasp + ret + ++.align 4 + .Ltail_neon: +- add $len,$len,#256 ++ add $len,$len,#320 ++ ldp d8,d9,[sp] // meet ABI requirements + cmp $len,#64 + b.lo .Less_than_64 + +@@ -601,7 +690,7 @@ $code.=<<___; + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +-#ifdef __ARMEB__ ++#ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] +@@ -621,48 +710,68 @@ $code.=<<___; + eor @x[14],@x[14],@x[15] + + stp @x[0],@x[2],[$out,#0] // store output +- add @d[6],@d[6],#4 // increment counter ++ add $xa0,$xa0,@K[0] // accumulate key block + stp @x[4],@x[6],[$out,#16] ++ add $xb0,$xb0,@K[1] + stp @x[8],@x[10],[$out,#32] ++ add $xc0,$xc0,@K[2] + stp @x[12],@x[14],[$out,#48] ++ add $xd0,$xd0,@K[3] + add $out,$out,#64 + b.eq .Ldone_neon + sub $len,$len,#64 + cmp $len,#64 +- b.lo .Less_than_128 ++ b.lo .Last_neon + +- ld1.8 {$T0-$T3},[$inp],#64 +- eor $A0,$A0,$T0 +- eor $B0,$B0,$T1 +- eor $C0,$C0,$T2 +- eor $D0,$D0,$T3 +- st1.8 {$A0-$D0},[$out],#64 ++ ld1.8 {$xt0-$xt3},[$inp],#64 ++ eor $xa0,$xa0,$xt0 ++ eor $xb0,$xb0,$xt1 ++ eor $xc0,$xc0,$xt2 ++ eor $xd0,$xd0,$xt3 ++ st1.8 {$xa0-$xd0},[$out],#64 + b.eq .Ldone_neon ++ ++ add $xa0,$xa1,@K[0] ++ add $xb0,$xb1,@K[1] + sub $len,$len,#64 ++ add $xc0,$xc1,@K[2] + cmp $len,#64 +- b.lo .Less_than_192 ++ add $xd0,$xd1,@K[3] ++ b.lo .Last_neon + +- ld1.8 {$T0-$T3},[$inp],#64 +- eor $A1,$A1,$T0 +- eor $B1,$B1,$T1 +- eor $C1,$C1,$T2 +- eor $D1,$D1,$T3 +- st1.8 {$A1-$D1},[$out],#64 ++ ld1.8 {$xt0-$xt3},[$inp],#64 ++ eor $xa1,$xa0,$xt0 ++ eor $xb1,$xb0,$xt1 ++ eor $xc1,$xc0,$xt2 ++ eor $xd1,$xd0,$xt3 ++ st1.8 {$xa1-$xd1},[$out],#64 + b.eq .Ldone_neon ++ ++ add $xa0,$xa2,@K[0] ++ add $xb0,$xb2,@K[1] + sub $len,$len,#64 ++ add $xc0,$xc2,@K[2] ++ cmp $len,#64 ++ add $xd0,$xd2,@K[3] ++ b.lo .Last_neon + +- st1.8 {$A2-$D2},[sp] +- b .Last_neon ++ ld1.8 {$xt0-$xt3},[$inp],#64 ++ eor $xa2,$xa0,$xt0 ++ eor $xb2,$xb0,$xt1 ++ eor $xc2,$xc0,$xt2 ++ eor $xd2,$xd0,$xt3 ++ st1.8 {$xa2-$xd2},[$out],#64 ++ b.eq .Ldone_neon + +-.Less_than_128: +- st1.8 {$A0-$D0},[sp] +- b .Last_neon +-.Less_than_192: +- st1.8 {$A1-$D1},[sp] +- b .Last_neon ++ add $xa0,$xa3,@K[0] ++ add $xb0,$xb3,@K[1] ++ add $xc0,$xc3,@K[2] ++ add $xd0,$xd3,@K[3] ++ sub $len,$len,#64 + +-.align 4 + .Last_neon: ++ st1.8 {$xa0-$xd0},[sp] ++ + sub $out,$out,#1 + add $inp,$inp,$len + add $out,$out,$len +@@ -695,9 +804,41 @@ $code.=<<___; + .size ChaCha20_neon,.-ChaCha20_neon + ___ + { ++my @K = map("v$_.4s",(0..6)); + my ($T0,$T1,$T2,$T3,$T4,$T5)=@K; + my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2, +- $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23)); ++ $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(8..31)); ++my $rot24 = @K[6]; ++my $ONE = "v7.4s"; ++ ++sub NEONROUND { ++my $odd = pop; ++my ($a,$b,$c,$d,$t)=@_; ++ ++ ( ++ "&add ('$a','$a','$b')", ++ "&eor ('$d','$d','$a')", ++ "&rev32_16 ('$d','$d')", # vrot ($d,16) ++ ++ "&add ('$c','$c','$d')", ++ "&eor ('$t','$b','$c')", ++ "&ushr ('$b','$t',20)", ++ "&sli ('$b','$t',12)", ++ ++ "&add ('$a','$a','$b')", ++ "&eor ('$d','$d','$a')", ++ "&tbl ('$d','{$d}','$rot24')", ++ ++ "&add ('$c','$c','$d')", ++ "&eor ('$t','$b','$c')", ++ "&ushr ('$b','$t',25)", ++ "&sli ('$b','$t',7)", ++ ++ "&ext ('$c','$c','$c',8)", ++ "&ext ('$d','$d','$d',$odd?4:12)", ++ "&ext ('$b','$b','$b',$odd?12:4)" ++ ); ++} + + $code.=<<___; + .type ChaCha20_512_neon,%function +@@ -717,6 +858,7 @@ ChaCha20_512_neon: + .L512_or_more_neon: + sub sp,sp,#128+64 + ++ eor $ONE,$ONE,$ONE + ldp @d[0],@d[1],[@x[0]] // load sigma + ld1 {@K[0]},[@x[0]],#16 + ldp @d[2],@d[3],[$key] // load key +@@ -724,8 +866,9 @@ ChaCha20_512_neon: + ld1 {@K[1],@K[2]},[$key] + ldp @d[6],@d[7],[$ctr] // load counter + ld1 {@K[3]},[$ctr] +- ld1 {$ONE},[@x[0]] +-#ifdef __ARMEB__ ++ ld1 {$ONE}[0],[@x[0]] ++ add $key,@x[0],#16 // .Lrot24 ++#ifdef __AARCH64EB__ + rev64 @K[0],@K[0] + ror @d[2],@d[2],#32 + ror @d[3],@d[3],#32 +@@ -792,9 +935,10 @@ ChaCha20_512_neon: + mov $C4,@K[2] + stp @K[3],@K[4],[sp,#48] // off-load key block, variable part + mov $C5,@K[2] +- str @K[5],[sp,#80] ++ stp @K[5],@K[6],[sp,#80] + + mov $ctr,#5 ++ ld1 {$rot24},[$key] + subs $len,$len,#512 + .Loop_upper_neon: + sub $ctr,$ctr,#1 +@@ -867,7 +1011,7 @@ $code.=<<___; + add @x[14],@x[14],@x[15],lsl#32 + ldp @x[13],@x[15],[$inp,#48] + add $inp,$inp,#64 +-#ifdef __ARMEB__ ++#ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] +@@ -956,6 +1100,7 @@ $code.=<<___; + add.32 @x[2],@x[2],@d[1] + ldp @K[4],@K[5],[sp,#64] + add @x[3],@x[3],@d[1],lsr#32 ++ ldr @K[6],[sp,#96] + add $A0,$A0,@K[0] + add.32 @x[4],@x[4],@d[2] + add $A1,$A1,@K[0] +@@ -1008,7 +1153,7 @@ $code.=<<___; + add $inp,$inp,#64 + add $B5,$B5,@K[1] + +-#ifdef __ARMEB__ ++#ifdef __AARCH64EB__ + rev @x[0],@x[0] + rev @x[2],@x[2] + rev @x[4],@x[4] +@@ -1086,26 +1231,26 @@ $code.=<<___; + b.hs .Loop_outer_512_neon + + adds $len,$len,#512 +- ushr $A0,$ONE,#2 // 4 -> 1 ++ ushr $ONE,$ONE,#1 // 4 -> 2 + +- ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp d10,d11,[sp,#128+16] + ldp d12,d13,[sp,#128+32] + ldp d14,d15,[sp,#128+48] + +- stp @K[0],$ONE,[sp,#0] // wipe off-load area +- stp @K[0],$ONE,[sp,#32] +- stp @K[0],$ONE,[sp,#64] ++ stp @K[0],@K[0],[sp,#0] // wipe off-load area ++ stp @K[0],@K[0],[sp,#32] ++ stp @K[0],@K[0],[sp,#64] + + b.eq .Ldone_512_neon + ++ sub $key,$key,#16 // .Lone + cmp $len,#192 +- sub @K[3],@K[3],$A0 // -= 1 +- sub @K[4],@K[4],$A0 +- sub @K[5],@K[5],$A0 + add sp,sp,#128 ++ sub @K[3],@K[3],$ONE // -= 2 ++ ld1 {$CTR,$ROT24},[$key] + b.hs .Loop_outer_neon + ++ ldp d8,d9,[sp,#0] // meet ABI requirements + eor @K[1],@K[1],@K[1] + eor @K[2],@K[2],@K[2] + eor @K[3],@K[3],@K[3] +@@ -1115,6 +1260,7 @@ $code.=<<___; + b .Loop_outer + + .Ldone_512_neon: ++ ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp x19,x20,[x29,#16] + add sp,sp,#128+64 + ldp x21,x22,[x29,#32] +@@ -1133,9 +1279,11 @@ foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval $1/geo; + + (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or +- (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or ++ (m/\b(eor|ext|mov|tbl)\b/ and (s/\.4s/\.16b/g or 1)) or + (s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or + (m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or ++ (m/\b(dup|ld1)\b/ and (s/\.4(s}?\[[0-3]\])/.$1/g or 1)) or ++ (s/\b(zip[12])\.64\b/$1/ and (s/\.4s/\.2d/g or 1)) or + (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1)); + + #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; +diff -up openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl.arm-update openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl +--- openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl 2020-12-09 10:37:38.408558954 +0100 +@@ -42,6 +42,7 @@ + # Denver 0.51 0.65 6.02 + # Mongoose 0.65 1.10 8.06 + # Kryo 0.76 1.16 8.00 ++# ThunderX2 1.05 + # + # (*) presented for reference/comparison purposes; + +diff -up openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl.arm-update openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl +--- openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl 2020-12-09 10:37:38.408558954 +0100 +@@ -29,6 +29,7 @@ + # X-Gene 2.13/+68% 2.27 + # Mongoose 1.77/+75% 1.12 + # Kryo 2.70/+55% 1.13 ++# ThunderX2 1.17/+95% 1.36 + # + # (*) estimate based on resources availability is less than 1.0, + # i.e. measured result is worse than expected, presumably binary +diff -up openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl.arm-update openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl +--- openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl 2020-12-09 10:37:38.408558954 +0100 +@@ -51,6 +51,7 @@ + # Kryo 12 + # Denver 7.8 + # Apple A7 7.2 ++# ThunderX2 9.7 + # + # (*) Corresponds to SHA3-256. No improvement coefficients are listed + # because they vary too much from compiler to compiler. Newer +diff -up openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl.arm-update openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl +--- openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl 2020-12-09 10:37:38.408558954 +0100 +@@ -27,6 +27,7 @@ + # X-Gene 8.80 (+200%) + # Mongoose 2.05 6.50 (+160%) + # Kryo 1.88 8.00 (+90%) ++# ThunderX2 2.64 6.36 (+150%) + # + # (*) Software results are presented mostly for reference purposes. + # (**) Keep in mind that Denver relies on binary translation, which +diff -up openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl.arm-update openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl +--- openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl 2020-12-09 10:37:38.408558954 +0100 +@@ -28,6 +28,7 @@ + # X-Gene 20.0 (+100%) 12.8 (+300%(***)) + # Mongoose 2.36 13.0 (+50%) 8.36 (+33%) + # Kryo 1.92 17.4 (+30%) 11.2 (+8%) ++# ThunderX2 2.54 13.2 (+40%) 8.40 (+18%) + # + # (*) Software SHA256 results are of lesser relevance, presented + # mostly for informational purposes. |