diff options
author | CoprDistGit <infra@openeuler.org> | 2024-08-03 06:28:41 +0000 |
---|---|---|
committer | CoprDistGit <infra@openeuler.org> | 2024-08-03 06:28:41 +0000 |
commit | d20db0561a6a36f914fde030512503b114ef9a0c (patch) | |
tree | d4e5e3494d95c269a1cee6195f11bf3201bcadbf /3-5-AArch64-Improve-A64FX-memset-for-remaining-bytes.patch | |
parent | 016343d99b1b269d7246ef1e143d4b54914433d4 (diff) |
automatic import of glibcopeneuler22.03_LTS_SP4openeuler22.03_LTS_SP3openeuler20.03
Diffstat (limited to '3-5-AArch64-Improve-A64FX-memset-for-remaining-bytes.patch')
-rw-r--r-- | 3-5-AArch64-Improve-A64FX-memset-for-remaining-bytes.patch | 80 |
1 files changed, 80 insertions, 0 deletions
diff --git a/3-5-AArch64-Improve-A64FX-memset-for-remaining-bytes.patch b/3-5-AArch64-Improve-A64FX-memset-for-remaining-bytes.patch new file mode 100644 index 0000000..7ba3516 --- /dev/null +++ b/3-5-AArch64-Improve-A64FX-memset-for-remaining-bytes.patch @@ -0,0 +1,80 @@ +From 186092c6ba8825598ffdbf15dbf0823c771f560d Mon Sep 17 00:00:00 2001 +From: Wilco Dijkstra <wdijkstr@arm.com> +Date: Tue, 10 Aug 2021 13:42:07 +0100 +Subject: [PATCH] [3/5] AArch64: Improve A64FX memset for remaining bytes + +Simplify handling of remaining bytes. Avoid lots of taken branches and complex +whilelo computations, instead unconditionally write vectors from the end. + +Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com> +--- + sysdeps/aarch64/multiarch/memset_a64fx.S | 46 +++++++++----------------------- + 1 file changed, 13 insertions(+), 33 deletions(-) + +diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S +index 75cf43a..337c86b 100644 +--- a/sysdeps/aarch64/multiarch/memset_a64fx.S ++++ b/sysdeps/aarch64/multiarch/memset_a64fx.S +@@ -130,38 +130,19 @@ L(unroll8): + b 1b + + L(last): +- whilelo p0.b, xzr, rest +- whilelo p1.b, vector_length, rest +- b.last 1f +- st1b z0.b, p0, [dst, #0, mul vl] +- st1b z0.b, p1, [dst, #1, mul vl] +- ret +-1: lsl tmp1, vector_length, 1 // vector_length * 2 +- whilelo p2.b, tmp1, rest +- incb tmp1 +- whilelo p3.b, tmp1, rest +- b.last 1f +- st1b z0.b, p0, [dst, #0, mul vl] +- st1b z0.b, p1, [dst, #1, mul vl] +- st1b z0.b, p2, [dst, #2, mul vl] +- st1b z0.b, p3, [dst, #3, mul vl] +- ret +-1: lsl tmp1, vector_length, 2 // vector_length * 4 +- whilelo p4.b, tmp1, rest +- incb tmp1 +- whilelo p5.b, tmp1, rest +- incb tmp1 +- whilelo p6.b, tmp1, rest +- incb tmp1 +- whilelo p7.b, tmp1, rest +- st1b z0.b, p0, [dst, #0, mul vl] +- st1b z0.b, p1, [dst, #1, mul vl] +- st1b z0.b, p2, [dst, #2, mul vl] +- st1b z0.b, p3, [dst, #3, mul vl] +- st1b z0.b, p4, [dst, #4, mul vl] +- st1b z0.b, p5, [dst, #5, mul vl] +- st1b z0.b, p6, [dst, #6, mul vl] +- st1b z0.b, p7, [dst, #7, mul vl] ++ cmp count, vector_length, lsl 1 ++ b.ls 2f ++ add tmp2, vector_length, vector_length, lsl 2 ++ cmp count, tmp2 ++ b.ls 5f ++ st1b z0.b, p0, [dstend, -8, mul vl] ++ st1b z0.b, p0, [dstend, -7, mul vl] ++ st1b z0.b, p0, [dstend, -6, mul vl] ++5: st1b z0.b, p0, [dstend, -5, mul vl] ++ st1b z0.b, p0, [dstend, -4, mul vl] ++ st1b z0.b, p0, [dstend, -3, mul vl] ++2: st1b z0.b, p0, [dstend, -2, mul vl] ++ st1b z0.b, p0, [dstend, -1, mul vl] + ret + + L(L1_prefetch): // if rest >= L1_SIZE +@@ -199,7 +180,6 @@ L(L2): + subs count, count, CACHE_LINE_SIZE + b.hi 1b + add count, count, CACHE_LINE_SIZE +- add dst, dst, CACHE_LINE_SIZE + b L(last) + + END (MEMSET) +-- +1.8.3.1 + |