summaryrefslogtreecommitdiff
path: root/2-5-AArch64-Improve-A64FX-memset-for-large-sizes.patch
diff options
context:
space:
mode:
Diffstat (limited to '2-5-AArch64-Improve-A64FX-memset-for-large-sizes.patch')
-rw-r--r--2-5-AArch64-Improve-A64FX-memset-for-large-sizes.patch131
1 files changed, 131 insertions, 0 deletions
diff --git a/2-5-AArch64-Improve-A64FX-memset-for-large-sizes.patch b/2-5-AArch64-Improve-A64FX-memset-for-large-sizes.patch
new file mode 100644
index 0000000..81cdbe0
--- /dev/null
+++ b/2-5-AArch64-Improve-A64FX-memset-for-large-sizes.patch
@@ -0,0 +1,131 @@
+From 9bc2ed8f46d80859a5596789cc9e8cc2de84b0e7 Mon Sep 17 00:00:00 2001
+From: Wilco Dijkstra <wdijkstr@arm.com>
+Date: Tue, 10 Aug 2021 13:39:37 +0100
+Subject: [PATCH] [2/5] AArch64: Improve A64FX memset for large sizes
+
+Improve performance of large memsets. Simplify alignment code. For zero memset
+use DC ZVA, which almost doubles performance. For non-zero memsets use the
+unroll8 loop which is about 10% faster.
+
+Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
+---
+ sysdeps/aarch64/multiarch/memset_a64fx.S | 85 ++++++++++----------------------
+ 1 file changed, 25 insertions(+), 60 deletions(-)
+
+diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
+index cf3d402..75cf43a 100644
+--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
++++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
+@@ -27,14 +27,11 @@
+ */
+
+ #define L1_SIZE (64*1024) // L1 64KB
+-#define L2_SIZE (8*1024*1024) // L2 8MB - 1MB
++#define L2_SIZE (8*1024*1024) // L2 8MB
+ #define CACHE_LINE_SIZE 256
+ #define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1
+-#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance
+-#define rest x8
++#define rest x2
+ #define vector_length x9
+-#define vl_remainder x10 // vector_length remainder
+-#define cl_remainder x11 // CACHE_LINE_SIZE remainder
+
+ #if HAVE_AARCH64_SVE_ASM
+ # if IS_IN (libc)
+@@ -42,14 +39,6 @@
+
+ .arch armv8.2-a+sve
+
+- .macro dc_zva times
+- dc zva, tmp1
+- add tmp1, tmp1, CACHE_LINE_SIZE
+- .if \times-1
+- dc_zva "(\times-1)"
+- .endif
+- .endm
+-
+ .macro st1b_unroll first=0, last=7
+ st1b z0.b, p0, [dst, \first, mul vl]
+ .if \last-\first
+@@ -188,54 +177,30 @@ L(L1_prefetch): // if rest >= L1_SIZE
+ cbnz rest, L(unroll32)
+ ret
+
+-L(L2):
+- // align dst address at vector_length byte boundary
+- sub tmp1, vector_length, 1
+- ands tmp2, dst, tmp1
+- // if vl_remainder == 0
+- b.eq 1f
+- sub vl_remainder, vector_length, tmp2
+- // process remainder until the first vector_length boundary
+- whilelt p2.b, xzr, vl_remainder
+- st1b z0.b, p2, [dst]
+- add dst, dst, vl_remainder
+- sub rest, rest, vl_remainder
+- // align dstin address at CACHE_LINE_SIZE byte boundary
+-1: mov tmp1, CACHE_LINE_SIZE
+- ands tmp2, dst, CACHE_LINE_SIZE - 1
+- // if cl_remainder == 0
+- b.eq L(L2_dc_zva)
+- sub cl_remainder, tmp1, tmp2
+- // process remainder until the first CACHE_LINE_SIZE boundary
+- mov tmp1, xzr // index
+-2: whilelt p2.b, tmp1, cl_remainder
+- st1b z0.b, p2, [dst, tmp1]
+- incb tmp1
+- cmp tmp1, cl_remainder
+- b.lo 2b
+- add dst, dst, cl_remainder
+- sub rest, rest, cl_remainder
+-
+-L(L2_dc_zva):
+- // zero fill
+- mov tmp1, dst
+- dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1
+- mov zva_len, ZF_DIST
+- add tmp1, zva_len, CACHE_LINE_SIZE * 2
+- // unroll
++ // count >= L2_SIZE
+ .p2align 3
+-1: st1b_unroll 0, 3
+- add tmp2, dst, zva_len
+- dc zva, tmp2
+- st1b_unroll 4, 7
+- add tmp2, tmp2, CACHE_LINE_SIZE
+- dc zva, tmp2
+- add dst, dst, CACHE_LINE_SIZE * 2
+- sub rest, rest, CACHE_LINE_SIZE * 2
+- cmp rest, tmp1 // ZF_DIST + CACHE_LINE_SIZE * 2
+- b.ge 1b
+- cbnz rest, L(unroll8)
+- ret
++L(L2):
++ tst valw, 255
++ b.ne L(unroll8)
++ // align dst to CACHE_LINE_SIZE byte boundary
++ and tmp2, dst, CACHE_LINE_SIZE - 1
++ st1b z0.b, p0, [dst, 0, mul vl]
++ st1b z0.b, p0, [dst, 1, mul vl]
++ st1b z0.b, p0, [dst, 2, mul vl]
++ st1b z0.b, p0, [dst, 3, mul vl]
++ sub dst, dst, tmp2
++ add count, count, tmp2
++
++ // clear cachelines using DC ZVA
++ sub count, count, CACHE_LINE_SIZE * 2
++ .p2align 4
++1: add dst, dst, CACHE_LINE_SIZE
++ dc zva, dst
++ subs count, count, CACHE_LINE_SIZE
++ b.hi 1b
++ add count, count, CACHE_LINE_SIZE
++ add dst, dst, CACHE_LINE_SIZE
++ b L(last)
+
+ END (MEMSET)
+ libc_hidden_builtin_def (MEMSET)
+--
+1.8.3.1
+