1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
|
From 9bc2ed8f46d80859a5596789cc9e8cc2de84b0e7 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wdijkstr@arm.com>
Date: Tue, 10 Aug 2021 13:39:37 +0100
Subject: [PATCH] [2/5] AArch64: Improve A64FX memset for large sizes
Improve performance of large memsets. Simplify alignment code. For zero memset
use DC ZVA, which almost doubles performance. For non-zero memsets use the
unroll8 loop which is about 10% faster.
Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
---
sysdeps/aarch64/multiarch/memset_a64fx.S | 85 ++++++++++----------------------
1 file changed, 25 insertions(+), 60 deletions(-)
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
index cf3d402..75cf43a 100644
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
@@ -27,14 +27,11 @@
*/
#define L1_SIZE (64*1024) // L1 64KB
-#define L2_SIZE (8*1024*1024) // L2 8MB - 1MB
+#define L2_SIZE (8*1024*1024) // L2 8MB
#define CACHE_LINE_SIZE 256
#define PF_DIST_L1 (CACHE_LINE_SIZE * 16) // Prefetch distance L1
-#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance
-#define rest x8
+#define rest x2
#define vector_length x9
-#define vl_remainder x10 // vector_length remainder
-#define cl_remainder x11 // CACHE_LINE_SIZE remainder
#if HAVE_AARCH64_SVE_ASM
# if IS_IN (libc)
@@ -42,14 +39,6 @@
.arch armv8.2-a+sve
- .macro dc_zva times
- dc zva, tmp1
- add tmp1, tmp1, CACHE_LINE_SIZE
- .if \times-1
- dc_zva "(\times-1)"
- .endif
- .endm
-
.macro st1b_unroll first=0, last=7
st1b z0.b, p0, [dst, \first, mul vl]
.if \last-\first
@@ -188,54 +177,30 @@ L(L1_prefetch): // if rest >= L1_SIZE
cbnz rest, L(unroll32)
ret
-L(L2):
- // align dst address at vector_length byte boundary
- sub tmp1, vector_length, 1
- ands tmp2, dst, tmp1
- // if vl_remainder == 0
- b.eq 1f
- sub vl_remainder, vector_length, tmp2
- // process remainder until the first vector_length boundary
- whilelt p2.b, xzr, vl_remainder
- st1b z0.b, p2, [dst]
- add dst, dst, vl_remainder
- sub rest, rest, vl_remainder
- // align dstin address at CACHE_LINE_SIZE byte boundary
-1: mov tmp1, CACHE_LINE_SIZE
- ands tmp2, dst, CACHE_LINE_SIZE - 1
- // if cl_remainder == 0
- b.eq L(L2_dc_zva)
- sub cl_remainder, tmp1, tmp2
- // process remainder until the first CACHE_LINE_SIZE boundary
- mov tmp1, xzr // index
-2: whilelt p2.b, tmp1, cl_remainder
- st1b z0.b, p2, [dst, tmp1]
- incb tmp1
- cmp tmp1, cl_remainder
- b.lo 2b
- add dst, dst, cl_remainder
- sub rest, rest, cl_remainder
-
-L(L2_dc_zva):
- // zero fill
- mov tmp1, dst
- dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1
- mov zva_len, ZF_DIST
- add tmp1, zva_len, CACHE_LINE_SIZE * 2
- // unroll
+ // count >= L2_SIZE
.p2align 3
-1: st1b_unroll 0, 3
- add tmp2, dst, zva_len
- dc zva, tmp2
- st1b_unroll 4, 7
- add tmp2, tmp2, CACHE_LINE_SIZE
- dc zva, tmp2
- add dst, dst, CACHE_LINE_SIZE * 2
- sub rest, rest, CACHE_LINE_SIZE * 2
- cmp rest, tmp1 // ZF_DIST + CACHE_LINE_SIZE * 2
- b.ge 1b
- cbnz rest, L(unroll8)
- ret
+L(L2):
+ tst valw, 255
+ b.ne L(unroll8)
+ // align dst to CACHE_LINE_SIZE byte boundary
+ and tmp2, dst, CACHE_LINE_SIZE - 1
+ st1b z0.b, p0, [dst, 0, mul vl]
+ st1b z0.b, p0, [dst, 1, mul vl]
+ st1b z0.b, p0, [dst, 2, mul vl]
+ st1b z0.b, p0, [dst, 3, mul vl]
+ sub dst, dst, tmp2
+ add count, count, tmp2
+
+ // clear cachelines using DC ZVA
+ sub count, count, CACHE_LINE_SIZE * 2
+ .p2align 4
+1: add dst, dst, CACHE_LINE_SIZE
+ dc zva, dst
+ subs count, count, CACHE_LINE_SIZE
+ b.hi 1b
+ add count, count, CACHE_LINE_SIZE
+ add dst, dst, CACHE_LINE_SIZE
+ b L(last)
END (MEMSET)
libc_hidden_builtin_def (MEMSET)
--
1.8.3.1
|