1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
|
From 07b427296b8d59f439144029d9a948f6c1ce0a31 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wdijkstr@arm.com>
Date: Tue, 10 Aug 2021 13:30:27 +0100
Subject: [PATCH] [1/5] AArch64: Improve A64FX memset for small sizes
Improve performance of small memsets by reducing instruction counts and
improving code alignment. Bench-memset shows 35-45% performance gain for
small sizes.
Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
---
sysdeps/aarch64/multiarch/memset_a64fx.S | 96 ++++++++++++--------------------
1 file changed, 36 insertions(+), 60 deletions(-)
diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
index ce54e54..cf3d402 100644
--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
@@ -51,78 +51,54 @@
.endm
.macro st1b_unroll first=0, last=7
- st1b z0.b, p0, [dst, #\first, mul vl]
+ st1b z0.b, p0, [dst, \first, mul vl]
.if \last-\first
st1b_unroll "(\first+1)", \last
.endif
.endm
- .macro shortcut_for_small_size exit
- // if rest <= vector_length * 2
- whilelo p0.b, xzr, count
- whilelo p1.b, vector_length, count
- b.last 1f
- st1b z0.b, p0, [dstin, #0, mul vl]
- st1b z0.b, p1, [dstin, #1, mul vl]
- ret
-1: // if rest > vector_length * 8
- cmp count, vector_length, lsl 3 // vector_length * 8
- b.hi \exit
- // if rest <= vector_length * 4
- lsl tmp1, vector_length, 1 // vector_length * 2
- whilelo p2.b, tmp1, count
- incb tmp1
- whilelo p3.b, tmp1, count
- b.last 1f
- st1b z0.b, p0, [dstin, #0, mul vl]
- st1b z0.b, p1, [dstin, #1, mul vl]
- st1b z0.b, p2, [dstin, #2, mul vl]
- st1b z0.b, p3, [dstin, #3, mul vl]
- ret
-1: // if rest <= vector_length * 8
- lsl tmp1, vector_length, 2 // vector_length * 4
- whilelo p4.b, tmp1, count
- incb tmp1
- whilelo p5.b, tmp1, count
- b.last 1f
- st1b z0.b, p0, [dstin, #0, mul vl]
- st1b z0.b, p1, [dstin, #1, mul vl]
- st1b z0.b, p2, [dstin, #2, mul vl]
- st1b z0.b, p3, [dstin, #3, mul vl]
- st1b z0.b, p4, [dstin, #4, mul vl]
- st1b z0.b, p5, [dstin, #5, mul vl]
- ret
-1: lsl tmp1, vector_length, 2 // vector_length * 4
- incb tmp1 // vector_length * 5
- incb tmp1 // vector_length * 6
- whilelo p6.b, tmp1, count
- incb tmp1
- whilelo p7.b, tmp1, count
- st1b z0.b, p0, [dstin, #0, mul vl]
- st1b z0.b, p1, [dstin, #1, mul vl]
- st1b z0.b, p2, [dstin, #2, mul vl]
- st1b z0.b, p3, [dstin, #3, mul vl]
- st1b z0.b, p4, [dstin, #4, mul vl]
- st1b z0.b, p5, [dstin, #5, mul vl]
- st1b z0.b, p6, [dstin, #6, mul vl]
- st1b z0.b, p7, [dstin, #7, mul vl]
- ret
- .endm
-ENTRY (MEMSET)
+#undef BTI_C
+#define BTI_C
+ENTRY (MEMSET)
PTR_ARG (0)
SIZE_ARG (2)
- cbnz count, 1f
- ret
-1: dup z0.b, valw
cntb vector_length
- // shortcut for less than vector_length * 8
- // gives a free ptrue to p0.b for n >= vector_length
- shortcut_for_small_size L(vl_agnostic)
- // end of shortcut
+ dup z0.b, valw
+ whilelo p0.b, vector_length, count
+ b.last 1f
+ whilelo p1.b, xzr, count
+ st1b z0.b, p1, [dstin, 0, mul vl]
+ st1b z0.b, p0, [dstin, 1, mul vl]
+ ret
+
+ // count >= vector_length * 2
+1: cmp count, vector_length, lsl 2
+ add dstend, dstin, count
+ b.hi 1f
+ st1b z0.b, p0, [dstin, 0, mul vl]
+ st1b z0.b, p0, [dstin, 1, mul vl]
+ st1b z0.b, p0, [dstend, -2, mul vl]
+ st1b z0.b, p0, [dstend, -1, mul vl]
+ ret
+
+ // count > vector_length * 4
+1: lsl tmp1, vector_length, 3
+ cmp count, tmp1
+ b.hi L(vl_agnostic)
+ st1b z0.b, p0, [dstin, 0, mul vl]
+ st1b z0.b, p0, [dstin, 1, mul vl]
+ st1b z0.b, p0, [dstin, 2, mul vl]
+ st1b z0.b, p0, [dstin, 3, mul vl]
+ st1b z0.b, p0, [dstend, -4, mul vl]
+ st1b z0.b, p0, [dstend, -3, mul vl]
+ st1b z0.b, p0, [dstend, -2, mul vl]
+ st1b z0.b, p0, [dstend, -1, mul vl]
+ ret
+ .p2align 4
L(vl_agnostic): // VL Agnostic
mov rest, count
mov dst, dstin
--
1.8.3.1
|