1 files changed, 136 insertions, 0 deletions
diff --git a/1-5-AArch64-Improve-A64FX-memset-for-small-sizes.patch b/1-5-AArch64-Improve-A64FX-memset-for-small-sizes.patch
new file mode 100644
index 0000000..2db0045
--- /dev/null
+++ b/1-5-AArch64-Improve-A64FX-memset-for-small-sizes.patch
@@ -0,0 +1,136 @@
+From 07b427296b8d59f439144029d9a948f6c1ce0a31 Mon Sep 17 00:00:00 2001
+From: Wilco Dijkstra <wdijkstr@arm.com>
+Date: Tue, 10 Aug 2021 13:30:27 +0100
+Subject: [PATCH] [1/5] AArch64: Improve A64FX memset for small sizes
+
+Improve performance of small memsets by reducing instruction counts and
+improving code alignment. Bench-memset shows 35-45% performance gain for
+small sizes.
+
+Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
+---
+ sysdeps/aarch64/multiarch/memset_a64fx.S | 96 ++++++++++++--------------------
+ 1 file changed, 36 insertions(+), 60 deletions(-)
+
+diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
+index ce54e54..cf3d402 100644
+--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
++++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
+@@ -51,78 +51,54 @@
+ 	.endm
+ 
+ 	.macro st1b_unroll first=0, last=7
+-	st1b	z0.b, p0, [dst, #\first, mul vl]
++	st1b	z0.b, p0, [dst, \first, mul vl]
+ 	.if \last-\first
+ 	st1b_unroll "(\first+1)", \last
+ 	.endif
+ 	.endm
+ 
+-	.macro shortcut_for_small_size exit
+-	// if rest <= vector_length * 2
+-	whilelo	p0.b, xzr, count
+-	whilelo	p1.b, vector_length, count
+-	b.last	1f
+-	st1b	z0.b, p0, [dstin, #0, mul vl]
+-	st1b	z0.b, p1, [dstin, #1, mul vl]
+-	ret
+-1:	// if rest > vector_length * 8
+-	cmp	count, vector_length, lsl 3	// vector_length * 8
+-	b.hi	\exit
+-	// if rest <= vector_length * 4
+-	lsl	tmp1, vector_length, 1	// vector_length * 2
+-	whilelo	p2.b, tmp1, count
+-	incb	tmp1
+-	whilelo	p3.b, tmp1, count
+-	b.last	1f
+-	st1b	z0.b, p0, [dstin, #0, mul vl]
+-	st1b	z0.b, p1, [dstin, #1, mul vl]
+-	st1b	z0.b, p2, [dstin, #2, mul vl]
+-	st1b	z0.b, p3, [dstin, #3, mul vl]
+-	ret
+-1:	// if rest <= vector_length * 8
+-	lsl	tmp1, vector_length, 2	// vector_length * 4
+-	whilelo	p4.b, tmp1, count
+-	incb	tmp1
+-	whilelo	p5.b, tmp1, count
+-	b.last	1f
+-	st1b	z0.b, p0, [dstin, #0, mul vl]
+-	st1b	z0.b, p1, [dstin, #1, mul vl]
+-	st1b	z0.b, p2, [dstin, #2, mul vl]
+-	st1b	z0.b, p3, [dstin, #3, mul vl]
+-	st1b	z0.b, p4, [dstin, #4, mul vl]
+-	st1b	z0.b, p5, [dstin, #5, mul vl]
+-	ret
+-1:	lsl	tmp1, vector_length, 2	// vector_length * 4
+-	incb	tmp1			// vector_length * 5
+-	incb	tmp1			// vector_length * 6
+-	whilelo	p6.b, tmp1, count
+-	incb	tmp1
+-	whilelo	p7.b, tmp1, count
+-	st1b	z0.b, p0, [dstin, #0, mul vl]
+-	st1b	z0.b, p1, [dstin, #1, mul vl]
+-	st1b	z0.b, p2, [dstin, #2, mul vl]
+-	st1b	z0.b, p3, [dstin, #3, mul vl]
+-	st1b	z0.b, p4, [dstin, #4, mul vl]
+-	st1b	z0.b, p5, [dstin, #5, mul vl]
+-	st1b	z0.b, p6, [dstin, #6, mul vl]
+-	st1b	z0.b, p7, [dstin, #7, mul vl]
+-	ret
+-	.endm
+ 
+-ENTRY (MEMSET)
++#undef BTI_C
++#define BTI_C
+ 
++ENTRY (MEMSET)
+ 	PTR_ARG (0)
+ 	SIZE_ARG (2)
+ 
+-	cbnz	count, 1f
+-	ret
+-1:	dup	z0.b, valw
+ 	cntb	vector_length
+-	// shortcut for less than vector_length * 8
+-	// gives a free ptrue to p0.b for n >= vector_length
+-	shortcut_for_small_size L(vl_agnostic)
+-	// end of shortcut
++	dup	z0.b, valw
++	whilelo	p0.b, vector_length, count
++	b.last	1f
++	whilelo	p1.b, xzr, count
++	st1b	z0.b, p1, [dstin, 0, mul vl]
++	st1b	z0.b, p0, [dstin, 1, mul vl]
++	ret
++
++	// count >= vector_length * 2
++1:	cmp	count, vector_length, lsl 2
++	add	dstend, dstin, count
++	b.hi	1f
++	st1b	z0.b, p0, [dstin, 0, mul vl]
++	st1b	z0.b, p0, [dstin, 1, mul vl]
++	st1b	z0.b, p0, [dstend, -2, mul vl]
++	st1b	z0.b, p0, [dstend, -1, mul vl]
++	ret
++
++	// count > vector_length * 4
++1:	lsl	tmp1, vector_length, 3
++	cmp	count, tmp1
++	b.hi	L(vl_agnostic)
++	st1b	z0.b, p0, [dstin, 0, mul vl]
++	st1b	z0.b, p0, [dstin, 1, mul vl]
++	st1b	z0.b, p0, [dstin, 2, mul vl]
++	st1b	z0.b, p0, [dstin, 3, mul vl]
++	st1b	z0.b, p0, [dstend, -4, mul vl]
++	st1b	z0.b, p0, [dstend, -3, mul vl]
++	st1b	z0.b, p0, [dstend, -2, mul vl]
++	st1b	z0.b, p0, [dstend, -1, mul vl]
++	ret
+ 
++	.p2align 4
+ L(vl_agnostic): // VL Agnostic
+ 	mov	rest, count
+ 	mov	dst, dstin
+-- 
+1.8.3.1
+