summaryrefslogtreecommitdiff
path: root/4-5-AArch64-Improve-A64FX-memset-by-removing-unroll3.patch
diff options
context:
space:
mode:
authorCoprDistGit <infra@openeuler.org>2024-08-03 06:28:41 +0000
committerCoprDistGit <infra@openeuler.org>2024-08-03 06:28:41 +0000
commitd20db0561a6a36f914fde030512503b114ef9a0c (patch)
treed4e5e3494d95c269a1cee6195f11bf3201bcadbf /4-5-AArch64-Improve-A64FX-memset-by-removing-unroll3.patch
parent016343d99b1b269d7246ef1e143d4b54914433d4 (diff)
Diffstat (limited to '4-5-AArch64-Improve-A64FX-memset-by-removing-unroll3.patch')
-rw-r--r--4-5-AArch64-Improve-A64FX-memset-by-removing-unroll3.patch51
1 files changed, 51 insertions, 0 deletions
diff --git a/4-5-AArch64-Improve-A64FX-memset-by-removing-unroll3.patch b/4-5-AArch64-Improve-A64FX-memset-by-removing-unroll3.patch
new file mode 100644
index 0000000..fd17671
--- /dev/null
+++ b/4-5-AArch64-Improve-A64FX-memset-by-removing-unroll3.patch
@@ -0,0 +1,51 @@
+From e69d9981f858a38e19304e6ff5ebdf89f2cb0ba0 Mon Sep 17 00:00:00 2001
+From: Wilco Dijkstra <wdijkstr@arm.com>
+Date: Tue, 10 Aug 2021 13:44:27 +0100
+Subject: [PATCH] [4/5] AArch64: Improve A64FX memset by removing unroll32
+
+Remove unroll32 code since it doesn't improve performance.
+
+Reviewed-by: Naohiro Tamura <naohirot@fujitsu.com>
+---
+ sysdeps/aarch64/multiarch/memset_a64fx.S | 18 +-----------------
+ 1 file changed, 1 insertion(+), 17 deletions(-)
+
+diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S
+index 337c86b..ef03156 100644
+--- a/sysdeps/aarch64/multiarch/memset_a64fx.S
++++ b/sysdeps/aarch64/multiarch/memset_a64fx.S
+@@ -102,22 +102,6 @@ L(vl_agnostic): // VL Agnostic
+ ccmp vector_length, tmp1, 0, cs
+ b.eq L(L1_prefetch)
+
+-L(unroll32):
+- lsl tmp1, vector_length, 3 // vector_length * 8
+- lsl tmp2, vector_length, 5 // vector_length * 32
+- .p2align 3
+-1: cmp rest, tmp2
+- b.cc L(unroll8)
+- st1b_unroll
+- add dst, dst, tmp1
+- st1b_unroll
+- add dst, dst, tmp1
+- st1b_unroll
+- add dst, dst, tmp1
+- st1b_unroll
+- add dst, dst, tmp1
+- sub rest, rest, tmp2
+- b 1b
+
+ L(unroll8):
+ lsl tmp1, vector_length, 3
+@@ -155,7 +139,7 @@ L(L1_prefetch): // if rest >= L1_SIZE
+ sub rest, rest, CACHE_LINE_SIZE * 2
+ cmp rest, L1_SIZE
+ b.ge 1b
+- cbnz rest, L(unroll32)
++ cbnz rest, L(unroll8)
+ ret
+
+ // count >= L2_SIZE
+--
+1.8.3.1
+