summaryrefslogtreecommitdiff
path: root/0008-AArch64-Add-memset_zva64.patch
diff options
context:
space:
mode:
Diffstat (limited to '0008-AArch64-Add-memset_zva64.patch')
-rw-r--r--0008-AArch64-Add-memset_zva64.patch228
1 files changed, 228 insertions, 0 deletions
diff --git a/0008-AArch64-Add-memset_zva64.patch b/0008-AArch64-Add-memset_zva64.patch
new file mode 100644
index 0000000..5225816
--- /dev/null
+++ b/0008-AArch64-Add-memset_zva64.patch
@@ -0,0 +1,228 @@
+From 156e44845f4137d6d3ea6c2824dd459652a7efda Mon Sep 17 00:00:00 2001
+From: Wilco Dijkstra <wilco.dijkstra@arm.com>
+Date: Thu, 26 Oct 2023 17:07:21 +0100
+Subject: [PATCH 08/26] AArch64: Add memset_zva64
+
+Add a specialized memset for the common ZVA size of 64 to avoid the
+overhead of reading the ZVA size. Since the code is identical to
+__memset_falkor, remove the latter.
+
+Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
+(cherry picked from commit 3d7090f14b13312320e425b27dcf0fe72de026fd)
+---
+ sysdeps/aarch64/memset.S | 10 ++--
+ sysdeps/aarch64/multiarch/Makefile | 2 +-
+ sysdeps/aarch64/multiarch/ifunc-impl-list.c | 4 +-
+ sysdeps/aarch64/multiarch/memset.c | 9 ++--
+ sysdeps/aarch64/multiarch/memset_falkor.S | 54 ---------------------
+ sysdeps/aarch64/multiarch/memset_zva64.S | 27 +++++++++++
+ 6 files changed, 38 insertions(+), 68 deletions(-)
+ delete mode 100644 sysdeps/aarch64/multiarch/memset_falkor.S
+ create mode 100644 sysdeps/aarch64/multiarch/memset_zva64.S
+
+diff --git a/sysdeps/aarch64/memset.S b/sysdeps/aarch64/memset.S
+index bf3cf85c8a..bbfb7184c3 100644
+--- a/sysdeps/aarch64/memset.S
++++ b/sysdeps/aarch64/memset.S
+@@ -101,19 +101,19 @@ L(tail64):
+ ret
+
+ L(try_zva):
+-#ifdef ZVA_MACRO
+- zva_macro
+-#else
++#ifndef ZVA64_ONLY
+ .p2align 3
+ mrs tmp1, dczid_el0
+ tbnz tmp1w, 4, L(no_zva)
+ and tmp1w, tmp1w, 15
+ cmp tmp1w, 4 /* ZVA size is 64 bytes. */
+ b.ne L(zva_128)
+-
++ nop
++#endif
+ /* Write the first and last 64 byte aligned block using stp rather
+ than using DC ZVA. This is faster on some cores.
+ */
++ .p2align 4
+ L(zva_64):
+ str q0, [dst, 16]
+ stp q0, q0, [dst, 32]
+@@ -123,7 +123,6 @@ L(zva_64):
+ sub count, dstend, dst /* Count is now 128 too large. */
+ sub count, count, 128+64+64 /* Adjust count and bias for loop. */
+ add dst, dst, 128
+- nop
+ 1: dc zva, dst
+ add dst, dst, 64
+ subs count, count, 64
+@@ -134,6 +133,7 @@ L(zva_64):
+ stp q0, q0, [dstend, -32]
+ ret
+
++#ifndef ZVA64_ONLY
+ .p2align 3
+ L(zva_128):
+ cmp tmp1w, 5 /* ZVA size is 128 bytes. */
+diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
+index a1a4de3cd9..171ca5e4cf 100644
+--- a/sysdeps/aarch64/multiarch/Makefile
++++ b/sysdeps/aarch64/multiarch/Makefile
+@@ -12,10 +12,10 @@ sysdep_routines += \
+ memmove_mops \
+ memset_a64fx \
+ memset_emag \
+- memset_falkor \
+ memset_generic \
+ memset_kunpeng \
+ memset_mops \
++ memset_zva64 \
+ strlen_asimd \
+ strlen_generic \
+ # sysdep_routines
+diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+index 3596d3c8d3..fdd9ea9246 100644
+--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
++++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+@@ -54,9 +54,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ IFUNC_IMPL_ADD (array, i, memmove, mops, __memmove_mops)
+ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_generic))
+ IFUNC_IMPL (i, name, memset,
+- /* Enable this on non-falkor processors too so that other cores
+- can do a comparative analysis with __memset_generic. */
+- IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_falkor)
++ IFUNC_IMPL_ADD (array, i, memset, (zva_size == 64), __memset_zva64)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_emag)
+ IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
+ #if HAVE_AARCH64_SVE_ASM
+diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
+index 9193b197dd..6deb6865e5 100644
+--- a/sysdeps/aarch64/multiarch/memset.c
++++ b/sysdeps/aarch64/multiarch/memset.c
+@@ -28,7 +28,7 @@
+
+ extern __typeof (__redirect_memset) __libc_memset;
+
+-extern __typeof (__redirect_memset) __memset_falkor attribute_hidden;
++extern __typeof (__redirect_memset) __memset_zva64 attribute_hidden;
+ extern __typeof (__redirect_memset) __memset_emag attribute_hidden;
+ extern __typeof (__redirect_memset) __memset_kunpeng attribute_hidden;
+ extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
+@@ -47,18 +47,17 @@ select_memset_ifunc (void)
+ {
+ if (IS_A64FX (midr) && zva_size == 256)
+ return __memset_a64fx;
+- return __memset_generic;
+ }
+
+ if (IS_KUNPENG920 (midr))
+ return __memset_kunpeng;
+
+- if ((IS_FALKOR (midr) || IS_PHECDA (midr)) && zva_size == 64)
+- return __memset_falkor;
+-
+ if (IS_EMAG (midr))
+ return __memset_emag;
+
++ if (zva_size == 64)
++ return __memset_zva64;
++
+ return __memset_generic;
+ }
+
+diff --git a/sysdeps/aarch64/multiarch/memset_falkor.S b/sysdeps/aarch64/multiarch/memset_falkor.S
+deleted file mode 100644
+index c6946a8072..0000000000
+--- a/sysdeps/aarch64/multiarch/memset_falkor.S
++++ /dev/null
+@@ -1,54 +0,0 @@
+-/* Memset for falkor.
+- Copyright (C) 2017-2023 Free Software Foundation, Inc.
+-
+- This file is part of the GNU C Library.
+-
+- The GNU C Library is free software; you can redistribute it and/or
+- modify it under the terms of the GNU Lesser General Public
+- License as published by the Free Software Foundation; either
+- version 2.1 of the License, or (at your option) any later version.
+-
+- The GNU C Library is distributed in the hope that it will be useful,
+- but WITHOUT ANY WARRANTY; without even the implied warranty of
+- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+- Lesser General Public License for more details.
+-
+- You should have received a copy of the GNU Lesser General Public
+- License along with the GNU C Library. If not, see
+- <https://www.gnu.org/licenses/>. */
+-
+-#include <sysdep.h>
+-#include <memset-reg.h>
+-
+-/* Reading dczid_el0 is expensive on falkor so move it into the ifunc
+- resolver and assume ZVA size of 64 bytes. The IFUNC resolver takes care to
+- use this function only when ZVA is enabled. */
+-
+-#if IS_IN (libc)
+-.macro zva_macro
+- .p2align 4
+- /* Write the first and last 64 byte aligned block using stp rather
+- than using DC ZVA. This is faster on some cores. */
+- str q0, [dst, 16]
+- stp q0, q0, [dst, 32]
+- bic dst, dst, 63
+- stp q0, q0, [dst, 64]
+- stp q0, q0, [dst, 96]
+- sub count, dstend, dst /* Count is now 128 too large. */
+- sub count, count, 128+64+64 /* Adjust count and bias for loop. */
+- add dst, dst, 128
+-1: dc zva, dst
+- add dst, dst, 64
+- subs count, count, 64
+- b.hi 1b
+- stp q0, q0, [dst, 0]
+- stp q0, q0, [dst, 32]
+- stp q0, q0, [dstend, -64]
+- stp q0, q0, [dstend, -32]
+- ret
+-.endm
+-
+-# define ZVA_MACRO zva_macro
+-# define MEMSET __memset_falkor
+-# include <sysdeps/aarch64/memset.S>
+-#endif
+diff --git a/sysdeps/aarch64/multiarch/memset_zva64.S b/sysdeps/aarch64/multiarch/memset_zva64.S
+new file mode 100644
+index 0000000000..13f45fd3d8
+--- /dev/null
++++ b/sysdeps/aarch64/multiarch/memset_zva64.S
+@@ -0,0 +1,27 @@
++/* Optimized memset for zva size = 64.
++ Copyright (C) 2023 Free Software Foundation, Inc.
++
++ This file is part of the GNU C Library.
++
++ The GNU C Library is free software; you can redistribute it and/or
++ modify it under the terms of the GNU Lesser General Public
++ License as published by the Free Software Foundation; either
++ version 2.1 of the License, or (at your option) any later version.
++
++ The GNU C Library is distributed in the hope that it will be useful,
++ but WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ Lesser General Public License for more details.
++
++ You should have received a copy of the GNU Lesser General Public
++ License along with the GNU C Library. If not, see
++ <https://www.gnu.org/licenses/>. */
++
++#include <sysdep.h>
++
++#define ZVA64_ONLY 1
++#define MEMSET __memset_zva64
++#undef libc_hidden_builtin_def
++#define libc_hidden_builtin_def(X)
++
++#include "../memset.S"
+--
+2.33.0
+