From d20db0561a6a36f914fde030512503b114ef9a0c Mon Sep 17 00:00:00 2001 From: CoprDistGit Date: Sat, 3 Aug 2024 06:28:41 +0000 Subject: automatic import of glibc --- ...-Improve-A64FX-memset-by-removing-unroll3.patch | 51 ++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 4-5-AArch64-Improve-A64FX-memset-by-removing-unroll3.patch (limited to '4-5-AArch64-Improve-A64FX-memset-by-removing-unroll3.patch') diff --git a/4-5-AArch64-Improve-A64FX-memset-by-removing-unroll3.patch b/4-5-AArch64-Improve-A64FX-memset-by-removing-unroll3.patch new file mode 100644 index 0000000..fd17671 --- /dev/null +++ b/4-5-AArch64-Improve-A64FX-memset-by-removing-unroll3.patch @@ -0,0 +1,51 @@ +From e69d9981f858a38e19304e6ff5ebdf89f2cb0ba0 Mon Sep 17 00:00:00 2001 +From: Wilco Dijkstra +Date: Tue, 10 Aug 2021 13:44:27 +0100 +Subject: [PATCH] [4/5] AArch64: Improve A64FX memset by removing unroll32 + +Remove unroll32 code since it doesn't improve performance. + +Reviewed-by: Naohiro Tamura +--- + sysdeps/aarch64/multiarch/memset_a64fx.S | 18 +----------------- + 1 file changed, 1 insertion(+), 17 deletions(-) + +diff --git a/sysdeps/aarch64/multiarch/memset_a64fx.S b/sysdeps/aarch64/multiarch/memset_a64fx.S +index 337c86b..ef03156 100644 +--- a/sysdeps/aarch64/multiarch/memset_a64fx.S ++++ b/sysdeps/aarch64/multiarch/memset_a64fx.S +@@ -102,22 +102,6 @@ L(vl_agnostic): // VL Agnostic + ccmp vector_length, tmp1, 0, cs + b.eq L(L1_prefetch) + +-L(unroll32): +- lsl tmp1, vector_length, 3 // vector_length * 8 +- lsl tmp2, vector_length, 5 // vector_length * 32 +- .p2align 3 +-1: cmp rest, tmp2 +- b.cc L(unroll8) +- st1b_unroll +- add dst, dst, tmp1 +- st1b_unroll +- add dst, dst, tmp1 +- st1b_unroll +- add dst, dst, tmp1 +- st1b_unroll +- add dst, dst, tmp1 +- sub rest, rest, tmp2 +- b 1b + + L(unroll8): + lsl tmp1, vector_length, 3 +@@ -155,7 +139,7 @@ L(L1_prefetch): // if rest >= L1_SIZE + sub rest, rest, CACHE_LINE_SIZE * 2 + cmp rest, L1_SIZE + b.ge 1b +- cbnz rest, L(unroll32) ++ cbnz rest, L(unroll8) + ret + + // count >= L2_SIZE +-- +1.8.3.1 + -- cgit v1.2.3