diff options
Diffstat (limited to 'LoongArch-Add-ifunc-support-for-memcmp-aligned-lsx-l.patch')
-rw-r--r-- | LoongArch-Add-ifunc-support-for-memcmp-aligned-lsx-l.patch | 946 |
1 files changed, 946 insertions, 0 deletions
diff --git a/LoongArch-Add-ifunc-support-for-memcmp-aligned-lsx-l.patch b/LoongArch-Add-ifunc-support-for-memcmp-aligned-lsx-l.patch new file mode 100644 index 0000000..72c26d0 --- /dev/null +++ b/LoongArch-Add-ifunc-support-for-memcmp-aligned-lsx-l.patch @@ -0,0 +1,946 @@ +From 60f4bbd1eec528ba8df044ae6b3091f6337a7fcc Mon Sep 17 00:00:00 2001 +From: dengjianbo <dengjianbo@loongson.cn> +Date: Mon, 28 Aug 2023 10:08:39 +0800 +Subject: [PATCH 18/29] LoongArch: Add ifunc support for memcmp{aligned, lsx, + lasx} + +According to glibc memcmp microbenchmark test results(Add generic +memcmp), this implementation have performance improvement +except the length is less than 3, details as below: + +Name Percent of time reduced +memcmp-lasx 16%-74% +memcmp-lsx 20%-50% +memcmp-aligned 5%-20% + +Signed-off-by: Peng Fan <fanpeng@loongson.cn> +Signed-off-by: ticat_fp <fanpeng@loongson.cn> +--- + sysdeps/loongarch/lp64/multiarch/Makefile | 3 + + .../lp64/multiarch/ifunc-impl-list.c | 7 + + .../loongarch/lp64/multiarch/ifunc-memcmp.h | 40 +++ + .../loongarch/lp64/multiarch/memcmp-aligned.S | 292 ++++++++++++++++++ + .../loongarch/lp64/multiarch/memcmp-lasx.S | 207 +++++++++++++ + sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S | 269 ++++++++++++++++ + sysdeps/loongarch/lp64/multiarch/memcmp.c | 43 +++ + 7 files changed, 861 insertions(+) + create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-memcmp.h + create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S + create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S + create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S + create mode 100644 sysdeps/loongarch/lp64/multiarch/memcmp.c + +diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile +index 216886c5..360a6718 100644 +--- a/sysdeps/loongarch/lp64/multiarch/Makefile ++++ b/sysdeps/loongarch/lp64/multiarch/Makefile +@@ -34,5 +34,8 @@ sysdep_routines += \ + memset-unaligned \ + memset-lsx \ + memset-lasx \ ++ memcmp-aligned \ ++ memcmp-lsx \ ++ memcmp-lasx \ + # sysdep_routines + endif +diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c +index 37f60dde..e397d58c 100644 +--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c +@@ -127,5 +127,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memset, 1, __memset_aligned) + ) + ++ IFUNC_IMPL (i, name, memcmp, ++#if !defined __loongarch_soft_float ++ IFUNC_IMPL_ADD (array, i, memcmp, SUPPORT_LASX, __memcmp_lasx) ++ IFUNC_IMPL_ADD (array, i, memcmp, SUPPORT_LSX, __memcmp_lsx) ++#endif ++ IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_aligned) ++ ) + return i; + } +diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-memcmp.h b/sysdeps/loongarch/lp64/multiarch/ifunc-memcmp.h +new file mode 100644 +index 00000000..04adc2e5 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/multiarch/ifunc-memcmp.h +@@ -0,0 +1,40 @@ ++/* Common definition for memcmp ifunc selections. ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++#include <ldsodefs.h> ++#include <ifunc-init.h> ++ ++#if !defined __loongarch_soft_float ++extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden; ++#endif ++extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden; ++ ++static inline void * ++IFUNC_SELECTOR (void) ++{ ++#if !defined __loongarch_soft_float ++ if (SUPPORT_LASX) ++ return OPTIMIZE (lasx); ++ else if (SUPPORT_LSX) ++ return OPTIMIZE (lsx); ++ else ++#endif ++ return OPTIMIZE (aligned); ++} +diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S +new file mode 100644 +index 00000000..14a7caa9 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/multiarch/memcmp-aligned.S +@@ -0,0 +1,292 @@ ++/* Optimized memcmp implementation using basic LoongArch instructions. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++#include <sysdep.h> ++#include <sys/regdef.h> ++#include <sys/asm.h> ++ ++#if IS_IN (libc) ++# define MEMCMP_NAME __memcmp_aligned ++#else ++# define MEMCMP_NAME memcmp ++#endif ++ ++LEAF(MEMCMP_NAME, 6) ++ beqz a2, L(ret) ++ andi a4, a1, 0x7 ++ andi a3, a0, 0x7 ++ sltu a5, a4, a3 ++ ++ xor t0, a0, a1 ++ li.w t8, 8 ++ maskeqz t0, t0, a5 ++ li.w t7, -1 ++ ++ xor a0, a0, t0 ++ xor a1, a1, t0 ++ andi a3, a0, 0x7 ++ andi a4, a1, 0x7 ++ ++ xor a0, a0, a3 ++ xor a1, a1, a4 ++ ld.d t2, a0, 0 ++ ld.d t1, a1, 0 ++ ++ slli.d t3, a3, 3 ++ slli.d t4, a4, 3 ++ sub.d a6, t3, t4 ++ srl.d t1, t1, t4 ++ ++ srl.d t0, t2, t3 ++ srl.d t5, t7, t4 ++ sub.d t6, t0, t1 ++ and t6, t6, t5 ++ ++ sub.d t5, t8, a4 ++ bnez t6, L(first_out) ++ bgeu t5, a2, L(ret) ++ sub.d a2, a2, t5 ++ ++ bnez a6, L(unaligned) ++ blt a2, t8, L(al_less_8bytes) ++ andi t1, a2, 31 ++ beq t1, a2, L(al_less_32bytes) ++ ++ sub.d t2, a2, t1 ++ add.d a4, a0, t2 ++ move a2, t1 ++ ++L(al_loop): ++ ld.d t0, a0, 8 ++ ++ ld.d t1, a1, 8 ++ ld.d t2, a0, 16 ++ ld.d t3, a1, 16 ++ ld.d t4, a0, 24 ++ ++ ld.d t5, a1, 24 ++ ld.d t6, a0, 32 ++ ld.d t7, a1, 32 ++ addi.d a0, a0, 32 ++ ++ addi.d a1, a1, 32 ++ bne t0, t1, L(out1) ++ bne t2, t3, L(out2) ++ bne t4, t5, L(out3) ++ ++ bne t6, t7, L(out4) ++ bne a0, a4, L(al_loop) ++ ++L(al_less_32bytes): ++ srai.d a4, a2, 4 ++ beqz a4, L(al_less_16bytes) ++ ++ ld.d t0, a0, 8 ++ ld.d t1, a1, 8 ++ ld.d t2, a0, 16 ++ ld.d t3, a1, 16 ++ ++ addi.d a0, a0, 16 ++ addi.d a1, a1, 16 ++ addi.d a2, a2, -16 ++ bne t0, t1, L(out1) ++ ++ bne t2, t3, L(out2) ++ ++L(al_less_16bytes): ++ srai.d a4, a2, 3 ++ beqz a4, L(al_less_8bytes) ++ ld.d t0, a0, 8 ++ ++ ld.d t1, a1, 8 ++ addi.d a0, a0, 8 ++ addi.d a1, a1, 8 ++ addi.d a2, a2, -8 ++ ++ bne t0, t1, L(out1) ++ ++L(al_less_8bytes): ++ beqz a2, L(ret) ++ ld.d t0, a0, 8 ++ ld.d t1, a1, 8 ++ ++ li.d t7, -1 ++ slli.d t2, a2, 3 ++ sll.d t2, t7, t2 ++ sub.d t3, t0, t1 ++ ++ andn t6, t3, t2 ++ bnez t6, L(count_diff) ++ ++L(ret): ++ move a0, zero ++ jr ra ++ ++L(out4): ++ move t0, t6 ++ move t1, t7 ++ sub.d t6, t6, t7 ++ b L(count_diff) ++ ++L(out3): ++ move t0, t4 ++ move t1, t5 ++ sub.d t6, t4, t5 ++ b L(count_diff) ++ ++L(out2): ++ move t0, t2 ++ move t1, t3 ++L(out1): ++ sub.d t6, t0, t1 ++ b L(count_diff) ++ ++L(first_out): ++ slli.d t4, a2, 3 ++ slt t3, a2, t5 ++ sll.d t4, t7, t4 ++ maskeqz t4, t4, t3 ++ ++ andn t6, t6, t4 ++ ++L(count_diff): ++ ctz.d t2, t6 ++ bstrins.d t2, zero, 2, 0 ++ srl.d t0, t0, t2 ++ ++ srl.d t1, t1, t2 ++ andi t0, t0, 0xff ++ andi t1, t1, 0xff ++ sub.d t2, t0, t1 ++ ++ sub.d t3, t1, t0 ++ masknez t2, t2, a5 ++ maskeqz t3, t3, a5 ++ or a0, t2, t3 ++ ++ jr ra ++ ++L(unaligned): ++ sub.d a7, zero, a6 ++ srl.d t0, t2, a6 ++ blt a2, t8, L(un_less_8bytes) ++ ++ andi t1, a2, 31 ++ beq t1, a2, L(un_less_32bytes) ++ sub.d t2, a2, t1 ++ add.d a4, a0, t2 ++ ++ move a2, t1 ++ ++L(un_loop): ++ ld.d t2, a0, 8 ++ ld.d t1, a1, 8 ++ ld.d t4, a0, 16 ++ ++ ld.d t3, a1, 16 ++ ld.d t6, a0, 24 ++ ld.d t5, a1, 24 ++ ld.d t8, a0, 32 ++ ++ ld.d t7, a1, 32 ++ addi.d a0, a0, 32 ++ addi.d a1, a1, 32 ++ sll.d a3, t2, a7 ++ ++ or t0, a3, t0 ++ bne t0, t1, L(out1) ++ srl.d t0, t2, a6 ++ sll.d a3, t4, a7 ++ ++ or t2, a3, t0 ++ bne t2, t3, L(out2) ++ srl.d t0, t4, a6 ++ sll.d a3, t6, a7 ++ ++ or t4, a3, t0 ++ bne t4, t5, L(out3) ++ srl.d t0, t6, a6 ++ sll.d a3, t8, a7 ++ ++ or t6, t0, a3 ++ bne t6, t7, L(out4) ++ srl.d t0, t8, a6 ++ bne a0, a4, L(un_loop) ++ ++L(un_less_32bytes): ++ srai.d a4, a2, 4 ++ beqz a4, L(un_less_16bytes) ++ ld.d t2, a0, 8 ++ ld.d t1, a1, 8 ++ ++ ld.d t4, a0, 16 ++ ld.d t3, a1, 16 ++ addi.d a0, a0, 16 ++ addi.d a1, a1, 16 ++ ++ addi.d a2, a2, -16 ++ sll.d a3, t2, a7 ++ or t0, a3, t0 ++ bne t0, t1, L(out1) ++ ++ srl.d t0, t2, a6 ++ sll.d a3, t4, a7 ++ or t2, a3, t0 ++ bne t2, t3, L(out2) ++ ++ srl.d t0, t4, a6 ++ ++L(un_less_16bytes): ++ srai.d a4, a2, 3 ++ beqz a4, L(un_less_8bytes) ++ ld.d t2, a0, 8 ++ ++ ld.d t1, a1, 8 ++ addi.d a0, a0, 8 ++ addi.d a1, a1, 8 ++ addi.d a2, a2, -8 ++ ++ sll.d a3, t2, a7 ++ or t0, a3, t0 ++ bne t0, t1, L(out1) ++ srl.d t0, t2, a6 ++ ++L(un_less_8bytes): ++ beqz a2, L(ret) ++ andi a7, a7, 63 ++ slli.d a4, a2, 3 ++ bgeu a7, a4, L(last_cmp) ++ ++ ld.d t2, a0, 8 ++ sll.d a3, t2, a7 ++ or t0, a3, t0 ++ ++L(last_cmp): ++ ld.d t1, a1, 8 ++ ++ li.d t7, -1 ++ sll.d t2, t7, a4 ++ sub.d t3, t0, t1 ++ andn t6, t3, t2 ++ ++ bnez t6, L(count_diff) ++ move a0, zero ++ jr ra ++END(MEMCMP_NAME) ++ ++libc_hidden_builtin_def (MEMCMP_NAME) +diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S +new file mode 100644 +index 00000000..3151a179 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lasx.S +@@ -0,0 +1,207 @@ ++/* Optimized memcmp implementation using LoongArch LASX instructions. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++#include <sysdep.h> ++#include <sys/regdef.h> ++#include <sys/asm.h> ++ ++#if IS_IN (libc) && !defined __loongarch_soft_float ++ ++# define MEMCMP __memcmp_lasx ++ ++LEAF(MEMCMP, 6) ++ li.d t2, 32 ++ add.d a3, a0, a2 ++ add.d a4, a1, a2 ++ bgeu t2, a2, L(less32) ++ ++ li.d t1, 160 ++ bgeu a2, t1, L(make_aligned) ++L(loop32): ++ xvld xr0, a0, 0 ++ xvld xr1, a1, 0 ++ ++ addi.d a0, a0, 32 ++ addi.d a1, a1, 32 ++ addi.d a2, a2, -32 ++ xvseq.b xr2, xr0, xr1 ++ ++ xvsetanyeqz.b fcc0, xr2 ++ bcnez fcc0, L(end) ++L(last_bytes): ++ bltu t2, a2, L(loop32) ++ xvld xr0, a3, -32 ++ ++ ++ xvld xr1, a4, -32 ++ xvseq.b xr2, xr0, xr1 ++L(end): ++ xvmsknz.b xr2, xr2 ++ xvpermi.q xr4, xr0, 1 ++ ++ xvpickve.w xr3, xr2, 4 ++ xvpermi.q xr5, xr1, 1 ++ vilvl.h vr2, vr3, vr2 ++ movfr2gr.s t0, fa2 ++ ++ cto.w t0, t0 ++ vreplgr2vr.b vr2, t0 ++ vshuf.b vr0, vr4, vr0, vr2 ++ vshuf.b vr1, vr5, vr1, vr2 ++ ++ vpickve2gr.bu t0, vr0, 0 ++ vpickve2gr.bu t1, vr1, 0 ++ sub.d a0, t0, t1 ++ jr ra ++ ++ ++L(less32): ++ srli.d t0, a2, 4 ++ beqz t0, L(less16) ++ vld vr0, a0, 0 ++ vld vr1, a1, 0 ++ ++ vld vr2, a3, -16 ++ vld vr3, a4, -16 ++L(short_ret): ++ vseq.b vr4, vr0, vr1 ++ vseq.b vr5, vr2, vr3 ++ ++ vmsknz.b vr4, vr4 ++ vmsknz.b vr5, vr5 ++ vilvl.h vr4, vr5, vr4 ++ movfr2gr.s t0, fa4 ++ ++ cto.w t0, t0 ++ vreplgr2vr.b vr4, t0 ++ vshuf.b vr0, vr2, vr0, vr4 ++ vshuf.b vr1, vr3, vr1, vr4 ++ ++ ++ vpickve2gr.bu t0, vr0, 0 ++ vpickve2gr.bu t1, vr1, 0 ++ sub.d a0, t0, t1 ++ jr ra ++ ++L(less16): ++ srli.d t0, a2, 3 ++ beqz t0, L(less8) ++ vldrepl.d vr0, a0, 0 ++ vldrepl.d vr1, a1, 0 ++ ++ vldrepl.d vr2, a3, -8 ++ vldrepl.d vr3, a4, -8 ++ b L(short_ret) ++ nop ++ ++L(less8): ++ srli.d t0, a2, 2 ++ beqz t0, L(less4) ++ vldrepl.w vr0, a0, 0 ++ vldrepl.w vr1, a1, 0 ++ ++ ++ vldrepl.w vr2, a3, -4 ++ vldrepl.w vr3, a4, -4 ++ b L(short_ret) ++ nop ++ ++L(less4): ++ srli.d t0, a2, 1 ++ beqz t0, L(less2) ++ vldrepl.h vr0, a0, 0 ++ vldrepl.h vr1, a1, 0 ++ ++ vldrepl.h vr2, a3, -2 ++ vldrepl.h vr3, a4, -2 ++ b L(short_ret) ++ nop ++ ++L(less2): ++ beqz a2, L(ret0) ++ ld.bu t0, a0, 0 ++ ld.bu t1, a1, 0 ++ sub.d a0, t0, t1 ++ ++ jr ra ++L(ret0): ++ move a0, zero ++ jr ra ++ ++L(make_aligned): ++ xvld xr0, a0, 0 ++ ++ xvld xr1, a1, 0 ++ xvseq.b xr2, xr0, xr1 ++ xvsetanyeqz.b fcc0, xr2 ++ bcnez fcc0, L(end) ++ ++ andi t0, a0, 0x1f ++ sub.d t0, t2, t0 ++ sub.d t1, a2, t0 ++ add.d a0, a0, t0 ++ ++ add.d a1, a1, t0 ++ andi a2, t1, 0x3f ++ sub.d t0, t1, a2 ++ add.d a5, a0, t0 ++ ++ ++L(loop_align): ++ xvld xr0, a0, 0 ++ xvld xr1, a1, 0 ++ xvld xr2, a0, 32 ++ xvld xr3, a1, 32 ++ ++ xvseq.b xr0, xr0, xr1 ++ xvseq.b xr1, xr2, xr3 ++ xvmin.bu xr2, xr1, xr0 ++ xvsetanyeqz.b fcc0, xr2 ++ ++ bcnez fcc0, L(pair_end) ++ addi.d a0, a0, 64 ++ addi.d a1, a1, 64 ++ bne a0, a5, L(loop_align) ++ ++ bnez a2, L(last_bytes) ++ move a0, zero ++ jr ra ++ nop ++ ++ ++L(pair_end): ++ xvmsknz.b xr0, xr0 ++ xvmsknz.b xr1, xr1 ++ xvpickve.w xr2, xr0, 4 ++ xvpickve.w xr3, xr1, 4 ++ ++ vilvl.h vr0, vr2, vr0 ++ vilvl.h vr1, vr3, vr1 ++ vilvl.w vr0, vr1, vr0 ++ movfr2gr.d t0, fa0 ++ ++ cto.d t0, t0 ++ ldx.bu t1, a0, t0 ++ ldx.bu t2, a1, t0 ++ sub.d a0, t1, t2 ++ ++ jr ra ++END(MEMCMP) ++ ++libc_hidden_builtin_def (MEMCMP) ++#endif +diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S +new file mode 100644 +index 00000000..38a50a4c +--- /dev/null ++++ b/sysdeps/loongarch/lp64/multiarch/memcmp-lsx.S +@@ -0,0 +1,269 @@ ++/* Optimized memcmp implementation using LoongArch LSX instructions. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++#include <sysdep.h> ++#include <sys/regdef.h> ++#include <sys/asm.h> ++ ++#if IS_IN (libc) && !defined __loongarch_soft_float ++ ++#define MEMCMP __memcmp_lsx ++ ++LEAF(MEMCMP, 6) ++ beqz a2, L(out) ++ pcalau12i t0, %pc_hi20(L(INDEX)) ++ andi a3, a0, 0xf ++ vld vr5, t0, %pc_lo12(L(INDEX)) ++ ++ andi a4, a1, 0xf ++ bne a3, a4, L(unaligned) ++ bstrins.d a0, zero, 3, 0 ++ xor a1, a1, a4 ++ ++ vld vr0, a0, 0 ++ vld vr1, a1, 0 ++ li.d t0, 16 ++ vreplgr2vr.b vr3, a3 ++ ++ sub.d t1, t0, a3 ++ vadd.b vr3, vr3, vr5 ++ vshuf.b vr0, vr3, vr0, vr3 ++ vshuf.b vr1, vr3, vr1, vr3 ++ ++ ++ vseq.b vr4, vr0, vr1 ++ bgeu t1, a2, L(al_end) ++ vsetanyeqz.b fcc0, vr4 ++ bcnez fcc0, L(al_found) ++ ++ sub.d t1, a2, t1 ++ andi a2, t1, 31 ++ beq a2, t1, L(al_less_32bytes) ++ sub.d t2, t1, a2 ++ ++ add.d a4, a0, t2 ++L(al_loop): ++ vld vr0, a0, 16 ++ vld vr1, a1, 16 ++ vld vr2, a0, 32 ++ ++ vld vr3, a1, 32 ++ addi.d a0, a0, 32 ++ addi.d a1, a1, 32 ++ vseq.b vr4, vr0, vr1 ++ ++ ++ vseq.b vr6, vr2, vr3 ++ vand.v vr6, vr4, vr6 ++ vsetanyeqz.b fcc0, vr6 ++ bcnez fcc0, L(al_pair_end) ++ ++ bne a0, a4, L(al_loop) ++L(al_less_32bytes): ++ bgeu t0, a2, L(al_less_16bytes) ++ vld vr0, a0, 16 ++ vld vr1, a1, 16 ++ ++ vld vr2, a0, 32 ++ vld vr3, a1, 32 ++ addi.d a2, a2, -16 ++ vreplgr2vr.b vr6, a2 ++ ++ vslt.b vr5, vr5, vr6 ++ vseq.b vr4, vr0, vr1 ++ vseq.b vr6, vr2, vr3 ++ vorn.v vr6, vr6, vr5 ++ ++ ++L(al_pair_end): ++ vsetanyeqz.b fcc0, vr4 ++ bcnez fcc0, L(al_found) ++ vnori.b vr4, vr6, 0 ++ vfrstpi.b vr4, vr4, 0 ++ ++ vshuf.b vr0, vr2, vr2, vr4 ++ vshuf.b vr1, vr3, vr3, vr4 ++ vpickve2gr.bu t0, vr0, 0 ++ vpickve2gr.bu t1, vr1, 0 ++ ++ sub.d a0, t0, t1 ++ jr ra ++ nop ++ nop ++ ++L(al_less_16bytes): ++ beqz a2, L(out) ++ vld vr0, a0, 16 ++ vld vr1, a1, 16 ++ vseq.b vr4, vr0, vr1 ++ ++ ++L(al_end): ++ vreplgr2vr.b vr6, a2 ++ vslt.b vr5, vr5, vr6 ++ vorn.v vr4, vr4, vr5 ++ nop ++ ++L(al_found): ++ vnori.b vr4, vr4, 0 ++ vfrstpi.b vr4, vr4, 0 ++ vshuf.b vr0, vr0, vr0, vr4 ++ vshuf.b vr1, vr1, vr1, vr4 ++ ++ vpickve2gr.bu t0, vr0, 0 ++ vpickve2gr.bu t1, vr1, 0 ++ sub.d a0, t0, t1 ++ jr ra ++ ++L(out): ++ move a0, zero ++ jr ra ++ nop ++ nop ++ ++ ++L(unaligned): ++ xor t2, a0, a1 ++ sltu a5, a3, a4 ++ masknez t2, t2, a5 ++ xor a0, a0, t2 ++ ++ xor a1, a1, t2 ++ andi a3, a0, 0xf ++ andi a4, a1, 0xf ++ bstrins.d a0, zero, 3, 0 ++ ++ xor a1, a1, a4 ++ vld vr4, a0, 0 ++ vld vr1, a1, 0 ++ li.d t0, 16 ++ ++ vreplgr2vr.b vr2, a4 ++ sub.d a6, a4, a3 ++ sub.d t1, t0, a4 ++ sub.d t2, t0, a6 ++ ++ ++ vadd.b vr2, vr2, vr5 ++ vreplgr2vr.b vr6, t2 ++ vadd.b vr6, vr6, vr5 ++ vshuf.b vr0, vr4, vr4, vr6 ++ ++ vshuf.b vr1, vr2, vr1, vr2 ++ vshuf.b vr0, vr2, vr0, vr2 ++ vseq.b vr7, vr0, vr1 ++ bgeu t1, a2, L(un_end) ++ ++ vsetanyeqz.b fcc0, vr7 ++ bcnez fcc0, L(un_found) ++ sub.d a2, a2, t1 ++ andi t1, a2, 31 ++ ++ beq a2, t1, L(un_less_32bytes) ++ sub.d t2, a2, t1 ++ move a2, t1 ++ add.d a4, a1, t2 ++ ++ ++L(un_loop): ++ vld vr2, a0, 16 ++ vld vr1, a1, 16 ++ vld vr3, a1, 32 ++ addi.d a1, a1, 32 ++ ++ addi.d a0, a0, 32 ++ vshuf.b vr0, vr2, vr4, vr6 ++ vld vr4, a0, 0 ++ vseq.b vr7, vr0, vr1 ++ ++ vshuf.b vr2, vr4, vr2, vr6 ++ vseq.b vr8, vr2, vr3 ++ vand.v vr8, vr7, vr8 ++ vsetanyeqz.b fcc0, vr8 ++ ++ bcnez fcc0, L(un_pair_end) ++ bne a1, a4, L(un_loop) ++ ++L(un_less_32bytes): ++ bltu a2, t0, L(un_less_16bytes) ++ vld vr2, a0, 16 ++ vld vr1, a1, 16 ++ addi.d a0, a0, 16 ++ ++ addi.d a1, a1, 16 ++ addi.d a2, a2, -16 ++ vshuf.b vr0, vr2, vr4, vr6 ++ vor.v vr4, vr2, vr2 ++ ++ vseq.b vr7, vr0, vr1 ++ vsetanyeqz.b fcc0, vr7 ++ bcnez fcc0, L(un_found) ++L(un_less_16bytes): ++ beqz a2, L(out) ++ vld vr1, a1, 16 ++ bgeu a6, a2, 1f ++ ++ vld vr2, a0, 16 ++1: ++ vshuf.b vr0, vr2, vr4, vr6 ++ vseq.b vr7, vr0, vr1 ++L(un_end): ++ vreplgr2vr.b vr3, a2 ++ ++ ++ vslt.b vr3, vr5, vr3 ++ vorn.v vr7, vr7, vr3 ++ ++L(un_found): ++ vnori.b vr7, vr7, 0 ++ vfrstpi.b vr7, vr7, 0 ++ ++ vshuf.b vr0, vr0, vr0, vr7 ++ vshuf.b vr1, vr1, vr1, vr7 ++L(calc_result): ++ vpickve2gr.bu t0, vr0, 0 ++ vpickve2gr.bu t1, vr1, 0 ++ ++ sub.d t2, t0, t1 ++ sub.d t3, t1, t0 ++ masknez t0, t3, a5 ++ maskeqz t1, t2, a5 ++ ++ or a0, t0, t1 ++ jr ra ++L(un_pair_end): ++ vsetanyeqz.b fcc0, vr7 ++ bcnez fcc0, L(un_found) ++ ++ ++ vnori.b vr7, vr8, 0 ++ vfrstpi.b vr7, vr7, 0 ++ vshuf.b vr0, vr2, vr2, vr7 ++ vshuf.b vr1, vr3, vr3, vr7 ++ ++ b L(calc_result) ++END(MEMCMP) ++ ++ .section .rodata.cst16,"M",@progbits,16 ++ .align 4 ++L(INDEX): ++ .dword 0x0706050403020100 ++ .dword 0x0f0e0d0c0b0a0908 ++ ++libc_hidden_builtin_def (MEMCMP) ++#endif +diff --git a/sysdeps/loongarch/lp64/multiarch/memcmp.c b/sysdeps/loongarch/lp64/multiarch/memcmp.c +new file mode 100644 +index 00000000..32eccac2 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/multiarch/memcmp.c +@@ -0,0 +1,43 @@ ++/* Multiple versions of memcmp. ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++/* Define multiple versions only for the definition in libc. */ ++#if IS_IN (libc) ++# define memcmp __redirect_memcmp ++# include <string.h> ++# undef memcmp ++ ++# define SYMBOL_NAME memcmp ++# include "ifunc-memcmp.h" ++ ++libc_ifunc_redirected (__redirect_memcmp, memcmp, ++ IFUNC_SELECTOR ()); ++# undef bcmp ++weak_alias (memcmp, bcmp) ++ ++# undef __memcmpeq ++strong_alias (memcmp, __memcmpeq) ++libc_hidden_def (__memcmpeq) ++ ++# ifdef SHARED ++__hidden_ver1 (memcmp, __GI_memcmp, __redirect_memcmp) ++ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (memcmp); ++# endif ++ ++#endif +-- +2.33.0 + |