diff options
Diffstat (limited to 'Loongarch-Add-ifunc-support-for-memcpy-aligned-unali.patch')
-rw-r--r-- | Loongarch-Add-ifunc-support-for-memcpy-aligned-unali.patch | 2570 |
1 files changed, 2570 insertions, 0 deletions
diff --git a/Loongarch-Add-ifunc-support-for-memcpy-aligned-unali.patch b/Loongarch-Add-ifunc-support-for-memcpy-aligned-unali.patch new file mode 100644 index 0000000..2bbf367 --- /dev/null +++ b/Loongarch-Add-ifunc-support-for-memcpy-aligned-unali.patch @@ -0,0 +1,2570 @@ +From 9c522272146423c1ef9fb9e071737a8ad26e844e Mon Sep 17 00:00:00 2001 +From: dengjianbo <dengjianbo@loongson.cn> +Date: Tue, 15 Aug 2023 09:11:53 +0800 +Subject: [PATCH 07/29] Loongarch: Add ifunc support for memcpy{aligned, + unaligned, lsx, lasx} and memmove{aligned, unaligned, lsx, lasx} + +These implementations improve the time to copy data in the glibc +microbenchmark as below: +memcpy-lasx reduces the runtime about 8%-76% +memcpy-lsx reduces the runtime about 8%-72% +memcpy-unaligned reduces the runtime of unaligned data copying up to 40% +memcpy-aligned reduece the runtime of unaligned data copying up to 25% +memmove-lasx reduces the runtime about 20%-73% +memmove-lsx reduces the runtime about 50% +memmove-unaligned reduces the runtime of unaligned data moving up to 40% +memmove-aligned reduces the runtime of unaligned data moving up to 25% + +Signed-off-by: Peng Fan <fanpeng@loongson.cn> +Signed-off-by: ticat_fp <fanpeng@loongson.cn> +--- + sysdeps/loongarch/lp64/multiarch/Makefile | 5 + + .../lp64/multiarch/ifunc-impl-list.c | 19 + + sysdeps/loongarch/lp64/multiarch/ifunc-lasx.h | 45 + + .../loongarch/lp64/multiarch/memcpy-aligned.S | 783 ++++++++++++++++++ + .../loongarch/lp64/multiarch/memcpy-lasx.S | 20 + + sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S | 20 + + .../lp64/multiarch/memcpy-unaligned.S | 247 ++++++ + sysdeps/loongarch/lp64/multiarch/memcpy.c | 37 + + .../lp64/multiarch/memmove-aligned.S | 20 + + .../loongarch/lp64/multiarch/memmove-lasx.S | 287 +++++++ + .../loongarch/lp64/multiarch/memmove-lsx.S | 534 ++++++++++++ + .../lp64/multiarch/memmove-unaligned.S | 380 +++++++++ + sysdeps/loongarch/lp64/multiarch/memmove.c | 38 + + 13 files changed, 2435 insertions(+) + create mode 100644 sysdeps/loongarch/lp64/multiarch/ifunc-lasx.h + create mode 100644 sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S + create mode 100644 sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S + create mode 100644 sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S + create mode 100644 sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S + create mode 100644 sysdeps/loongarch/lp64/multiarch/memcpy.c + create mode 100644 sysdeps/loongarch/lp64/multiarch/memmove-aligned.S + create mode 100644 sysdeps/loongarch/lp64/multiarch/memmove-lasx.S + create mode 100644 sysdeps/loongarch/lp64/multiarch/memmove-lsx.S + create mode 100644 sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S + create mode 100644 sysdeps/loongarch/lp64/multiarch/memmove.c + +diff --git a/sysdeps/loongarch/lp64/multiarch/Makefile b/sysdeps/loongarch/lp64/multiarch/Makefile +index 110a8c5c..afa51041 100644 +--- a/sysdeps/loongarch/lp64/multiarch/Makefile ++++ b/sysdeps/loongarch/lp64/multiarch/Makefile +@@ -9,5 +9,10 @@ sysdep_routines += \ + strchrnul-aligned \ + strchrnul-lsx \ + strchrnul-lasx \ ++ memcpy-aligned \ ++ memcpy-unaligned \ ++ memmove-unaligned \ ++ memmove-lsx \ ++ memmove-lasx \ + # sysdep_routines + endif +diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c +index c7164b45..25eb96b0 100644 +--- a/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/loongarch/lp64/multiarch/ifunc-impl-list.c +@@ -53,5 +53,24 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + #endif + IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_aligned) + ) ++ ++ IFUNC_IMPL (i, name, memcpy, ++#if !defined __loongarch_soft_float ++ IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LASX, __memcpy_lasx) ++ IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_LSX, __memcpy_lsx) ++#endif ++ IFUNC_IMPL_ADD (array, i, memcpy, SUPPORT_UAL, __memcpy_unaligned) ++ IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_aligned) ++ ) ++ ++ IFUNC_IMPL (i, name, memmove, ++#if !defined __loongarch_soft_float ++ IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_LASX, __memmove_lasx) ++ IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_LSX, __memmove_lsx) ++#endif ++ IFUNC_IMPL_ADD (array, i, memmove, SUPPORT_UAL, __memmove_unaligned) ++ IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_aligned) ++ ) ++ + return i; + } +diff --git a/sysdeps/loongarch/lp64/multiarch/ifunc-lasx.h b/sysdeps/loongarch/lp64/multiarch/ifunc-lasx.h +new file mode 100644 +index 00000000..3be67da6 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/multiarch/ifunc-lasx.h +@@ -0,0 +1,45 @@ ++/* Common definition for ifunc selection implementation. ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++#include <ldsodefs.h> ++#include <ifunc-init.h> ++ ++#if !defined __loongarch_soft_float ++extern __typeof (REDIRECT_NAME) OPTIMIZE (lasx) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (lsx) attribute_hidden; ++#endif ++ ++extern __typeof (REDIRECT_NAME) OPTIMIZE (aligned) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (unaligned) attribute_hidden; ++ ++static inline void * ++IFUNC_SELECTOR (void) ++{ ++#if !defined __loongarch_soft_float ++ if (SUPPORT_LASX) ++ return OPTIMIZE (lasx); ++ else if (SUPPORT_LSX) ++ return OPTIMIZE (lsx); ++ else ++#endif ++ if (SUPPORT_UAL) ++ return OPTIMIZE (unaligned); ++ else ++ return OPTIMIZE (aligned); ++} +diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S +new file mode 100644 +index 00000000..299dd49c +--- /dev/null ++++ b/sysdeps/loongarch/lp64/multiarch/memcpy-aligned.S +@@ -0,0 +1,783 @@ ++/* Optimized memcpy_aligned implementation using basic Loongarch instructions. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++#include <sysdep.h> ++#include <sys/regdef.h> ++#include <sys/asm.h> ++ ++#if IS_IN (libc) ++# define MEMCPY_NAME __memcpy_aligned ++# define MEMMOVE_NAME __memmove_aligned ++#else ++# define MEMCPY_NAME memcpy ++# define MEMMOVE_NAME memmove ++#endif ++ ++#define LD_64(reg, n) \ ++ ld.d t0, reg, n; \ ++ ld.d t1, reg, n + 8; \ ++ ld.d t2, reg, n + 16; \ ++ ld.d t3, reg, n + 24; \ ++ ld.d t4, reg, n + 32; \ ++ ld.d t5, reg, n + 40; \ ++ ld.d t6, reg, n + 48; \ ++ ld.d t7, reg, n + 56; ++ ++#define ST_64(reg, n) \ ++ st.d t0, reg, n; \ ++ st.d t1, reg, n + 8; \ ++ st.d t2, reg, n + 16; \ ++ st.d t3, reg, n + 24; \ ++ st.d t4, reg, n + 32; \ ++ st.d t5, reg, n + 40; \ ++ st.d t6, reg, n + 48; \ ++ st.d t7, reg, n + 56; ++ ++LEAF(MEMMOVE_NAME, 6) ++ sub.d t0, a0, a1 ++ bltu t0, a2, L(copy_back) ++END(MEMMOVE_NAME) ++ ++LEAF_NO_ALIGN(MEMCPY_NAME) ++ srai.d a3, a2, 4 ++ beqz a3, L(short_data) ++ ++ move a4, a0 ++ andi a5, a0, 0x7 ++ andi a6, a1, 0x7 ++ li.d t8, 8 ++ beqz a5, L(check_align) ++ ++ sub.d t2, t8, a5 ++ sub.d a2, a2, t2 ++ pcaddi t1, 20 ++ slli.d t3, t2, 3 ++ ++ add.d a1, a1, t2 ++ sub.d t1, t1, t3 ++ add.d a4, a4, t2 ++ jr t1 ++ ++L(al7): ++ ld.b t0, a1, -7 ++ st.b t0, a4, -7 ++L(al6): ++ ld.b t0, a1, -6 ++ st.b t0, a4, -6 ++L(al5): ++ ld.b t0, a1, -5 ++ st.b t0, a4, -5 ++L(al4): ++ ld.b t0, a1, -4 ++ st.b t0, a4, -4 ++L(al3): ++ ld.b t0, a1, -3 ++ st.b t0, a4, -3 ++L(al2): ++ ld.b t0, a1, -2 ++ st.b t0, a4, -2 ++L(al1): ++ ld.b t0, a1, -1 ++ st.b t0, a4, -1 ++ ++L(check_align): ++ bne a5, a6, L(unalign) ++ srai.d a3, a2, 4 ++ beqz a3, L(al_less_16bytes) ++ andi a3, a2, 0x3f ++ ++ beq a3, a2, L(al_less_64bytes) ++ sub.d t0, a2, a3 ++ move a2, a3 ++ add.d a5, a1, t0 ++ ++L(loop_64bytes): ++ LD_64(a1, 0) ++ addi.d a1, a1, 64 ++ ST_64(a4, 0) ++ ++ addi.d a4, a4, 64 ++ bne a1, a5, L(loop_64bytes) ++ ++L(al_less_64bytes): ++ srai.d a3, a2, 5 ++ beqz a3, L(al_less_32bytes) ++ ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a1, 16 ++ ld.d t3, a1, 24 ++ ++ addi.d a1, a1, 32 ++ addi.d a2, a2, -32 ++ ++ st.d t0, a4, 0 ++ st.d t1, a4, 8 ++ st.d t2, a4, 16 ++ st.d t3, a4, 24 ++ ++ addi.d a4, a4, 32 ++ ++L(al_less_32bytes): ++ srai.d a3, a2, 4 ++ beqz a3, L(al_less_16bytes) ++ ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ addi.d a1, a1, 16 ++ addi.d a2, a2, -16 ++ ++ st.d t0, a4, 0 ++ st.d t1, a4, 8 ++ addi.d a4, a4, 16 ++ ++L(al_less_16bytes): ++ srai.d a3, a2, 3 ++ beqz a3, L(al_less_8bytes) ++ ++ ld.d t0, a1, 0 ++ addi.d a1, a1, 8 ++ addi.d a2, a2, -8 ++ st.d t0, a4, 0 ++ addi.d a4, a4, 8 ++ ++L(al_less_8bytes): ++ srai.d a3, a2, 2 ++ beqz a3, L(al_less_4bytes) ++ ++ ld.w t0, a1, 0 ++ addi.d a1, a1, 4 ++ addi.d a2, a2, -4 ++ st.w t0, a4, 0 ++ addi.d a4, a4, 4 ++ ++L(al_less_4bytes): ++ srai.d a3, a2, 1 ++ beqz a3, L(al_less_2bytes) ++ ++ ld.h t0, a1, 0 ++ addi.d a1, a1, 2 ++ addi.d a2, a2, -2 ++ st.h t0, a4, 0 ++ addi.d a4, a4, 2 ++ ++L(al_less_2bytes): ++ beqz a2, L(al_less_1byte) ++ ++ ld.b t0, a1, 0 ++ st.b t0, a4, 0 ++ ++L(al_less_1byte): ++ jr ra ++ ++L(unalign): ++ andi a5, a1, 0x7 ++ bstrins.d a1, zero, 2, 0 ++ sub.d t8, t8, a5 ++ slli.d a5, a5, 3 ++ ++ ld.d t0, a1, 0 ++ addi.d a1, a1, 8 ++ slli.d a6, t8, 3 ++ srl.d a7, t0, a5 ++ ++ srai.d a3, a2, 4 ++ beqz a3, L(un_less_16bytes) ++ andi a3, a2, 0x3f ++ beq a3, a2, L(un_less_64bytes) ++ ++ sub.d t0, a2, a3 ++ move a2, a3 ++ add.d a3, a1, t0 ++ ++L(un_long_bytes): ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a1, 16 ++ ld.d t3, a1, 24 ++ ++ srl.d t4, t0, a5 ++ sll.d t0, t0, a6 ++ srl.d t5, t1, a5 ++ sll.d t1, t1, a6 ++ ++ srl.d t6, t2, a5 ++ sll.d t2, t2, a6 ++ srl.d t7, t3, a5 ++ sll.d t3, t3, a6 ++ ++ or t0, a7, t0 ++ or t1, t4, t1 ++ or t2, t5, t2 ++ or t3, t6, t3 ++ ++ ld.d t4, a1, 32 ++ ld.d t5, a1, 40 ++ ld.d t6, a1, 48 ++ ld.d a7, a1, 56 ++ ++ st.d t0, a4, 0 ++ st.d t1, a4, 8 ++ st.d t2, a4, 16 ++ st.d t3, a4, 24 ++ ++ addi.d a1, a1, 64 ++ ++ srl.d t0, t4, a5 ++ sll.d t4, t4, a6 ++ srl.d t1, t5, a5 ++ sll.d t5, t5, a6 ++ ++ srl.d t2, t6, a5 ++ sll.d t6, t6, a6 ++ sll.d t3, a7, a6 ++ srl.d a7, a7, a5 ++ ++ or t4, t7, t4 ++ or t5, t0, t5 ++ or t6, t1, t6 ++ or t3, t2, t3 ++ ++ st.d t4, a4, 32 ++ st.d t5, a4, 40 ++ st.d t6, a4, 48 ++ st.d t3, a4, 56 ++ ++ addi.d a4, a4, 64 ++ bne a3, a1, L(un_long_bytes) ++ ++L(un_less_64bytes): ++ srai.d a3, a2, 5 ++ beqz a3, L(un_less_32bytes) ++ ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a1, 16 ++ ld.d t3, a1, 24 ++ ++ addi.d a1, a1, 32 ++ addi.d a2, a2, -32 ++ ++ srl.d t4, t0, a5 ++ sll.d t0, t0, a6 ++ srl.d t5, t1, a5 ++ sll.d t1, t1, a6 ++ ++ srl.d t6, t2, a5 ++ sll.d t2, t2, a6 ++ or t0, a7, t0 ++ srl.d a7, t3, a5 ++ sll.d t3, t3, a6 ++ ++ or t1, t4, t1 ++ or t2, t5, t2 ++ or t3, t6, t3 ++ ++ st.d t0, a4, 0 ++ st.d t1, a4, 8 ++ st.d t2, a4, 16 ++ st.d t3, a4, 24 ++ ++ addi.d a4, a4, 32 ++ ++L(un_less_32bytes): ++ srai.d a3, a2, 4 ++ beqz a3, L(un_less_16bytes) ++ ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ addi.d a1, a1, 16 ++ addi.d a2, a2, -16 ++ ++ srl.d t2, t0, a5 ++ sll.d t3, t0, a6 ++ sll.d t4, t1, a6 ++ or t3, a7, t3 ++ or t4, t2, t4 ++ ++ srl.d a7, t1, a5 ++ st.d t3, a4, 0 ++ st.d t4, a4, 8 ++ addi.d a4, a4, 16 ++ ++L(un_less_16bytes): ++ srai.d a3, a2, 3 ++ beqz a3, L(un_less_8bytes) ++ ++ ld.d t0, a1, 0 ++ addi.d a1, a1, 8 ++ addi.d a2, a2, -8 ++ sll.d t1, t0, a6 ++ ++ or t2, a7, t1 ++ srl.d a7, t0, a5 ++ st.d t2, a4, 0 ++ addi.d a4, a4, 8 ++ ++L(un_less_8bytes): ++ beqz a2, L(un_less_1byte) ++ bge t8, a2, 1f ++ ++ ld.d t0, a1, 0 ++ sll.d t0, t0, a6 ++ or a7, a7, t0 ++ ++1: ++ srai.d a3, a2, 2 ++ beqz a3, L(un_less_4bytes) ++ ++ addi.d a2, a2, -4 ++ st.w a7, a4, 0 ++ addi.d a4, a4, 4 ++ srai.d a7, a7, 32 ++ ++L(un_less_4bytes): ++ srai.d a3, a2, 1 ++ beqz a3, L(un_less_2bytes) ++ ++ addi.d a2, a2, -2 ++ st.h a7, a4, 0 ++ addi.d a4, a4, 2 ++ srai.d a7, a7, 16 ++ ++L(un_less_2bytes): ++ beqz a2, L(un_less_1byte) ++ st.b a7, a4, 0 ++ ++L(un_less_1byte): ++ jr ra ++ ++L(short_data): ++ pcaddi t1, 36 ++ slli.d t2, a2, 3 ++ add.d a4, a0, a2 ++ sub.d t1, t1, t2 ++ add.d a1, a1, a2 ++ jr t1 ++ ++L(short_15_bytes): ++ ld.b t0, a1, -15 ++ st.b t0, a4, -15 ++L(short_14_bytes): ++ ld.b t0, a1, -14 ++ st.b t0, a4, -14 ++L(short_13_bytes): ++ ld.b t0, a1, -13 ++ st.b t0, a4, -13 ++L(short_12_bytes): ++ ld.b t0, a1, -12 ++ st.b t0, a4, -12 ++L(short_11_bytes): ++ ld.b t0, a1, -11 ++ st.b t0, a4, -11 ++L(short_10_bytes): ++ ld.b t0, a1, -10 ++ st.b t0, a4, -10 ++L(short_9_bytes): ++ ld.b t0, a1, -9 ++ st.b t0, a4, -9 ++L(short_8_bytes): ++ ld.b t0, a1, -8 ++ st.b t0, a4, -8 ++L(short_7_bytes): ++ ld.b t0, a1, -7 ++ st.b t0, a4, -7 ++L(short_6_bytes): ++ ld.b t0, a1, -6 ++ st.b t0, a4, -6 ++L(short_5_bytes): ++ ld.b t0, a1, -5 ++ st.b t0, a4, -5 ++L(short_4_bytes): ++ ld.b t0, a1, -4 ++ st.b t0, a4, -4 ++L(short_3_bytes): ++ ld.b t0, a1, -3 ++ st.b t0, a4, -3 ++L(short_2_bytes): ++ ld.b t0, a1, -2 ++ st.b t0, a4, -2 ++L(short_1_bytes): ++ ld.b t0, a1, -1 ++ st.b t0, a4, -1 ++ jr ra ++ ++L(copy_back): ++ srai.d a3, a2, 4 ++ beqz a3, L(back_short_data) ++ ++ add.d a4, a0, a2 ++ add.d a1, a1, a2 ++ ++ andi a5, a4, 0x7 ++ andi a6, a1, 0x7 ++ beqz a5, L(back_check_align) ++ ++ sub.d a2, a2, a5 ++ sub.d a1, a1, a5 ++ sub.d a4, a4, a5 ++ ++ pcaddi t1, 18 ++ slli.d t3, a5, 3 ++ sub.d t1, t1, t3 ++ jr t1 ++ ++ ld.b t0, a1, 6 ++ st.b t0, a4, 6 ++ ld.b t0, a1, 5 ++ st.b t0, a4, 5 ++ ld.b t0, a1, 4 ++ st.b t0, a4, 4 ++ ld.b t0, a1, 3 ++ st.b t0, a4, 3 ++ ld.b t0, a1, 2 ++ st.b t0, a4, 2 ++ ld.b t0, a1, 1 ++ st.b t0, a4, 1 ++ ld.b t0, a1, 0 ++ st.b t0, a4, 0 ++ ++L(back_check_align): ++ bne a5, a6, L(back_unalign) ++ ++ srai.d a3, a2, 4 ++ beqz a3, L(back_less_16bytes) ++ ++ andi a3, a2, 0x3f ++ beq a3, a2, L(back_less_64bytes) ++ ++ sub.d t0, a2, a3 ++ move a2, a3 ++ sub.d a5, a1, t0 ++ ++L(back_loop_64bytes): ++ LD_64(a1, -64) ++ addi.d a1, a1, -64 ++ ST_64(a4, -64) ++ ++ addi.d a4, a4, -64 ++ bne a1, a5, L(back_loop_64bytes) ++ ++L(back_less_64bytes): ++ srai.d a3, a2, 5 ++ beqz a3, L(back_less_32bytes) ++ ++ ld.d t0, a1, -32 ++ ld.d t1, a1, -24 ++ ld.d t2, a1, -16 ++ ld.d t3, a1, -8 ++ ++ addi.d a1, a1, -32 ++ addi.d a2, a2, -32 ++ ++ st.d t0, a4, -32 ++ st.d t1, a4, -24 ++ st.d t2, a4, -16 ++ st.d t3, a4, -8 ++ ++ addi.d a4, a4, -32 ++ ++L(back_less_32bytes): ++ srai.d a3, a2, 4 ++ beqz a3, L(back_less_16bytes) ++ ++ ld.d t0, a1, -16 ++ ld.d t1, a1, -8 ++ ++ addi.d a2, a2, -16 ++ addi.d a1, a1, -16 ++ ++ st.d t0, a4, -16 ++ st.d t1, a4, -8 ++ addi.d a4, a4, -16 ++ ++L(back_less_16bytes): ++ srai.d a3, a2, 3 ++ beqz a3, L(back_less_8bytes) ++ ++ ld.d t0, a1, -8 ++ addi.d a2, a2, -8 ++ addi.d a1, a1, -8 ++ ++ st.d t0, a4, -8 ++ addi.d a4, a4, -8 ++ ++L(back_less_8bytes): ++ srai.d a3, a2, 2 ++ beqz a3, L(back_less_4bytes) ++ ++ ld.w t0, a1, -4 ++ addi.d a2, a2, -4 ++ addi.d a1, a1, -4 ++ ++ st.w t0, a4, -4 ++ addi.d a4, a4, -4 ++ ++L(back_less_4bytes): ++ srai.d a3, a2, 1 ++ beqz a3, L(back_less_2bytes) ++ ++ ld.h t0, a1, -2 ++ addi.d a2, a2, -2 ++ addi.d a1, a1, -2 ++ ++ st.h t0, a4, -2 ++ addi.d a4, a4, -2 ++ ++L(back_less_2bytes): ++ beqz a2, L(back_less_1byte) ++ ++ ld.b t0, a1, -1 ++ st.b t0, a4, -1 ++ ++L(back_less_1byte): ++ jr ra ++ ++L(back_unalign): ++ andi t8, a1, 0x7 ++ bstrins.d a1, zero, 2, 0 ++ ++ sub.d a6, zero, t8 ++ ++ ld.d t0, a1, 0 ++ slli.d a6, a6, 3 ++ slli.d a5, t8, 3 ++ sll.d a7, t0, a6 ++ ++ srai.d a3, a2, 4 ++ beqz a3, L(back_un_less_16bytes) ++ ++ andi a3, a2, 0x3f ++ beq a3, a2, L(back_un_less_64bytes) ++ ++ sub.d t0, a2, a3 ++ move a2, a3 ++ sub.d a3, a1, t0 ++ ++L(back_un_long_bytes): ++ ld.d t0, a1, -8 ++ ld.d t1, a1, -16 ++ ld.d t2, a1, -24 ++ ld.d t3, a1, -32 ++ ++ sll.d t4, t0, a6 ++ srl.d t0, t0, a5 ++ ++ sll.d t5, t1, a6 ++ srl.d t1, t1, a5 ++ ++ sll.d t6, t2, a6 ++ srl.d t2, t2, a5 ++ ++ sll.d t7, t3, a6 ++ srl.d t3, t3, a5 ++ ++ or t0, t0, a7 ++ or t1, t1, t4 ++ or t2, t2, t5 ++ or t3, t3, t6 ++ ++ ld.d t4, a1, -40 ++ ld.d t5, a1, -48 ++ ld.d t6, a1, -56 ++ ld.d a7, a1, -64 ++ st.d t0, a4, -8 ++ st.d t1, a4, -16 ++ st.d t2, a4, -24 ++ st.d t3, a4, -32 ++ ++ addi.d a1, a1, -64 ++ ++ sll.d t0, t4, a6 ++ srl.d t4, t4, a5 ++ ++ sll.d t1, t5, a6 ++ srl.d t5, t5, a5 ++ ++ sll.d t2, t6, a6 ++ srl.d t6, t6, a5 ++ ++ srl.d t3, a7, a5 ++ sll.d a7, a7, a6 ++ ++ or t4, t7, t4 ++ or t5, t0, t5 ++ or t6, t1, t6 ++ or t3, t2, t3 ++ ++ st.d t4, a4, -40 ++ st.d t5, a4, -48 ++ st.d t6, a4, -56 ++ st.d t3, a4, -64 ++ ++ addi.d a4, a4, -64 ++ bne a3, a1, L(back_un_long_bytes) ++ ++L(back_un_less_64bytes): ++ srai.d a3, a2, 5 ++ beqz a3, L(back_un_less_32bytes) ++ ++ ld.d t0, a1, -8 ++ ld.d t1, a1, -16 ++ ld.d t2, a1, -24 ++ ld.d t3, a1, -32 ++ ++ addi.d a1, a1, -32 ++ addi.d a2, a2, -32 ++ ++ sll.d t4, t0, a6 ++ srl.d t0, t0, a5 ++ ++ sll.d t5, t1, a6 ++ srl.d t1, t1, a5 ++ ++ sll.d t6, t2, a6 ++ srl.d t2, t2, a5 ++ ++ or t0, a7, t0 ++ ++ sll.d a7, t3, a6 ++ srl.d t3, t3, a5 ++ ++ or t1, t4, t1 ++ or t2, t5, t2 ++ or t3, t6, t3 ++ ++ st.d t0, a4, -8 ++ st.d t1, a4, -16 ++ st.d t2, a4, -24 ++ st.d t3, a4, -32 ++ ++ addi.d a4, a4, -32 ++ ++L(back_un_less_32bytes): ++ srai.d a3, a2, 4 ++ beqz a3, L(back_un_less_16bytes) ++ ++ ld.d t0, a1, -8 ++ ld.d t1, a1, -16 ++ ++ addi.d a1, a1, -16 ++ addi.d a2, a2, -16 ++ ++ sll.d t2, t0, a6 ++ srl.d t3, t0, a5 ++ ++ srl.d t4, t1, a5 ++ or t3, a7, t3 ++ or t4, t2, t4 ++ sll.d a7, t1, a6 ++ ++ st.d t3, a4, -8 ++ st.d t4, a4, -16 ++ ++ addi.d a4, a4, -16 ++ ++L(back_un_less_16bytes): ++ srai.d a3, a2, 3 ++ beqz a3, L(back_un_less_8bytes) ++ ++ ld.d t0, a1, -8 ++ ++ addi.d a1, a1, -8 ++ addi.d a2, a2, -8 ++ ++ srl.d t1, t0, a5 ++ or t2, a7, t1 ++ sll.d a7, t0, a6 ++ ++ st.d t2, a4, -8 ++ addi.d a4, a4, -8 ++ ++L(back_un_less_8bytes): ++ beqz a2, L(back_end) ++ bge t8, a2, 1f ++ ++ ld.d t0, a1, -8 ++ srl.d t0, t0, a5 ++ or a7, a7, t0 ++ ++1: ++ srai.d a3, a2, 2 ++ beqz a3, L(back_un_less_4bytes) ++ ++ srai.d t0, a7, 32 ++ addi.d a2, a2, -4 ++ st.w t0, a4, -4 ++ addi.d a4, a4, -4 ++ slli.d a7, a7, 32 ++ ++L(back_un_less_4bytes): ++ srai.d a3, a2, 1 ++ beqz a3, L(back_un_less_2bytes) ++ srai.d t0, a7, 48 ++ addi.d a2, a2, -2 ++ st.h t0, a4, -2 ++ addi.d a4, a4, -2 ++ slli.d a7, a7, 16 ++L(back_un_less_2bytes): ++ beqz a2, L(back_un_less_1byte) ++ srai.d t0, a7, 56 ++ st.b t0, a4, -1 ++L(back_un_less_1byte): ++ jr ra ++ ++L(back_short_data): ++ pcaddi t1, 34 ++ slli.d t2, a2, 3 ++ sub.d t1, t1, t2 ++ jr t1 ++ ++ ld.b t0, a1, 14 ++ st.b t0, a0, 14 ++ ld.b t0, a1, 13 ++ st.b t0, a0, 13 ++ ld.b t0, a1, 12 ++ st.b t0, a0, 12 ++ ld.b t0, a1, 11 ++ st.b t0, a0, 11 ++ ld.b t0, a1, 10 ++ st.b t0, a0, 10 ++ ld.b t0, a1, 9 ++ st.b t0, a0, 9 ++ ld.b t0, a1, 8 ++ st.b t0, a0, 8 ++ ld.b t0, a1, 7 ++ st.b t0, a0, 7 ++ ld.b t0, a1, 6 ++ st.b t0, a0, 6 ++ ld.b t0, a1, 5 ++ st.b t0, a0, 5 ++ ld.b t0, a1, 4 ++ st.b t0, a0, 4 ++ ld.b t0, a1, 3 ++ st.b t0, a0, 3 ++ ld.b t0, a1, 2 ++ st.b t0, a0, 2 ++ ld.b t0, a1, 1 ++ st.b t0, a0, 1 ++ ld.b t0, a1, 0 ++ st.b t0, a0, 0 ++L(back_end): ++ jr ra ++ ++END(MEMCPY_NAME) ++ ++libc_hidden_builtin_def (MEMMOVE_NAME) ++libc_hidden_builtin_def (MEMCPY_NAME) +diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S b/sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S +new file mode 100644 +index 00000000..4aae5bf8 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/multiarch/memcpy-lasx.S +@@ -0,0 +1,20 @@ ++/* Optimized memcpy implementation using Loongarch LASX instructions. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++/* memcpy is part of memmove.S */ +diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S b/sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S +new file mode 100644 +index 00000000..6ebbe7a2 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/multiarch/memcpy-lsx.S +@@ -0,0 +1,20 @@ ++/* Optimized memcpy implementation using Loongarch LSX instructions. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++/* memcpy is part of memmove.S */ +diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S +new file mode 100644 +index 00000000..8e60a22d +--- /dev/null ++++ b/sysdeps/loongarch/lp64/multiarch/memcpy-unaligned.S +@@ -0,0 +1,247 @@ ++/* Optimized unaligned memcpy implementation using basic Loongarch instructions. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++#include <sysdep.h> ++#include <sys/regdef.h> ++#include <sys/asm.h> ++ ++#if IS_IN (libc) ++ ++# define MEMCPY_NAME __memcpy_unaligned ++ ++# define LD_64(reg, n) \ ++ ld.d t0, reg, n; \ ++ ld.d t1, reg, n + 8; \ ++ ld.d t2, reg, n + 16; \ ++ ld.d t3, reg, n + 24; \ ++ ld.d t4, reg, n + 32; \ ++ ld.d t5, reg, n + 40; \ ++ ld.d t6, reg, n + 48; \ ++ ld.d t7, reg, n + 56; ++ ++# define ST_64(reg, n) \ ++ st.d t0, reg, n; \ ++ st.d t1, reg, n + 8; \ ++ st.d t2, reg, n + 16; \ ++ st.d t3, reg, n + 24; \ ++ st.d t4, reg, n + 32; \ ++ st.d t5, reg, n + 40; \ ++ st.d t6, reg, n + 48; \ ++ st.d t7, reg, n + 56; ++ ++LEAF(MEMCPY_NAME, 3) ++ add.d a4, a1, a2 ++ add.d a3, a0, a2 ++ li.w a6, 16 ++ bge a6, a2, L(less_16bytes) ++ ++ li.w a6, 128 ++ blt a6, a2, L(long_bytes) ++ li.w a6, 64 ++ blt a6, a2, L(more_64bytes) ++ ++ li.w a6, 32 ++ blt a6, a2, L(more_32bytes) ++ ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a4, -16 ++ ld.d t3, a4, -8 ++ ++ st.d t0, a0, 0 ++ st.d t1, a0, 8 ++ st.d t2, a3, -16 ++ st.d t3, a3, -8 ++ jr ra ++ ++L(more_64bytes): ++ srli.d t8, a0, 3 ++ slli.d t8, t8, 3 ++ addi.d t8, t8, 0x8 ++ sub.d a7, a0, t8 ++ ++ ld.d t0, a1, 0 ++ sub.d a1, a1, a7 ++ st.d t0, a0, 0 ++ add.d a7, a7, a2 ++ addi.d a7, a7, -0x20 ++ ++L(loop_32): ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a1, 16 ++ ld.d t3, a1, 24 ++ ++ st.d t0, t8, 0 ++ st.d t1, t8, 8 ++ st.d t2, t8, 16 ++ st.d t3, t8, 24 ++ ++ addi.d t8, t8, 0x20 ++ addi.d a1, a1, 0x20 ++ addi.d a7, a7, -0x20 ++ blt zero, a7, L(loop_32) ++ ++ ld.d t4, a4, -32 ++ ld.d t5, a4, -24 ++ ld.d t6, a4, -16 ++ ld.d t7, a4, -8 ++ ++ st.d t4, a3, -32 ++ st.d t5, a3, -24 ++ st.d t6, a3, -16 ++ st.d t7, a3, -8 ++ ++ jr ra ++ ++L(more_32bytes): ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a1, 16 ++ ld.d t3, a1, 24 ++ ++ ld.d t4, a4, -32 ++ ld.d t5, a4, -24 ++ ld.d t6, a4, -16 ++ ld.d t7, a4, -8 ++ ++ st.d t0, a0, 0 ++ st.d t1, a0, 8 ++ st.d t2, a0, 16 ++ st.d t3, a0, 24 ++ ++ st.d t4, a3, -32 ++ st.d t5, a3, -24 ++ st.d t6, a3, -16 ++ st.d t7, a3, -8 ++ ++ jr ra ++ ++L(less_16bytes): ++ srai.d a6, a2, 3 ++ beqz a6, L(less_8bytes) ++ ++ ld.d t0, a1, 0 ++ ld.d t1, a4, -8 ++ st.d t0, a0, 0 ++ st.d t1, a3, -8 ++ ++ jr ra ++ ++L(less_8bytes): ++ srai.d a6, a2, 2 ++ beqz a6, L(less_4bytes) ++ ++ ld.w t0, a1, 0 ++ ld.w t1, a4, -4 ++ st.w t0, a0, 0 ++ st.w t1, a3, -4 ++ ++ jr ra ++ ++L(less_4bytes): ++ srai.d a6, a2, 1 ++ beqz a6, L(less_2bytes) ++ ++ ld.h t0, a1, 0 ++ ld.h t1, a4, -2 ++ st.h t0, a0, 0 ++ st.h t1, a3, -2 ++ ++ jr ra ++ ++L(less_2bytes): ++ beqz a2, L(less_1bytes) ++ ++ ld.b t0, a1, 0 ++ st.b t0, a0, 0 ++ jr ra ++ ++L(less_1bytes): ++ jr ra ++ ++L(long_bytes): ++ srli.d t8, a0, 3 ++ slli.d t8, t8, 3 ++ beq a0, t8, L(start) ++ ld.d t0, a1, 0 ++ ++ addi.d t8, t8, 0x8 ++ st.d t0, a0, 0 ++ sub.d a7, a0, t8 ++ sub.d a1, a1, a7 ++ ++L(start): ++ addi.d a5, a3, -0x80 ++ blt a5, t8, L(align_end_proc) ++ ++L(loop_128): ++ LD_64(a1, 0) ++ ST_64(t8, 0) ++ LD_64(a1, 64) ++ addi.d a1, a1, 0x80 ++ ST_64(t8, 64) ++ addi.d t8, t8, 0x80 ++ bge a5, t8, L(loop_128) ++ ++L(align_end_proc): ++ sub.d a2, a3, t8 ++ pcaddi t1, 34 ++ andi t2, a2, 0x78 ++ sub.d t1, t1, t2 ++ jr t1 ++ ++ ld.d t0, a1, 112 ++ st.d t0, t8, 112 ++ ld.d t0, a1, 104 ++ st.d t0, t8, 104 ++ ld.d t0, a1, 96 ++ st.d t0, t8, 96 ++ ld.d t0, a1, 88 ++ st.d t0, t8, 88 ++ ld.d t0, a1, 80 ++ st.d t0, t8, 80 ++ ld.d t0, a1, 72 ++ st.d t0, t8, 72 ++ ld.d t0, a1, 64 ++ st.d t0, t8, 64 ++ ld.d t0, a1, 56 ++ st.d t0, t8, 56 ++ ld.d t0, a1, 48 ++ st.d t0, t8, 48 ++ ld.d t0, a1, 40 ++ st.d t0, t8, 40 ++ ld.d t0, a1, 32 ++ st.d t0, t8, 32 ++ ld.d t0, a1, 24 ++ st.d t0, t8, 24 ++ ld.d t0, a1, 16 ++ st.d t0, t8, 16 ++ ld.d t0, a1, 8 ++ st.d t0, t8, 8 ++ ld.d t0, a1, 0 ++ st.d t0, t8, 0 ++ ld.d t0, a4, -8 ++ st.d t0, a3, -8 ++ ++ jr ra ++END(MEMCPY_NAME) ++ ++libc_hidden_builtin_def (MEMCPY_NAME) ++#endif +diff --git a/sysdeps/loongarch/lp64/multiarch/memcpy.c b/sysdeps/loongarch/lp64/multiarch/memcpy.c +new file mode 100644 +index 00000000..93b238ce +--- /dev/null ++++ b/sysdeps/loongarch/lp64/multiarch/memcpy.c +@@ -0,0 +1,37 @@ ++/* Multiple versions of memcpy. ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++/* Define multiple versions only for the definition in libc. */ ++#if IS_IN (libc) ++# define memcpy __redirect_memcpy ++# include <string.h> ++# undef memcpy ++ ++# define SYMBOL_NAME memcpy ++# include "ifunc-lasx.h" ++ ++libc_ifunc_redirected (__redirect_memcpy, memcpy, ++ IFUNC_SELECTOR ()); ++ ++# ifdef SHARED ++__hidden_ver1 (memcpy, __GI_memcpy, __redirect_memcpy) ++ __attribute__ ((visibility ("hidden"))) __attribute_copy__ (memcmp); ++# endif ++ ++#endif +diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-aligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-aligned.S +new file mode 100644 +index 00000000..5354f383 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/multiarch/memmove-aligned.S +@@ -0,0 +1,20 @@ ++/* Optimized memmove_aligned implementation using basic Loongarch instructions. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++/* memmove_aligned is part of memcpy_aligned, see memcpy-aligned.S. */ +diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S +new file mode 100644 +index 00000000..ff68e7a2 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/multiarch/memmove-lasx.S +@@ -0,0 +1,287 @@ ++/* Optimized memmove implementation using Loongarch LASX instructions. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++#include <sysdep.h> ++#include <sys/regdef.h> ++#include <sys/asm.h> ++ ++#if IS_IN (libc) && !defined __loongarch_soft_float ++ ++#ifndef MEMCPY_NAME ++# define MEMCPY_NAME __memcpy_lasx ++#endif ++ ++#ifndef MEMMOVE_NAME ++# define MEMMOVE_NAME __memmove_lasx ++#endif ++ ++LEAF(MEMCPY_NAME, 6) ++ li.d t0, 32 ++ add.d a3, a0, a2 ++ add.d a4, a1, a2 ++ bgeu t0, a2, L(less_32bytes) ++ ++ li.d t1, 64 ++ bltu t1, a2, L(copy_long) ++ xvld xr0, a1, 0 ++ xvld xr1, a4, -32 ++ ++ xvst xr0, a0, 0 ++ xvst xr1, a3, -32 ++ jr ra ++L(less_32bytes): ++ srli.d t0, a2, 4 ++ ++ beqz t0, L(less_16bytes) ++ vld vr0, a1, 0 ++ vld vr1, a4, -16 ++ vst vr0, a0, 0 ++ ++ ++ vst vr1, a3, -16 ++ jr ra ++L(less_16bytes): ++ srli.d t0, a2, 3 ++ beqz t0, L(less_8bytes) ++ ++ ld.d t0, a1, 0 ++ ld.d t1, a4, -8 ++ st.d t0, a0, 0 ++ st.d t1, a3, -8 ++ ++ jr ra ++L(less_8bytes): ++ srli.d t0, a2, 2 ++ beqz t0, L(less_4bytes) ++ ld.w t0, a1, 0 ++ ++ ld.w t1, a4, -4 ++ st.w t0, a0, 0 ++ st.w t1, a3, -4 ++ jr ra ++ ++ ++L(less_4bytes): ++ srli.d t0, a2, 1 ++ beqz t0, L(less_2bytes) ++ ld.h t0, a1, 0 ++ ld.h t1, a4, -2 ++ ++ st.h t0, a0, 0 ++ st.h t1, a3, -2 ++ jr ra ++L(less_2bytes): ++ beqz a2, L(less_1bytes) ++ ++ ld.b t0, a1, 0 ++ st.b t0, a0, 0 ++L(less_1bytes): ++ jr ra ++END(MEMCPY_NAME) ++ ++LEAF(MEMMOVE_NAME, 6) ++ ++ li.d t0, 32 ++ add.d a3, a0, a2 ++ add.d a4, a1, a2 ++ bgeu t0, a2, L(less_32bytes) ++ ++ li.d t1, 64 ++ bltu t1, a2, L(move_long) ++ xvld xr0, a1, 0 ++ xvld xr1, a4, -32 ++ ++ xvst xr0, a0, 0 ++ xvst xr1, a3, -32 ++ jr ra ++L(move_long): ++ sub.d t2, a0, a1 ++ ++ bltu t2, a2, L(copy_back) ++L(copy_long): ++ andi t2, a0, 0x1f ++ addi.d a2, a2, -1 ++ sub.d t2, t0, t2 ++ ++ ++ xvld xr8, a1, 0 ++ xvld xr9, a4, -32 ++ sub.d t3, a2, t2 ++ add.d a5, a0, t2 ++ ++ andi a2, t3, 0xff ++ add.d a1, a1, t2 ++ beq a2, t3, L(lt256) ++ sub.d a6, a4, a2 ++ ++ addi.d a6, a6, -1 ++L(loop_256): ++ xvld xr0, a1, 0 ++ xvld xr1, a1, 32 ++ xvld xr2, a1, 64 ++ ++ xvld xr3, a1, 96 ++ xvld xr4, a1, 128 ++ xvld xr5, a1, 160 ++ xvld xr6, a1, 192 ++ ++ ++ xvld xr7, a1, 224 ++ addi.d a1, a1, 256 ++ xvst xr0, a5, 0 ++ xvst xr1, a5, 32 ++ ++ xvst xr2, a5, 64 ++ xvst xr3, a5, 96 ++ xvst xr4, a5, 128 ++ xvst xr5, a5, 160 ++ ++ xvst xr6, a5, 192 ++ xvst xr7, a5, 224 ++ addi.d a5, a5, 256 ++ bne a1, a6, L(loop_256) ++ ++L(lt256): ++ srli.d t2, a2, 7 ++ beqz t2, L(lt128) ++ xvld xr0, a1, 0 ++ xvld xr1, a1, 32 ++ ++ ++ xvld xr2, a1, 64 ++ xvld xr3, a1, 96 ++ addi.d a1, a1, 128 ++ addi.d a2, a2, -128 ++ ++ xvst xr0, a5, 0 ++ xvst xr1, a5, 32 ++ xvst xr2, a5, 64 ++ xvst xr3, a5, 96 ++ ++ addi.d a5, a5, 128 ++L(lt128): ++ bltu a2, t1, L(lt64) ++ xvld xr0, a1, 0 ++ xvld xr1, a1, 32 ++ ++ addi.d a1, a1, 64 ++ addi.d a2, a2, -64 ++ xvst xr0, a5, 0 ++ xvst xr1, a5, 32 ++ ++ ++ addi.d a5, a5, 64 ++L(lt64): ++ bltu a2, t0, L(lt32) ++ xvld xr0, a1, 0 ++ xvst xr0, a5, 0 ++ ++L(lt32): ++ xvst xr8, a0, 0 ++ xvst xr9, a3, -32 ++ jr ra ++ nop ++ ++L(copy_back): ++ addi.d a3, a3, -1 ++ addi.d a2, a2, -2 ++ andi t2, a3, 0x1f ++ xvld xr8, a1, 0 ++ ++ xvld xr9, a4, -32 ++ sub.d t3, a2, t2 ++ sub.d a5, a3, t2 ++ sub.d a4, a4, t2 ++ ++ ++ andi a2, t3, 0xff ++ beq a2, t3, L(back_lt256) ++ add.d a6, a1, a2 ++ addi.d a6, a6, 2 ++ ++L(back_loop_256): ++ xvld xr0, a4, -33 ++ xvld xr1, a4, -65 ++ xvld xr2, a4, -97 ++ xvld xr3, a4, -129 ++ ++ xvld xr4, a4, -161 ++ xvld xr5, a4, -193 ++ xvld xr6, a4, -225 ++ xvld xr7, a4, -257 ++ ++ addi.d a4, a4, -256 ++ xvst xr0, a5, -32 ++ xvst xr1, a5, -64 ++ xvst xr2, a5, -96 ++ ++ ++ xvst xr3, a5, -128 ++ xvst xr4, a5, -160 ++ xvst xr5, a5, -192 ++ xvst xr6, a5, -224 ++ ++ xvst xr7, a5, -256 ++ addi.d a5, a5, -256 ++ bne a4, a6, L(back_loop_256) ++L(back_lt256): ++ srli.d t2, a2, 7 ++ ++ beqz t2, L(back_lt128) ++ xvld xr0, a4, -33 ++ xvld xr1, a4, -65 ++ xvld xr2, a4, -97 ++ ++ xvld xr3, a4, -129 ++ addi.d a2, a2, -128 ++ addi.d a4, a4, -128 ++ xvst xr0, a5, -32 ++ ++ ++ xvst xr1, a5, -64 ++ xvst xr2, a5, -96 ++ xvst xr3, a5, -128 ++ addi.d a5, a5, -128 ++ ++L(back_lt128): ++ blt a2, t1, L(back_lt64) ++ xvld xr0, a4, -33 ++ xvld xr1, a4, -65 ++ addi.d a2, a2, -64 ++ ++ addi.d a4, a4, -64 ++ xvst xr0, a5, -32 ++ xvst xr1, a5, -64 ++ addi.d a5, a5, -64 ++ ++L(back_lt64): ++ bltu a2, t0, L(back_lt32) ++ xvld xr0, a4, -33 ++ xvst xr0, a5, -32 ++L(back_lt32): ++ xvst xr8, a0, 0 ++ ++ ++ xvst xr9, a3, -31 ++ jr ra ++END(MEMMOVE_NAME) ++ ++libc_hidden_builtin_def (MEMCPY_NAME) ++libc_hidden_builtin_def (MEMMOVE_NAME) ++#endif +diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S +new file mode 100644 +index 00000000..9e1502a7 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/multiarch/memmove-lsx.S +@@ -0,0 +1,534 @@ ++/* Optimized memmove implementation using Loongarch LSX instructions. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++#include <sysdep.h> ++#include <sys/regdef.h> ++#include <sys/asm.h> ++ ++#if IS_IN (libc) && !defined __loongarch_soft_float ++ ++# define MEMCPY_NAME __memcpy_lsx ++# define MEMMOVE_NAME __memmove_lsx ++ ++LEAF(MEMCPY_NAME, 6) ++ li.d t6, 16 ++ add.d a3, a0, a2 ++ add.d a4, a1, a2 ++ bgeu t6, a2, L(less_16bytes) ++ ++ li.d t8, 64 ++ li.d t7, 32 ++ bltu t8, a2, L(copy_long) ++ bltu t7, a2, L(more_32bytes) ++ ++ vld vr0, a1, 0 ++ vld vr1, a4, -16 ++ vst vr0, a0, 0 ++ vst vr1, a3, -16 ++ ++ jr ra ++L(more_32bytes): ++ vld vr0, a1, 0 ++ vld vr1, a1, 16 ++ vld vr2, a4, -32 ++ ++ ++ vld vr3, a4, -16 ++ vst vr0, a0, 0 ++ vst vr1, a0, 16 ++ vst vr2, a3, -32 ++ ++ vst vr3, a3, -16 ++ jr ra ++L(less_16bytes): ++ srli.d t0, a2, 3 ++ beqz t0, L(less_8bytes) ++ ++ vldrepl.d vr0, a1, 0 ++ vldrepl.d vr1, a4, -8 ++ vstelm.d vr0, a0, 0, 0 ++ vstelm.d vr1, a3, -8, 0 ++ ++ jr ra ++L(less_8bytes): ++ srli.d t0, a2, 2 ++ beqz t0, L(less_4bytes) ++ vldrepl.w vr0, a1, 0 ++ ++ ++ vldrepl.w vr1, a4, -4 ++ vstelm.w vr0, a0, 0, 0 ++ vstelm.w vr1, a3, -4, 0 ++ jr ra ++ ++L(less_4bytes): ++ srli.d t0, a2, 1 ++ beqz t0, L(less_2bytes) ++ vldrepl.h vr0, a1, 0 ++ vldrepl.h vr1, a4, -2 ++ ++ vstelm.h vr0, a0, 0, 0 ++ vstelm.h vr1, a3, -2, 0 ++ jr ra ++L(less_2bytes): ++ beqz a2, L(less_1bytes) ++ ++ ld.b t0, a1, 0 ++ st.b t0, a0, 0 ++L(less_1bytes): ++ jr ra ++ nop ++END(MEMCPY_NAME) ++ ++LEAF(MEMMOVE_NAME, 6) ++ li.d t6, 16 ++ add.d a3, a0, a2 ++ add.d a4, a1, a2 ++ bgeu t6, a2, L(less_16bytes) ++ ++ li.d t8, 64 ++ li.d t7, 32 ++ bltu t8, a2, L(move_long) ++ bltu t7, a2, L(more_32bytes) ++ ++ vld vr0, a1, 0 ++ vld vr1, a4, -16 ++ vst vr0, a0, 0 ++ vst vr1, a3, -16 ++ ++ jr ra ++ nop ++L(move_long): ++ sub.d t0, a0, a1 ++ bltu t0, a2, L(copy_back) ++ ++ ++L(copy_long): ++ vld vr2, a1, 0 ++ andi t0, a0, 0xf ++ sub.d t0, t6, t0 ++ add.d a1, a1, t0 ++ ++ sub.d a2, a2, t0 ++ andi t1, a1, 0xf ++ bnez t1, L(unaligned) ++ vld vr0, a1, 0 ++ ++ addi.d a2, a2, -16 ++ vst vr2, a0, 0 ++ andi t2, a2, 0x7f ++ add.d a5, a0, t0 ++ ++ beq a2, t2, L(al_less_128) ++ sub.d t3, a2, t2 ++ move a2, t2 ++ add.d a6, a1, t3 ++ ++ ++L(al_loop): ++ vld vr1, a1, 16 ++ vld vr2, a1, 32 ++ vld vr3, a1, 48 ++ vld vr4, a1, 64 ++ ++ vld vr5, a1, 80 ++ vld vr6, a1, 96 ++ vld vr7, a1, 112 ++ vst vr0, a5, 0 ++ ++ vld vr0, a1, 128 ++ addi.d a1, a1, 128 ++ vst vr1, a5, 16 ++ vst vr2, a5, 32 ++ ++ vst vr3, a5, 48 ++ vst vr4, a5, 64 ++ vst vr5, a5, 80 ++ vst vr6, a5, 96 ++ ++ ++ vst vr7, a5, 112 ++ addi.d a5, a5, 128 ++ bne a1, a6, L(al_loop) ++L(al_less_128): ++ blt a2, t8, L(al_less_64) ++ ++ vld vr1, a1, 16 ++ vld vr2, a1, 32 ++ vld vr3, a1, 48 ++ addi.d a2, a2, -64 ++ ++ vst vr0, a5, 0 ++ vld vr0, a1, 64 ++ addi.d a1, a1, 64 ++ vst vr1, a5, 16 ++ ++ vst vr2, a5, 32 ++ vst vr3, a5, 48 ++ addi.d a5, a5, 64 ++L(al_less_64): ++ blt a2, t7, L(al_less_32) ++ ++ ++ vld vr1, a1, 16 ++ addi.d a2, a2, -32 ++ vst vr0, a5, 0 ++ vld vr0, a1, 32 ++ ++ addi.d a1, a1, 32 ++ vst vr1, a5, 16 ++ addi.d a5, a5, 32 ++L(al_less_32): ++ blt a2, t6, L(al_less_16) ++ ++ vst vr0, a5, 0 ++ vld vr0, a1, 16 ++ addi.d a5, a5, 16 ++L(al_less_16): ++ vld vr1, a4, -16 ++ ++ vst vr0, a5, 0 ++ vst vr1, a3, -16 ++ jr ra ++ nop ++ ++ ++L(magic_num): ++ .dword 0x0706050403020100 ++ .dword 0x0f0e0d0c0b0a0908 ++L(unaligned): ++ pcaddi t2, -4 ++ bstrins.d a1, zero, 3, 0 ++ vld vr8, t2, 0 ++ vld vr0, a1, 0 ++ ++ vld vr1, a1, 16 ++ addi.d a2, a2, -16 ++ vst vr2, a0, 0 ++ add.d a5, a0, t0 ++ ++ vreplgr2vr.b vr9, t1 ++ andi t2, a2, 0x7f ++ vadd.b vr9, vr9, vr8 ++ addi.d a1, a1, 32 ++ ++ ++ beq t2, a2, L(un_less_128) ++ sub.d t3, a2, t2 ++ move a2, t2 ++ add.d a6, a1, t3 ++ ++L(un_loop): ++ vld vr2, a1, 0 ++ vld vr3, a1, 16 ++ vld vr4, a1, 32 ++ vld vr5, a1, 48 ++ ++ vld vr6, a1, 64 ++ vld vr7, a1, 80 ++ vshuf.b vr8, vr1, vr0, vr9 ++ vld vr0, a1, 96 ++ ++ vst vr8, a5, 0 ++ vshuf.b vr8, vr2, vr1, vr9 ++ vld vr1, a1, 112 ++ vst vr8, a5, 16 ++ ++ ++ addi.d a1, a1, 128 ++ vshuf.b vr2, vr3, vr2, vr9 ++ vshuf.b vr3, vr4, vr3, vr9 ++ vst vr2, a5, 32 ++ ++ vshuf.b vr4, vr5, vr4, vr9 ++ vst vr3, a5, 48 ++ vshuf.b vr5, vr6, vr5, vr9 ++ vst vr4, a5, 64 ++ ++ vshuf.b vr6, vr7, vr6, vr9 ++ vst vr5, a5, 80 ++ vshuf.b vr7, vr0, vr7, vr9 ++ vst vr6, a5, 96 ++ ++ vst vr7, a5, 112 ++ addi.d a5, a5, 128 ++ bne a1, a6, L(un_loop) ++L(un_less_128): ++ blt a2, t8, L(un_less_64) ++ ++ ++ vld vr2, a1, 0 ++ vld vr3, a1, 16 ++ vshuf.b vr4, vr1, vr0, vr9 ++ vld vr0, a1, 32 ++ ++ vst vr4, a5, 0 ++ addi.d a2, a2, -64 ++ vshuf.b vr4, vr2, vr1, vr9 ++ vld vr1, a1, 48 ++ ++ addi.d a1, a1, 64 ++ vst vr4, a5, 16 ++ vshuf.b vr2, vr3, vr2, vr9 ++ vshuf.b vr3, vr0, vr3, vr9 ++ ++ vst vr2, a5, 32 ++ vst vr3, a5, 48 ++ addi.d a5, a5, 64 ++L(un_less_64): ++ blt a2, t7, L(un_less_32) ++ ++ ++ vshuf.b vr3, vr1, vr0, vr9 ++ vld vr0, a1, 0 ++ vst vr3, a5, 0 ++ addi.d a2, a2, -32 ++ ++ vshuf.b vr3, vr0, vr1, vr9 ++ vld vr1, a1, 16 ++ addi.d a1, a1, 32 ++ vst vr3, a5, 16 ++ ++ addi.d a5, a5, 32 ++L(un_less_32): ++ blt a2, t6, L(un_less_16) ++ vshuf.b vr2, vr1, vr0, vr9 ++ vor.v vr0, vr1, vr1 ++ ++ vld vr1, a1, 0 ++ vst vr2, a5, 0 ++ addi.d a5, a5, 16 ++L(un_less_16): ++ vld vr2, a4, -16 ++ ++ ++ vshuf.b vr0, vr1, vr0, vr9 ++ vst vr0, a5, 0 ++ vst vr2, a3, -16 ++ jr ra ++ ++L(copy_back): ++ addi.d t0, a3, -1 ++ vld vr2, a4, -16 ++ andi t0, t0, 0xf ++ addi.d t0, t0, 1 ++ ++ sub.d a4, a4, t0 ++ sub.d a2, a2, t0 ++ andi t1, a4, 0xf ++ bnez t1, L(back_unaligned) ++ ++ vld vr0, a4, -16 ++ addi.d a2, a2, -16 ++ vst vr2, a3, -16 ++ andi t2, a2, 0x7f ++ ++ ++ sub.d a3, a3, t0 ++ beq t2, a2, L(back_al_less_128) ++ sub.d t3, a2, t2 ++ move a2, t2 ++ ++ sub.d a6, a4, t3 ++L(back_al_loop): ++ vld vr1, a4, -32 ++ vld vr2, a4, -48 ++ vld vr3, a4, -64 ++ ++ vld vr4, a4, -80 ++ vld vr5, a4, -96 ++ vld vr6, a4, -112 ++ vld vr7, a4, -128 ++ ++ vst vr0, a3, -16 ++ vld vr0, a4, -144 ++ addi.d a4, a4, -128 ++ vst vr1, a3, -32 ++ ++ ++ vst vr2, a3, -48 ++ vst vr3, a3, -64 ++ vst vr4, a3, -80 ++ vst vr5, a3, -96 ++ ++ vst vr6, a3, -112 ++ vst vr7, a3, -128 ++ addi.d a3, a3, -128 ++ bne a4, a6, L(back_al_loop) ++ ++L(back_al_less_128): ++ blt a2, t8, L(back_al_less_64) ++ vld vr1, a4, -32 ++ vld vr2, a4, -48 ++ vld vr3, a4, -64 ++ ++ addi.d a2, a2, -64 ++ vst vr0, a3, -16 ++ vld vr0, a4, -80 ++ addi.d a4, a4, -64 ++ ++ ++ vst vr1, a3, -32 ++ vst vr2, a3, -48 ++ vst vr3, a3, -64 ++ addi.d a3, a3, -64 ++ ++L(back_al_less_64): ++ blt a2, t7, L(back_al_less_32) ++ vld vr1, a4, -32 ++ addi.d a2, a2, -32 ++ vst vr0, a3, -16 ++ ++ vld vr0, a4, -48 ++ vst vr1, a3, -32 ++ addi.d a3, a3, -32 ++ addi.d a4, a4, -32 ++ ++L(back_al_less_32): ++ blt a2, t6, L(back_al_less_16) ++ vst vr0, a3, -16 ++ vld vr0, a4, -32 ++ addi.d a3, a3, -16 ++ ++ ++L(back_al_less_16): ++ vld vr1, a1, 0 ++ vst vr0, a3, -16 ++ vst vr1, a0, 0 ++ jr ra ++ ++L(magic_num_2): ++ .dword 0x0706050403020100 ++ .dword 0x0f0e0d0c0b0a0908 ++L(back_unaligned): ++ pcaddi t2, -4 ++ bstrins.d a4, zero, 3, 0 ++ vld vr8, t2, 0 ++ vld vr0, a4, 0 ++ ++ vld vr1, a4, -16 ++ addi.d a2, a2, -16 ++ vst vr2, a3, -16 ++ sub.d a3, a3, t0 ++ ++ ++ vreplgr2vr.b vr9, t1 ++ andi t2, a2, 0x7f ++ vadd.b vr9, vr9, vr8 ++ addi.d a4, a4, -16 ++ ++ beq t2, a2, L(back_un_less_128) ++ sub.d t3, a2, t2 ++ move a2, t2 ++ sub.d a6, a4, t3 ++ ++L(back_un_loop): ++ vld vr2, a4, -16 ++ vld vr3, a4, -32 ++ vld vr4, a4, -48 ++ ++ vld vr5, a4, -64 ++ vld vr6, a4, -80 ++ vld vr7, a4, -96 ++ vshuf.b vr8, vr0, vr1, vr9 ++ ++ ++ vld vr0, a4, -112 ++ vst vr8, a3, -16 ++ vshuf.b vr8, vr1, vr2, vr9 ++ vld vr1, a4, -128 ++ ++ vst vr8, a3, -32 ++ addi.d a4, a4, -128 ++ vshuf.b vr2, vr2, vr3, vr9 ++ vshuf.b vr3, vr3, vr4, vr9 ++ ++ vst vr2, a3, -48 ++ vshuf.b vr4, vr4, vr5, vr9 ++ vst vr3, a3, -64 ++ vshuf.b vr5, vr5, vr6, vr9 ++ ++ vst vr4, a3, -80 ++ vshuf.b vr6, vr6, vr7, vr9 ++ vst vr5, a3, -96 ++ vshuf.b vr7, vr7, vr0, vr9 ++ ++ ++ vst vr6, a3, -112 ++ vst vr7, a3, -128 ++ addi.d a3, a3, -128 ++ bne a4, a6, L(back_un_loop) ++ ++L(back_un_less_128): ++ blt a2, t8, L(back_un_less_64) ++ vld vr2, a4, -16 ++ vld vr3, a4, -32 ++ vshuf.b vr4, vr0, vr1, vr9 ++ ++ vld vr0, a4, -48 ++ vst vr4, a3, -16 ++ addi.d a2, a2, -64 ++ vshuf.b vr4, vr1, vr2, vr9 ++ ++ vld vr1, a4, -64 ++ addi.d a4, a4, -64 ++ vst vr4, a3, -32 ++ vshuf.b vr2, vr2, vr3, vr9 ++ ++ ++ vshuf.b vr3, vr3, vr0, vr9 ++ vst vr2, a3, -48 ++ vst vr3, a3, -64 ++ addi.d a3, a3, -64 ++ ++L(back_un_less_64): ++ blt a2, t7, L(back_un_less_32) ++ vshuf.b vr3, vr0, vr1, vr9 ++ vld vr0, a4, -16 ++ vst vr3, a3, -16 ++ ++ addi.d a2, a2, -32 ++ vshuf.b vr3, vr1, vr0, vr9 ++ vld vr1, a4, -32 ++ addi.d a4, a4, -32 ++ ++ vst vr3, a3, -32 ++ addi.d a3, a3, -32 ++L(back_un_less_32): ++ blt a2, t6, L(back_un_less_16) ++ vshuf.b vr2, vr0, vr1, vr9 ++ ++ ++ vor.v vr0, vr1, vr1 ++ vld vr1, a4, -16 ++ vst vr2, a3, -16 ++ addi.d a3, a3, -16 ++ ++L(back_un_less_16): ++ vld vr2, a1, 0 ++ vshuf.b vr0, vr0, vr1, vr9 ++ vst vr0, a3, -16 ++ vst vr2, a0, 0 ++ ++ jr ra ++END(MEMMOVE_NAME) ++ ++libc_hidden_builtin_def (MEMCPY_NAME) ++libc_hidden_builtin_def (MEMMOVE_NAME) ++#endif +diff --git a/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S +new file mode 100644 +index 00000000..90a64b6b +--- /dev/null ++++ b/sysdeps/loongarch/lp64/multiarch/memmove-unaligned.S +@@ -0,0 +1,380 @@ ++/* Optimized memmove_unaligned implementation using basic Loongarch instructions. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++#include <sysdep.h> ++#include <sys/regdef.h> ++#include <sys/asm.h> ++ ++#if IS_IN (libc) ++ ++# define MEMMOVE_NAME __memmove_unaligned ++ ++# define LD_64(reg, n) \ ++ ld.d t0, reg, n; \ ++ ld.d t1, reg, n + 8; \ ++ ld.d t2, reg, n + 16; \ ++ ld.d t3, reg, n + 24; \ ++ ld.d t4, reg, n + 32; \ ++ ld.d t5, reg, n + 40; \ ++ ld.d t6, reg, n + 48; \ ++ ld.d t7, reg, n + 56; ++ ++# define ST_64(reg, n) \ ++ st.d t0, reg, n; \ ++ st.d t1, reg, n + 8; \ ++ st.d t2, reg, n + 16; \ ++ st.d t3, reg, n + 24; \ ++ st.d t4, reg, n + 32; \ ++ st.d t5, reg, n + 40; \ ++ st.d t6, reg, n + 48; \ ++ st.d t7, reg, n + 56; ++ ++LEAF(MEMMOVE_NAME, 3) ++ add.d a4, a1, a2 ++ add.d a3, a0, a2 ++ beq a1, a0, L(less_1bytes) ++ move t8, a0 ++ ++ srai.d a6, a2, 4 ++ beqz a6, L(less_16bytes) ++ srai.d a6, a2, 6 ++ bnez a6, L(more_64bytes) ++ srai.d a6, a2, 5 ++ beqz a6, L(less_32bytes) ++ ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a1, 16 ++ ld.d t3, a1, 24 ++ ++ ld.d t4, a4, -32 ++ ld.d t5, a4, -24 ++ ld.d t6, a4, -16 ++ ld.d t7, a4, -8 ++ ++ st.d t0, a0, 0 ++ st.d t1, a0, 8 ++ st.d t2, a0, 16 ++ st.d t3, a0, 24 ++ ++ st.d t4, a3, -32 ++ st.d t5, a3, -24 ++ st.d t6, a3, -16 ++ st.d t7, a3, -8 ++ ++ jr ra ++ ++L(less_32bytes): ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a4, -16 ++ ld.d t3, a4, -8 ++ ++ st.d t0, a0, 0 ++ st.d t1, a0, 8 ++ st.d t2, a3, -16 ++ st.d t3, a3, -8 ++ ++ jr ra ++ ++L(less_16bytes): ++ srai.d a6, a2, 3 ++ beqz a6, L(less_8bytes) ++ ++ ld.d t0, a1, 0 ++ ld.d t1, a4, -8 ++ st.d t0, a0, 0 ++ st.d t1, a3, -8 ++ ++ jr ra ++ ++L(less_8bytes): ++ srai.d a6, a2, 2 ++ beqz a6, L(less_4bytes) ++ ++ ld.w t0, a1, 0 ++ ld.w t1, a4, -4 ++ st.w t0, a0, 0 ++ st.w t1, a3, -4 ++ ++ jr ra ++ ++L(less_4bytes): ++ srai.d a6, a2, 1 ++ beqz a6, L(less_2bytes) ++ ++ ld.h t0, a1, 0 ++ ld.h t1, a4, -2 ++ st.h t0, a0, 0 ++ st.h t1, a3, -2 ++ ++ jr ra ++ ++L(less_2bytes): ++ beqz a2, L(less_1bytes) ++ ++ ld.b t0, a1, 0 ++ st.b t0, a0, 0 ++ ++ jr ra ++ ++L(less_1bytes): ++ jr ra ++ ++L(more_64bytes): ++ sub.d a7, a0, a1 ++ bltu a7, a2, L(copy_backward) ++ ++L(copy_forward): ++ srli.d a0, a0, 3 ++ slli.d a0, a0, 3 ++ beq a0, t8, L(all_align) ++ addi.d a0, a0, 0x8 ++ sub.d a7, t8, a0 ++ sub.d a1, a1, a7 ++ add.d a2, a7, a2 ++ ++L(start_unalign_proc): ++ pcaddi t1, 18 ++ slli.d a6, a7, 3 ++ add.d t1, t1, a6 ++ jr t1 ++ ++ ld.b t0, a1, -7 ++ st.b t0, a0, -7 ++ ld.b t0, a1, -6 ++ st.b t0, a0, -6 ++ ld.b t0, a1, -5 ++ st.b t0, a0, -5 ++ ld.b t0, a1, -4 ++ st.b t0, a0, -4 ++ ld.b t0, a1, -3 ++ st.b t0, a0, -3 ++ ld.b t0, a1, -2 ++ st.b t0, a0, -2 ++ ld.b t0, a1, -1 ++ st.b t0, a0, -1 ++L(start_over): ++ ++ addi.d a2, a2, -0x80 ++ blt a2, zero, L(end_unalign_proc) ++ ++L(loop_less): ++ LD_64(a1, 0) ++ ST_64(a0, 0) ++ LD_64(a1, 64) ++ ST_64(a0, 64) ++ ++ addi.d a0, a0, 0x80 ++ addi.d a1, a1, 0x80 ++ addi.d a2, a2, -0x80 ++ bge a2, zero, L(loop_less) ++ ++L(end_unalign_proc): ++ addi.d a2, a2, 0x80 ++ ++ pcaddi t1, 36 ++ andi t2, a2, 0x78 ++ add.d a1, a1, t2 ++ add.d a0, a0, t2 ++ sub.d t1, t1, t2 ++ jr t1 ++ ++ ld.d t0, a1, -120 ++ st.d t0, a0, -120 ++ ld.d t0, a1, -112 ++ st.d t0, a0, -112 ++ ld.d t0, a1, -104 ++ st.d t0, a0, -104 ++ ld.d t0, a1, -96 ++ st.d t0, a0, -96 ++ ld.d t0, a1, -88 ++ st.d t0, a0, -88 ++ ld.d t0, a1, -80 ++ st.d t0, a0, -80 ++ ld.d t0, a1, -72 ++ st.d t0, a0, -72 ++ ld.d t0, a1, -64 ++ st.d t0, a0, -64 ++ ld.d t0, a1, -56 ++ st.d t0, a0, -56 ++ ld.d t0, a1, -48 ++ st.d t0, a0, -48 ++ ld.d t0, a1, -40 ++ st.d t0, a0, -40 ++ ld.d t0, a1, -32 ++ st.d t0, a0, -32 ++ ld.d t0, a1, -24 ++ st.d t0, a0, -24 ++ ld.d t0, a1, -16 ++ st.d t0, a0, -16 ++ ld.d t0, a1, -8 ++ st.d t0, a0, -8 ++ ++ andi a2, a2, 0x7 ++ pcaddi t1, 18 ++ slli.d a2, a2, 3 ++ sub.d t1, t1, a2 ++ jr t1 ++ ++ ld.b t0, a4, -7 ++ st.b t0, a3, -7 ++ ld.b t0, a4, -6 ++ st.b t0, a3, -6 ++ ld.b t0, a4, -5 ++ st.b t0, a3, -5 ++ ld.b t0, a4, -4 ++ st.b t0, a3, -4 ++ ld.b t0, a4, -3 ++ st.b t0, a3, -3 ++ ld.b t0, a4, -2 ++ st.b t0, a3, -2 ++ ld.b t0, a4, -1 ++ st.b t0, a3, -1 ++L(end): ++ move a0, t8 ++ jr ra ++ ++L(all_align): ++ addi.d a1, a1, 0x8 ++ addi.d a0, a0, 0x8 ++ ld.d t0, a1, -8 ++ st.d t0, a0, -8 ++ addi.d a2, a2, -8 ++ b L(start_over) ++ ++L(all_align_back): ++ addi.d a4, a4, -0x8 ++ addi.d a3, a3, -0x8 ++ ld.d t0, a4, 0 ++ st.d t0, a3, 0 ++ addi.d a2, a2, -8 ++ b L(start_over_back) ++ ++L(copy_backward): ++ move a5, a3 ++ srli.d a3, a3, 3 ++ slli.d a3, a3, 3 ++ beq a3, a5, L(all_align_back) ++ sub.d a7, a3, a5 ++ add.d a4, a4, a7 ++ add.d a2, a7, a2 ++ ++ pcaddi t1, 18 ++ slli.d a6, a7, 3 ++ add.d t1, t1, a6 ++ jr t1 ++ ++ ld.b t0, a4, 6 ++ st.b t0, a3, 6 ++ ld.b t0, a4, 5 ++ st.b t0, a3, 5 ++ ld.b t0, a4, 4 ++ st.b t0, a3, 4 ++ ld.b t0, a4, 3 ++ st.b t0, a3, 3 ++ ld.b t0, a4, 2 ++ st.b t0, a3, 2 ++ ld.b t0, a4, 1 ++ st.b t0, a3, 1 ++ ld.b t0, a4, 0 ++ st.b t0, a3, 0 ++L(start_over_back): ++ addi.d a2, a2, -0x80 ++ blt a2, zero, L(end_unalign_proc_back) ++ ++L(loop_less_back): ++ LD_64(a4, -64) ++ ST_64(a3, -64) ++ LD_64(a4, -128) ++ ST_64(a3, -128) ++ ++ addi.d a4, a4, -0x80 ++ addi.d a3, a3, -0x80 ++ addi.d a2, a2, -0x80 ++ bge a2, zero, L(loop_less_back) ++ ++L(end_unalign_proc_back): ++ addi.d a2, a2, 0x80 ++ ++ pcaddi t1, 36 ++ andi t2, a2, 0x78 ++ sub.d a4, a4, t2 ++ sub.d a3, a3, t2 ++ sub.d t1, t1, t2 ++ jr t1 ++ ++ ld.d t0, a4, 112 ++ st.d t0, a3, 112 ++ ld.d t0, a4, 104 ++ st.d t0, a3, 104 ++ ld.d t0, a4, 96 ++ st.d t0, a3, 96 ++ ld.d t0, a4, 88 ++ st.d t0, a3, 88 ++ ld.d t0, a4, 80 ++ st.d t0, a3, 80 ++ ld.d t0, a4, 72 ++ st.d t0, a3, 72 ++ ld.d t0, a4, 64 ++ st.d t0, a3, 64 ++ ld.d t0, a4, 56 ++ st.d t0, a3, 56 ++ ld.d t0, a4, 48 ++ st.d t0, a3, 48 ++ ld.d t0, a4, 40 ++ st.d t0, a3, 40 ++ ld.d t0, a4, 32 ++ st.d t0, a3, 32 ++ ld.d t0, a4, 24 ++ st.d t0, a3, 24 ++ ld.d t0, a4, 16 ++ st.d t0, a3, 16 ++ ld.d t0, a4, 8 ++ st.d t0, a3, 8 ++ ld.d t0, a4, 0 ++ st.d t0, a3, 0 ++ ++ andi a2, a2, 0x7 ++ pcaddi t1, 18 ++ slli.d a2, a2, 3 ++ sub.d t1, t1, a2 ++ jr t1 ++ ++ ld.b t0, a1, 6 ++ st.b t0, a0, 6 ++ ld.b t0, a1, 5 ++ st.b t0, a0, 5 ++ ld.b t0, a1, 4 ++ st.b t0, a0, 4 ++ ld.b t0, a1, 3 ++ st.b t0, a0, 3 ++ ld.b t0, a1, 2 ++ st.b t0, a0, 2 ++ ld.b t0, a1, 1 ++ st.b t0, a0, 1 ++ ld.b t0, a1, 0 ++ st.b t0, a0, 0 ++ ++ move a0, t8 ++ jr ra ++END(MEMMOVE_NAME) ++ ++libc_hidden_builtin_def (MEMMOVE_NAME) ++#endif +diff --git a/sysdeps/loongarch/lp64/multiarch/memmove.c b/sysdeps/loongarch/lp64/multiarch/memmove.c +new file mode 100644 +index 00000000..7e3ca4c4 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/multiarch/memmove.c +@@ -0,0 +1,38 @@ ++/* Multiple versions of memmove. ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2023 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++/* Define multiple versions only for the definition in libc. */ ++#if IS_IN (libc) ++# define memmove __redirect_memmove ++# include <string.h> ++# undef memmove ++ ++# define SYMBOL_NAME memmove ++# include "ifunc-lasx.h" ++ ++libc_ifunc_redirected (__redirect_memmove, __libc_memmove, ++ IFUNC_SELECTOR ()); ++strong_alias (__libc_memmove, memmove); ++ ++# ifdef SHARED ++__hidden_ver1 (__libc_memmove, __GI_memmove, __redirect_memmove) ++ __attribute__ ((visibility ("hidden"))); ++# endif ++ ++#endif +-- +2.33.0 + |