diff options
Diffstat (limited to '1_6-LoongArch-Optimize-string-functions-memcpy-memmove.patch')
-rw-r--r-- | 1_6-LoongArch-Optimize-string-functions-memcpy-memmove.patch | 693 |
1 files changed, 693 insertions, 0 deletions
diff --git a/1_6-LoongArch-Optimize-string-functions-memcpy-memmove.patch b/1_6-LoongArch-Optimize-string-functions-memcpy-memmove.patch new file mode 100644 index 0000000..5413394 --- /dev/null +++ b/1_6-LoongArch-Optimize-string-functions-memcpy-memmove.patch @@ -0,0 +1,693 @@ +From 939b5ed88b61d03bae6d20bf97ad0f77f9b110bb Mon Sep 17 00:00:00 2001 +From: Xue Liu <liuxue@loongson.cn> +Date: Sun, 29 Jan 2023 10:20:26 +0800 +Subject: [PATCH 1/6] LoongArch: Optimize string functions memcpy, memmove. + +Change-Id: Ib0e78d062082a657d5bf572403f19bf5bfe0a28d +--- + sysdeps/loongarch/lp64/memcpy.S | 259 ++++++++++++++++++++ + sysdeps/loongarch/lp64/memmove.S | 406 +++++++++++++++++++++++++++++++ + 2 files changed, 665 insertions(+) + create mode 100644 sysdeps/loongarch/lp64/memcpy.S + create mode 100644 sysdeps/loongarch/lp64/memmove.S + +diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S +new file mode 100644 +index 00000000..5d850123 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/memcpy.S +@@ -0,0 +1,259 @@ ++/* Optimized memcpy implementation for LoongArch. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++#include <sysdep.h> ++ ++/* Allow the routine to be named something else if desired. */ ++#ifndef MEMCPY_NAME ++#define MEMCPY_NAME memcpy ++#endif ++ ++#define LD_64(reg, n) \ ++ ld.d t0, reg, n; \ ++ ld.d t1, reg, n+8; \ ++ ld.d t2, reg, n+16; \ ++ ld.d t3, reg, n+24; \ ++ ld.d t4, reg, n+32; \ ++ ld.d t5, reg, n+40; \ ++ ld.d t6, reg, n+48; \ ++ ld.d t7, reg, n+56; ++ ++#define ST_64(reg, n) \ ++ st.d t0, reg, n; \ ++ st.d t1, reg, n+8; \ ++ st.d t2, reg, n+16; \ ++ st.d t3, reg, n+24; \ ++ st.d t4, reg, n+32; \ ++ st.d t5, reg, n+40; \ ++ st.d t6, reg, n+48; \ ++ st.d t7, reg, n+56; ++ ++LEAF(MEMCPY_NAME) ++//1st var: dst ptr: void *a1 $r4 a0 ++//2nd var: src ptr: void *a2 $r5 a1 ++//3rd var: size_t len $r6 a2 ++//t0~t9 registers as temp ++ ++ add.d a4, a1, a2 ++ add.d a3, a0, a2 ++ li.w a6, 16 ++ bge a6, a2, less_16bytes ++ li.w a6, 128 ++ blt a6, a2, long_bytes ++ li.w a6, 64 ++ blt a6, a2, more_64bytes ++ li.w a6, 32 ++ blt a6, a2, more_32bytes ++ ++ /* 17...32 */ ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a4, -16 ++ ld.d t3, a4, -8 ++ st.d t0, a0, 0 ++ st.d t1, a0, 8 ++ st.d t2, a3, -16 ++ st.d t3, a3, -8 ++ jr ra ++ ++more_64bytes: ++ srli.d t8, a0, 3 ++ slli.d t8, t8, 3 ++ addi.d t8, t8, 0x8 ++ sub.d a7, a0, t8 ++ ld.d t0, a1, 0 ++ sub.d a1, a1, a7 ++ st.d t0, a0, 0 ++ ++ add.d a7, a7, a2 ++ addi.d a7, a7, -0x20 ++loop_32: ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a1, 16 ++ ld.d t3, a1, 24 ++ st.d t0, t8, 0 ++ st.d t1, t8, 8 ++ st.d t2, t8, 16 ++ st.d t3, t8, 24 ++ ++ addi.d t8, t8, 0x20 ++ addi.d a1, a1, 0x20 ++ addi.d a7, a7, -0x20 ++ blt zero, a7, loop_32 ++ ++ ld.d t4, a4, -32 ++ ld.d t5, a4, -24 ++ ld.d t6, a4, -16 ++ ld.d t7, a4, -8 ++ st.d t4, a3, -32 ++ st.d t5, a3, -24 ++ st.d t6, a3, -16 ++ st.d t7, a3, -8 ++ ++ jr ra ++ ++more_32bytes: ++ /* 33...64 */ ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a1, 16 ++ ld.d t3, a1, 24 ++ ld.d t4, a4, -32 ++ ld.d t5, a4, -24 ++ ld.d t6, a4, -16 ++ ld.d t7, a4, -8 ++ st.d t0, a0, 0 ++ st.d t1, a0, 8 ++ st.d t2, a0, 16 ++ st.d t3, a0, 24 ++ st.d t4, a3, -32 ++ st.d t5, a3, -24 ++ st.d t6, a3, -16 ++ st.d t7, a3, -8 ++ jr ra ++ ++less_16bytes: ++ srai.d a6, a2, 3 ++ beqz a6, less_8bytes ++ ++ /* 8...16 */ ++ ld.d t0, a1, 0 ++ ld.d t1, a4, -8 ++ st.d t0, a0, 0 ++ st.d t1, a3, -8 ++ ++ jr ra ++ ++less_8bytes: ++ srai.d a6, a2, 2 ++ beqz a6, less_4bytes ++ ++ /* 4...7 */ ++ ld.w t0, a1, 0 ++ ld.w t1, a4, -4 ++ st.w t0, a0, 0 ++ st.w t1, a3, -4 ++ jr ra ++ ++less_4bytes: ++ srai.d a6, a2, 1 ++ beqz a6, less_2bytes ++ ++ /* 2...3 */ ++ ld.h t0, a1, 0 ++ ld.h t1, a4, -2 ++ st.h t0, a0, 0 ++ st.h t1, a3, -2 ++ jr ra ++ ++less_2bytes: ++ beqz a2, less_1bytes ++ ++ ld.b t0, a1, 0 ++ st.b t0, a0, 0 ++ jr ra ++ ++less_1bytes: ++ jr ra ++ ++long_bytes: ++ srli.d t8, a0, 3 ++ slli.d t8, t8, 3 ++ beq a0, t8, start ++ ++ ld.d t0, a1, 0 ++ addi.d t8, t8, 0x8 ++ st.d t0, a0, 0 ++ sub.d a7, a0, t8 ++ sub.d a1, a1, a7 ++ ++start: ++ addi.d a5, a3, -0x80 ++ blt a5, t8, align_end_proc ++ ++loop_128: ++ LD_64(a1, 0) ++ ST_64(t8, 0) ++ LD_64(a1, 64) ++ addi.d a1, a1, 0x80 ++ ST_64(t8, 64) ++ addi.d t8, t8, 0x80 ++ bge a5, t8, loop_128 ++ ++align_end_proc: ++ sub.d a2, a3, t8 ++ ++ pcaddi t1, 34 ++ andi t2, a2, 0x78 ++ sub.d t1, t1, t2 ++ jirl zero, t1, 0 ++ ++end_120_128_unalign: ++ ld.d t0, a1, 112 ++ st.d t0, t8, 112 ++end_112_120_unalign: ++ ld.d t0, a1, 104 ++ st.d t0, t8, 104 ++end_104_112_unalign: ++ ld.d t0, a1, 96 ++ st.d t0, t8, 96 ++end_96_104_unalign: ++ ld.d t0, a1, 88 ++ st.d t0, t8, 88 ++end_88_96_unalign: ++ ld.d t0, a1, 80 ++ st.d t0, t8, 80 ++end_80_88_unalign: ++ ld.d t0, a1, 72 ++ st.d t0, t8, 72 ++end_72_80_unalign: ++ ld.d t0, a1, 64 ++ st.d t0, t8, 64 ++end_64_72_unalign: ++ ld.d t0, a1, 56 ++ st.d t0, t8, 56 ++end_56_64_unalign: ++ ld.d t0, a1, 48 ++ st.d t0, t8, 48 ++end_48_56_unalign: ++ ld.d t0, a1, 40 ++ st.d t0, t8, 40 ++end_40_48_unalign: ++ ld.d t0, a1, 32 ++ st.d t0, t8, 32 ++end_32_40_unalign: ++ ld.d t0, a1, 24 ++ st.d t0, t8, 24 ++end_24_32_unalign: ++ ld.d t0, a1, 16 ++ st.d t0, t8, 16 ++end_16_24_unalign: ++ ld.d t0, a1, 8 ++ st.d t0, t8, 8 ++end_8_16_unalign: ++ ld.d t0, a1, 0 ++ st.d t0, t8, 0 ++end_0_8_unalign: ++ ld.d t0, a4, -8 ++ st.d t0, a3, -8 ++ ++ jr ra ++ ++END(MEMCPY_NAME) ++libc_hidden_builtin_def (MEMCPY_NAME) +diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S +new file mode 100644 +index 00000000..edd9cf3d +--- /dev/null ++++ b/sysdeps/loongarch/lp64/memmove.S +@@ -0,0 +1,406 @@ ++/* Optimized memmove implementation for LoongArch. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++#include <sysdep.h> ++ ++/* Allow the routine to be named something else if desired. */ ++#ifndef MEMMOVE_NAME ++#define MEMMOVE_NAME memmove ++#endif ++ ++#define LD_64(reg, n) \ ++ ld.d t0, reg, n; \ ++ ld.d t1, reg, n+8; \ ++ ld.d t2, reg, n+16; \ ++ ld.d t3, reg, n+24; \ ++ ld.d t4, reg, n+32; \ ++ ld.d t5, reg, n+40; \ ++ ld.d t6, reg, n+48; \ ++ ld.d t7, reg, n+56; ++ ++ ++#define ST_64(reg, n) \ ++ st.d t0, reg, n; \ ++ st.d t1, reg, n+8; \ ++ st.d t2, reg, n+16; \ ++ st.d t3, reg, n+24; \ ++ st.d t4, reg, n+32; \ ++ st.d t5, reg, n+40; \ ++ st.d t6, reg, n+48; \ ++ st.d t7, reg, n+56; ++ ++/* memmove (const void *dst, const void *src, size_t n) */ ++LEAF(MEMMOVE_NAME) ++ add.d a4, a1, a2 ++ add.d a3, a0, a2 ++ beq a1, a0, less_1bytes ++ move t8, a0 ++ srai.d a6, a2, 4 #num/16 ++ beqz a6, less_16bytes #num<16 ++ srai.d a6, a2, 6 #num/64 ++ bnez a6, more_64bytes #num>64 ++ srai.d a6, a2, 5 ++ beqz a6, less_32bytes #num<32 ++ ++ ld.d t0, a1, 0 #32<num<64 ++ ld.d t1, a1, 8 ++ ld.d t2, a1, 16 ++ ld.d t3, a1, 24 ++ ld.d t4, a4, -32 ++ ld.d t5, a4, -24 ++ ld.d t6, a4, -16 ++ ld.d t7, a4, -8 ++ st.d t0, a0, 0 ++ st.d t1, a0, 8 ++ st.d t2, a0, 16 ++ st.d t3, a0, 24 ++ st.d t4, a3, -32 ++ st.d t5, a3, -24 ++ st.d t6, a3, -16 ++ st.d t7, a3, -8 ++ ++ jr ra ++ ++less_32bytes: ++ ld.d t0, a1, 0 ++ ld.d t1, a1, 8 ++ ld.d t2, a4, -16 ++ ld.d t3, a4, -8 ++ st.d t0, a0, 0 ++ st.d t1, a0, 8 ++ st.d t2, a3, -16 ++ st.d t3, a3, -8 ++ ++ jr ra ++ ++less_16bytes: ++ srai.d a6, a2, 3 #num/8 ++ beqz a6, less_8bytes ++ ++ ld.d t0, a1, 0 ++ ld.d t1, a4, -8 ++ st.d t0, a0, 0 ++ st.d t1, a3, -8 ++ ++ jr ra ++ ++less_8bytes: ++ srai.d a6, a2, 2 ++ beqz a6, less_4bytes ++ ++ ld.w t0, a1, 0 ++ ld.w t1, a4, -4 ++ st.w t0, a0, 0 ++ st.w t1, a3, -4 ++ ++ jr ra ++ ++less_4bytes: ++ srai.d a6, a2, 1 ++ beqz a6, less_2bytes ++ ++ ld.h t0, a1, 0 ++ ld.h t1, a4, -2 ++ st.h t0, a0, 0 ++ st.h t1, a3, -2 ++ ++ jr ra ++ ++less_2bytes: ++ beqz a2, less_1bytes ++ ++ ld.b t0, a1, 0 ++ st.b t0, a0, 0 ++ ++ jr ra ++ ++less_1bytes: ++ jr ra ++ ++more_64bytes: ++ sub.d a7, a0, a1 ++ bltu a7, a2, copy_backward ++ ++copy_forward: ++ srli.d a0, a0, 3 ++ slli.d a0, a0, 3 ++ beq a0, t8, all_align ++ addi.d a0, a0, 0x8 ++ sub.d a7, t8, a0 ++ sub.d a1, a1, a7 ++ add.d a2, a7, a2 ++ ++start_unalign_proc: ++ pcaddi t1, 18 ++ slli.d a6, a7, 3 ++ add.d t1, t1, a6 ++ jirl zero, t1, 0 ++ ++start_7_unalign: ++ ld.b t0, a1, -7 ++ st.b t0, a0, -7 ++start_6_unalign: ++ ld.b t0, a1, -6 ++ st.b t0, a0, -6 ++start_5_unalign: ++ ld.b t0, a1, -5 ++ st.b t0, a0, -5 ++start_4_unalign: ++ ld.b t0, a1, -4 ++ st.b t0, a0, -4 ++start_3_unalign: ++ ld.b t0, a1, -3 ++ st.b t0, a0, -3 ++start_2_unalign: ++ ld.b t0, a1, -2 ++ st.b t0, a0, -2 ++start_1_unalign: ++ ld.b t0, a1, -1 ++ st.b t0, a0, -1 ++start_over: ++ ++ addi.d a2, a2, -0x80 ++ blt a2, zero, end_unalign_proc ++ ++loop_less: ++ LD_64(a1, 0) ++ ST_64(a0, 0) ++ LD_64(a1, 64) ++ ST_64(a0, 64) ++ ++ addi.d a0, a0, 0x80 ++ addi.d a1, a1, 0x80 ++ addi.d a2, a2, -0x80 ++ bge a2, zero, loop_less ++ ++end_unalign_proc: ++ addi.d a2, a2, 0x80 ++ ++ pcaddi t1, 36 ++ andi t2, a2, 0x78 ++ add.d a1, a1, t2 ++ add.d a0, a0, t2 ++ sub.d t1, t1, t2 ++ jirl zero, t1, 0 ++ ++end_120_128_unalign: ++ ld.d t0, a1, -120 ++ st.d t0, a0, -120 ++end_112_120_unalign: ++ ld.d t0, a1, -112 ++ st.d t0, a0, -112 ++end_104_112_unalign: ++ ld.d t0, a1, -104 ++ st.d t0, a0, -104 ++end_96_104_unalign: ++ ld.d t0, a1, -96 ++ st.d t0, a0, -96 ++end_88_96_unalign: ++ ld.d t0, a1, -88 ++ st.d t0, a0, -88 ++end_80_88_unalign: ++ ld.d t0, a1, -80 ++ st.d t0, a0, -80 ++end_72_80_unalign: ++ ld.d t0, a1, -72 ++ st.d t0, a0, -72 ++end_64_72_unalign: ++ ld.d t0, a1, -64 ++ st.d t0, a0, -64 ++end_56_64_unalign: ++ ld.d t0, a1, -56 ++ st.d t0, a0, -56 ++end_48_56_unalign: ++ ld.d t0, a1, -48 ++ st.d t0, a0, -48 ++end_40_48_unalign: ++ ld.d t0, a1, -40 ++ st.d t0, a0, -40 ++end_32_40_unalign: ++ ld.d t0, a1, -32 ++ st.d t0, a0, -32 ++end_24_32_unalign: ++ ld.d t0, a1, -24 ++ st.d t0, a0, -24 ++end_16_24_unalign: ++ ld.d t0, a1, -16 ++ st.d t0, a0, -16 ++end_8_16_unalign: ++ ld.d t0, a1, -8 ++ st.d t0, a0, -8 ++end_0_8_unalign: ++ ++ andi a2, a2, 0x7 ++ pcaddi t1, 18 ++ slli.d a2, a2, 3 ++ sub.d t1, t1, a2 ++ jirl zero, t1, 0 ++ ++end_7_unalign: ++ ld.b t0, a4, -7 ++ st.b t0, a3, -7 ++end_6_unalign: ++ ld.b t0, a4, -6 ++ st.b t0, a3, -6 ++end_5_unalign: ++ ld.b t0, a4, -5 ++ st.b t0, a3, -5 ++end_4_unalign: ++ ld.b t0, a4, -4 ++ st.b t0, a3, -4 ++end_3_unalign: ++ ld.b t0, a4, -3 ++ st.b t0, a3, -3 ++end_2_unalign: ++ ld.b t0, a4, -2 ++ st.b t0, a3, -2 ++end_1_unalign: ++ ld.b t0, a4, -1 ++ st.b t0, a3, -1 ++end: ++ ++ move v0, t8 ++ jr ra ++ ++all_align: ++ addi.d a1, a1, 0x8 ++ addi.d a0, a0, 0x8 ++ ld.d t0, a1, -8 ++ st.d t0, a0, -8 ++ addi.d a2, a2, -8 ++ b start_over ++ ++all_align_back: ++ addi.d a4, a4, -0x8 ++ addi.d a3, a3, -0x8 ++ ld.d t0, a4, 0 ++ st.d t0, a3, 0 ++ addi.d a2, a2, -8 ++ b start_over_back ++ ++copy_backward: ++ move a5, a3 ++ srli.d a3, a3, 3 ++ slli.d a3, a3, 3 ++ beq a3, a5, all_align_back ++ sub.d a7, a3, a5 ++ add.d a4, a4, a7 ++ add.d a2, a7, a2 ++ ++ pcaddi t1, 18 ++ slli.d a6, a7, 3 ++ add.d t1, t1, a6 ++ jirl zero, t1, 0 ++ ++ ld.b t0, a4, 6 ++ st.b t0, a3, 6 ++ ld.b t0, a4, 5 ++ st.b t0, a3, 5 ++ ld.b t0, a4, 4 ++ st.b t0, a3, 4 ++ ld.b t0, a4, 3 ++ st.b t0, a3, 3 ++ ld.b t0, a4, 2 ++ st.b t0, a3, 2 ++ ld.b t0, a4, 1 ++ st.b t0, a3, 1 ++ ld.b t0, a4, 0 ++ st.b t0, a3, 0 ++start_over_back: ++ ++ addi.d a2, a2, -0x80 ++ blt a2, zero, end_unalign_proc_back ++ ++loop_less_back: ++ LD_64(a4, -64) ++ ST_64(a3, -64) ++ LD_64(a4, -128) ++ ST_64(a3, -128) ++ ++ addi.d a4, a4, -0x80 ++ addi.d a3, a3, -0x80 ++ addi.d a2, a2, -0x80 ++ bge a2, zero, loop_less_back ++ ++end_unalign_proc_back: ++ addi.d a2, a2, 0x80 ++ ++ pcaddi t1, 36 ++ andi t2, a2, 0x78 ++ sub.d a4, a4, t2 ++ sub.d a3, a3, t2 ++ sub.d t1, t1, t2 ++ jirl zero, t1, 0 ++ ++ ld.d t0, a4, 112 ++ st.d t0, a3, 112 ++ ld.d t0, a4, 104 ++ st.d t0, a3, 104 ++ ld.d t0, a4, 96 ++ st.d t0, a3, 96 ++ ld.d t0, a4, 88 ++ st.d t0, a3, 88 ++ ld.d t0, a4, 80 ++ st.d t0, a3, 80 ++ ld.d t0, a4, 72 ++ st.d t0, a3, 72 ++ ld.d t0, a4, 64 ++ st.d t0, a3, 64 ++ ld.d t0, a4, 56 ++ st.d t0, a3, 56 ++ ld.d t0, a4, 48 ++ st.d t0, a3, 48 ++ ld.d t0, a4, 40 ++ st.d t0, a3, 40 ++ ld.d t0, a4, 32 ++ st.d t0, a3, 32 ++ ld.d t0, a4, 24 ++ st.d t0, a3, 24 ++ ld.d t0, a4, 16 ++ st.d t0, a3, 16 ++ ld.d t0, a4, 8 ++ st.d t0, a3, 8 ++ ld.d t0, a4, 0 ++ st.d t0, a3, 0 ++ ++ andi a2, a2, 0x7 ++ pcaddi t1, 18 ++ slli.d a2, a2, 3 ++ sub.d t1, t1, a2 ++ jirl zero, t1, 0 ++ ++ ld.b t0, a1, 6 ++ st.b t0, a0, 6 ++ ld.b t0, a1, 5 ++ st.b t0, a0, 5 ++ ld.b t0, a1, 4 ++ st.b t0, a0, 4 ++ ld.b t0, a1, 3 ++ st.b t0, a0, 3 ++ ld.b t0, a1, 2 ++ st.b t0, a0, 2 ++ ld.b t0, a1, 1 ++ st.b t0, a0, 1 ++ ld.b t0, a1, 0 ++ st.b t0, a0, 0 ++ ++ move v0, t8 ++ jr ra ++ ++END(MEMMOVE_NAME) ++libc_hidden_builtin_def (MEMMOVE_NAME) +-- +2.33.0 + |