summaryrefslogtreecommitdiff
path: root/1_6-LoongArch-Optimize-string-functions-memcpy-memmove.patch
diff options
context:
space:
mode:
Diffstat (limited to '1_6-LoongArch-Optimize-string-functions-memcpy-memmove.patch')
-rw-r--r--1_6-LoongArch-Optimize-string-functions-memcpy-memmove.patch693
1 files changed, 693 insertions, 0 deletions
diff --git a/1_6-LoongArch-Optimize-string-functions-memcpy-memmove.patch b/1_6-LoongArch-Optimize-string-functions-memcpy-memmove.patch
new file mode 100644
index 0000000..5413394
--- /dev/null
+++ b/1_6-LoongArch-Optimize-string-functions-memcpy-memmove.patch
@@ -0,0 +1,693 @@
+From 939b5ed88b61d03bae6d20bf97ad0f77f9b110bb Mon Sep 17 00:00:00 2001
+From: Xue Liu <liuxue@loongson.cn>
+Date: Sun, 29 Jan 2023 10:20:26 +0800
+Subject: [PATCH 1/6] LoongArch: Optimize string functions memcpy, memmove.
+
+Change-Id: Ib0e78d062082a657d5bf572403f19bf5bfe0a28d
+---
+ sysdeps/loongarch/lp64/memcpy.S | 259 ++++++++++++++++++++
+ sysdeps/loongarch/lp64/memmove.S | 406 +++++++++++++++++++++++++++++++
+ 2 files changed, 665 insertions(+)
+ create mode 100644 sysdeps/loongarch/lp64/memcpy.S
+ create mode 100644 sysdeps/loongarch/lp64/memmove.S
+
+diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S
+new file mode 100644
+index 00000000..5d850123
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/memcpy.S
+@@ -0,0 +1,259 @@
++/* Optimized memcpy implementation for LoongArch.
++ Copyright (C) 2021 Free Software Foundation, Inc.
++ This file is part of the GNU C Library.
++
++ The GNU C Library is free software; you can redistribute it and/or
++ modify it under the terms of the GNU Lesser General Public
++ License as published by the Free Software Foundation; either
++ version 2.1 of the License, or (at your option) any later version.
++
++ The GNU C Library is distributed in the hope that it will be useful,
++ but WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ Lesser General Public License for more details.
++
++ You should have received a copy of the GNU Lesser General Public
++ License along with the GNU C Library. If not, see
++ <https://www.gnu.org/licenses/>. */
++
++#include <sysdep.h>
++
++/* Allow the routine to be named something else if desired. */
++#ifndef MEMCPY_NAME
++#define MEMCPY_NAME memcpy
++#endif
++
++#define LD_64(reg, n) \
++ ld.d t0, reg, n; \
++ ld.d t1, reg, n+8; \
++ ld.d t2, reg, n+16; \
++ ld.d t3, reg, n+24; \
++ ld.d t4, reg, n+32; \
++ ld.d t5, reg, n+40; \
++ ld.d t6, reg, n+48; \
++ ld.d t7, reg, n+56;
++
++#define ST_64(reg, n) \
++ st.d t0, reg, n; \
++ st.d t1, reg, n+8; \
++ st.d t2, reg, n+16; \
++ st.d t3, reg, n+24; \
++ st.d t4, reg, n+32; \
++ st.d t5, reg, n+40; \
++ st.d t6, reg, n+48; \
++ st.d t7, reg, n+56;
++
++LEAF(MEMCPY_NAME)
++//1st var: dst ptr: void *a1 $r4 a0
++//2nd var: src ptr: void *a2 $r5 a1
++//3rd var: size_t len $r6 a2
++//t0~t9 registers as temp
++
++ add.d a4, a1, a2
++ add.d a3, a0, a2
++ li.w a6, 16
++ bge a6, a2, less_16bytes
++ li.w a6, 128
++ blt a6, a2, long_bytes
++ li.w a6, 64
++ blt a6, a2, more_64bytes
++ li.w a6, 32
++ blt a6, a2, more_32bytes
++
++ /* 17...32 */
++ ld.d t0, a1, 0
++ ld.d t1, a1, 8
++ ld.d t2, a4, -16
++ ld.d t3, a4, -8
++ st.d t0, a0, 0
++ st.d t1, a0, 8
++ st.d t2, a3, -16
++ st.d t3, a3, -8
++ jr ra
++
++more_64bytes:
++ srli.d t8, a0, 3
++ slli.d t8, t8, 3
++ addi.d t8, t8, 0x8
++ sub.d a7, a0, t8
++ ld.d t0, a1, 0
++ sub.d a1, a1, a7
++ st.d t0, a0, 0
++
++ add.d a7, a7, a2
++ addi.d a7, a7, -0x20
++loop_32:
++ ld.d t0, a1, 0
++ ld.d t1, a1, 8
++ ld.d t2, a1, 16
++ ld.d t3, a1, 24
++ st.d t0, t8, 0
++ st.d t1, t8, 8
++ st.d t2, t8, 16
++ st.d t3, t8, 24
++
++ addi.d t8, t8, 0x20
++ addi.d a1, a1, 0x20
++ addi.d a7, a7, -0x20
++ blt zero, a7, loop_32
++
++ ld.d t4, a4, -32
++ ld.d t5, a4, -24
++ ld.d t6, a4, -16
++ ld.d t7, a4, -8
++ st.d t4, a3, -32
++ st.d t5, a3, -24
++ st.d t6, a3, -16
++ st.d t7, a3, -8
++
++ jr ra
++
++more_32bytes:
++ /* 33...64 */
++ ld.d t0, a1, 0
++ ld.d t1, a1, 8
++ ld.d t2, a1, 16
++ ld.d t3, a1, 24
++ ld.d t4, a4, -32
++ ld.d t5, a4, -24
++ ld.d t6, a4, -16
++ ld.d t7, a4, -8
++ st.d t0, a0, 0
++ st.d t1, a0, 8
++ st.d t2, a0, 16
++ st.d t3, a0, 24
++ st.d t4, a3, -32
++ st.d t5, a3, -24
++ st.d t6, a3, -16
++ st.d t7, a3, -8
++ jr ra
++
++less_16bytes:
++ srai.d a6, a2, 3
++ beqz a6, less_8bytes
++
++ /* 8...16 */
++ ld.d t0, a1, 0
++ ld.d t1, a4, -8
++ st.d t0, a0, 0
++ st.d t1, a3, -8
++
++ jr ra
++
++less_8bytes:
++ srai.d a6, a2, 2
++ beqz a6, less_4bytes
++
++ /* 4...7 */
++ ld.w t0, a1, 0
++ ld.w t1, a4, -4
++ st.w t0, a0, 0
++ st.w t1, a3, -4
++ jr ra
++
++less_4bytes:
++ srai.d a6, a2, 1
++ beqz a6, less_2bytes
++
++ /* 2...3 */
++ ld.h t0, a1, 0
++ ld.h t1, a4, -2
++ st.h t0, a0, 0
++ st.h t1, a3, -2
++ jr ra
++
++less_2bytes:
++ beqz a2, less_1bytes
++
++ ld.b t0, a1, 0
++ st.b t0, a0, 0
++ jr ra
++
++less_1bytes:
++ jr ra
++
++long_bytes:
++ srli.d t8, a0, 3
++ slli.d t8, t8, 3
++ beq a0, t8, start
++
++ ld.d t0, a1, 0
++ addi.d t8, t8, 0x8
++ st.d t0, a0, 0
++ sub.d a7, a0, t8
++ sub.d a1, a1, a7
++
++start:
++ addi.d a5, a3, -0x80
++ blt a5, t8, align_end_proc
++
++loop_128:
++ LD_64(a1, 0)
++ ST_64(t8, 0)
++ LD_64(a1, 64)
++ addi.d a1, a1, 0x80
++ ST_64(t8, 64)
++ addi.d t8, t8, 0x80
++ bge a5, t8, loop_128
++
++align_end_proc:
++ sub.d a2, a3, t8
++
++ pcaddi t1, 34
++ andi t2, a2, 0x78
++ sub.d t1, t1, t2
++ jirl zero, t1, 0
++
++end_120_128_unalign:
++ ld.d t0, a1, 112
++ st.d t0, t8, 112
++end_112_120_unalign:
++ ld.d t0, a1, 104
++ st.d t0, t8, 104
++end_104_112_unalign:
++ ld.d t0, a1, 96
++ st.d t0, t8, 96
++end_96_104_unalign:
++ ld.d t0, a1, 88
++ st.d t0, t8, 88
++end_88_96_unalign:
++ ld.d t0, a1, 80
++ st.d t0, t8, 80
++end_80_88_unalign:
++ ld.d t0, a1, 72
++ st.d t0, t8, 72
++end_72_80_unalign:
++ ld.d t0, a1, 64
++ st.d t0, t8, 64
++end_64_72_unalign:
++ ld.d t0, a1, 56
++ st.d t0, t8, 56
++end_56_64_unalign:
++ ld.d t0, a1, 48
++ st.d t0, t8, 48
++end_48_56_unalign:
++ ld.d t0, a1, 40
++ st.d t0, t8, 40
++end_40_48_unalign:
++ ld.d t0, a1, 32
++ st.d t0, t8, 32
++end_32_40_unalign:
++ ld.d t0, a1, 24
++ st.d t0, t8, 24
++end_24_32_unalign:
++ ld.d t0, a1, 16
++ st.d t0, t8, 16
++end_16_24_unalign:
++ ld.d t0, a1, 8
++ st.d t0, t8, 8
++end_8_16_unalign:
++ ld.d t0, a1, 0
++ st.d t0, t8, 0
++end_0_8_unalign:
++ ld.d t0, a4, -8
++ st.d t0, a3, -8
++
++ jr ra
++
++END(MEMCPY_NAME)
++libc_hidden_builtin_def (MEMCPY_NAME)
+diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S
+new file mode 100644
+index 00000000..edd9cf3d
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/memmove.S
+@@ -0,0 +1,406 @@
++/* Optimized memmove implementation for LoongArch.
++ Copyright (C) 2021 Free Software Foundation, Inc.
++ This file is part of the GNU C Library.
++
++ The GNU C Library is free software; you can redistribute it and/or
++ modify it under the terms of the GNU Lesser General Public
++ License as published by the Free Software Foundation; either
++ version 2.1 of the License, or (at your option) any later version.
++
++ The GNU C Library is distributed in the hope that it will be useful,
++ but WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ Lesser General Public License for more details.
++
++ You should have received a copy of the GNU Lesser General Public
++ License along with the GNU C Library. If not, see
++ <https://www.gnu.org/licenses/>. */
++
++#include <sysdep.h>
++
++/* Allow the routine to be named something else if desired. */
++#ifndef MEMMOVE_NAME
++#define MEMMOVE_NAME memmove
++#endif
++
++#define LD_64(reg, n) \
++ ld.d t0, reg, n; \
++ ld.d t1, reg, n+8; \
++ ld.d t2, reg, n+16; \
++ ld.d t3, reg, n+24; \
++ ld.d t4, reg, n+32; \
++ ld.d t5, reg, n+40; \
++ ld.d t6, reg, n+48; \
++ ld.d t7, reg, n+56;
++
++
++#define ST_64(reg, n) \
++ st.d t0, reg, n; \
++ st.d t1, reg, n+8; \
++ st.d t2, reg, n+16; \
++ st.d t3, reg, n+24; \
++ st.d t4, reg, n+32; \
++ st.d t5, reg, n+40; \
++ st.d t6, reg, n+48; \
++ st.d t7, reg, n+56;
++
++/* memmove (const void *dst, const void *src, size_t n) */
++LEAF(MEMMOVE_NAME)
++ add.d a4, a1, a2
++ add.d a3, a0, a2
++ beq a1, a0, less_1bytes
++ move t8, a0
++ srai.d a6, a2, 4 #num/16
++ beqz a6, less_16bytes #num<16
++ srai.d a6, a2, 6 #num/64
++ bnez a6, more_64bytes #num>64
++ srai.d a6, a2, 5
++ beqz a6, less_32bytes #num<32
++
++ ld.d t0, a1, 0 #32<num<64
++ ld.d t1, a1, 8
++ ld.d t2, a1, 16
++ ld.d t3, a1, 24
++ ld.d t4, a4, -32
++ ld.d t5, a4, -24
++ ld.d t6, a4, -16
++ ld.d t7, a4, -8
++ st.d t0, a0, 0
++ st.d t1, a0, 8
++ st.d t2, a0, 16
++ st.d t3, a0, 24
++ st.d t4, a3, -32
++ st.d t5, a3, -24
++ st.d t6, a3, -16
++ st.d t7, a3, -8
++
++ jr ra
++
++less_32bytes:
++ ld.d t0, a1, 0
++ ld.d t1, a1, 8
++ ld.d t2, a4, -16
++ ld.d t3, a4, -8
++ st.d t0, a0, 0
++ st.d t1, a0, 8
++ st.d t2, a3, -16
++ st.d t3, a3, -8
++
++ jr ra
++
++less_16bytes:
++ srai.d a6, a2, 3 #num/8
++ beqz a6, less_8bytes
++
++ ld.d t0, a1, 0
++ ld.d t1, a4, -8
++ st.d t0, a0, 0
++ st.d t1, a3, -8
++
++ jr ra
++
++less_8bytes:
++ srai.d a6, a2, 2
++ beqz a6, less_4bytes
++
++ ld.w t0, a1, 0
++ ld.w t1, a4, -4
++ st.w t0, a0, 0
++ st.w t1, a3, -4
++
++ jr ra
++
++less_4bytes:
++ srai.d a6, a2, 1
++ beqz a6, less_2bytes
++
++ ld.h t0, a1, 0
++ ld.h t1, a4, -2
++ st.h t0, a0, 0
++ st.h t1, a3, -2
++
++ jr ra
++
++less_2bytes:
++ beqz a2, less_1bytes
++
++ ld.b t0, a1, 0
++ st.b t0, a0, 0
++
++ jr ra
++
++less_1bytes:
++ jr ra
++
++more_64bytes:
++ sub.d a7, a0, a1
++ bltu a7, a2, copy_backward
++
++copy_forward:
++ srli.d a0, a0, 3
++ slli.d a0, a0, 3
++ beq a0, t8, all_align
++ addi.d a0, a0, 0x8
++ sub.d a7, t8, a0
++ sub.d a1, a1, a7
++ add.d a2, a7, a2
++
++start_unalign_proc:
++ pcaddi t1, 18
++ slli.d a6, a7, 3
++ add.d t1, t1, a6
++ jirl zero, t1, 0
++
++start_7_unalign:
++ ld.b t0, a1, -7
++ st.b t0, a0, -7
++start_6_unalign:
++ ld.b t0, a1, -6
++ st.b t0, a0, -6
++start_5_unalign:
++ ld.b t0, a1, -5
++ st.b t0, a0, -5
++start_4_unalign:
++ ld.b t0, a1, -4
++ st.b t0, a0, -4
++start_3_unalign:
++ ld.b t0, a1, -3
++ st.b t0, a0, -3
++start_2_unalign:
++ ld.b t0, a1, -2
++ st.b t0, a0, -2
++start_1_unalign:
++ ld.b t0, a1, -1
++ st.b t0, a0, -1
++start_over:
++
++ addi.d a2, a2, -0x80
++ blt a2, zero, end_unalign_proc
++
++loop_less:
++ LD_64(a1, 0)
++ ST_64(a0, 0)
++ LD_64(a1, 64)
++ ST_64(a0, 64)
++
++ addi.d a0, a0, 0x80
++ addi.d a1, a1, 0x80
++ addi.d a2, a2, -0x80
++ bge a2, zero, loop_less
++
++end_unalign_proc:
++ addi.d a2, a2, 0x80
++
++ pcaddi t1, 36
++ andi t2, a2, 0x78
++ add.d a1, a1, t2
++ add.d a0, a0, t2
++ sub.d t1, t1, t2
++ jirl zero, t1, 0
++
++end_120_128_unalign:
++ ld.d t0, a1, -120
++ st.d t0, a0, -120
++end_112_120_unalign:
++ ld.d t0, a1, -112
++ st.d t0, a0, -112
++end_104_112_unalign:
++ ld.d t0, a1, -104
++ st.d t0, a0, -104
++end_96_104_unalign:
++ ld.d t0, a1, -96
++ st.d t0, a0, -96
++end_88_96_unalign:
++ ld.d t0, a1, -88
++ st.d t0, a0, -88
++end_80_88_unalign:
++ ld.d t0, a1, -80
++ st.d t0, a0, -80
++end_72_80_unalign:
++ ld.d t0, a1, -72
++ st.d t0, a0, -72
++end_64_72_unalign:
++ ld.d t0, a1, -64
++ st.d t0, a0, -64
++end_56_64_unalign:
++ ld.d t0, a1, -56
++ st.d t0, a0, -56
++end_48_56_unalign:
++ ld.d t0, a1, -48
++ st.d t0, a0, -48
++end_40_48_unalign:
++ ld.d t0, a1, -40
++ st.d t0, a0, -40
++end_32_40_unalign:
++ ld.d t0, a1, -32
++ st.d t0, a0, -32
++end_24_32_unalign:
++ ld.d t0, a1, -24
++ st.d t0, a0, -24
++end_16_24_unalign:
++ ld.d t0, a1, -16
++ st.d t0, a0, -16
++end_8_16_unalign:
++ ld.d t0, a1, -8
++ st.d t0, a0, -8
++end_0_8_unalign:
++
++ andi a2, a2, 0x7
++ pcaddi t1, 18
++ slli.d a2, a2, 3
++ sub.d t1, t1, a2
++ jirl zero, t1, 0
++
++end_7_unalign:
++ ld.b t0, a4, -7
++ st.b t0, a3, -7
++end_6_unalign:
++ ld.b t0, a4, -6
++ st.b t0, a3, -6
++end_5_unalign:
++ ld.b t0, a4, -5
++ st.b t0, a3, -5
++end_4_unalign:
++ ld.b t0, a4, -4
++ st.b t0, a3, -4
++end_3_unalign:
++ ld.b t0, a4, -3
++ st.b t0, a3, -3
++end_2_unalign:
++ ld.b t0, a4, -2
++ st.b t0, a3, -2
++end_1_unalign:
++ ld.b t0, a4, -1
++ st.b t0, a3, -1
++end:
++
++ move v0, t8
++ jr ra
++
++all_align:
++ addi.d a1, a1, 0x8
++ addi.d a0, a0, 0x8
++ ld.d t0, a1, -8
++ st.d t0, a0, -8
++ addi.d a2, a2, -8
++ b start_over
++
++all_align_back:
++ addi.d a4, a4, -0x8
++ addi.d a3, a3, -0x8
++ ld.d t0, a4, 0
++ st.d t0, a3, 0
++ addi.d a2, a2, -8
++ b start_over_back
++
++copy_backward:
++ move a5, a3
++ srli.d a3, a3, 3
++ slli.d a3, a3, 3
++ beq a3, a5, all_align_back
++ sub.d a7, a3, a5
++ add.d a4, a4, a7
++ add.d a2, a7, a2
++
++ pcaddi t1, 18
++ slli.d a6, a7, 3
++ add.d t1, t1, a6
++ jirl zero, t1, 0
++
++ ld.b t0, a4, 6
++ st.b t0, a3, 6
++ ld.b t0, a4, 5
++ st.b t0, a3, 5
++ ld.b t0, a4, 4
++ st.b t0, a3, 4
++ ld.b t0, a4, 3
++ st.b t0, a3, 3
++ ld.b t0, a4, 2
++ st.b t0, a3, 2
++ ld.b t0, a4, 1
++ st.b t0, a3, 1
++ ld.b t0, a4, 0
++ st.b t0, a3, 0
++start_over_back:
++
++ addi.d a2, a2, -0x80
++ blt a2, zero, end_unalign_proc_back
++
++loop_less_back:
++ LD_64(a4, -64)
++ ST_64(a3, -64)
++ LD_64(a4, -128)
++ ST_64(a3, -128)
++
++ addi.d a4, a4, -0x80
++ addi.d a3, a3, -0x80
++ addi.d a2, a2, -0x80
++ bge a2, zero, loop_less_back
++
++end_unalign_proc_back:
++ addi.d a2, a2, 0x80
++
++ pcaddi t1, 36
++ andi t2, a2, 0x78
++ sub.d a4, a4, t2
++ sub.d a3, a3, t2
++ sub.d t1, t1, t2
++ jirl zero, t1, 0
++
++ ld.d t0, a4, 112
++ st.d t0, a3, 112
++ ld.d t0, a4, 104
++ st.d t0, a3, 104
++ ld.d t0, a4, 96
++ st.d t0, a3, 96
++ ld.d t0, a4, 88
++ st.d t0, a3, 88
++ ld.d t0, a4, 80
++ st.d t0, a3, 80
++ ld.d t0, a4, 72
++ st.d t0, a3, 72
++ ld.d t0, a4, 64
++ st.d t0, a3, 64
++ ld.d t0, a4, 56
++ st.d t0, a3, 56
++ ld.d t0, a4, 48
++ st.d t0, a3, 48
++ ld.d t0, a4, 40
++ st.d t0, a3, 40
++ ld.d t0, a4, 32
++ st.d t0, a3, 32
++ ld.d t0, a4, 24
++ st.d t0, a3, 24
++ ld.d t0, a4, 16
++ st.d t0, a3, 16
++ ld.d t0, a4, 8
++ st.d t0, a3, 8
++ ld.d t0, a4, 0
++ st.d t0, a3, 0
++
++ andi a2, a2, 0x7
++ pcaddi t1, 18
++ slli.d a2, a2, 3
++ sub.d t1, t1, a2
++ jirl zero, t1, 0
++
++ ld.b t0, a1, 6
++ st.b t0, a0, 6
++ ld.b t0, a1, 5
++ st.b t0, a0, 5
++ ld.b t0, a1, 4
++ st.b t0, a0, 4
++ ld.b t0, a1, 3
++ st.b t0, a0, 3
++ ld.b t0, a1, 2
++ st.b t0, a0, 2
++ ld.b t0, a1, 1
++ st.b t0, a0, 1
++ ld.b t0, a1, 0
++ st.b t0, a0, 0
++
++ move v0, t8
++ jr ra
++
++END(MEMMOVE_NAME)
++libc_hidden_builtin_def (MEMMOVE_NAME)
+--
+2.33.0
+