From 939b5ed88b61d03bae6d20bf97ad0f77f9b110bb Mon Sep 17 00:00:00 2001 From: Xue Liu Date: Sun, 29 Jan 2023 10:20:26 +0800 Subject: [PATCH 1/6] LoongArch: Optimize string functions memcpy, memmove. Change-Id: Ib0e78d062082a657d5bf572403f19bf5bfe0a28d --- sysdeps/loongarch/lp64/memcpy.S | 259 ++++++++++++++++++++ sysdeps/loongarch/lp64/memmove.S | 406 +++++++++++++++++++++++++++++++ 2 files changed, 665 insertions(+) create mode 100644 sysdeps/loongarch/lp64/memcpy.S create mode 100644 sysdeps/loongarch/lp64/memmove.S diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S new file mode 100644 index 00000000..5d850123 --- /dev/null +++ b/sysdeps/loongarch/lp64/memcpy.S @@ -0,0 +1,259 @@ +/* Optimized memcpy implementation for LoongArch. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +/* Allow the routine to be named something else if desired. */ +#ifndef MEMCPY_NAME +#define MEMCPY_NAME memcpy +#endif + +#define LD_64(reg, n) \ + ld.d t0, reg, n; \ + ld.d t1, reg, n+8; \ + ld.d t2, reg, n+16; \ + ld.d t3, reg, n+24; \ + ld.d t4, reg, n+32; \ + ld.d t5, reg, n+40; \ + ld.d t6, reg, n+48; \ + ld.d t7, reg, n+56; + +#define ST_64(reg, n) \ + st.d t0, reg, n; \ + st.d t1, reg, n+8; \ + st.d t2, reg, n+16; \ + st.d t3, reg, n+24; \ + st.d t4, reg, n+32; \ + st.d t5, reg, n+40; \ + st.d t6, reg, n+48; \ + st.d t7, reg, n+56; + +LEAF(MEMCPY_NAME) +//1st var: dst ptr: void *a1 $r4 a0 +//2nd var: src ptr: void *a2 $r5 a1 +//3rd var: size_t len $r6 a2 +//t0~t9 registers as temp + + add.d a4, a1, a2 + add.d a3, a0, a2 + li.w a6, 16 + bge a6, a2, less_16bytes + li.w a6, 128 + blt a6, a2, long_bytes + li.w a6, 64 + blt a6, a2, more_64bytes + li.w a6, 32 + blt a6, a2, more_32bytes + + /* 17...32 */ + ld.d t0, a1, 0 + ld.d t1, a1, 8 + ld.d t2, a4, -16 + ld.d t3, a4, -8 + st.d t0, a0, 0 + st.d t1, a0, 8 + st.d t2, a3, -16 + st.d t3, a3, -8 + jr ra + +more_64bytes: + srli.d t8, a0, 3 + slli.d t8, t8, 3 + addi.d t8, t8, 0x8 + sub.d a7, a0, t8 + ld.d t0, a1, 0 + sub.d a1, a1, a7 + st.d t0, a0, 0 + + add.d a7, a7, a2 + addi.d a7, a7, -0x20 +loop_32: + ld.d t0, a1, 0 + ld.d t1, a1, 8 + ld.d t2, a1, 16 + ld.d t3, a1, 24 + st.d t0, t8, 0 + st.d t1, t8, 8 + st.d t2, t8, 16 + st.d t3, t8, 24 + + addi.d t8, t8, 0x20 + addi.d a1, a1, 0x20 + addi.d a7, a7, -0x20 + blt zero, a7, loop_32 + + ld.d t4, a4, -32 + ld.d t5, a4, -24 + ld.d t6, a4, -16 + ld.d t7, a4, -8 + st.d t4, a3, -32 + st.d t5, a3, -24 + st.d t6, a3, -16 + st.d t7, a3, -8 + + jr ra + +more_32bytes: + /* 33...64 */ + ld.d t0, a1, 0 + ld.d t1, a1, 8 + ld.d t2, a1, 16 + ld.d t3, a1, 24 + ld.d t4, a4, -32 + ld.d t5, a4, -24 + ld.d t6, a4, -16 + ld.d t7, a4, -8 + st.d t0, a0, 0 + st.d t1, a0, 8 + st.d t2, a0, 16 + st.d t3, a0, 24 + st.d t4, a3, -32 + st.d t5, a3, -24 + st.d t6, a3, -16 + st.d t7, a3, -8 + jr ra + +less_16bytes: + srai.d a6, a2, 3 + beqz a6, less_8bytes + + /* 8...16 */ + ld.d t0, a1, 0 + ld.d t1, a4, -8 + st.d t0, a0, 0 + st.d t1, a3, -8 + + jr ra + +less_8bytes: + srai.d a6, a2, 2 + beqz a6, less_4bytes + + /* 4...7 */ + ld.w t0, a1, 0 + ld.w t1, a4, -4 + st.w t0, a0, 0 + st.w t1, a3, -4 + jr ra + +less_4bytes: + srai.d a6, a2, 1 + beqz a6, less_2bytes + + /* 2...3 */ + ld.h t0, a1, 0 + ld.h t1, a4, -2 + st.h t0, a0, 0 + st.h t1, a3, -2 + jr ra + +less_2bytes: + beqz a2, less_1bytes + + ld.b t0, a1, 0 + st.b t0, a0, 0 + jr ra + +less_1bytes: + jr ra + +long_bytes: + srli.d t8, a0, 3 + slli.d t8, t8, 3 + beq a0, t8, start + + ld.d t0, a1, 0 + addi.d t8, t8, 0x8 + st.d t0, a0, 0 + sub.d a7, a0, t8 + sub.d a1, a1, a7 + +start: + addi.d a5, a3, -0x80 + blt a5, t8, align_end_proc + +loop_128: + LD_64(a1, 0) + ST_64(t8, 0) + LD_64(a1, 64) + addi.d a1, a1, 0x80 + ST_64(t8, 64) + addi.d t8, t8, 0x80 + bge a5, t8, loop_128 + +align_end_proc: + sub.d a2, a3, t8 + + pcaddi t1, 34 + andi t2, a2, 0x78 + sub.d t1, t1, t2 + jirl zero, t1, 0 + +end_120_128_unalign: + ld.d t0, a1, 112 + st.d t0, t8, 112 +end_112_120_unalign: + ld.d t0, a1, 104 + st.d t0, t8, 104 +end_104_112_unalign: + ld.d t0, a1, 96 + st.d t0, t8, 96 +end_96_104_unalign: + ld.d t0, a1, 88 + st.d t0, t8, 88 +end_88_96_unalign: + ld.d t0, a1, 80 + st.d t0, t8, 80 +end_80_88_unalign: + ld.d t0, a1, 72 + st.d t0, t8, 72 +end_72_80_unalign: + ld.d t0, a1, 64 + st.d t0, t8, 64 +end_64_72_unalign: + ld.d t0, a1, 56 + st.d t0, t8, 56 +end_56_64_unalign: + ld.d t0, a1, 48 + st.d t0, t8, 48 +end_48_56_unalign: + ld.d t0, a1, 40 + st.d t0, t8, 40 +end_40_48_unalign: + ld.d t0, a1, 32 + st.d t0, t8, 32 +end_32_40_unalign: + ld.d t0, a1, 24 + st.d t0, t8, 24 +end_24_32_unalign: + ld.d t0, a1, 16 + st.d t0, t8, 16 +end_16_24_unalign: + ld.d t0, a1, 8 + st.d t0, t8, 8 +end_8_16_unalign: + ld.d t0, a1, 0 + st.d t0, t8, 0 +end_0_8_unalign: + ld.d t0, a4, -8 + st.d t0, a3, -8 + + jr ra + +END(MEMCPY_NAME) +libc_hidden_builtin_def (MEMCPY_NAME) diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S new file mode 100644 index 00000000..edd9cf3d --- /dev/null +++ b/sysdeps/loongarch/lp64/memmove.S @@ -0,0 +1,406 @@ +/* Optimized memmove implementation for LoongArch. + Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library. If not, see + . */ + +#include + +/* Allow the routine to be named something else if desired. */ +#ifndef MEMMOVE_NAME +#define MEMMOVE_NAME memmove +#endif + +#define LD_64(reg, n) \ + ld.d t0, reg, n; \ + ld.d t1, reg, n+8; \ + ld.d t2, reg, n+16; \ + ld.d t3, reg, n+24; \ + ld.d t4, reg, n+32; \ + ld.d t5, reg, n+40; \ + ld.d t6, reg, n+48; \ + ld.d t7, reg, n+56; + + +#define ST_64(reg, n) \ + st.d t0, reg, n; \ + st.d t1, reg, n+8; \ + st.d t2, reg, n+16; \ + st.d t3, reg, n+24; \ + st.d t4, reg, n+32; \ + st.d t5, reg, n+40; \ + st.d t6, reg, n+48; \ + st.d t7, reg, n+56; + +/* memmove (const void *dst, const void *src, size_t n) */ +LEAF(MEMMOVE_NAME) + add.d a4, a1, a2 + add.d a3, a0, a2 + beq a1, a0, less_1bytes + move t8, a0 + srai.d a6, a2, 4 #num/16 + beqz a6, less_16bytes #num<16 + srai.d a6, a2, 6 #num/64 + bnez a6, more_64bytes #num>64 + srai.d a6, a2, 5 + beqz a6, less_32bytes #num<32 + + ld.d t0, a1, 0 #32