summaryrefslogtreecommitdiff
path: root/5_6-LoongArch-Optimize-string-function-strcpy.patch
diff options
context:
space:
mode:
Diffstat (limited to '5_6-LoongArch-Optimize-string-function-strcpy.patch')
-rw-r--r--5_6-LoongArch-Optimize-string-function-strcpy.patch195
1 files changed, 195 insertions, 0 deletions
diff --git a/5_6-LoongArch-Optimize-string-function-strcpy.patch b/5_6-LoongArch-Optimize-string-function-strcpy.patch
new file mode 100644
index 0000000..d2686d5
--- /dev/null
+++ b/5_6-LoongArch-Optimize-string-function-strcpy.patch
@@ -0,0 +1,195 @@
+From 379b627b88af8d91c1f87b323925119ec313b1b7 Mon Sep 17 00:00:00 2001
+From: Xue Liu <liuxue@loongson.cn>
+Date: Sun, 29 Jan 2023 10:25:18 +0800
+Subject: [PATCH 5/6] LoongArch: Optimize string function strcpy.
+
+Change-Id: Ic105e1f00cceb4937d5fd2127ca03025a18ff4be
+---
+ sysdeps/loongarch/lp64/strcpy.S | 175 ++++++++++++++++++++++++++++++++
+ 1 file changed, 175 insertions(+)
+ create mode 100644 sysdeps/loongarch/lp64/strcpy.S
+
+diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S
+new file mode 100644
+index 00000000..03d9d361
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/strcpy.S
+@@ -0,0 +1,175 @@
++/* Optimized strcpy implementation for LoongArch.
++ Copyright (C) 2021 Free Software Foundation, Inc.
++ This file is part of the GNU C Library.
++
++ The GNU C Library is free software; you can redistribute it and/or
++ modify it under the terms of the GNU Lesser General Public
++ License as published by the Free Software Foundation; either
++ version 2.1 of the License, or (at your option) any later version.
++
++ The GNU C Library is distributed in the hope that it will be useful,
++ but WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ Lesser General Public License for more details.
++
++ You should have received a copy of the GNU Lesser General Public
++ License along with the GNU C Library. If not, see
++ <https://www.gnu.org/licenses/>. */
++
++#include <sys/asm.h>
++
++/* Parameters and Results */
++#define dest a0
++#define src a1
++#define result v0
++
++/* Internal variable */
++#define data t0
++#define data1 t1
++#define has_nul t2
++#define diff t3
++#define syndrome t4
++#define zeroones t5
++#define sevenf t6
++#define pos t7
++#define dest_backup t8
++#define tmp1 a4
++#define tmp2 a5
++#define tmp3 a6
++#define dest_off a2
++#define src_off a3
++#define tmp4 a7
++
++/* rd <- if rc then ra else rb
++ tmp3 will be destroyed. */
++#define CONDITIONSEL(rd, rc, ra, rb)\
++ masknez tmp3, rb, rc;\
++ maskeqz rd, ra, rc;\
++ or rd, rd, tmp3
++
++/* int strcpy (const char *s1, const char *s2); */
++LEAF(strcpy)
++ .align 4
++
++ move dest_backup, dest
++ lu12i.w zeroones, 0x01010
++ lu12i.w sevenf, 0x7f7f7
++ ori zeroones, zeroones, 0x101
++ ori sevenf, sevenf, 0xf7f
++ bstrins.d zeroones, zeroones, 63, 32
++ bstrins.d sevenf, sevenf, 63, 32
++ andi src_off, src, 0x7
++ beqz src_off, strcpy_loop_aligned_1
++ b strcpy_mutual_align
++strcpy_loop_aligned:
++ st.d data, dest, 0
++ addi.d dest, dest, 8
++strcpy_loop_aligned_1:
++ ld.d data, src, 0
++ addi.d src, src, 8
++strcpy_start_realigned:
++ sub.d tmp1, data, zeroones
++ or tmp2, data, sevenf
++ andn has_nul, tmp1, tmp2
++ beqz has_nul, strcpy_loop_aligned
++
++strcpy_end:
++
++ /* 8 4 2 1 */
++ ctz.d pos, has_nul
++ srli.d pos, pos, 3
++ addi.d pos, pos, 1
++ /* Do 8/4/2/1 strcpy based on pos value.
++ pos value is the number of bytes to be copied
++ the bytes include the final \0 so the max length is 8 and the min length is 1. */
++strcpy_end_8:
++ andi tmp1, pos, 0x8
++ beqz tmp1, strcpy_end_4
++ st.d data, dest, 0
++ move dest, dest_backup
++ jr ra
++strcpy_end_4:
++ andi tmp1, pos, 0x4
++ beqz tmp1, strcpy_end_2
++ st.w data, dest, 0
++ srli.d data, data, 32
++ addi.d dest, dest, 4
++strcpy_end_2:
++ andi tmp1, pos, 0x2
++ beqz tmp1, strcpy_end_1
++ st.h data, dest, 0
++ srli.d data, data, 16
++ addi.d dest, dest, 2
++strcpy_end_1:
++ andi tmp1, pos, 0x1
++ beqz tmp1, strcpy_end_ret
++ st.b data, dest, 0
++strcpy_end_ret:
++ move result, dest_backup
++ jr ra
++
++
++strcpy_mutual_align:
++ /* Check if around src page bound.
++ if not go to page cross ok.
++ if it is, do further check.
++ use tmp2 to accelerate. */
++
++ li.w tmp2, 0xff8
++ andi tmp1, src, 0xff8
++ beq tmp1, tmp2, strcpy_page_cross
++
++strcpy_page_cross_ok:
++ /* Load a misaligned double word and check if has \0
++ If no, do a misaligned double word paste.
++ If yes, calculate the number of avaliable bytes,
++ then jump to 4/2/1 end. */
++ ld.d data, src, 0
++ sub.d tmp1, data, zeroones
++ or tmp2, data, sevenf
++ andn has_nul, tmp1, tmp2
++ bnez has_nul, strcpy_end
++strcpy_mutual_align_finish:
++ /* Before jump back to align loop, make dest/src aligned.
++ This will cause a duplicated paste for several bytes between the first double word and the second double word,
++ but should not bring a problem. */
++ li.w tmp1, 8
++ st.d data, dest, 0
++ sub.d tmp1, tmp1, src_off
++ add.d src, src, tmp1
++ add.d dest, dest, tmp1
++
++ b strcpy_loop_aligned_1
++
++strcpy_page_cross:
++ /*
++ ld.d from aligned address(src & ~0x7).
++ check if high bytes have \0.
++ it not, go back to page cross ok,
++ since the string is supposed to cross the page bound in such situation.
++ if it is, do a srl for data to make it seems like a direct double word from src,
++ then go to 4/2/1 strcpy end.
++
++ tmp4 is 0xffff...ffff mask
++ tmp2 demonstrate the bytes to be masked
++ tmp2 = src_off << 3
++ data = data >> (src_off * 8) | -1 << (64 - src_off * 8)
++ and
++ -1 << (64 - src_off * 8) -> ~(-1 >> (src_off * 8)) */
++
++ li.w tmp1, 0x7
++ andn tmp3, src, tmp1
++ ld.d data, tmp3, 0
++ li.w tmp4, -1
++ slli.d tmp2, src_off, 3
++ srl.d tmp4, tmp4, tmp2
++ srl.d data, data, tmp2
++ nor tmp4, tmp4, zero
++ or data, data, tmp4
++ sub.d tmp1, data, zeroones
++ or tmp2, data, sevenf
++ andn has_nul, tmp1, tmp2
++ beqz has_nul, strcpy_page_cross_ok
++ b strcpy_end
++END(strcpy)
++libc_hidden_builtin_def (strcpy)
+--
+2.33.0
+