diff options
Diffstat (limited to '5_6-LoongArch-Optimize-string-function-strcpy.patch')
-rw-r--r-- | 5_6-LoongArch-Optimize-string-function-strcpy.patch | 195 |
1 files changed, 195 insertions, 0 deletions
diff --git a/5_6-LoongArch-Optimize-string-function-strcpy.patch b/5_6-LoongArch-Optimize-string-function-strcpy.patch new file mode 100644 index 0000000..d2686d5 --- /dev/null +++ b/5_6-LoongArch-Optimize-string-function-strcpy.patch @@ -0,0 +1,195 @@ +From 379b627b88af8d91c1f87b323925119ec313b1b7 Mon Sep 17 00:00:00 2001 +From: Xue Liu <liuxue@loongson.cn> +Date: Sun, 29 Jan 2023 10:25:18 +0800 +Subject: [PATCH 5/6] LoongArch: Optimize string function strcpy. + +Change-Id: Ic105e1f00cceb4937d5fd2127ca03025a18ff4be +--- + sysdeps/loongarch/lp64/strcpy.S | 175 ++++++++++++++++++++++++++++++++ + 1 file changed, 175 insertions(+) + create mode 100644 sysdeps/loongarch/lp64/strcpy.S + +diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S +new file mode 100644 +index 00000000..03d9d361 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/strcpy.S +@@ -0,0 +1,175 @@ ++/* Optimized strcpy implementation for LoongArch. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++#include <sys/asm.h> ++ ++/* Parameters and Results */ ++#define dest a0 ++#define src a1 ++#define result v0 ++ ++/* Internal variable */ ++#define data t0 ++#define data1 t1 ++#define has_nul t2 ++#define diff t3 ++#define syndrome t4 ++#define zeroones t5 ++#define sevenf t6 ++#define pos t7 ++#define dest_backup t8 ++#define tmp1 a4 ++#define tmp2 a5 ++#define tmp3 a6 ++#define dest_off a2 ++#define src_off a3 ++#define tmp4 a7 ++ ++/* rd <- if rc then ra else rb ++ tmp3 will be destroyed. */ ++#define CONDITIONSEL(rd, rc, ra, rb)\ ++ masknez tmp3, rb, rc;\ ++ maskeqz rd, ra, rc;\ ++ or rd, rd, tmp3 ++ ++/* int strcpy (const char *s1, const char *s2); */ ++LEAF(strcpy) ++ .align 4 ++ ++ move dest_backup, dest ++ lu12i.w zeroones, 0x01010 ++ lu12i.w sevenf, 0x7f7f7 ++ ori zeroones, zeroones, 0x101 ++ ori sevenf, sevenf, 0xf7f ++ bstrins.d zeroones, zeroones, 63, 32 ++ bstrins.d sevenf, sevenf, 63, 32 ++ andi src_off, src, 0x7 ++ beqz src_off, strcpy_loop_aligned_1 ++ b strcpy_mutual_align ++strcpy_loop_aligned: ++ st.d data, dest, 0 ++ addi.d dest, dest, 8 ++strcpy_loop_aligned_1: ++ ld.d data, src, 0 ++ addi.d src, src, 8 ++strcpy_start_realigned: ++ sub.d tmp1, data, zeroones ++ or tmp2, data, sevenf ++ andn has_nul, tmp1, tmp2 ++ beqz has_nul, strcpy_loop_aligned ++ ++strcpy_end: ++ ++ /* 8 4 2 1 */ ++ ctz.d pos, has_nul ++ srli.d pos, pos, 3 ++ addi.d pos, pos, 1 ++ /* Do 8/4/2/1 strcpy based on pos value. ++ pos value is the number of bytes to be copied ++ the bytes include the final \0 so the max length is 8 and the min length is 1. */ ++strcpy_end_8: ++ andi tmp1, pos, 0x8 ++ beqz tmp1, strcpy_end_4 ++ st.d data, dest, 0 ++ move dest, dest_backup ++ jr ra ++strcpy_end_4: ++ andi tmp1, pos, 0x4 ++ beqz tmp1, strcpy_end_2 ++ st.w data, dest, 0 ++ srli.d data, data, 32 ++ addi.d dest, dest, 4 ++strcpy_end_2: ++ andi tmp1, pos, 0x2 ++ beqz tmp1, strcpy_end_1 ++ st.h data, dest, 0 ++ srli.d data, data, 16 ++ addi.d dest, dest, 2 ++strcpy_end_1: ++ andi tmp1, pos, 0x1 ++ beqz tmp1, strcpy_end_ret ++ st.b data, dest, 0 ++strcpy_end_ret: ++ move result, dest_backup ++ jr ra ++ ++ ++strcpy_mutual_align: ++ /* Check if around src page bound. ++ if not go to page cross ok. ++ if it is, do further check. ++ use tmp2 to accelerate. */ ++ ++ li.w tmp2, 0xff8 ++ andi tmp1, src, 0xff8 ++ beq tmp1, tmp2, strcpy_page_cross ++ ++strcpy_page_cross_ok: ++ /* Load a misaligned double word and check if has \0 ++ If no, do a misaligned double word paste. ++ If yes, calculate the number of avaliable bytes, ++ then jump to 4/2/1 end. */ ++ ld.d data, src, 0 ++ sub.d tmp1, data, zeroones ++ or tmp2, data, sevenf ++ andn has_nul, tmp1, tmp2 ++ bnez has_nul, strcpy_end ++strcpy_mutual_align_finish: ++ /* Before jump back to align loop, make dest/src aligned. ++ This will cause a duplicated paste for several bytes between the first double word and the second double word, ++ but should not bring a problem. */ ++ li.w tmp1, 8 ++ st.d data, dest, 0 ++ sub.d tmp1, tmp1, src_off ++ add.d src, src, tmp1 ++ add.d dest, dest, tmp1 ++ ++ b strcpy_loop_aligned_1 ++ ++strcpy_page_cross: ++ /* ++ ld.d from aligned address(src & ~0x7). ++ check if high bytes have \0. ++ it not, go back to page cross ok, ++ since the string is supposed to cross the page bound in such situation. ++ if it is, do a srl for data to make it seems like a direct double word from src, ++ then go to 4/2/1 strcpy end. ++ ++ tmp4 is 0xffff...ffff mask ++ tmp2 demonstrate the bytes to be masked ++ tmp2 = src_off << 3 ++ data = data >> (src_off * 8) | -1 << (64 - src_off * 8) ++ and ++ -1 << (64 - src_off * 8) -> ~(-1 >> (src_off * 8)) */ ++ ++ li.w tmp1, 0x7 ++ andn tmp3, src, tmp1 ++ ld.d data, tmp3, 0 ++ li.w tmp4, -1 ++ slli.d tmp2, src_off, 3 ++ srl.d tmp4, tmp4, tmp2 ++ srl.d data, data, tmp2 ++ nor tmp4, tmp4, zero ++ or data, data, tmp4 ++ sub.d tmp1, data, zeroones ++ or tmp2, data, sevenf ++ andn has_nul, tmp1, tmp2 ++ beqz has_nul, strcpy_page_cross_ok ++ b strcpy_end ++END(strcpy) ++libc_hidden_builtin_def (strcpy) +-- +2.33.0 + |