diff options
Diffstat (limited to '3_6-LoongArch-Optimize-string-function-memset.patch')
-rw-r--r-- | 3_6-LoongArch-Optimize-string-function-memset.patch | 190 |
1 files changed, 190 insertions, 0 deletions
diff --git a/3_6-LoongArch-Optimize-string-function-memset.patch b/3_6-LoongArch-Optimize-string-function-memset.patch new file mode 100644 index 0000000..54b9a2b --- /dev/null +++ b/3_6-LoongArch-Optimize-string-function-memset.patch @@ -0,0 +1,190 @@ +From 603aa93569ec4034aa1d5a310f59504b5d6aad4d Mon Sep 17 00:00:00 2001 +From: Xue Liu <liuxue@loongson.cn> +Date: Sun, 29 Jan 2023 10:23:06 +0800 +Subject: [PATCH 3/6] LoongArch: Optimize string function memset. + +Change-Id: I04906c31a2eabd380b19bb3a4cab603128526cd1 +--- + sysdeps/loongarch/lp64/memset.S | 170 ++++++++++++++++++++++++++++++++ + 1 file changed, 170 insertions(+) + create mode 100644 sysdeps/loongarch/lp64/memset.S + +diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S +new file mode 100644 +index 00000000..261504b1 +--- /dev/null ++++ b/sysdeps/loongarch/lp64/memset.S +@@ -0,0 +1,170 @@ ++/* Optimized memset implementation for LoongArch. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ <https://www.gnu.org/licenses/>. */ ++ ++#include <sysdep.h> ++ ++#define ST_128(n) \ ++ st.d a1, a0, n; \ ++ st.d a1, a0, n+8 ; \ ++ st.d a1, a0, n+16 ; \ ++ st.d a1, a0, n+24 ; \ ++ st.d a1, a0, n+32 ; \ ++ st.d a1, a0, n+40 ; \ ++ st.d a1, a0, n+48 ; \ ++ st.d a1, a0, n+56 ; \ ++ st.d a1, a0, n+64 ; \ ++ st.d a1, a0, n+72 ; \ ++ st.d a1, a0, n+80 ; \ ++ st.d a1, a0, n+88 ; \ ++ st.d a1, a0, n+96 ; \ ++ st.d a1, a0, n+104; \ ++ st.d a1, a0, n+112; \ ++ st.d a1, a0, n+120; \ ++ ++/* void *memset(void *s, int c, size_t n); */ ++LEAF(memset) ++ .align 6 ++ ++ bstrins.d a1, a1, 15, 8 ++ add.d t7, a0, a2 ++ bstrins.d a1, a1, 31, 16 ++ move t0, a0 ++ bstrins.d a1, a1, 63, 32 ++ srai.d t8, a2, 4 #num/16 ++ beqz t8, less_16bytes #num<16 ++ srai.d t8, a2, 6 #num/64 ++ bnez t8, more_64bytes #num>64 ++ srai.d t8, a2, 5 #num/32 ++ beqz t8, less_32bytes #num<32 ++ st.d a1, a0, 0 #32<num<64 ++ st.d a1, a0, 8 ++ st.d a1, a0, 16 ++ st.d a1, a0, 24 ++ st.d a1, t7, -32 ++ st.d a1, t7, -24 ++ st.d a1, t7, -16 ++ st.d a1, t7, -8 ++ ++ jr ra ++ ++less_32bytes: ++ st.d a1, a0, 0 ++ st.d a1, a0, 8 ++ st.d a1, t7, -16 ++ st.d a1, t7, -8 ++ ++ jr ra ++ ++less_16bytes: ++ srai.d t8, a2, 3 #num/8 ++ beqz t8, less_8bytes ++ st.d a1, a0, 0 ++ st.d a1, t7, -8 ++ ++ jr ra ++ ++less_8bytes: ++ srai.d t8, a2, 2 ++ beqz t8, less_4bytes ++ st.w a1, a0, 0 ++ st.w a1, t7, -4 ++ ++ jr ra ++ ++less_4bytes: ++ srai.d t8, a2, 1 ++ beqz t8, less_2bytes ++ st.h a1, a0, 0 ++ st.h a1, t7, -2 ++ ++ jr ra ++ ++less_2bytes: ++ beqz a2, less_1bytes ++ st.b a1, a0, 0 ++ ++ jr ra ++ ++less_1bytes: ++ jr ra ++ ++more_64bytes: ++ srli.d a0, a0, 3 ++ slli.d a0, a0, 3 ++ addi.d a0, a0, 0x8 ++ st.d a1, t0, 0 ++ sub.d t2, t0, a0 ++ add.d a2, t2, a2 ++ ++ addi.d a2, a2, -0x80 ++ blt a2, zero, end_unalign_proc ++ ++loop_less: ++ ST_128(0) ++ addi.d a0, a0, 0x80 ++ addi.d a2, a2, -0x80 ++ bge a2, zero, loop_less ++ ++end_unalign_proc: ++ addi.d a2, a2, 0x80 ++ ++ pcaddi t1, 20 ++ andi t5, a2, 0x78 ++ srli.d t5, t5, 1 ++ sub.d t1, t1, t5 ++ jirl zero, t1, 0 ++ ++end_120_128_unalign: ++ st.d a1, a0, 112 ++end_112_120_unalign: ++ st.d a1, a0, 104 ++end_104_112_unalign: ++ st.d a1, a0, 96 ++end_96_104_unalign: ++ st.d a1, a0, 88 ++end_88_96_unalign: ++ st.d a1, a0, 80 ++end_80_88_unalign: ++ st.d a1, a0, 72 ++end_72_80_unalign: ++ st.d a1, a0, 64 ++end_64_72_unalign: ++ st.d a1, a0, 56 ++end_56_64_unalign: ++ st.d a1, a0, 48 ++end_48_56_unalign: ++ st.d a1, a0, 40 ++end_40_48_unalign: ++ st.d a1, a0, 32 ++end_32_40_unalign: ++ st.d a1, a0, 24 ++end_24_32_unalign: ++ st.d a1, a0, 16 ++end_16_24_unalign: ++ st.d a1, a0, 8 ++end_8_16_unalign: ++ st.d a1, a0, 0 ++end_0_8_unalign: ++ st.d a1, t7, -8 ++ ++ move v0, t0 ++ jr ra ++ ++END(memset) ++ ++libc_hidden_builtin_def (memset) +-- +2.33.0 + |