summaryrefslogtreecommitdiff
path: root/3_6-LoongArch-Optimize-string-function-memset.patch
diff options
context:
space:
mode:
Diffstat (limited to '3_6-LoongArch-Optimize-string-function-memset.patch')
-rw-r--r--3_6-LoongArch-Optimize-string-function-memset.patch190
1 files changed, 190 insertions, 0 deletions
diff --git a/3_6-LoongArch-Optimize-string-function-memset.patch b/3_6-LoongArch-Optimize-string-function-memset.patch
new file mode 100644
index 0000000..54b9a2b
--- /dev/null
+++ b/3_6-LoongArch-Optimize-string-function-memset.patch
@@ -0,0 +1,190 @@
+From 603aa93569ec4034aa1d5a310f59504b5d6aad4d Mon Sep 17 00:00:00 2001
+From: Xue Liu <liuxue@loongson.cn>
+Date: Sun, 29 Jan 2023 10:23:06 +0800
+Subject: [PATCH 3/6] LoongArch: Optimize string function memset.
+
+Change-Id: I04906c31a2eabd380b19bb3a4cab603128526cd1
+---
+ sysdeps/loongarch/lp64/memset.S | 170 ++++++++++++++++++++++++++++++++
+ 1 file changed, 170 insertions(+)
+ create mode 100644 sysdeps/loongarch/lp64/memset.S
+
+diff --git a/sysdeps/loongarch/lp64/memset.S b/sysdeps/loongarch/lp64/memset.S
+new file mode 100644
+index 00000000..261504b1
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/memset.S
+@@ -0,0 +1,170 @@
++/* Optimized memset implementation for LoongArch.
++ Copyright (C) 2021 Free Software Foundation, Inc.
++ This file is part of the GNU C Library.
++
++ The GNU C Library is free software; you can redistribute it and/or
++ modify it under the terms of the GNU Lesser General Public
++ License as published by the Free Software Foundation; either
++ version 2.1 of the License, or (at your option) any later version.
++
++ The GNU C Library is distributed in the hope that it will be useful,
++ but WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
++ Lesser General Public License for more details.
++
++ You should have received a copy of the GNU Lesser General Public
++ License along with the GNU C Library. If not, see
++ <https://www.gnu.org/licenses/>. */
++
++#include <sysdep.h>
++
++#define ST_128(n) \
++ st.d a1, a0, n; \
++ st.d a1, a0, n+8 ; \
++ st.d a1, a0, n+16 ; \
++ st.d a1, a0, n+24 ; \
++ st.d a1, a0, n+32 ; \
++ st.d a1, a0, n+40 ; \
++ st.d a1, a0, n+48 ; \
++ st.d a1, a0, n+56 ; \
++ st.d a1, a0, n+64 ; \
++ st.d a1, a0, n+72 ; \
++ st.d a1, a0, n+80 ; \
++ st.d a1, a0, n+88 ; \
++ st.d a1, a0, n+96 ; \
++ st.d a1, a0, n+104; \
++ st.d a1, a0, n+112; \
++ st.d a1, a0, n+120; \
++
++/* void *memset(void *s, int c, size_t n); */
++LEAF(memset)
++ .align 6
++
++ bstrins.d a1, a1, 15, 8
++ add.d t7, a0, a2
++ bstrins.d a1, a1, 31, 16
++ move t0, a0
++ bstrins.d a1, a1, 63, 32
++ srai.d t8, a2, 4 #num/16
++ beqz t8, less_16bytes #num<16
++ srai.d t8, a2, 6 #num/64
++ bnez t8, more_64bytes #num>64
++ srai.d t8, a2, 5 #num/32
++ beqz t8, less_32bytes #num<32
++ st.d a1, a0, 0 #32<num<64
++ st.d a1, a0, 8
++ st.d a1, a0, 16
++ st.d a1, a0, 24
++ st.d a1, t7, -32
++ st.d a1, t7, -24
++ st.d a1, t7, -16
++ st.d a1, t7, -8
++
++ jr ra
++
++less_32bytes:
++ st.d a1, a0, 0
++ st.d a1, a0, 8
++ st.d a1, t7, -16
++ st.d a1, t7, -8
++
++ jr ra
++
++less_16bytes:
++ srai.d t8, a2, 3 #num/8
++ beqz t8, less_8bytes
++ st.d a1, a0, 0
++ st.d a1, t7, -8
++
++ jr ra
++
++less_8bytes:
++ srai.d t8, a2, 2
++ beqz t8, less_4bytes
++ st.w a1, a0, 0
++ st.w a1, t7, -4
++
++ jr ra
++
++less_4bytes:
++ srai.d t8, a2, 1
++ beqz t8, less_2bytes
++ st.h a1, a0, 0
++ st.h a1, t7, -2
++
++ jr ra
++
++less_2bytes:
++ beqz a2, less_1bytes
++ st.b a1, a0, 0
++
++ jr ra
++
++less_1bytes:
++ jr ra
++
++more_64bytes:
++ srli.d a0, a0, 3
++ slli.d a0, a0, 3
++ addi.d a0, a0, 0x8
++ st.d a1, t0, 0
++ sub.d t2, t0, a0
++ add.d a2, t2, a2
++
++ addi.d a2, a2, -0x80
++ blt a2, zero, end_unalign_proc
++
++loop_less:
++ ST_128(0)
++ addi.d a0, a0, 0x80
++ addi.d a2, a2, -0x80
++ bge a2, zero, loop_less
++
++end_unalign_proc:
++ addi.d a2, a2, 0x80
++
++ pcaddi t1, 20
++ andi t5, a2, 0x78
++ srli.d t5, t5, 1
++ sub.d t1, t1, t5
++ jirl zero, t1, 0
++
++end_120_128_unalign:
++ st.d a1, a0, 112
++end_112_120_unalign:
++ st.d a1, a0, 104
++end_104_112_unalign:
++ st.d a1, a0, 96
++end_96_104_unalign:
++ st.d a1, a0, 88
++end_88_96_unalign:
++ st.d a1, a0, 80
++end_80_88_unalign:
++ st.d a1, a0, 72
++end_72_80_unalign:
++ st.d a1, a0, 64
++end_64_72_unalign:
++ st.d a1, a0, 56
++end_56_64_unalign:
++ st.d a1, a0, 48
++end_48_56_unalign:
++ st.d a1, a0, 40
++end_40_48_unalign:
++ st.d a1, a0, 32
++end_32_40_unalign:
++ st.d a1, a0, 24
++end_24_32_unalign:
++ st.d a1, a0, 16
++end_16_24_unalign:
++ st.d a1, a0, 8
++end_8_16_unalign:
++ st.d a1, a0, 0
++end_0_8_unalign:
++ st.d a1, t7, -8
++
++ move v0, t0
++ jr ra
++
++END(memset)
++
++libc_hidden_builtin_def (memset)
+--
+2.33.0
+