1 files changed, 195 insertions, 0 deletions
diff --git a/5_6-LoongArch-Optimize-string-function-strcpy.patch b/5_6-LoongArch-Optimize-string-function-strcpy.patch
new file mode 100644
index 0000000..d2686d5
--- /dev/null
+++ b/5_6-LoongArch-Optimize-string-function-strcpy.patch
@@ -0,0 +1,195 @@
+From 379b627b88af8d91c1f87b323925119ec313b1b7 Mon Sep 17 00:00:00 2001
+From: Xue Liu <liuxue@loongson.cn>
+Date: Sun, 29 Jan 2023 10:25:18 +0800
+Subject: [PATCH 5/6] LoongArch: Optimize string function strcpy.
+
+Change-Id: Ic105e1f00cceb4937d5fd2127ca03025a18ff4be
+---
+ sysdeps/loongarch/lp64/strcpy.S | 175 ++++++++++++++++++++++++++++++++
+ 1 file changed, 175 insertions(+)
+ create mode 100644 sysdeps/loongarch/lp64/strcpy.S
+
+diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S
+new file mode 100644
+index 00000000..03d9d361
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/strcpy.S
+@@ -0,0 +1,175 @@
++/* Optimized strcpy implementation for LoongArch.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library.  If not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sys/asm.h>
++
++/* Parameters and Results */
++#define dest	a0
++#define src	a1
++#define result	v0
++
++/* Internal variable */
++#define data		t0
++#define data1		t1
++#define has_nul		t2
++#define diff		t3
++#define syndrome	t4
++#define zeroones	t5
++#define sevenf		t6
++#define pos		t7
++#define dest_backup	t8
++#define tmp1		a4
++#define tmp2		a5
++#define tmp3		a6
++#define dest_off	a2
++#define src_off		a3
++#define tmp4		a7
++
++/* rd <- if rc then ra else rb
++   tmp3 will be destroyed. */
++#define CONDITIONSEL(rd, rc, ra, rb)\
++	masknez	tmp3, rb, rc;\
++	maskeqz	rd, ra, rc;\
++	or	rd, rd, tmp3
++
++/* int strcpy (const char *s1, const char *s2); */
++LEAF(strcpy)
++	.align		4
++
++	move		dest_backup, dest
++	lu12i.w		zeroones, 0x01010
++	lu12i.w		sevenf, 0x7f7f7
++	ori		zeroones, zeroones, 0x101
++	ori		sevenf, sevenf, 0xf7f
++	bstrins.d	zeroones, zeroones, 63, 32
++	bstrins.d	sevenf, sevenf, 63, 32
++	andi		src_off, src, 0x7
++	beqz		src_off, strcpy_loop_aligned_1
++	b		strcpy_mutual_align
++strcpy_loop_aligned:
++	st.d		data, dest, 0
++	addi.d		dest, dest, 8
++strcpy_loop_aligned_1:
++	ld.d		data, src, 0
++	addi.d		src, src, 8
++strcpy_start_realigned:
++	sub.d		tmp1, data, zeroones
++	or		tmp2, data, sevenf
++	andn		has_nul, tmp1, tmp2
++	beqz		has_nul, strcpy_loop_aligned
++
++strcpy_end:
++
++	/* 8 4 2 1 */
++	ctz.d		pos, has_nul
++	srli.d		pos, pos, 3
++	addi.d		pos, pos, 1
++	/* Do 8/4/2/1 strcpy based on pos value.
++	   pos value is the number of bytes to be copied
++	   the bytes include the final \0 so the max length is 8 and the min length is 1. */
++strcpy_end_8:
++	andi		tmp1, pos, 0x8
++	beqz		tmp1, strcpy_end_4
++	st.d		data, dest, 0
++	move		dest, dest_backup
++	jr		ra
++strcpy_end_4:
++	andi		tmp1, pos, 0x4
++	beqz		tmp1, strcpy_end_2
++	st.w		data, dest, 0
++	srli.d		data, data, 32
++	addi.d		dest, dest, 4
++strcpy_end_2:
++	andi		tmp1, pos, 0x2
++	beqz		tmp1, strcpy_end_1
++	st.h		data, dest, 0
++	srli.d		data, data, 16
++	addi.d		dest, dest, 2
++strcpy_end_1:
++	andi		tmp1, pos, 0x1
++	beqz		tmp1, strcpy_end_ret
++	st.b		data, dest, 0
++strcpy_end_ret:
++	move		result, dest_backup
++	jr		ra
++
++
++strcpy_mutual_align:
++	/* Check if around src page bound.
++	   if not go to page cross ok.
++	   if it is, do further check.
++	   use tmp2 to accelerate. */
++
++	li.w		tmp2, 0xff8
++	andi		tmp1, src, 0xff8
++	beq		tmp1, tmp2, strcpy_page_cross
++
++strcpy_page_cross_ok:
++	/* Load a misaligned double word and check if has \0
++	   If no, do a misaligned double word paste.
++	   If yes, calculate the number of avaliable bytes,
++	   then jump to 4/2/1 end. */
++	ld.d		data, src, 0
++	sub.d		tmp1, data, zeroones
++	or		tmp2, data, sevenf
++	andn		has_nul, tmp1, tmp2
++	bnez		has_nul, strcpy_end
++strcpy_mutual_align_finish:
++	/* Before jump back to align loop, make dest/src aligned.
++	   This will cause a duplicated paste for several bytes between the first double word and the second double word,
++	   but should not bring a problem. */
++	li.w		tmp1, 8
++	st.d		data, dest, 0
++	sub.d		tmp1, tmp1, src_off
++	add.d		src, src, tmp1
++	add.d		dest, dest, tmp1
++
++	b		strcpy_loop_aligned_1
++
++strcpy_page_cross:
++	/*
++	   ld.d from aligned address(src & ~0x7).
++	   check if high bytes have \0.
++	   it not, go back to page cross ok,
++	   since the string is supposed to cross the page bound in such situation.
++	   if it is, do a srl for data to make it seems like a direct double word from src,
++	   then go to 4/2/1 strcpy end.
++
++	   tmp4 is 0xffff...ffff mask
++	   tmp2 demonstrate the bytes to be masked
++	   tmp2 = src_off << 3
++	   data = data >> (src_off * 8) | -1 << (64 - src_off * 8)
++	   and
++	   -1 << (64 - src_off * 8) -> ~(-1 >> (src_off * 8)) */
++
++	li.w		tmp1, 0x7
++	andn		tmp3, src, tmp1
++	ld.d		data, tmp3, 0
++	li.w		tmp4, -1
++	slli.d		tmp2, src_off, 3
++	srl.d		tmp4, tmp4, tmp2
++	srl.d		data, data, tmp2
++	nor		tmp4, tmp4, zero
++	or		data, data, tmp4
++	sub.d		tmp1, data, zeroones
++	or		tmp2, data, sevenf
++	andn		has_nul, tmp1, tmp2
++	beqz		has_nul, strcpy_page_cross_ok
++	b		strcpy_end
++END(strcpy)
++libc_hidden_builtin_def (strcpy)
+-- 
+2.33.0
+