1 files changed, 693 insertions, 0 deletions
diff --git a/1_6-LoongArch-Optimize-string-functions-memcpy-memmove.patch b/1_6-LoongArch-Optimize-string-functions-memcpy-memmove.patch
new file mode 100644
index 0000000..5413394
--- /dev/null
+++ b/1_6-LoongArch-Optimize-string-functions-memcpy-memmove.patch
@@ -0,0 +1,693 @@
+From 939b5ed88b61d03bae6d20bf97ad0f77f9b110bb Mon Sep 17 00:00:00 2001
+From: Xue Liu <liuxue@loongson.cn>
+Date: Sun, 29 Jan 2023 10:20:26 +0800
+Subject: [PATCH 1/6] LoongArch: Optimize string functions memcpy, memmove.
+
+Change-Id: Ib0e78d062082a657d5bf572403f19bf5bfe0a28d
+---
+ sysdeps/loongarch/lp64/memcpy.S  | 259 ++++++++++++++++++++
+ sysdeps/loongarch/lp64/memmove.S | 406 +++++++++++++++++++++++++++++++
+ 2 files changed, 665 insertions(+)
+ create mode 100644 sysdeps/loongarch/lp64/memcpy.S
+ create mode 100644 sysdeps/loongarch/lp64/memmove.S
+
+diff --git a/sysdeps/loongarch/lp64/memcpy.S b/sysdeps/loongarch/lp64/memcpy.S
+new file mode 100644
+index 00000000..5d850123
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/memcpy.S
+@@ -0,0 +1,259 @@
++/* Optimized memcpy implementation for LoongArch.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library.  If not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++/* Allow the routine to be named something else if desired.  */
++#ifndef MEMCPY_NAME
++#define MEMCPY_NAME memcpy
++#endif
++
++#define LD_64(reg, n) \
++	ld.d	t0, reg, n;    \
++	ld.d	t1, reg, n+8;  \
++	ld.d	t2, reg, n+16; \
++	ld.d	t3, reg, n+24; \
++	ld.d	t4, reg, n+32; \
++	ld.d	t5, reg, n+40; \
++	ld.d	t6, reg, n+48; \
++	ld.d	t7, reg, n+56;
++
++#define ST_64(reg, n) \
++	st.d	t0, reg, n;    \
++	st.d	t1, reg, n+8;  \
++	st.d	t2, reg, n+16; \
++	st.d	t3, reg, n+24; \
++	st.d	t4, reg, n+32; \
++	st.d	t5, reg, n+40; \
++	st.d	t6, reg, n+48; \
++	st.d	t7, reg, n+56;
++
++LEAF(MEMCPY_NAME)
++//1st var: dst ptr: void *a1 $r4 a0
++//2nd var: src ptr: void *a2 $r5 a1
++//3rd var: size_t len $r6 a2
++//t0~t9 registers as temp
++
++	add.d	a4, a1, a2
++	add.d	a3, a0, a2
++	li.w	a6, 16
++	bge	a6, a2, less_16bytes
++	li.w	a6, 128
++	blt	a6, a2, long_bytes
++	li.w	a6, 64
++	blt	a6, a2, more_64bytes
++	li.w	a6, 32
++	blt	a6, a2, more_32bytes
++
++	/* 17...32 */
++	ld.d	t0, a1, 0
++	ld.d	t1, a1, 8
++	ld.d	t2, a4, -16
++	ld.d	t3, a4, -8
++	st.d	t0, a0, 0
++	st.d	t1, a0, 8
++	st.d	t2, a3, -16
++	st.d	t3, a3, -8
++	jr	ra
++
++more_64bytes:
++	srli.d	t8, a0, 3
++	slli.d	t8, t8, 3
++	addi.d	t8, t8, 0x8
++	sub.d	a7, a0, t8
++	ld.d	t0, a1, 0
++	sub.d	a1, a1, a7
++	st.d	t0, a0, 0
++
++	add.d	a7, a7, a2
++	addi.d	a7, a7, -0x20
++loop_32:
++	ld.d	t0, a1, 0
++	ld.d	t1, a1, 8
++	ld.d	t2, a1, 16
++	ld.d	t3, a1, 24
++	st.d	t0, t8, 0
++	st.d	t1, t8, 8
++	st.d	t2, t8, 16
++	st.d	t3, t8, 24
++
++	addi.d	t8, t8, 0x20
++	addi.d	a1, a1, 0x20
++	addi.d	a7, a7, -0x20
++	blt	zero, a7, loop_32
++
++	ld.d	t4, a4, -32
++	ld.d	t5, a4, -24
++	ld.d	t6, a4, -16
++	ld.d	t7, a4, -8
++	st.d	t4, a3, -32
++	st.d	t5, a3, -24
++	st.d	t6, a3, -16
++	st.d	t7, a3, -8
++
++	jr	ra
++
++more_32bytes:
++	/* 33...64 */
++	ld.d	t0, a1, 0
++	ld.d	t1, a1, 8
++	ld.d	t2, a1, 16
++	ld.d	t3, a1, 24
++	ld.d	t4, a4, -32
++	ld.d	t5, a4, -24
++	ld.d	t6, a4, -16
++	ld.d	t7, a4, -8
++	st.d	t0, a0, 0
++	st.d	t1, a0, 8
++	st.d	t2, a0, 16
++	st.d	t3, a0, 24
++	st.d	t4, a3, -32
++	st.d	t5, a3, -24
++	st.d	t6, a3, -16
++	st.d	t7, a3, -8
++	jr	ra
++
++less_16bytes:
++	srai.d	a6, a2, 3
++	beqz	a6, less_8bytes
++
++	/* 8...16 */
++	ld.d	t0, a1, 0
++	ld.d	t1, a4, -8
++	st.d	t0, a0, 0
++	st.d	t1, a3, -8
++
++	jr	ra
++
++less_8bytes:
++	srai.d	a6, a2, 2
++	beqz	a6, less_4bytes
++
++	/* 4...7 */
++	ld.w	t0, a1, 0
++	ld.w	t1, a4, -4
++	st.w	t0, a0, 0
++	st.w	t1, a3, -4
++	jr	ra
++
++less_4bytes:
++	srai.d	a6, a2, 1
++	beqz	a6, less_2bytes
++
++	/* 2...3 */
++	ld.h	t0, a1, 0
++	ld.h	t1, a4, -2
++	st.h	t0, a0, 0
++	st.h	t1, a3, -2
++	jr	ra
++
++less_2bytes:
++	beqz	a2, less_1bytes
++
++	ld.b	t0, a1, 0
++	st.b	t0, a0, 0
++	jr	ra
++
++less_1bytes:
++	jr	ra
++
++long_bytes:
++	srli.d	t8, a0, 3
++	slli.d	t8, t8, 3
++	beq	a0, t8, start
++
++	ld.d	t0, a1, 0
++	addi.d	t8, t8, 0x8
++	st.d	t0, a0, 0
++	sub.d	a7, a0, t8
++	sub.d	a1, a1, a7
++
++start:
++	addi.d	a5, a3, -0x80
++	blt	a5, t8, align_end_proc
++
++loop_128:
++	LD_64(a1, 0)
++	ST_64(t8, 0)
++	LD_64(a1, 64)
++	addi.d	a1, a1, 0x80
++	ST_64(t8, 64)
++	addi.d	t8, t8, 0x80
++	bge	a5, t8, loop_128
++
++align_end_proc:
++	sub.d	a2, a3, t8
++
++	pcaddi	t1, 34
++	andi	t2, a2, 0x78
++	sub.d	t1, t1, t2
++	jirl	zero, t1, 0
++
++end_120_128_unalign:
++	ld.d	t0, a1, 112
++	st.d	t0, t8, 112
++end_112_120_unalign:
++	ld.d	t0, a1, 104
++	st.d	t0, t8, 104
++end_104_112_unalign:
++	ld.d	t0, a1, 96
++	st.d	t0, t8, 96
++end_96_104_unalign:
++	ld.d	t0, a1, 88
++	st.d	t0, t8, 88
++end_88_96_unalign:
++	ld.d	t0, a1, 80
++	st.d	t0, t8, 80
++end_80_88_unalign:
++	ld.d	t0, a1, 72
++	st.d	t0, t8, 72
++end_72_80_unalign:
++	ld.d	t0, a1, 64
++	st.d	t0, t8, 64
++end_64_72_unalign:
++	ld.d	t0, a1, 56
++	st.d	t0, t8, 56
++end_56_64_unalign:
++	ld.d	t0, a1, 48
++	st.d	t0, t8, 48
++end_48_56_unalign:
++	ld.d	t0, a1, 40
++	st.d	t0, t8, 40
++end_40_48_unalign:
++	ld.d	t0, a1, 32
++	st.d	t0, t8, 32
++end_32_40_unalign:
++	ld.d	t0, a1, 24
++	st.d	t0, t8, 24
++end_24_32_unalign:
++	ld.d	t0, a1, 16
++	st.d	t0, t8, 16
++end_16_24_unalign:
++	ld.d	t0, a1, 8
++	st.d	t0, t8, 8
++end_8_16_unalign:
++	ld.d	t0, a1, 0
++	st.d	t0, t8, 0
++end_0_8_unalign:
++	ld.d	t0, a4, -8
++	st.d	t0, a3, -8
++
++	jr	ra
++
++END(MEMCPY_NAME)
++libc_hidden_builtin_def (MEMCPY_NAME)
+diff --git a/sysdeps/loongarch/lp64/memmove.S b/sysdeps/loongarch/lp64/memmove.S
+new file mode 100644
+index 00000000..edd9cf3d
+--- /dev/null
++++ b/sysdeps/loongarch/lp64/memmove.S
+@@ -0,0 +1,406 @@
++/* Optimized memmove implementation for LoongArch.
++   Copyright (C) 2021 Free Software Foundation, Inc.
++   This file is part of the GNU C Library.
++
++   The GNU C Library is free software; you can redistribute it and/or
++   modify it under the terms of the GNU Lesser General Public
++   License as published by the Free Software Foundation; either
++   version 2.1 of the License, or (at your option) any later version.
++
++   The GNU C Library is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   Lesser General Public License for more details.
++
++   You should have received a copy of the GNU Lesser General Public
++   License along with the GNU C Library.  If not, see
++   <https://www.gnu.org/licenses/>.  */
++
++#include <sysdep.h>
++
++/* Allow the routine to be named something else if desired.  */
++#ifndef MEMMOVE_NAME
++#define MEMMOVE_NAME memmove
++#endif
++
++#define LD_64(reg, n) \
++	ld.d	t0, reg, n;    \
++	ld.d	t1, reg, n+8;  \
++	ld.d	t2, reg, n+16; \
++	ld.d	t3, reg, n+24; \
++	ld.d	t4, reg, n+32; \
++	ld.d	t5, reg, n+40; \
++	ld.d	t6, reg, n+48; \
++	ld.d	t7, reg, n+56;
++
++
++#define ST_64(reg, n) \
++	st.d	t0, reg, n;    \
++	st.d	t1, reg, n+8;  \
++	st.d	t2, reg, n+16; \
++	st.d	t3, reg, n+24; \
++	st.d	t4, reg, n+32; \
++	st.d	t5, reg, n+40; \
++	st.d	t6, reg, n+48; \
++	st.d	t7, reg, n+56;
++
++/* memmove (const void *dst, const void *src, size_t n) */
++LEAF(MEMMOVE_NAME)
++	add.d	a4, a1, a2
++	add.d	a3, a0, a2
++	beq	a1, a0, less_1bytes
++	move	t8, a0
++	srai.d	a6, a2, 4		#num/16
++	beqz	a6, less_16bytes        #num<16
++	srai.d	a6, a2, 6		#num/64
++	bnez	a6, more_64bytes	#num>64
++	srai.d	a6, a2, 5
++	beqz	a6, less_32bytes	#num<32
++
++	ld.d	t0, a1, 0		#32<num<64
++	ld.d	t1, a1, 8
++	ld.d	t2, a1, 16
++	ld.d	t3, a1, 24
++	ld.d	t4, a4, -32
++	ld.d	t5, a4, -24
++	ld.d	t6, a4, -16
++	ld.d	t7, a4, -8
++	st.d	t0, a0, 0
++	st.d	t1, a0, 8
++	st.d	t2, a0, 16
++	st.d	t3, a0, 24
++	st.d	t4, a3, -32
++	st.d	t5, a3, -24
++	st.d	t6, a3, -16
++	st.d	t7, a3, -8
++
++	jr	ra
++
++less_32bytes:
++	ld.d	t0, a1, 0
++	ld.d	t1, a1, 8
++	ld.d	t2, a4, -16
++	ld.d	t3, a4, -8
++	st.d	t0, a0, 0
++	st.d	t1, a0, 8
++	st.d	t2, a3, -16
++	st.d	t3, a3, -8
++
++	jr	ra
++
++less_16bytes:
++	srai.d	a6, a2, 3		#num/8
++	beqz	a6, less_8bytes
++
++	ld.d	t0, a1, 0
++	ld.d	t1, a4, -8
++	st.d	t0, a0, 0
++	st.d	t1, a3, -8
++
++	jr	ra
++
++less_8bytes:
++	srai.d	a6, a2, 2
++	beqz	a6, less_4bytes
++
++	ld.w	t0, a1, 0
++	ld.w	t1, a4, -4
++	st.w	t0, a0, 0
++	st.w	t1, a3, -4
++
++	jr	ra
++
++less_4bytes:
++	srai.d	a6, a2, 1
++	beqz	a6, less_2bytes
++
++	ld.h	t0, a1, 0
++	ld.h	t1, a4, -2
++	st.h	t0, a0, 0
++	st.h	t1, a3, -2
++
++	jr	ra
++
++less_2bytes:
++	beqz	a2, less_1bytes
++
++	ld.b	t0, a1, 0
++	st.b	t0, a0, 0
++
++	jr	ra
++
++less_1bytes:
++	jr	ra
++
++more_64bytes:
++	sub.d	a7, a0, a1
++	bltu	a7, a2, copy_backward
++
++copy_forward:
++	srli.d	a0, a0, 3
++	slli.d	a0, a0, 3
++	beq	a0, t8, all_align
++	addi.d	a0, a0, 0x8
++	sub.d	a7, t8, a0
++	sub.d	a1, a1, a7
++	add.d	a2, a7, a2
++
++start_unalign_proc:
++	pcaddi	t1, 18
++	slli.d	a6, a7, 3
++	add.d	t1, t1, a6
++	jirl	zero, t1, 0
++
++start_7_unalign:
++	ld.b	t0, a1, -7
++	st.b	t0, a0, -7
++start_6_unalign:
++	ld.b	t0, a1, -6
++	st.b	t0, a0, -6
++start_5_unalign:
++	ld.b	t0, a1, -5
++	st.b	t0, a0, -5
++start_4_unalign:
++	ld.b	t0, a1, -4
++	st.b	t0, a0, -4
++start_3_unalign:
++	ld.b	t0, a1, -3
++	st.b	t0, a0, -3
++start_2_unalign:
++	ld.b	t0, a1, -2
++	st.b	t0, a0, -2
++start_1_unalign:
++	ld.b	t0, a1, -1
++	st.b	t0, a0, -1
++start_over:
++
++	addi.d	a2, a2, -0x80
++	blt	a2, zero, end_unalign_proc
++
++loop_less:
++	LD_64(a1, 0)
++	ST_64(a0, 0)
++	LD_64(a1, 64)
++	ST_64(a0, 64)
++
++	addi.d	a0, a0, 0x80
++	addi.d	a1, a1, 0x80
++	addi.d	a2, a2, -0x80
++	bge	a2, zero, loop_less
++
++end_unalign_proc:
++	addi.d	a2, a2, 0x80
++
++	pcaddi	t1, 36
++	andi	t2, a2, 0x78
++	add.d	a1, a1, t2
++	add.d	a0, a0, t2
++	sub.d	t1, t1, t2
++	jirl	zero, t1, 0
++
++end_120_128_unalign:
++	ld.d	t0, a1, -120
++	st.d	t0, a0, -120
++end_112_120_unalign:
++	ld.d	t0, a1, -112
++	st.d	t0, a0, -112
++end_104_112_unalign:
++	ld.d	t0, a1, -104
++	st.d	t0, a0, -104
++end_96_104_unalign:
++	ld.d	t0, a1, -96
++	st.d	t0, a0, -96
++end_88_96_unalign:
++	ld.d	t0, a1, -88
++	st.d	t0, a0, -88
++end_80_88_unalign:
++	ld.d	t0, a1, -80
++	st.d	t0, a0, -80
++end_72_80_unalign:
++	ld.d	t0, a1, -72
++	st.d	t0, a0, -72
++end_64_72_unalign:
++	ld.d	t0, a1, -64
++	st.d	t0, a0, -64
++end_56_64_unalign:
++	ld.d	t0, a1, -56
++	st.d	t0, a0, -56
++end_48_56_unalign:
++	ld.d	t0, a1, -48
++	st.d	t0, a0, -48
++end_40_48_unalign:
++	ld.d	t0, a1, -40
++	st.d	t0, a0, -40
++end_32_40_unalign:
++	ld.d	t0, a1, -32
++	st.d	t0, a0, -32
++end_24_32_unalign:
++	ld.d	t0, a1, -24
++	st.d	t0, a0, -24
++end_16_24_unalign:
++	ld.d	t0, a1, -16
++	st.d	t0, a0, -16
++end_8_16_unalign:
++	ld.d	t0, a1, -8
++	st.d	t0, a0, -8
++end_0_8_unalign:
++
++	andi	a2, a2, 0x7
++	pcaddi	t1, 18
++	slli.d	a2, a2, 3
++	sub.d	t1, t1, a2
++	jirl	zero, t1, 0
++
++end_7_unalign:
++	ld.b	t0, a4, -7
++	st.b	t0, a3, -7
++end_6_unalign:
++	ld.b	t0, a4, -6
++	st.b	t0, a3, -6
++end_5_unalign:
++	ld.b	t0, a4, -5
++	st.b	t0, a3, -5
++end_4_unalign:
++	ld.b	t0, a4, -4
++	st.b	t0, a3, -4
++end_3_unalign:
++	ld.b	t0, a4, -3
++	st.b	t0, a3, -3
++end_2_unalign:
++	ld.b	t0, a4, -2
++	st.b	t0, a3, -2
++end_1_unalign:
++	ld.b	t0, a4, -1
++	st.b	t0, a3, -1
++end:
++
++	move	v0, t8
++	jr	ra
++
++all_align:
++	addi.d	a1, a1, 0x8
++	addi.d	a0, a0, 0x8
++	ld.d	t0, a1, -8
++	st.d	t0, a0, -8
++	addi.d	a2, a2, -8
++	b	start_over
++
++all_align_back:
++	addi.d	a4, a4, -0x8
++	addi.d	a3, a3, -0x8
++	ld.d	t0, a4, 0
++	st.d	t0, a3, 0
++	addi.d	a2, a2, -8
++	b	start_over_back
++
++copy_backward:
++	move	a5, a3
++	srli.d	a3, a3, 3
++	slli.d	a3, a3, 3
++	beq	a3, a5, all_align_back
++	sub.d	a7, a3, a5
++	add.d	a4, a4, a7
++	add.d	a2, a7, a2
++
++	pcaddi	t1, 18
++	slli.d	a6, a7, 3
++	add.d	t1, t1, a6
++	jirl	zero, t1, 0
++
++	ld.b	t0, a4, 6
++	st.b	t0, a3, 6
++	ld.b	t0, a4, 5
++	st.b	t0, a3, 5
++	ld.b	t0, a4, 4
++	st.b	t0, a3, 4
++	ld.b	t0, a4, 3
++	st.b	t0, a3, 3
++	ld.b	t0, a4, 2
++	st.b	t0, a3, 2
++	ld.b	t0, a4, 1
++	st.b	t0, a3, 1
++	ld.b	t0, a4, 0
++	st.b	t0, a3, 0
++start_over_back:
++
++	addi.d	a2, a2, -0x80
++	blt	a2, zero, end_unalign_proc_back
++
++loop_less_back:
++	LD_64(a4, -64)
++	ST_64(a3, -64)
++	LD_64(a4, -128)
++	ST_64(a3, -128)
++
++	addi.d	a4, a4, -0x80
++	addi.d	a3, a3, -0x80
++	addi.d	a2, a2, -0x80
++	bge	a2, zero, loop_less_back
++
++end_unalign_proc_back:
++	addi.d	a2, a2, 0x80
++
++	pcaddi	t1, 36
++	andi	t2, a2, 0x78
++	sub.d	a4, a4, t2
++	sub.d	a3, a3, t2
++	sub.d	t1, t1, t2
++	jirl	zero, t1, 0
++
++	ld.d	t0, a4, 112
++	st.d	t0, a3, 112
++	ld.d	t0, a4, 104
++	st.d	t0, a3, 104
++	ld.d	t0, a4, 96
++	st.d	t0, a3, 96
++	ld.d	t0, a4, 88
++	st.d	t0, a3, 88
++	ld.d	t0, a4, 80
++	st.d	t0, a3, 80
++	ld.d	t0, a4, 72
++	st.d	t0, a3, 72
++	ld.d	t0, a4, 64
++	st.d	t0, a3, 64
++	ld.d	t0, a4, 56
++	st.d	t0, a3, 56
++	ld.d	t0, a4, 48
++	st.d	t0, a3, 48
++	ld.d	t0, a4, 40
++	st.d	t0, a3, 40
++	ld.d	t0, a4, 32
++	st.d	t0, a3, 32
++	ld.d	t0, a4, 24
++	st.d	t0, a3, 24
++	ld.d	t0, a4, 16
++	st.d	t0, a3, 16
++	ld.d	t0, a4, 8
++	st.d	t0, a3, 8
++	ld.d	t0, a4, 0
++	st.d	t0, a3, 0
++
++	andi	a2, a2, 0x7
++	pcaddi	t1, 18
++	slli.d	a2, a2, 3
++	sub.d	t1, t1, a2
++	jirl	zero, t1, 0
++
++	ld.b	t0, a1, 6
++	st.b	t0, a0, 6
++	ld.b	t0, a1, 5
++	st.b	t0, a0, 5
++	ld.b	t0, a1, 4
++	st.b	t0, a0, 4
++	ld.b	t0, a1, 3
++	st.b	t0, a0, 3
++	ld.b	t0, a1, 2
++	st.b	t0, a0, 2
++	ld.b	t0, a1, 1
++	st.b	t0, a0, 1
++	ld.b	t0, a1, 0
++	st.b	t0, a0, 0
++
++	move	v0, t8
++	jr	ra
++
++END(MEMMOVE_NAME)
++libc_hidden_builtin_def (MEMMOVE_NAME)
+-- 
+2.33.0
+