summaryrefslogtreecommitdiff
path: root/5_6-LoongArch-Optimize-string-function-strcpy.patch
blob: d2686d5382ad5741cf6df5275dc2767b381dbea3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
From 379b627b88af8d91c1f87b323925119ec313b1b7 Mon Sep 17 00:00:00 2001
From: Xue Liu <liuxue@loongson.cn>
Date: Sun, 29 Jan 2023 10:25:18 +0800
Subject: [PATCH 5/6] LoongArch: Optimize string function strcpy.

Change-Id: Ic105e1f00cceb4937d5fd2127ca03025a18ff4be
---
 sysdeps/loongarch/lp64/strcpy.S | 175 ++++++++++++++++++++++++++++++++
 1 file changed, 175 insertions(+)
 create mode 100644 sysdeps/loongarch/lp64/strcpy.S

diff --git a/sysdeps/loongarch/lp64/strcpy.S b/sysdeps/loongarch/lp64/strcpy.S
new file mode 100644
index 00000000..03d9d361
--- /dev/null
+++ b/sysdeps/loongarch/lp64/strcpy.S
@@ -0,0 +1,175 @@
+/* Optimized strcpy implementation for LoongArch.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sys/asm.h>
+
+/* Parameters and Results */
+#define dest	a0
+#define src	a1
+#define result	v0
+
+/* Internal variable */
+#define data		t0
+#define data1		t1
+#define has_nul		t2
+#define diff		t3
+#define syndrome	t4
+#define zeroones	t5
+#define sevenf		t6
+#define pos		t7
+#define dest_backup	t8
+#define tmp1		a4
+#define tmp2		a5
+#define tmp3		a6
+#define dest_off	a2
+#define src_off		a3
+#define tmp4		a7
+
+/* rd <- if rc then ra else rb
+   tmp3 will be destroyed. */
+#define CONDITIONSEL(rd, rc, ra, rb)\
+	masknez	tmp3, rb, rc;\
+	maskeqz	rd, ra, rc;\
+	or	rd, rd, tmp3
+
+/* int strcpy (const char *s1, const char *s2); */
+LEAF(strcpy)
+	.align		4
+
+	move		dest_backup, dest
+	lu12i.w		zeroones, 0x01010
+	lu12i.w		sevenf, 0x7f7f7
+	ori		zeroones, zeroones, 0x101
+	ori		sevenf, sevenf, 0xf7f
+	bstrins.d	zeroones, zeroones, 63, 32
+	bstrins.d	sevenf, sevenf, 63, 32
+	andi		src_off, src, 0x7
+	beqz		src_off, strcpy_loop_aligned_1
+	b		strcpy_mutual_align
+strcpy_loop_aligned:
+	st.d		data, dest, 0
+	addi.d		dest, dest, 8
+strcpy_loop_aligned_1:
+	ld.d		data, src, 0
+	addi.d		src, src, 8
+strcpy_start_realigned:
+	sub.d		tmp1, data, zeroones
+	or		tmp2, data, sevenf
+	andn		has_nul, tmp1, tmp2
+	beqz		has_nul, strcpy_loop_aligned
+
+strcpy_end:
+
+	/* 8 4 2 1 */
+	ctz.d		pos, has_nul
+	srli.d		pos, pos, 3
+	addi.d		pos, pos, 1
+	/* Do 8/4/2/1 strcpy based on pos value.
+	   pos value is the number of bytes to be copied
+	   the bytes include the final \0 so the max length is 8 and the min length is 1. */
+strcpy_end_8:
+	andi		tmp1, pos, 0x8
+	beqz		tmp1, strcpy_end_4
+	st.d		data, dest, 0
+	move		dest, dest_backup
+	jr		ra
+strcpy_end_4:
+	andi		tmp1, pos, 0x4
+	beqz		tmp1, strcpy_end_2
+	st.w		data, dest, 0
+	srli.d		data, data, 32
+	addi.d		dest, dest, 4
+strcpy_end_2:
+	andi		tmp1, pos, 0x2
+	beqz		tmp1, strcpy_end_1
+	st.h		data, dest, 0
+	srli.d		data, data, 16
+	addi.d		dest, dest, 2
+strcpy_end_1:
+	andi		tmp1, pos, 0x1
+	beqz		tmp1, strcpy_end_ret
+	st.b		data, dest, 0
+strcpy_end_ret:
+	move		result, dest_backup
+	jr		ra
+
+
+strcpy_mutual_align:
+	/* Check if around src page bound.
+	   if not go to page cross ok.
+	   if it is, do further check.
+	   use tmp2 to accelerate. */
+
+	li.w		tmp2, 0xff8
+	andi		tmp1, src, 0xff8
+	beq		tmp1, tmp2, strcpy_page_cross
+
+strcpy_page_cross_ok:
+	/* Load a misaligned double word and check if has \0
+	   If no, do a misaligned double word paste.
+	   If yes, calculate the number of avaliable bytes,
+	   then jump to 4/2/1 end. */
+	ld.d		data, src, 0
+	sub.d		tmp1, data, zeroones
+	or		tmp2, data, sevenf
+	andn		has_nul, tmp1, tmp2
+	bnez		has_nul, strcpy_end
+strcpy_mutual_align_finish:
+	/* Before jump back to align loop, make dest/src aligned.
+	   This will cause a duplicated paste for several bytes between the first double word and the second double word,
+	   but should not bring a problem. */
+	li.w		tmp1, 8
+	st.d		data, dest, 0
+	sub.d		tmp1, tmp1, src_off
+	add.d		src, src, tmp1
+	add.d		dest, dest, tmp1
+
+	b		strcpy_loop_aligned_1
+
+strcpy_page_cross:
+	/*
+	   ld.d from aligned address(src & ~0x7).
+	   check if high bytes have \0.
+	   it not, go back to page cross ok,
+	   since the string is supposed to cross the page bound in such situation.
+	   if it is, do a srl for data to make it seems like a direct double word from src,
+	   then go to 4/2/1 strcpy end.
+
+	   tmp4 is 0xffff...ffff mask
+	   tmp2 demonstrate the bytes to be masked
+	   tmp2 = src_off << 3
+	   data = data >> (src_off * 8) | -1 << (64 - src_off * 8)
+	   and
+	   -1 << (64 - src_off * 8) -> ~(-1 >> (src_off * 8)) */
+
+	li.w		tmp1, 0x7
+	andn		tmp3, src, tmp1
+	ld.d		data, tmp3, 0
+	li.w		tmp4, -1
+	slli.d		tmp2, src_off, 3
+	srl.d		tmp4, tmp4, tmp2
+	srl.d		data, data, tmp2
+	nor		tmp4, tmp4, zero
+	or		data, data, tmp4
+	sub.d		tmp1, data, zeroones
+	or		tmp2, data, sevenf
+	andn		has_nul, tmp1, tmp2
+	beqz		has_nul, strcpy_page_cross_ok
+	b		strcpy_end
+END(strcpy)
+libc_hidden_builtin_def (strcpy)
-- 
2.33.0