0176-Backport-SME-aarch64-Add-r-m-and-m-r-alternatives-to.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168

From d8233e19aae2272c4863de5e8d61d49d3147e807 Mon Sep 17 00:00:00 2001
From: Kyrylo Tkachov <kyrylo.tkachov@arm.com>
Date: Thu, 1 Jun 2023 09:37:06 +0100
Subject: [PATCH 077/157] [Backport][SME] aarch64: Add =r,m and =m,r
 alternatives to 64-bit vector move patterns

Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=12e71b593ea0c64d919df525cd75ea10b7be8a4b

We can use the X registers to load and store 64-bit vector modes, we just need to add the alternatives
to the mov patterns. This straightforward patch does that and for the pair variants too.
For the testcase in the code we now generate the optimal assembly without any superfluous
GP<->SIMD moves.

Bootstrapped and tested on aarch64-none-linux-gnu and aarch64_be-none-elf.

gcc/ChangeLog:

	* config/aarch64/aarch64-simd.md (*aarch64_simd_mov<VDMOV:mode>):
	Add =r,m and =r,m alternatives.
	(load_pair<DREG:mode><DREG2:mode>): Likewise.
	(vec_store_pair<DREG:mode><DREG2:mode>): Likewise.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/xreg-vec-modes_1.c: New test.
---
 gcc/config/aarch64/aarch64-simd.md            | 40 ++++++++++--------
 .../gcc.target/aarch64/xreg-vec-modes_1.c     | 42 +++++++++++++++++++
 2 files changed, 65 insertions(+), 17 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/xreg-vec-modes_1.c

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index 2d688edf5..b5c52ba16 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -116,26 +116,28 @@
 
 (define_insn "*aarch64_simd_mov<VDMOV:mode>"
   [(set (match_operand:VDMOV 0 "nonimmediate_operand"
-		"=w, m,  m,  w, ?r, ?w, ?r,  w,  w")
+		"=w, r, m,  m, m,  w, ?r, ?w, ?r,  w,  w")
 	(match_operand:VDMOV 1 "general_operand"
-		"m,  Dz, w,  w,  w,  r,  r, Dn, Dz"))]
+		"m,  m, Dz, w, r,  w,  w,  r,  r, Dn, Dz"))]
   "TARGET_FLOAT
    && (register_operand (operands[0], <MODE>mode)
        || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
   "@
    ldr\t%d0, %1
+   ldr\t%x0, %1
    str\txzr, %0
    str\t%d1, %0
+   str\t%x1, %0
    * return TARGET_SIMD ? \"mov\t%0.<Vbtype>, %1.<Vbtype>\" : \"fmov\t%d0, %d1\";
    * return TARGET_SIMD ? \"umov\t%0, %1.d[0]\" : \"fmov\t%x0, %d1\";
    fmov\t%d0, %1
    mov\t%0, %1
    * return aarch64_output_simd_mov_immediate (operands[1], 64);
    fmov\t%d0, xzr"
-  [(set_attr "type" "neon_load1_1reg<q>, store_8, neon_store1_1reg<q>,\
-		     neon_logic<q>, neon_to_gp<q>, f_mcr,\
+  [(set_attr "type" "neon_load1_1reg<q>, load_8, store_8, neon_store1_1reg<q>,\
+		     store_8, neon_logic<q>, neon_to_gp<q>, f_mcr,\
 		     mov_reg, neon_move<q>, f_mcr")
-   (set_attr "arch" "*,*,*,*,*,*,*,simd,*")]
+   (set_attr "arch" "*,*,*,*,*,*,*,*,*,simd,*")]
 )
 
 (define_insn "*aarch64_simd_mov<VQMOV:mode>"
@@ -177,31 +179,35 @@
 )
 
 (define_insn "load_pair<DREG:mode><DREG2:mode>"
-  [(set (match_operand:DREG 0 "register_operand" "=w")
-	(match_operand:DREG 1 "aarch64_mem_pair_operand" "Ump"))
-   (set (match_operand:DREG2 2 "register_operand" "=w")
-	(match_operand:DREG2 3 "memory_operand" "m"))]
+  [(set (match_operand:DREG 0 "register_operand" "=w,r")
+	(match_operand:DREG 1 "aarch64_mem_pair_operand" "Ump,Ump"))
+   (set (match_operand:DREG2 2 "register_operand" "=w,r")
+	(match_operand:DREG2 3 "memory_operand" "m,m"))]
   "TARGET_FLOAT
    && rtx_equal_p (XEXP (operands[3], 0),
 		   plus_constant (Pmode,
 				  XEXP (operands[1], 0),
 				  GET_MODE_SIZE (<DREG:MODE>mode)))"
-  "ldp\\t%d0, %d2, %z1"
-  [(set_attr "type" "neon_ldp")]
+  "@
+   ldp\t%d0, %d2, %z1
+   ldp\t%x0, %x2, %z1"
+  [(set_attr "type" "neon_ldp,load_16")]
 )
 
 (define_insn "vec_store_pair<DREG:mode><DREG2:mode>"
-  [(set (match_operand:DREG 0 "aarch64_mem_pair_operand" "=Ump")
-	(match_operand:DREG 1 "register_operand" "w"))
-   (set (match_operand:DREG2 2 "memory_operand" "=m")
-	(match_operand:DREG2 3 "register_operand" "w"))]
+  [(set (match_operand:DREG 0 "aarch64_mem_pair_operand" "=Ump,Ump")
+	(match_operand:DREG 1 "register_operand" "w,r"))
+   (set (match_operand:DREG2 2 "memory_operand" "=m,m")
+	(match_operand:DREG2 3 "register_operand" "w,r"))]
   "TARGET_FLOAT
    && rtx_equal_p (XEXP (operands[2], 0),
 		   plus_constant (Pmode,
 				  XEXP (operands[0], 0),
 				  GET_MODE_SIZE (<DREG:MODE>mode)))"
-  "stp\\t%d1, %d3, %z0"
-  [(set_attr "type" "neon_stp")]
+  "@
+   stp\t%d1, %d3, %z0
+   stp\t%x1, %x3, %z0"
+  [(set_attr "type" "neon_stp,store_16")]
 )
 
 (define_insn "load_pair<VQ:mode><VQ2:mode>"
diff --git a/gcc/testsuite/gcc.target/aarch64/xreg-vec-modes_1.c b/gcc/testsuite/gcc.target/aarch64/xreg-vec-modes_1.c
new file mode 100644
index 000000000..fc4dcb1ad
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/xreg-vec-modes_1.c
@@ -0,0 +1,42 @@
+/* { dg-do compile } */
+/* { dg-options "-O2" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+typedef unsigned int v2si  __attribute__((vector_size (8)));
+
+#define force_gp(V1)   asm volatile (""				\
+           : "=r"(V1)                                           \
+           : "r"(V1)                                            \
+           : /* No clobbers */);
+
+/*
+** foo:
+**	ldr	(x[0-9]+), \[x1\]
+**	str	\1, \[x0\]
+**	ret
+*/
+
+void
+foo (v2si *a, v2si *b)
+{
+  v2si tmp = *b;
+  force_gp (tmp);
+  *a = tmp;
+}
+
+/*
+** foo2:
+**	ldp	(x[0-9]+), (x[0-9]+), \[x0\]
+**	stp	\1, \2, \[x1\]
+**	ret
+*/
+void
+foo2 (v2si *a, v2si *b)
+{
+  v2si t1 = *a;
+  v2si t2 = a[1];
+  force_gp (t1);
+  force_gp (t2);
+  *b = t1;
+  b[1] = t2;
+}
-- 
2.33.0