0181-Backport-SME-AArch64-Support-new-tbranch-optab.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250

From da06b276b6ae281efad2ec3b982e09b1f4015917 Mon Sep 17 00:00:00 2001
From: Tamar Christina <tamar.christina@arm.com>
Date: Mon, 12 Dec 2022 15:18:56 +0000
Subject: [PATCH 082/157] [Backport][SME] AArch64: Support new tbranch optab.

Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=17ae956c0fa6baac3d22764019d5dd5ebf5c2b11

This implements the new tbranch optab for AArch64.

we cannot emit one big RTL for the final instruction immediately.
The reason that all comparisons in the AArch64 backend expand to separate CC
compares, and separate testing of the operands is for ifcvt.

The separate CC compare is needed so ifcvt can produce csel, cset etc from the
compares.  Unlike say combine, ifcvt can not do recog on a parallel with a
clobber.  Should we emit the instruction directly then ifcvt will not be able
to say, make a csel, because we have no patterns which handle zero_extract and
compare. (unlike combine ifcvt cannot transform the extract into an AND).

While you could provide various patterns for this (and I did try) you end up
with broken patterns because you can't add the clobber to the CC register.  If
you do, ifcvt recog fails.

i.e.

int
f1 (int x)
{
  if (x & 1)
    return 1;
  return x;
}

We lose csel here.

Secondly the reason the compare with an explicit CC mode is needed is so that
ifcvt can transform the operation into a version that doesn't require the flags
to be set.  But it only does so if it know the explicit usage of the CC reg.

For instance

int
foo (int a, int b)
{
  return ((a & (1 << 25)) ? 5 : 4);
}

Doesn't require a comparison, the optimal form is:

foo(int, int):
        ubfx    x0, x0, 25, 1
        add     w0, w0, 4
        ret

and no compare is actually needed.  If you represent the instruction using an
ANDS instead of a zero_extract then you get close, but you end up with an ands
followed by an add, which is a slower operation.

gcc/ChangeLog:

	* config/aarch64/aarch64.md (*tb<optab><mode>1): Rename to...
	(*tb<optab><ALLI:mode><GPI:mode>1): ... this.
	(tbranch_<code><mode>4): New.
	* config/aarch64/iterators.md(ZEROM, zerom): New.

gcc/testsuite/ChangeLog:

	* gcc.target/aarch64/tbz_1.c: New test.
---
 gcc/config/aarch64/aarch64.md            | 33 ++++++--
 gcc/config/aarch64/iterators.md          |  2 +
 gcc/testsuite/gcc.target/aarch64/tbz_1.c | 95 ++++++++++++++++++++++++
 3 files changed, 122 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/aarch64/tbz_1.c

diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 079c8a3f9..2becc888e 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -953,12 +953,29 @@
 		      (const_int 1)))]
 )
 
-(define_insn "*tb<optab><mode>1"
+(define_expand "tbranch_<code><mode>3"
   [(set (pc) (if_then_else
-	      (EQL (zero_extract:DI (match_operand:GPI 0 "register_operand" "r")
-				    (const_int 1)
-				    (match_operand 1
-				      "aarch64_simd_shift_imm_<mode>" "n"))
+              (EQL (match_operand:ALLI 0 "register_operand")
+                   (match_operand 1 "aarch64_simd_shift_imm_<mode>"))
+              (label_ref (match_operand 2 ""))
+              (pc)))]
+  ""
+{
+  rtx bitvalue = gen_reg_rtx (<ZEROM>mode);
+  rtx reg = gen_lowpart (<ZEROM>mode, operands[0]);
+  rtx val = GEN_INT (1UL << UINTVAL (operands[1]));
+  emit_insn (gen_and<zerom>3 (bitvalue, reg, val));
+  operands[1] = const0_rtx;
+  operands[0] = aarch64_gen_compare_reg (<CODE>, bitvalue,
+					 operands[1]);
+})
+
+(define_insn "*tb<optab><ALLI:mode><GPI:mode>1"
+  [(set (pc) (if_then_else
+	      (EQL (zero_extract:GPI (match_operand:ALLI 0 "register_operand" "r")
+				     (const_int 1)
+				     (match_operand 1
+				       "aarch64_simd_shift_imm_<ALLI:mode>" "n"))
 		   (const_int 0))
 	     (label_ref (match_operand 2 "" ""))
 	     (pc)))
@@ -969,15 +986,15 @@
       {
 	if (get_attr_far_branch (insn) == 1)
 	  return aarch64_gen_far_branch (operands, 2, "Ltb",
-					 "<inv_tb>\\t%<w>0, %1, ");
+					 "<inv_tb>\\t%<ALLI:w>0, %1, ");
 	else
 	  {
 	    operands[1] = GEN_INT (HOST_WIDE_INT_1U << UINTVAL (operands[1]));
-	    return "tst\t%<w>0, %1\;<bcond>\t%l2";
+	    return "tst\t%<ALLI:w>0, %1\;<bcond>\t%l2";
 	  }
       }
     else
-      return "<tbz>\t%<w>0, %1, %l2";
+      return "<tbz>\t%<ALLI:w>0, %1, %l2";
   }
   [(set_attr "type" "branch")
    (set (attr "length")
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 226dea48a..b616f5c9a 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1104,6 +1104,8 @@
 
 ;; Give the number of bits in the mode
 (define_mode_attr sizen [(QI "8") (HI "16") (SI "32") (DI "64")])
+(define_mode_attr ZEROM [(QI "SI") (HI "SI") (SI "SI") (DI "DI")])
+(define_mode_attr zerom [(QI "si") (HI "si") (SI "si") (DI "di")])
 
 ;; Give the ordinal of the MSB in the mode
 (define_mode_attr sizem1 [(QI "#7") (HI "#15") (SI "#31") (DI "#63")
diff --git a/gcc/testsuite/gcc.target/aarch64/tbz_1.c b/gcc/testsuite/gcc.target/aarch64/tbz_1.c
new file mode 100644
index 000000000..39deb58e2
--- /dev/null
+++ b/gcc/testsuite/gcc.target/aarch64/tbz_1.c
@@ -0,0 +1,95 @@
+/* { dg-do compile } */
+/* { dg-additional-options "-O2 -std=c99  -fno-unwind-tables -fno-asynchronous-unwind-tables" } */
+/* { dg-final { check-function-bodies "**" "" "" { target { le } } } } */
+
+#include <stdbool.h>
+
+void h(void);
+
+/*
+** g1:
+** 	tbnz	w[0-9]+, #?0, .L([0-9]+)
+** 	ret
+**	...
+*/
+void g1(bool x)
+{
+  if (__builtin_expect (x, 0))
+    h ();
+}
+
+/*
+** g2:
+** 	tbz	w[0-9]+, #?0, .L([0-9]+)
+** 	b	h
+**	...
+*/
+void g2(bool x)
+{
+  if (__builtin_expect (x, 1))
+    h ();
+}
+
+/*
+** g3_ge:
+** 	tbnz	w[0-9]+, #?31, .L[0-9]+
+** 	b	h
+**	...
+*/
+void g3_ge(int x)
+{
+  if (__builtin_expect (x >= 0, 1))
+    h ();
+}
+
+/*
+** g3_gt:
+** 	cmp	w[0-9]+, 0
+** 	ble	.L[0-9]+
+** 	b	h
+**	...
+*/
+void g3_gt(int x)
+{
+  if (__builtin_expect (x > 0, 1))
+    h ();
+}
+
+/*
+** g3_lt:
+** 	tbz	w[0-9]+, #?31, .L[0-9]+
+** 	b	h
+**	...
+*/
+void g3_lt(int x)
+{
+  if (__builtin_expect (x < 0, 1))
+    h ();
+}
+
+/*
+** g3_le:
+** 	cmp	w[0-9]+, 0
+** 	bgt	.L[0-9]+
+** 	b	h
+**	...
+*/
+void g3_le(int x)
+{
+  if (__builtin_expect (x <= 0, 1))
+    h ();
+}
+
+/*
+** g5:
+** 	mov	w[0-9]+, 65279
+** 	tst	w[0-9]+, w[0-9]+
+** 	beq	.L[0-9]+
+** 	b	h
+**	...
+*/ 
+void g5(int x)
+{
+  if (__builtin_expect (x & 0xfeff, 1))
+    h ();
+}
-- 
2.33.0