0244-Backport-SME-Canonicalize-X-Y-as-X-Y-in-match.pd-whe.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184

From f6652dbebf81372884e9fd8b68627fc7a94d8d3b Mon Sep 17 00:00:00 2001
From: Roger Sayle <roger@nextmovesoftware.com>
Date: Fri, 27 May 2022 08:57:46 +0100
Subject: [PATCH 145/157] [Backport][SME] Canonicalize X&-Y as X*Y in match.pd
 when Y is [0,1].

Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=8fb94fc6097c0a934aac0d89c9c5e2038da67655

"For every pessimization, there's an equal and opposite optimization".

In the review of my original patch for PR middle-end/98865, Richard
Biener pointed out that match.pd shouldn't be transforming X*Y into
X&-Y as the former is considered cheaper by tree-ssa's cost model
(operator count).  A corollary of this is that we should instead be
transforming X&-Y into the cheaper X*Y as a preferred canonical form
(especially as RTL expansion now intelligently selects the appropriate
implementation based on the target's costs).

With this patch we now generate identical code for:
int foo(int x, int y) { return -(x&1) & y; }
int bar(int x, int y) { return (x&1) * y; }

specifically on x86_64-pc-linux-gnu both use and/neg/and with -O2,
but both use and/mul with -Os.

One minor wrinkle/improvement is that this patch includes three
additional optimizations (that account for the change in canonical
form) to continue to optimize PR92834 and PR94786.

2022-05-27  Roger Sayle  <roger@nextmovesoftware.com>

gcc/ChangeLog
	* match.pd (match_zero_one_valued_p): New predicate.
	(mult @0 @1): Use zero_one_valued_p for optimization to the
	expression "bit_and @0 @1".
	(bit_and (negate zero_one_valued_p@0) @1): Optimize to MULT_EXPR.
	(plus @0 (mult (minus @1 @0) zero_one_valued_p@2)): New transform.
	(minus @0 (mult (minus @0 @1) zero_one_valued_p@2)): Likewise.
	(bit_xor @0 (mult (bit_xor @0 @1) zero_one_valued_p@2)): Likewise.
	Remove three redundant transforms obsoleted by the three above.

gcc/testsuite/ChangeLog
	* gcc.dg/pr98865.c: New test case.
---
 gcc/match.pd                   | 86 ++++++++++++++++------------------
 gcc/testsuite/gcc.dg/pr98865.c | 14 ++++++
 2 files changed, 55 insertions(+), 45 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/pr98865.c

diff --git a/gcc/match.pd b/gcc/match.pd
index aee58e47b..6d3165bcd 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -285,14 +285,6 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
            || !COMPLEX_FLOAT_TYPE_P (type)))
    (negate @0)))
 
-/* Transform { 0 or 1 } * { 0 or 1 } into { 0 or 1 } & { 0 or 1 } */
-(simplify
- (mult SSA_NAME@1 SSA_NAME@2)
-  (if (INTEGRAL_TYPE_P (type)
-       && get_nonzero_bits (@1) == 1
-       && get_nonzero_bits (@2) == 1)
-   (bit_and @1 @2)))
-
 /* Transform x * { 0 or 1, 0 or 1, ... } into x & { 0 or -1, 0 or -1, ...},
    unless the target has native support for the former but not the latter.  */
 (simplify
@@ -1790,6 +1782,27 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   (bit_not (bit_not @0))
   @0)
 
+(match zero_one_valued_p
+ @0
+ (if (INTEGRAL_TYPE_P (type) && tree_nonzero_bits (@0) == 1)))
+(match zero_one_valued_p
+ truth_valued_p@0)
+
+/* Transform { 0 or 1 } * { 0 or 1 } into { 0 or 1 } & { 0 or 1 }.  */
+(simplify
+ (mult zero_one_valued_p@0 zero_one_valued_p@1)
+ (if (INTEGRAL_TYPE_P (type))
+  (bit_and @0 @1)))
+
+/* Transform X & -Y into X * Y when Y is { 0 or 1 }.  */
+(simplify
+ (bit_and:c (convert? (negate zero_one_valued_p@0)) @1)
+ (if (INTEGRAL_TYPE_P (type)
+      && INTEGRAL_TYPE_P (TREE_TYPE (@0))
+      && TREE_CODE (TREE_TYPE (@0)) != BOOLEAN_TYPE
+      && !TYPE_UNSIGNED (TREE_TYPE (@0)))
+  (mult (convert @0) @1)))
+
 /* Convert ~ (-A) to A - 1.  */
 (simplify
  (bit_not (convert? (negate @0)))
@@ -3281,44 +3294,27 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
   (cmp @0 (minmax:c @0 @1))
   { constant_boolean_node (cmp == GE_EXPR || cmp == LE_EXPR, type); } ))
 
-/* Undo fancy way of writing max/min or other ?: expressions,
-   like a - ((a - b) & -(a < b)), in this case into (a < b) ? b : a.
+/* Undo fancy ways of writing max/min or other ?: expressions, like
+   a - ((a - b) & -(a < b))  and  a - (a - b) * (a < b) into (a < b) ? b : a.
    People normally use ?: and that is what we actually try to optimize.  */
-(for cmp (simple_comparison)
- (simplify
-  (minus @0 (bit_and:c (minus @0 @1)
-		       (convert? (negate@4 (convert? (cmp@5 @2 @3))))))
-  (if (INTEGRAL_TYPE_P (type)
-       && INTEGRAL_TYPE_P (TREE_TYPE (@4))
-       && TREE_CODE (TREE_TYPE (@4)) != BOOLEAN_TYPE
-       && INTEGRAL_TYPE_P (TREE_TYPE (@5))
-       && (TYPE_PRECISION (TREE_TYPE (@4)) >= TYPE_PRECISION (type)
-	   || !TYPE_UNSIGNED (TREE_TYPE (@4)))
-       && (GIMPLE || !TREE_SIDE_EFFECTS (@1)))
-   (cond (cmp @2 @3) @1 @0)))
- (simplify
-  (plus:c @0 (bit_and:c (minus @1 @0)
-			(convert? (negate@4 (convert? (cmp@5 @2 @3))))))
-  (if (INTEGRAL_TYPE_P (type)
-       && INTEGRAL_TYPE_P (TREE_TYPE (@4))
-       && TREE_CODE (TREE_TYPE (@4)) != BOOLEAN_TYPE
-       && INTEGRAL_TYPE_P (TREE_TYPE (@5))
-       && (TYPE_PRECISION (TREE_TYPE (@4)) >= TYPE_PRECISION (type)
-	   || !TYPE_UNSIGNED (TREE_TYPE (@4)))
-       && (GIMPLE || !TREE_SIDE_EFFECTS (@1)))
-   (cond (cmp @2 @3) @1 @0)))
- /* Similarly with ^ instead of - though in that case with :c.  */
- (simplify
-  (bit_xor:c @0 (bit_and:c (bit_xor:c @0 @1)
-			   (convert? (negate@4 (convert? (cmp@5 @2 @3))))))
-  (if (INTEGRAL_TYPE_P (type)
-       && INTEGRAL_TYPE_P (TREE_TYPE (@4))
-       && TREE_CODE (TREE_TYPE (@4)) != BOOLEAN_TYPE
-       && INTEGRAL_TYPE_P (TREE_TYPE (@5))
-       && (TYPE_PRECISION (TREE_TYPE (@4)) >= TYPE_PRECISION (type)
-	   || !TYPE_UNSIGNED (TREE_TYPE (@4)))
-       && (GIMPLE || !TREE_SIDE_EFFECTS (@1)))
-   (cond (cmp @2 @3) @1 @0))))
+/* Transform A + (B-A)*cmp into cmp ? B : A.  */
+(simplify
+ (plus:c @0 (mult:c (minus @1 @0) zero_one_valued_p@2))
+ (if (INTEGRAL_TYPE_P (type)
+      && (GIMPLE || !TREE_SIDE_EFFECTS (@1)))
+  (cond (convert:boolean_type_node @2) @1 @0)))
+/* Transform A - (A-B)*cmp into cmp ? B : A.  */
+(simplify
+ (minus @0 (mult:c (minus @0 @1) zero_one_valued_p@2))
+ (if (INTEGRAL_TYPE_P (type)
+      && (GIMPLE || !TREE_SIDE_EFFECTS (@1)))
+  (cond (convert:boolean_type_node @2) @1 @0)))
+/* Transform A ^ (A^B)*cmp into cmp ? B : A.  */
+(simplify
+ (bit_xor:c @0 (mult:c (bit_xor:c @0 @1) zero_one_valued_p@2))
+ (if (INTEGRAL_TYPE_P (type)
+      && (GIMPLE || !TREE_SIDE_EFFECTS (@1)))
+  (cond (convert:boolean_type_node @2) @1 @0)))
 
 /* Simplifications of shift and rotates.  */
 
diff --git a/gcc/testsuite/gcc.dg/pr98865.c b/gcc/testsuite/gcc.dg/pr98865.c
new file mode 100644
index 000000000..95f727033
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/pr98865.c
@@ -0,0 +1,14 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+
+int foo(int x, int y)
+{
+  return -(x&1) & y;
+}
+
+int bar(int x, int y)
+{
+  return (x&1) * y;
+}
+
+/* { dg-final { scan-tree-dump-times " \\* " 2 "optimized" } } */
-- 
2.33.0