0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194

From aa39a66f6029fe16a656d7c6339908b953fb1e04 Mon Sep 17 00:00:00 2001
From: Diachkov Ilia WX1215920 <diachkov.ilia1@huawei-partners.com>
Date: Thu, 22 Feb 2024 11:27:43 +0300
Subject: [PATCH 01/18] Add insn defs and correct costs for cmlt generation

---
 gcc/config/aarch64/aarch64-simd.md  | 48 +++++++++++++++++++++++++++++
 gcc/config/aarch64/aarch64.cc       | 15 +++++++++
 gcc/config/aarch64/aarch64.opt      |  4 +++
 gcc/config/aarch64/iterators.md     |  3 +-
 gcc/config/aarch64/predicates.md    | 25 +++++++++++++++
 gcc/testsuite/gcc.dg/combine-cmlt.c | 20 ++++++++++++
 6 files changed, 114 insertions(+), 1 deletion(-)
 create mode 100755 gcc/testsuite/gcc.dg/combine-cmlt.c

diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index ee7f0b89c..82f73805f 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -6454,6 +6454,54 @@
   [(set_attr "type" "neon_compare<q>, neon_compare_zero<q>")]
 )
 
+;; Use cmlt to replace vector arithmetic operations like this (SImode example):
+;; B = (((A >> 15) & 0x00010001) << 16) - ((A >> 15) & 0x00010001)
+;; TODO: maybe extend to scalar operations or other cm** instructions.
+
+(define_insn "*aarch64_cmlt_as_arith<mode>"
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
+	(minus:<V_INT_EQUIV>
+	  (ashift:<V_INT_EQUIV>
+	    (and:<V_INT_EQUIV>
+	      (lshiftrt:<V_INT_EQUIV>
+		(match_operand:VDQHSD 1 "register_operand" "w")
+		(match_operand:VDQHSD 2 "half_size_minus_one_operand"))
+	      (match_operand:VDQHSD 3 "cmlt_arith_mask_operand"))
+	    (match_operand:VDQHSD 4 "half_size_operand"))
+	  (and:<V_INT_EQUIV>
+	    (lshiftrt:<V_INT_EQUIV>
+	      (match_dup 1)
+	      (match_dup 2))
+	    (match_dup 3))))]
+  "TARGET_SIMD && flag_cmlt_arith"
+  "cmlt\t%<v>0.<V2ntype>, %<v>1.<V2ntype>, #0"
+  [(set_attr "type" "neon_compare_zero")]
+)
+
+;; The helper definition that allows combiner to use the previous pattern.
+
+(define_insn_and_split "*arch64_cmlt_tmp<mode>"
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
+	(and:<V_INT_EQUIV>
+	  (lshiftrt:<V_INT_EQUIV>
+	    (match_operand:VDQHSD 1 "register_operand" "w")
+	    (match_operand:VDQHSD 2 "half_size_minus_one_operand"))
+	  (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
+  "TARGET_SIMD && flag_cmlt_arith"
+  "#"
+  "&& reload_completed"
+  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
+	(lshiftrt:<V_INT_EQUIV>
+	  (match_operand:VDQHSD 1 "register_operand")
+	  (match_operand:VDQHSD 2 "half_size_minus_one_operand")))
+   (set (match_dup 0)
+	(and:<V_INT_EQUIV>
+	  (match_dup 0)
+	  (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
+  ""
+  [(set_attr "type" "neon_compare_zero")]
+)
+
 (define_insn_and_split "aarch64_cm<optab>di"
   [(set (match_operand:DI 0 "register_operand" "=w,w,r")
 	(neg:DI
diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
index a3da4ca30..04072ca25 100644
--- a/gcc/config/aarch64/aarch64.cc
+++ b/gcc/config/aarch64/aarch64.cc
@@ -14064,6 +14064,21 @@ cost_minus:
 	    return true;
 	  }
 
+	/* Detect aarch64_cmlt_as_arith instruction. Now only this pattern
+	   matches the condition. The costs of cmlt and sub instructions
+	   are comparable, so we are not increasing the cost here.  */
+	if (flag_cmlt_arith && GET_CODE (op0) == ASHIFT
+	    && GET_CODE (op1) == AND)
+	  {
+	    rtx op0_subop0 = XEXP (op0, 0);
+	    if (rtx_equal_p (op0_subop0, op1))
+	      {
+		rtx lshrt_op = XEXP (op0_subop0, 0);
+		if (GET_CODE (lshrt_op) == LSHIFTRT)
+		  return true;
+	      }
+	  }
+
 	/* Look for SUB (extended register).  */
 	if (is_a <scalar_int_mode> (mode)
 	    && aarch64_rtx_arith_op_extract_p (op1))
diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
index a64b927e9..101664c7c 100644
--- a/gcc/config/aarch64/aarch64.opt
+++ b/gcc/config/aarch64/aarch64.opt
@@ -262,6 +262,10 @@ Use an immediate to offset from the stack protector guard register, sp_el0.
 This option is for use with fstack-protector-strong and not for use in
 user-land code.
 
+mcmlt-arith
+Target Var(flag_cmlt_arith) Optimization Init(0)
+Use SIMD cmlt instruction to perform some arithmetic/logic calculations.
+
 TargetVariable
 long aarch64_stack_protector_guard_offset = 0
 
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index 26a840d7f..967e6b0b1 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -1485,7 +1485,8 @@
 			  (V2DI "2s")])
 
 ;; Register suffix narrowed modes for VQN.
-(define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
+(define_mode_attr V2ntype [(V4HI "8b") (V2SI "4h")
+			   (V8HI "16b") (V4SI "8h")
 			   (V2DI "4s")])
 
 ;; Widened modes of vector modes.
diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
index c308015ac..07c14aacb 100644
--- a/gcc/config/aarch64/predicates.md
+++ b/gcc/config/aarch64/predicates.md
@@ -49,6 +49,31 @@
   return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3);
 })
 
+(define_predicate "half_size_minus_one_operand"
+  (match_code "const_vector")
+{
+  op = unwrap_const_vec_duplicate (op);
+  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+  return CONST_INT_P (op) && (UINTVAL (op) == size - 1);
+})
+
+(define_predicate "half_size_operand"
+  (match_code "const_vector")
+{
+  op = unwrap_const_vec_duplicate (op);
+  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+  return CONST_INT_P (op) && (UINTVAL (op) == size);
+})
+
+(define_predicate "cmlt_arith_mask_operand"
+  (match_code "const_vector")
+{
+  op = unwrap_const_vec_duplicate (op);
+  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
+  unsigned long long mask = ((unsigned long long) 1 << size) | 1;
+  return CONST_INT_P (op) && (UINTVAL (op) == mask);
+})
+
 (define_predicate "subreg_lowpart_operator"
   (ior (match_code "truncate")
        (and (match_code "subreg")
diff --git a/gcc/testsuite/gcc.dg/combine-cmlt.c b/gcc/testsuite/gcc.dg/combine-cmlt.c
new file mode 100755
index 000000000..b4c9a37ff
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/combine-cmlt.c
@@ -0,0 +1,20 @@
+/* { dg-do compile { target aarch64-*-* } } */
+/* { dg-options "-O3 -mcmlt-arith" } */
+
+/* The test checks usage of cmlt insns for arithmetic/logic calculations
+ * in foo ().  It's inspired by sources of x264 codec.  */
+
+typedef unsigned short int uint16_t;
+typedef unsigned int uint32_t;
+
+void foo( uint32_t *a, uint32_t *b)
+{
+  for (unsigned i = 0; i < 4; i++)
+    {
+      uint32_t s = ((a[i]>>((8 * sizeof(uint16_t))-1))
+		    &(((uint32_t)1<<(8 * sizeof(uint16_t)))+1))*((uint16_t)-1);
+      b[i] = (a[i]+s)^s;
+    }
+}
+
+/* { dg-final { scan-assembler-times {cmlt\t} 1 } }  */
-- 
2.33.0