1 files changed, 239 insertions, 0 deletions
diff --git a/0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch b/0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch
new file mode 100644
index 0000000..a5a786f
--- /dev/null
+++ b/0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch
@@ -0,0 +1,239 @@
+From 11da40d18e35219961226d40f11b0702b8649044 Mon Sep 17 00:00:00 2001
+From: Pronin Alexander 00812787 <pronin.alexander@huawei.com>
+Date: Thu, 22 Feb 2024 17:13:27 +0800
+Subject: [PATCH 13/18] Port moving minmask pattern to gimple to GCC 12
+
+---
+ gcc/common.opt                          |   4 +
+ gcc/match.pd                            | 104 ++++++++++++++++++++++++
+ gcc/testsuite/gcc.dg/combine-maxmin-1.c |  15 ++++
+ gcc/testsuite/gcc.dg/combine-maxmin-2.c |  14 ++++
+ gcc/testsuite/gcc.dg/combine-maxmin.c   |  19 +++--
+ 5 files changed, 151 insertions(+), 5 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-1.c
+ create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-2.c
+
+diff --git a/gcc/common.opt b/gcc/common.opt
+index 6c6fabb31..3a5004271 100644
+--- a/gcc/common.opt
++++ b/gcc/common.opt
+@@ -1846,6 +1846,10 @@ fif-conversion-gimple
+ Common Var(flag_if_conversion_gimple) Optimization
+ Perform conversion of conditional jumps to branchless equivalents during gimple transformations.
+ 
++fconvert-minmax
++Common Var(flag_convert_minmax) Optimization
++Convert saturating clipping to min max.
++
+ fstack-reuse=
+ Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization
+ -fstack-reuse=[all|named_vars|none]	Set stack reuse level for local variables.
+diff --git a/gcc/match.pd b/gcc/match.pd
+index 61866cb90..3a19e93b3 100644
+--- a/gcc/match.pd
++++ b/gcc/match.pd
+@@ -8031,3 +8031,107 @@ and,
+    (plus:c@4 (op2:c @0 @1)
+     (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1))))
+      (if (single_use (@4) && single_use (@5)))))
++
++/* MinMax pattern matching helpers.  More info on the transformation below.  */
++
++/* Match (a & 0b11..100..0) pattern.  */
++(match (minmax_cmp_arg @0 @1)
++ (bit_and @0 INTEGER_CST@1)
++ (if (wi::popcount (~wi::to_widest (@1) + 1) == 1)))
++
++/* Match (inversed_sign_bit >> sign_bit_pos) pattern.
++   This statement is blocking for the transformation of unsigned integers.
++   Do type check here to avoid unnecessary duplications.  */
++(match (minmax_sat_arg @0)
++ (rshift (negate @0) INTEGER_CST@1)
++ (if (!TYPE_UNSIGNED (TREE_TYPE (@0))
++      && wi::eq_p (wi::to_widest (@1), TYPE_PRECISION (TREE_TYPE (@0)) - 1))))
++
++/* Transform ((x & ~mask) ? (-x)>>31 & mask : x) to (min (max (x, 0), mask)).
++   The matched pattern can be described as saturated clipping.
++
++   The pattern supports truncation via both casts and bit_and.
++   Also there are patterns for possible inverted conditions.  */
++(if (flag_convert_minmax)
++/* Truncation via casts.  Unfortunately convert? cannot be applied here
++   because convert and cond take different number of arguments.  */
++ (simplify
++  (convert
++   (cond
++    (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
++    (convert? (minmax_sat_arg @0))
++    (convert? @0)))
++  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
++   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
++    (convert (min (max @0 { integer_zero_node; })
++		  { mask; })))))
++ (simplify
++  (cond
++   (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
++   (convert? (minmax_sat_arg @0))
++   (convert? @0))
++  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
++   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
++    (convert (min (max @0 { integer_zero_node; })
++		  { mask; })))))
++
++ (simplify
++  (convert
++   (cond
++    (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
++    (convert? @0)
++    (convert? (minmax_sat_arg @0))))
++  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
++   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
++    (convert (min (max @0 { integer_zero_node; })
++		  { mask; })))))
++ (simplify
++  (cond
++   (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
++   (convert? @0)
++   (convert? (minmax_sat_arg @0)))
++  (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type)))
++   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
++    (convert (min (max @0 { integer_zero_node; })
++		  { mask; })))))
++
++ /* Truncation via bit_and with mask.  Same concerns on convert? here.  */
++ (simplify
++  (convert
++   (cond
++    (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
++    (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))
++    (convert? @0)))
++  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
++   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
++    (convert (min (max @0 { integer_zero_node; })
++		  { mask; })))))
++ (simplify
++  (cond
++   (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
++   (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))
++   (convert? @0))
++  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
++   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
++    (convert (min (max @0 { integer_zero_node; })
++		  { mask; })))))
++
++ (simplify
++  (convert
++   (cond
++    (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
++    (convert? @0)
++    (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))))
++  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
++   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
++    (convert (min (max @0 { integer_zero_node; })
++		  { mask; })))))
++ (simplify
++  (cond
++   (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop)
++   (convert? @0)
++   (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)))
++  (if (wi::to_widest (@2) == ~wi::to_widest (@1))
++   (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); }
++    (convert (min (max @0 { integer_zero_node; })
++		  { mask; }))))))
+diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-1.c b/gcc/testsuite/gcc.dg/combine-maxmin-1.c
+new file mode 100644
+index 000000000..859ff7df8
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/combine-maxmin-1.c
+@@ -0,0 +1,15 @@
++/* { dg-do compile { target aarch64-*-* } } */
++/* { dg-options "-O3 -fconvert-minmax" } */
++
++#include <inttypes.h>
++
++__attribute__((noinline))
++void test (int32_t *restrict a, int32_t *restrict x)
++{
++  for (int i = 0; i < 4; i++)
++    a[i] = ((((-x[i]) >> 31) ^ x[i])
++            & (-((int32_t)((x[i] & (~((1 << 8)-1))) == 0)))) ^ ((-x[i]) >> 31);
++}
++
++/* { dg-final { scan-assembler-not {smax\t} } }  */
++/* { dg-final { scan-assembler-not {smin\t} } }  */
+diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-2.c b/gcc/testsuite/gcc.dg/combine-maxmin-2.c
+new file mode 100644
+index 000000000..63d4d85b3
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/combine-maxmin-2.c
+@@ -0,0 +1,14 @@
++/* { dg-do compile { target aarch64-*-* } } */
++/* { dg-options "-O3 -fconvert-minmax" } */
++
++#include <inttypes.h>
++
++__attribute__((noinline))
++void test (int8_t *restrict a, int32_t *restrict x)
++{
++  for (int i = 0; i < 8; i++)
++    a[i] = ((x[i] & ~((1 << 9)-1)) ? (-x[i])>>31 & ((1 << 9)-1) : x[i]);
++}
++
++/* { dg-final { scan-assembler-times {smax\t} 4 } }  */
++/* { dg-final { scan-assembler-times {smin\t} 4 } }  */
+diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c
+index 06bce7029..a984fa560 100755
+--- a/gcc/testsuite/gcc.dg/combine-maxmin.c
++++ b/gcc/testsuite/gcc.dg/combine-maxmin.c
+@@ -1,5 +1,5 @@
+ /* { dg-do compile { target aarch64-*-* } } */
+-/* { dg-options "-O3 -fdump-rtl-combine-all" } */
++/* { dg-options "-O3 -fconvert-minmax" } */
+ 
+ /* The test checks usage of smax/smin insns for clip evaluation and
+  * uzp1/uzp2 insns for vector element narrowing.  It's inspired by
+@@ -19,20 +19,26 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+ {
+     const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0;
+     for( int y = 0; y < height; y++ ) {
++        /* This loop is not being vectorized now.  */
+         for( int x = -2; x < width+3; x++ ) {
+             int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride]
+ 		     + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride]));
+             dstv[x] = clip ( (v + 16) >> 5 );
+             buf[x+2] = v + pad;
+         }
++
++        /* Produces two versions of the code: 3xUZP1/2xMAX/2xMIN + 1xUZP1/1xMAX/1xMIN.  */
+         for( int x = 0; x < width; x++ )
+             dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1]
+ 			      + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1]))
+ 			     - 32*pad + 512) >> 10);
++
++        /* Priduces two versions of the code: 1xUZP1/2xMAX/2xMIN + 0xUZP1/1xMAX/1xMIN.  */
+         for( int x = 0; x < width; x++ )
+             dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1]
+ 			      + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1]))
+ 			     + 16) >> 5);
++
+         dsth += stride;
+         dstv += stride;
+         dstc += stride;
+@@ -40,7 +46,10 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
+     }
+ }
+ 
+-/* { dg-final { scan-assembler-times {smax\t} 4 } }  */
+-/* { dg-final { scan-assembler-times {smin\t} 4 } }  */
+-/* { dg-final { scan-assembler-times {cmtst\t} 2 } }  */
+-/* { dg-final { scan-assembler-times {uzp1\t} 6 } }  */
++/* Max is performed on 0 from signed values, match smax exactly.  */
++/* { dg-final { scan-assembler-times {smax\t} 6 } }  */
++/* Min is performed on signed val>0 and a mask, min sign doesn't matter.  */
++/* { dg-final { scan-assembler-times {[us]min\t} 6 } }  */
++/* All of the vectorized patterns are expected to be matched.  */
++/* { dg-final { scan-assembler-not {cmtst\t} } }  */
++/* { dg-final { scan-assembler-times {uzp1\t} 5 } }  */
+-- 
+2.33.0
+