diff options
Diffstat (limited to '0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch')
-rw-r--r-- | 0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch | 239 |
1 files changed, 239 insertions, 0 deletions
diff --git a/0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch b/0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch new file mode 100644 index 0000000..a5a786f --- /dev/null +++ b/0045-Port-moving-minmask-pattern-to-gimple-to-GCC-12.patch @@ -0,0 +1,239 @@ +From 11da40d18e35219961226d40f11b0702b8649044 Mon Sep 17 00:00:00 2001 +From: Pronin Alexander 00812787 <pronin.alexander@huawei.com> +Date: Thu, 22 Feb 2024 17:13:27 +0800 +Subject: [PATCH 13/18] Port moving minmask pattern to gimple to GCC 12 + +--- + gcc/common.opt | 4 + + gcc/match.pd | 104 ++++++++++++++++++++++++ + gcc/testsuite/gcc.dg/combine-maxmin-1.c | 15 ++++ + gcc/testsuite/gcc.dg/combine-maxmin-2.c | 14 ++++ + gcc/testsuite/gcc.dg/combine-maxmin.c | 19 +++-- + 5 files changed, 151 insertions(+), 5 deletions(-) + create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-1.c + create mode 100644 gcc/testsuite/gcc.dg/combine-maxmin-2.c + +diff --git a/gcc/common.opt b/gcc/common.opt +index 6c6fabb31..3a5004271 100644 +--- a/gcc/common.opt ++++ b/gcc/common.opt +@@ -1846,6 +1846,10 @@ fif-conversion-gimple + Common Var(flag_if_conversion_gimple) Optimization + Perform conversion of conditional jumps to branchless equivalents during gimple transformations. + ++fconvert-minmax ++Common Var(flag_convert_minmax) Optimization ++Convert saturating clipping to min max. ++ + fstack-reuse= + Common Joined RejectNegative Enum(stack_reuse_level) Var(flag_stack_reuse) Init(SR_ALL) Optimization + -fstack-reuse=[all|named_vars|none] Set stack reuse level for local variables. +diff --git a/gcc/match.pd b/gcc/match.pd +index 61866cb90..3a19e93b3 100644 +--- a/gcc/match.pd ++++ b/gcc/match.pd +@@ -8031,3 +8031,107 @@ and, + (plus:c@4 (op2:c @0 @1) + (plus:c@5 (double_size_mul_overflow_check_lo @0 @1 @3) (op3:c @0 @1)))) + (if (single_use (@4) && single_use (@5))))) ++ ++/* MinMax pattern matching helpers. More info on the transformation below. */ ++ ++/* Match (a & 0b11..100..0) pattern. */ ++(match (minmax_cmp_arg @0 @1) ++ (bit_and @0 INTEGER_CST@1) ++ (if (wi::popcount (~wi::to_widest (@1) + 1) == 1))) ++ ++/* Match (inversed_sign_bit >> sign_bit_pos) pattern. ++ This statement is blocking for the transformation of unsigned integers. ++ Do type check here to avoid unnecessary duplications. */ ++(match (minmax_sat_arg @0) ++ (rshift (negate @0) INTEGER_CST@1) ++ (if (!TYPE_UNSIGNED (TREE_TYPE (@0)) ++ && wi::eq_p (wi::to_widest (@1), TYPE_PRECISION (TREE_TYPE (@0)) - 1)))) ++ ++/* Transform ((x & ~mask) ? (-x)>>31 & mask : x) to (min (max (x, 0), mask)). ++ The matched pattern can be described as saturated clipping. ++ ++ The pattern supports truncation via both casts and bit_and. ++ Also there are patterns for possible inverted conditions. */ ++(if (flag_convert_minmax) ++/* Truncation via casts. Unfortunately convert? cannot be applied here ++ because convert and cond take different number of arguments. */ ++ (simplify ++ (convert ++ (cond ++ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? (minmax_sat_arg @0)) ++ (convert? @0))) ++ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type))) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ (simplify ++ (cond ++ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? (minmax_sat_arg @0)) ++ (convert? @0)) ++ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type))) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ ++ (simplify ++ (convert ++ (cond ++ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? @0) ++ (convert? (minmax_sat_arg @0)))) ++ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type))) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ (simplify ++ (cond ++ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? @0) ++ (convert? (minmax_sat_arg @0))) ++ (if (wi::geu_p (~wi::to_widest (@1) + 1, TYPE_PRECISION (type))) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ ++ /* Truncation via bit_and with mask. Same concerns on convert? here. */ ++ (simplify ++ (convert ++ (cond ++ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)) ++ (convert? @0))) ++ (if (wi::to_widest (@2) == ~wi::to_widest (@1)) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ (simplify ++ (cond ++ (ne (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)) ++ (convert? @0)) ++ (if (wi::to_widest (@2) == ~wi::to_widest (@1)) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ ++ (simplify ++ (convert ++ (cond ++ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? @0) ++ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2)))) ++ (if (wi::to_widest (@2) == ~wi::to_widest (@1)) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; }))))) ++ (simplify ++ (cond ++ (eq (minmax_cmp_arg @0 INTEGER_CST@1) integer_zerop) ++ (convert? @0) ++ (convert? (bit_and (minmax_sat_arg @0) INTEGER_CST@2))) ++ (if (wi::to_widest (@2) == ~wi::to_widest (@1)) ++ (with { tree mask = build_int_cst (integer_type_node, ~tree_to_shwi (@1)); } ++ (convert (min (max @0 { integer_zero_node; }) ++ { mask; })))))) +diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-1.c b/gcc/testsuite/gcc.dg/combine-maxmin-1.c +new file mode 100644 +index 000000000..859ff7df8 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/combine-maxmin-1.c +@@ -0,0 +1,15 @@ ++/* { dg-do compile { target aarch64-*-* } } */ ++/* { dg-options "-O3 -fconvert-minmax" } */ ++ ++#include <inttypes.h> ++ ++__attribute__((noinline)) ++void test (int32_t *restrict a, int32_t *restrict x) ++{ ++ for (int i = 0; i < 4; i++) ++ a[i] = ((((-x[i]) >> 31) ^ x[i]) ++ & (-((int32_t)((x[i] & (~((1 << 8)-1))) == 0)))) ^ ((-x[i]) >> 31); ++} ++ ++/* { dg-final { scan-assembler-not {smax\t} } } */ ++/* { dg-final { scan-assembler-not {smin\t} } } */ +diff --git a/gcc/testsuite/gcc.dg/combine-maxmin-2.c b/gcc/testsuite/gcc.dg/combine-maxmin-2.c +new file mode 100644 +index 000000000..63d4d85b3 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/combine-maxmin-2.c +@@ -0,0 +1,14 @@ ++/* { dg-do compile { target aarch64-*-* } } */ ++/* { dg-options "-O3 -fconvert-minmax" } */ ++ ++#include <inttypes.h> ++ ++__attribute__((noinline)) ++void test (int8_t *restrict a, int32_t *restrict x) ++{ ++ for (int i = 0; i < 8; i++) ++ a[i] = ((x[i] & ~((1 << 9)-1)) ? (-x[i])>>31 & ((1 << 9)-1) : x[i]); ++} ++ ++/* { dg-final { scan-assembler-times {smax\t} 4 } } */ ++/* { dg-final { scan-assembler-times {smin\t} 4 } } */ +diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c +index 06bce7029..a984fa560 100755 +--- a/gcc/testsuite/gcc.dg/combine-maxmin.c ++++ b/gcc/testsuite/gcc.dg/combine-maxmin.c +@@ -1,5 +1,5 @@ + /* { dg-do compile { target aarch64-*-* } } */ +-/* { dg-options "-O3 -fdump-rtl-combine-all" } */ ++/* { dg-options "-O3 -fconvert-minmax" } */ + + /* The test checks usage of smax/smin insns for clip evaluation and + * uzp1/uzp2 insns for vector element narrowing. It's inspired by +@@ -19,20 +19,26 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, + { + const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0; + for( int y = 0; y < height; y++ ) { ++ /* This loop is not being vectorized now. */ + for( int x = -2; x < width+3; x++ ) { + int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride] + + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride])); + dstv[x] = clip ( (v + 16) >> 5 ); + buf[x+2] = v + pad; + } ++ ++ /* Produces two versions of the code: 3xUZP1/2xMAX/2xMIN + 1xUZP1/1xMAX/1xMIN. */ + for( int x = 0; x < width; x++ ) + dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1] + + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1])) + - 32*pad + 512) >> 10); ++ ++ /* Priduces two versions of the code: 1xUZP1/2xMAX/2xMIN + 0xUZP1/1xMAX/1xMIN. */ + for( int x = 0; x < width; x++ ) + dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1] + + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1])) + + 16) >> 5); ++ + dsth += stride; + dstv += stride; + dstc += stride; +@@ -40,7 +46,10 @@ void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, + } + } + +-/* { dg-final { scan-assembler-times {smax\t} 4 } } */ +-/* { dg-final { scan-assembler-times {smin\t} 4 } } */ +-/* { dg-final { scan-assembler-times {cmtst\t} 2 } } */ +-/* { dg-final { scan-assembler-times {uzp1\t} 6 } } */ ++/* Max is performed on 0 from signed values, match smax exactly. */ ++/* { dg-final { scan-assembler-times {smax\t} 6 } } */ ++/* Min is performed on signed val>0 and a mask, min sign doesn't matter. */ ++/* { dg-final { scan-assembler-times {[us]min\t} 6 } } */ ++/* All of the vectorized patterns are expected to be matched. */ ++/* { dg-final { scan-assembler-not {cmtst\t} } } */ ++/* { dg-final { scan-assembler-times {uzp1\t} 5 } } */ +-- +2.33.0 + |