diff options
Diffstat (limited to '0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch')
-rw-r--r-- | 0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch | 194 |
1 files changed, 194 insertions, 0 deletions
diff --git a/0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch b/0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch new file mode 100644 index 0000000..c40886a --- /dev/null +++ b/0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch @@ -0,0 +1,194 @@ +From aa39a66f6029fe16a656d7c6339908b953fb1e04 Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia WX1215920 <diachkov.ilia1@huawei-partners.com> +Date: Thu, 22 Feb 2024 11:27:43 +0300 +Subject: [PATCH 01/18] Add insn defs and correct costs for cmlt generation + +--- + gcc/config/aarch64/aarch64-simd.md | 48 +++++++++++++++++++++++++++++ + gcc/config/aarch64/aarch64.cc | 15 +++++++++ + gcc/config/aarch64/aarch64.opt | 4 +++ + gcc/config/aarch64/iterators.md | 3 +- + gcc/config/aarch64/predicates.md | 25 +++++++++++++++ + gcc/testsuite/gcc.dg/combine-cmlt.c | 20 ++++++++++++ + 6 files changed, 114 insertions(+), 1 deletion(-) + create mode 100755 gcc/testsuite/gcc.dg/combine-cmlt.c + +diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md +index ee7f0b89c..82f73805f 100644 +--- a/gcc/config/aarch64/aarch64-simd.md ++++ b/gcc/config/aarch64/aarch64-simd.md +@@ -6454,6 +6454,54 @@ + [(set_attr "type" "neon_compare<q>, neon_compare_zero<q>")] + ) + ++;; Use cmlt to replace vector arithmetic operations like this (SImode example): ++;; B = (((A >> 15) & 0x00010001) << 16) - ((A >> 15) & 0x00010001) ++;; TODO: maybe extend to scalar operations or other cm** instructions. ++ ++(define_insn "*aarch64_cmlt_as_arith<mode>" ++ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w") ++ (minus:<V_INT_EQUIV> ++ (ashift:<V_INT_EQUIV> ++ (and:<V_INT_EQUIV> ++ (lshiftrt:<V_INT_EQUIV> ++ (match_operand:VDQHSD 1 "register_operand" "w") ++ (match_operand:VDQHSD 2 "half_size_minus_one_operand")) ++ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")) ++ (match_operand:VDQHSD 4 "half_size_operand")) ++ (and:<V_INT_EQUIV> ++ (lshiftrt:<V_INT_EQUIV> ++ (match_dup 1) ++ (match_dup 2)) ++ (match_dup 3))))] ++ "TARGET_SIMD && flag_cmlt_arith" ++ "cmlt\t%<v>0.<V2ntype>, %<v>1.<V2ntype>, #0" ++ [(set_attr "type" "neon_compare_zero")] ++) ++ ++;; The helper definition that allows combiner to use the previous pattern. ++ ++(define_insn_and_split "*arch64_cmlt_tmp<mode>" ++ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w") ++ (and:<V_INT_EQUIV> ++ (lshiftrt:<V_INT_EQUIV> ++ (match_operand:VDQHSD 1 "register_operand" "w") ++ (match_operand:VDQHSD 2 "half_size_minus_one_operand")) ++ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))] ++ "TARGET_SIMD && flag_cmlt_arith" ++ "#" ++ "&& reload_completed" ++ [(set (match_operand:<V_INT_EQUIV> 0 "register_operand") ++ (lshiftrt:<V_INT_EQUIV> ++ (match_operand:VDQHSD 1 "register_operand") ++ (match_operand:VDQHSD 2 "half_size_minus_one_operand"))) ++ (set (match_dup 0) ++ (and:<V_INT_EQUIV> ++ (match_dup 0) ++ (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))] ++ "" ++ [(set_attr "type" "neon_compare_zero")] ++) ++ + (define_insn_and_split "aarch64_cm<optab>di" + [(set (match_operand:DI 0 "register_operand" "=w,w,r") + (neg:DI +diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc +index a3da4ca30..04072ca25 100644 +--- a/gcc/config/aarch64/aarch64.cc ++++ b/gcc/config/aarch64/aarch64.cc +@@ -14064,6 +14064,21 @@ cost_minus: + return true; + } + ++ /* Detect aarch64_cmlt_as_arith instruction. Now only this pattern ++ matches the condition. The costs of cmlt and sub instructions ++ are comparable, so we are not increasing the cost here. */ ++ if (flag_cmlt_arith && GET_CODE (op0) == ASHIFT ++ && GET_CODE (op1) == AND) ++ { ++ rtx op0_subop0 = XEXP (op0, 0); ++ if (rtx_equal_p (op0_subop0, op1)) ++ { ++ rtx lshrt_op = XEXP (op0_subop0, 0); ++ if (GET_CODE (lshrt_op) == LSHIFTRT) ++ return true; ++ } ++ } ++ + /* Look for SUB (extended register). */ + if (is_a <scalar_int_mode> (mode) + && aarch64_rtx_arith_op_extract_p (op1)) +diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt +index a64b927e9..101664c7c 100644 +--- a/gcc/config/aarch64/aarch64.opt ++++ b/gcc/config/aarch64/aarch64.opt +@@ -262,6 +262,10 @@ Use an immediate to offset from the stack protector guard register, sp_el0. + This option is for use with fstack-protector-strong and not for use in + user-land code. + ++mcmlt-arith ++Target Var(flag_cmlt_arith) Optimization Init(0) ++Use SIMD cmlt instruction to perform some arithmetic/logic calculations. ++ + TargetVariable + long aarch64_stack_protector_guard_offset = 0 + +diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md +index 26a840d7f..967e6b0b1 100644 +--- a/gcc/config/aarch64/iterators.md ++++ b/gcc/config/aarch64/iterators.md +@@ -1485,7 +1485,8 @@ + (V2DI "2s")]) + + ;; Register suffix narrowed modes for VQN. +-(define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h") ++(define_mode_attr V2ntype [(V4HI "8b") (V2SI "4h") ++ (V8HI "16b") (V4SI "8h") + (V2DI "4s")]) + + ;; Widened modes of vector modes. +diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md +index c308015ac..07c14aacb 100644 +--- a/gcc/config/aarch64/predicates.md ++++ b/gcc/config/aarch64/predicates.md +@@ -49,6 +49,31 @@ + return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3); + }) + ++(define_predicate "half_size_minus_one_operand" ++ (match_code "const_vector") ++{ ++ op = unwrap_const_vec_duplicate (op); ++ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2; ++ return CONST_INT_P (op) && (UINTVAL (op) == size - 1); ++}) ++ ++(define_predicate "half_size_operand" ++ (match_code "const_vector") ++{ ++ op = unwrap_const_vec_duplicate (op); ++ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2; ++ return CONST_INT_P (op) && (UINTVAL (op) == size); ++}) ++ ++(define_predicate "cmlt_arith_mask_operand" ++ (match_code "const_vector") ++{ ++ op = unwrap_const_vec_duplicate (op); ++ unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2; ++ unsigned long long mask = ((unsigned long long) 1 << size) | 1; ++ return CONST_INT_P (op) && (UINTVAL (op) == mask); ++}) ++ + (define_predicate "subreg_lowpart_operator" + (ior (match_code "truncate") + (and (match_code "subreg") +diff --git a/gcc/testsuite/gcc.dg/combine-cmlt.c b/gcc/testsuite/gcc.dg/combine-cmlt.c +new file mode 100755 +index 000000000..b4c9a37ff +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/combine-cmlt.c +@@ -0,0 +1,20 @@ ++/* { dg-do compile { target aarch64-*-* } } */ ++/* { dg-options "-O3 -mcmlt-arith" } */ ++ ++/* The test checks usage of cmlt insns for arithmetic/logic calculations ++ * in foo (). It's inspired by sources of x264 codec. */ ++ ++typedef unsigned short int uint16_t; ++typedef unsigned int uint32_t; ++ ++void foo( uint32_t *a, uint32_t *b) ++{ ++ for (unsigned i = 0; i < 4; i++) ++ { ++ uint32_t s = ((a[i]>>((8 * sizeof(uint16_t))-1)) ++ &(((uint32_t)1<<(8 * sizeof(uint16_t)))+1))*((uint16_t)-1); ++ b[i] = (a[i]+s)^s; ++ } ++} ++ ++/* { dg-final { scan-assembler-times {cmlt\t} 1 } } */ +-- +2.33.0 + |