1 files changed, 194 insertions, 0 deletions
diff --git a/0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch b/0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch
new file mode 100644
index 0000000..c40886a
--- /dev/null
+++ b/0035-Add-insn-defs-and-correct-costs-for-cmlt-generation.patch
@@ -0,0 +1,194 @@
+From aa39a66f6029fe16a656d7c6339908b953fb1e04 Mon Sep 17 00:00:00 2001
+From: Diachkov Ilia WX1215920 <diachkov.ilia1@huawei-partners.com>
+Date: Thu, 22 Feb 2024 11:27:43 +0300
+Subject: [PATCH 01/18] Add insn defs and correct costs for cmlt generation
+
+---
+ gcc/config/aarch64/aarch64-simd.md  | 48 +++++++++++++++++++++++++++++
+ gcc/config/aarch64/aarch64.cc       | 15 +++++++++
+ gcc/config/aarch64/aarch64.opt      |  4 +++
+ gcc/config/aarch64/iterators.md     |  3 +-
+ gcc/config/aarch64/predicates.md    | 25 +++++++++++++++
+ gcc/testsuite/gcc.dg/combine-cmlt.c | 20 ++++++++++++
+ 6 files changed, 114 insertions(+), 1 deletion(-)
+ create mode 100755 gcc/testsuite/gcc.dg/combine-cmlt.c
+
+diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
+index ee7f0b89c..82f73805f 100644
+--- a/gcc/config/aarch64/aarch64-simd.md
++++ b/gcc/config/aarch64/aarch64-simd.md
+@@ -6454,6 +6454,54 @@
+   [(set_attr "type" "neon_compare<q>, neon_compare_zero<q>")]
+ )
+ 
++;; Use cmlt to replace vector arithmetic operations like this (SImode example):
++;; B = (((A >> 15) & 0x00010001) << 16) - ((A >> 15) & 0x00010001)
++;; TODO: maybe extend to scalar operations or other cm** instructions.
++
++(define_insn "*aarch64_cmlt_as_arith<mode>"
++  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
++	(minus:<V_INT_EQUIV>
++	  (ashift:<V_INT_EQUIV>
++	    (and:<V_INT_EQUIV>
++	      (lshiftrt:<V_INT_EQUIV>
++		(match_operand:VDQHSD 1 "register_operand" "w")
++		(match_operand:VDQHSD 2 "half_size_minus_one_operand"))
++	      (match_operand:VDQHSD 3 "cmlt_arith_mask_operand"))
++	    (match_operand:VDQHSD 4 "half_size_operand"))
++	  (and:<V_INT_EQUIV>
++	    (lshiftrt:<V_INT_EQUIV>
++	      (match_dup 1)
++	      (match_dup 2))
++	    (match_dup 3))))]
++  "TARGET_SIMD && flag_cmlt_arith"
++  "cmlt\t%<v>0.<V2ntype>, %<v>1.<V2ntype>, #0"
++  [(set_attr "type" "neon_compare_zero")]
++)
++
++;; The helper definition that allows combiner to use the previous pattern.
++
++(define_insn_and_split "*arch64_cmlt_tmp<mode>"
++  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand" "=w")
++	(and:<V_INT_EQUIV>
++	  (lshiftrt:<V_INT_EQUIV>
++	    (match_operand:VDQHSD 1 "register_operand" "w")
++	    (match_operand:VDQHSD 2 "half_size_minus_one_operand"))
++	  (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
++  "TARGET_SIMD && flag_cmlt_arith"
++  "#"
++  "&& reload_completed"
++  [(set (match_operand:<V_INT_EQUIV> 0 "register_operand")
++	(lshiftrt:<V_INT_EQUIV>
++	  (match_operand:VDQHSD 1 "register_operand")
++	  (match_operand:VDQHSD 2 "half_size_minus_one_operand")))
++   (set (match_dup 0)
++	(and:<V_INT_EQUIV>
++	  (match_dup 0)
++	  (match_operand:VDQHSD 3 "cmlt_arith_mask_operand")))]
++  ""
++  [(set_attr "type" "neon_compare_zero")]
++)
++
+ (define_insn_and_split "aarch64_cm<optab>di"
+   [(set (match_operand:DI 0 "register_operand" "=w,w,r")
+ 	(neg:DI
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index a3da4ca30..04072ca25 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -14064,6 +14064,21 @@ cost_minus:
+ 	    return true;
+ 	  }
+ 
++	/* Detect aarch64_cmlt_as_arith instruction. Now only this pattern
++	   matches the condition. The costs of cmlt and sub instructions
++	   are comparable, so we are not increasing the cost here.  */
++	if (flag_cmlt_arith && GET_CODE (op0) == ASHIFT
++	    && GET_CODE (op1) == AND)
++	  {
++	    rtx op0_subop0 = XEXP (op0, 0);
++	    if (rtx_equal_p (op0_subop0, op1))
++	      {
++		rtx lshrt_op = XEXP (op0_subop0, 0);
++		if (GET_CODE (lshrt_op) == LSHIFTRT)
++		  return true;
++	      }
++	  }
++
+ 	/* Look for SUB (extended register).  */
+ 	if (is_a <scalar_int_mode> (mode)
+ 	    && aarch64_rtx_arith_op_extract_p (op1))
+diff --git a/gcc/config/aarch64/aarch64.opt b/gcc/config/aarch64/aarch64.opt
+index a64b927e9..101664c7c 100644
+--- a/gcc/config/aarch64/aarch64.opt
++++ b/gcc/config/aarch64/aarch64.opt
+@@ -262,6 +262,10 @@ Use an immediate to offset from the stack protector guard register, sp_el0.
+ This option is for use with fstack-protector-strong and not for use in
+ user-land code.
+ 
++mcmlt-arith
++Target Var(flag_cmlt_arith) Optimization Init(0)
++Use SIMD cmlt instruction to perform some arithmetic/logic calculations.
++
+ TargetVariable
+ long aarch64_stack_protector_guard_offset = 0
+ 
+diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
+index 26a840d7f..967e6b0b1 100644
+--- a/gcc/config/aarch64/iterators.md
++++ b/gcc/config/aarch64/iterators.md
+@@ -1485,7 +1485,8 @@
+ 			  (V2DI "2s")])
+ 
+ ;; Register suffix narrowed modes for VQN.
+-(define_mode_attr V2ntype [(V8HI "16b") (V4SI "8h")
++(define_mode_attr V2ntype [(V4HI "8b") (V2SI "4h")
++			   (V8HI "16b") (V4SI "8h")
+ 			   (V2DI "4s")])
+ 
+ ;; Widened modes of vector modes.
+diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md
+index c308015ac..07c14aacb 100644
+--- a/gcc/config/aarch64/predicates.md
++++ b/gcc/config/aarch64/predicates.md
+@@ -49,6 +49,31 @@
+   return CONST_INT_P (op) && IN_RANGE (INTVAL (op), 1, 3);
+ })
+ 
++(define_predicate "half_size_minus_one_operand"
++  (match_code "const_vector")
++{
++  op = unwrap_const_vec_duplicate (op);
++  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
++  return CONST_INT_P (op) && (UINTVAL (op) == size - 1);
++})
++
++(define_predicate "half_size_operand"
++  (match_code "const_vector")
++{
++  op = unwrap_const_vec_duplicate (op);
++  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
++  return CONST_INT_P (op) && (UINTVAL (op) == size);
++})
++
++(define_predicate "cmlt_arith_mask_operand"
++  (match_code "const_vector")
++{
++  op = unwrap_const_vec_duplicate (op);
++  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) / 2;
++  unsigned long long mask = ((unsigned long long) 1 << size) | 1;
++  return CONST_INT_P (op) && (UINTVAL (op) == mask);
++})
++
+ (define_predicate "subreg_lowpart_operator"
+   (ior (match_code "truncate")
+        (and (match_code "subreg")
+diff --git a/gcc/testsuite/gcc.dg/combine-cmlt.c b/gcc/testsuite/gcc.dg/combine-cmlt.c
+new file mode 100755
+index 000000000..b4c9a37ff
+--- /dev/null
++++ b/gcc/testsuite/gcc.dg/combine-cmlt.c
+@@ -0,0 +1,20 @@
++/* { dg-do compile { target aarch64-*-* } } */
++/* { dg-options "-O3 -mcmlt-arith" } */
++
++/* The test checks usage of cmlt insns for arithmetic/logic calculations
++ * in foo ().  It's inspired by sources of x264 codec.  */
++
++typedef unsigned short int uint16_t;
++typedef unsigned int uint32_t;
++
++void foo( uint32_t *a, uint32_t *b)
++{
++  for (unsigned i = 0; i < 4; i++)
++    {
++      uint32_t s = ((a[i]>>((8 * sizeof(uint16_t))-1))
++		    &(((uint32_t)1<<(8 * sizeof(uint16_t)))+1))*((uint16_t)-1);
++      b[i] = (a[i]+s)^s;
++    }
++}
++
++/* { dg-final { scan-assembler-times {cmlt\t} 1 } }  */
+-- 
+2.33.0
+