diff options
| author | CoprDistGit <infra@openeuler.org> | 2025-02-28 10:03:49 +0000 | 
|---|---|---|
| committer | CoprDistGit <infra@openeuler.org> | 2025-02-28 10:03:49 +0000 | 
| commit | 73127104a245052cd5cf29cdaaca3e5c32c70348 (patch) | |
| tree | 8e28b63e478c43c252f18b49836dff7313affe54 /0044-Port-maxmin-patch-to-GCC-12.patch | |
| parent | 49d3feaf4665cdb07576fc1a2382a4d82a612d35 (diff) | |
automatic import of gccopeneuler24.03_LTS_SP1
Diffstat (limited to '0044-Port-maxmin-patch-to-GCC-12.patch')
| -rw-r--r-- | 0044-Port-maxmin-patch-to-GCC-12.patch | 378 | 
1 files changed, 378 insertions, 0 deletions
diff --git a/0044-Port-maxmin-patch-to-GCC-12.patch b/0044-Port-maxmin-patch-to-GCC-12.patch new file mode 100644 index 0000000..2423c12 --- /dev/null +++ b/0044-Port-maxmin-patch-to-GCC-12.patch @@ -0,0 +1,378 @@ +From a3013c074cd2ab5f71eb98a587a627f38c68656c Mon Sep 17 00:00:00 2001 +From: Diachkov Ilia <diachkov.ilia1@huawei-partners.com> +Date: Thu, 22 Feb 2024 17:07:24 +0800 +Subject: [PATCH 12/18] Port maxmin patch to GCC 12 + +--- + gcc/config/aarch64/aarch64-simd.md    | 256 ++++++++++++++++++++++++++ + gcc/config/aarch64/predicates.md      |  19 ++ + gcc/testsuite/gcc.dg/combine-maxmin.c |  46 +++++ + 3 files changed, 321 insertions(+) + create mode 100755 gcc/testsuite/gcc.dg/combine-maxmin.c + +diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md +index 82f73805f..de92802f5 100644 +--- a/gcc/config/aarch64/aarch64-simd.md ++++ b/gcc/config/aarch64/aarch64-simd.md +@@ -1138,6 +1138,82 @@ +   [(set_attr "type" "neon_compare<q>,neon_shift_imm<q>")] + ) +  ++;; Simplify the extension with following truncation for shift+neg operation. ++ ++(define_insn_and_split "*aarch64_sshr_neg_v8hi" ++  [(set (match_operand:V8HI 0 "register_operand" "=w") ++	(vec_concat:V8HI ++	  (truncate:V4HI ++	    (ashiftrt:V4SI ++	      (neg:V4SI ++		(sign_extend:V4SI ++		  (vec_select:V4HI ++		    (match_operand:V8HI 1 "register_operand") ++		    (match_operand:V8HI 3 "vect_par_cnst_lo_half")))) ++	      (match_operand:V4SI 2 "maxmin_arith_shift_operand"))) ++	  (truncate:V4HI ++	    (ashiftrt:V4SI ++	      (neg:V4SI ++		(sign_extend:V4SI ++		  (vec_select:V4HI ++		    (match_dup 1) ++		    (match_operand:V8HI 4 "vect_par_cnst_hi_half")))) ++	      (match_dup 2)))))] ++  "TARGET_SIMD" ++  "#" ++  "&& true" ++  [(set (match_operand:V8HI 0 "register_operand" "=w") ++	(ashiftrt:V8HI ++	  (neg:V8HI ++	    (match_operand:V8HI 1 "register_operand" "w")) ++	  (match_operand:V8HI 2 "aarch64_simd_imm_minus_one")))] ++  { ++    /* Reduce the shift amount to smaller mode.  */ ++    int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[2], 0)) ++	      - (GET_MODE_UNIT_BITSIZE (GET_MODE (operands[2])) / 2); ++    operands[2] = aarch64_simd_gen_const_vector_dup (V8HImode, val); ++  } ++  [(set_attr "type" "multiple")] ++) ++ ++;; The helper definition that allows combiner to use the previous pattern. ++ ++(define_insn_and_split "*aarch64_sshr_neg_tmpv8hi" ++  [(set (match_operand:V8HI 0 "register_operand" "=w") ++	(vec_concat:V8HI ++	  (truncate:V4HI ++	    (ashiftrt:V4SI ++	      (neg:V4SI ++		(match_operand:V4SI 1 "register_operand" "w")) ++	      (match_operand:V4SI 2 "maxmin_arith_shift_operand"))) ++	  (truncate:V4HI ++	    (ashiftrt:V4SI ++	      (neg:V4SI ++		(match_operand:V4SI 3 "register_operand" "w")) ++	      (match_dup 2)))))] ++  "TARGET_SIMD" ++  "#" ++  "&& true" ++  [(set (match_operand:V4SI 1 "register_operand" "=w") ++	(ashiftrt:V4SI ++	  (neg:V4SI ++	    (match_dup 1)) ++	  (match_operand:V4SI 2 "maxmin_arith_shift_operand"))) ++   (set (match_operand:V4SI 3 "register_operand" "=w") ++	(ashiftrt:V4SI ++	  (neg:V4SI ++	    (match_dup 3)) ++	  (match_dup 2))) ++   (set (match_operand:V8HI 0 "register_operand" "=w") ++	(vec_concat:V8HI ++	  (truncate:V4HI ++	    (match_dup 1)) ++	  (truncate:V4HI ++	    (match_dup 3))))] ++  "" ++  [(set_attr "type" "multiple")] ++) ++ + (define_insn "*aarch64_simd_sra<mode>" +  [(set (match_operand:VDQ_I 0 "register_operand" "=w") + 	(plus:VDQ_I +@@ -1714,6 +1790,26 @@ +  } + ) +  ++(define_insn "vec_pack_trunc_shifted_<mode>" ++ [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=&w") ++       (vec_concat:<VNARROWQ2> ++	 (truncate:<VNARROWQ> ++	   (ashiftrt:VQN (match_operand:VQN 1 "register_operand" "w") ++	      (match_operand:VQN 2 "half_size_operand" "w"))) ++	 (truncate:<VNARROWQ> ++	   (ashiftrt:VQN (match_operand:VQN 3 "register_operand" "w") ++	      (match_operand:VQN 4 "half_size_operand" "w")))))] ++ "TARGET_SIMD" ++ { ++   if (BYTES_BIG_ENDIAN) ++     return "uzp2\\t%0.<V2ntype>, %3.<V2ntype>, %1.<V2ntype>"; ++   else ++     return "uzp2\\t%0.<V2ntype>, %1.<V2ntype>, %3.<V2ntype>"; ++ } ++  [(set_attr "type" "neon_permute<q>") ++   (set_attr "length" "4")] ++) ++ + (define_insn "aarch64_shrn<mode>_insn_le" +   [(set (match_operand:<VNARROWQ2> 0 "register_operand" "=w") + 	(vec_concat:<VNARROWQ2> +@@ -6652,6 +6748,166 @@ +   [(set_attr "type" "neon_tst<q>")] + ) +  ++;; Simplify the extension with following truncation for cmtst-like operation. ++ ++(define_insn_and_split "*aarch64_cmtst_arith_v8hi" ++  [(set (match_operand:V8HI 0 "register_operand" "=w") ++	(vec_concat:V8HI ++	  (plus:V4HI ++	    (truncate:V4HI ++	      (eq:V4SI ++		(sign_extend:V4SI ++		  (vec_select:V4HI ++		    (and:V8HI ++		      (match_operand:V8HI 1 "register_operand") ++		      (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) ++		    (match_operand:V8HI 3 "vect_par_cnst_lo_half"))) ++		(match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))) ++	    (match_operand:V4HI 5 "aarch64_simd_imm_minus_one")) ++	  (plus:V4HI ++	    (truncate:V4HI ++	      (eq:V4SI ++		(sign_extend:V4SI ++		  (vec_select:V4HI ++		    (and:V8HI ++		      (match_dup 1) ++		      (match_dup 2)) ++		    (match_operand:V8HI 6 "vect_par_cnst_hi_half"))) ++		(match_dup 4))) ++	    (match_dup 5))))] ++  "TARGET_SIMD && !reload_completed" ++  "#" ++  "&& true" ++  [(set (match_operand:V8HI 6 "register_operand" "=w") ++	(match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) ++   (set (match_operand:V8HI 0 "register_operand" "=w") ++	(plus:V8HI ++	  (eq:V8HI ++	    (and:V8HI ++	      (match_operand:V8HI 1 "register_operand" "w") ++	      (match_dup 6)) ++	    (match_operand:V8HI 4 "aarch64_simd_imm_zero")) ++	  (match_operand:V8HI 5 "aarch64_simd_imm_minus_one")))] ++  { ++    if (can_create_pseudo_p ()) ++      { ++	int val = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[4], 0)); ++	operands[4] = aarch64_simd_gen_const_vector_dup (V8HImode, val); ++	int val2 = INTVAL (CONST_VECTOR_ENCODED_ELT (operands[5], 0)); ++	operands[5] = aarch64_simd_gen_const_vector_dup (V8HImode, val2); ++ ++	operands[6] = gen_reg_rtx (V8HImode); ++      } ++    else ++      FAIL; ++  } ++  [(set_attr "type" "neon_tst_q")] ++) ++ ++;; Three helper definitions that allow combiner to use the previous pattern. ++ ++(define_insn_and_split "*aarch64_cmtst_arith_tmp_lo_v8hi" ++  [(set (match_operand:V4SI 0 "register_operand" "=w") ++	(neg:V4SI ++	  (eq:V4SI ++	    (sign_extend:V4SI ++	      (vec_select:V4HI ++		(and:V8HI ++		  (match_operand:V8HI 1 "register_operand") ++		  (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) ++		(match_operand:V8HI 3 "vect_par_cnst_lo_half"))) ++	    (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))] ++  "TARGET_SIMD && !reload_completed" ++  "#" ++  "&& true" ++  [(set (match_operand:V8HI 5 "register_operand" "=w") ++	(and:V8HI ++	  (match_operand:V8HI 1 "register_operand") ++	  (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))) ++   (set (match_operand:V4SI 0 "register_operand" "=w") ++	(sign_extend:V4SI ++	  (vec_select:V4HI ++	    (match_dup 5) ++	    (match_operand:V8HI 3 "vect_par_cnst_lo_half")))) ++   (set (match_dup 0) ++	(neg:V4SI ++	  (eq:V4SI ++	    (match_dup 0) ++	    (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))] ++  { ++    if (can_create_pseudo_p ()) ++      operands[5] = gen_reg_rtx (V8HImode); ++    else ++      FAIL; ++  } ++  [(set_attr "type" "multiple")] ++) ++ ++(define_insn_and_split "*aarch64_cmtst_arith_tmp_hi_v8hi" ++  [(set (match_operand:V4SI 0 "register_operand" "=w") ++	  (neg:V4SI ++	    (eq:V4SI ++	      (sign_extend:V4SI ++		(vec_select:V4HI ++		  (and:V8HI ++		    (match_operand:V8HI 1 "register_operand") ++		    (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin")) ++		  (match_operand:V8HI 3 "vect_par_cnst_hi_half"))) ++	      (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))] ++  "TARGET_SIMD && !reload_completed" ++  "#" ++  "&& true" ++  [(set (match_operand:V8HI 5 "register_operand" "=w") ++	(and:V8HI ++	  (match_operand:V8HI 1 "register_operand") ++	  (match_operand:V8HI 2 "aarch64_bic_imm_for_maxmin"))) ++   (set (match_operand:V4SI 0 "register_operand" "=w") ++	(sign_extend:V4SI ++	  (vec_select:V4HI ++	    (match_dup 5) ++	    (match_operand:V8HI 3 "vect_par_cnst_hi_half")))) ++   (set (match_dup 0) ++	  (neg:V4SI ++	    (eq:V4SI ++	      (match_dup 0) ++	      (match_operand:V4SI 4 "aarch64_simd_or_scalar_imm_zero"))))] ++  { ++    if (can_create_pseudo_p ()) ++      operands[5] = gen_reg_rtx (V8HImode); ++    else ++      FAIL; ++  } ++  [(set_attr "type" "multiple")] ++) ++ ++(define_insn_and_split "*aarch64_cmtst_arith_tmpv8hi" ++  [(set (match_operand:V8HI 0 "register_operand" "=w") ++	(vec_concat:V8HI ++	  (truncate:V4HI ++	    (not:V4SI ++	      (match_operand:V4SI 1 "register_operand" "w"))) ++	  (truncate:V4HI ++	    (not:V4SI ++	      (match_operand:V4SI 2 "register_operand" "w")))))] ++  "TARGET_SIMD" ++  "#" ++  "&& true" ++  [(set (match_operand:V4SI 1 "register_operand" "=w") ++	(not:V4SI ++	  (match_dup 1))) ++   (set (match_operand:V4SI 2 "register_operand" "=w") ++	(not:V4SI ++	  (match_dup 2))) ++   (set (match_operand:V8HI 0 "register_operand" "=w") ++	(vec_concat:V8HI ++	  (truncate:V4HI ++	    (match_dup 1)) ++	  (truncate:V4HI ++	    (match_dup 2))))] ++  "" ++  [(set_attr "type" "multiple")] ++) ++ + (define_insn_and_split "aarch64_cmtstdi" +   [(set (match_operand:DI 0 "register_operand" "=w,r") + 	(neg:DI +diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md +index 07c14aacb..1b8496c07 100644 +--- a/gcc/config/aarch64/predicates.md ++++ b/gcc/config/aarch64/predicates.md +@@ -118,6 +118,25 @@ + 	     (match_test "aarch64_simd_valid_immediate (op, NULL, + 							AARCH64_CHECK_ORR)")))) +  ++(define_predicate "aarch64_bic_imm_for_maxmin" ++   (match_code "const_vector") ++{ ++  if (!aarch64_simd_valid_immediate (op, NULL, AARCH64_CHECK_BIC)) ++    return false; ++  op = unwrap_const_vec_duplicate (op); ++  unsigned int size = GET_MODE_UNIT_BITSIZE (mode); ++  return CONST_INT_P (op) ++	 && ((~UINTVAL (op)) < (((long unsigned int) 1 << size) - 1)); ++}) ++ ++(define_predicate "maxmin_arith_shift_operand" ++   (match_code "const_vector") ++{ ++  op = unwrap_const_vec_duplicate (op); ++  unsigned int size = GET_MODE_UNIT_BITSIZE (mode) - 1; ++  return CONST_INT_P (op) && (UINTVAL (op) == size); ++}) ++ + (define_predicate "aarch64_reg_or_bic_imm" +    (ior (match_operand 0 "register_operand") + 	(and (match_code "const_vector") +diff --git a/gcc/testsuite/gcc.dg/combine-maxmin.c b/gcc/testsuite/gcc.dg/combine-maxmin.c +new file mode 100755 +index 000000000..06bce7029 +--- /dev/null ++++ b/gcc/testsuite/gcc.dg/combine-maxmin.c +@@ -0,0 +1,46 @@ ++/* { dg-do compile { target aarch64-*-* } } */ ++/* { dg-options "-O3 -fdump-rtl-combine-all" } */ ++ ++/* The test checks usage of smax/smin insns for clip evaluation and ++ * uzp1/uzp2 insns for vector element narrowing.  It's inspired by ++ * sources of x264 codec.  */ ++ ++typedef unsigned char uint8_t; ++typedef long int intptr_t; ++typedef signed short int int16_t; ++ ++static __attribute__((always_inline)) inline uint8_t clip (int x ) ++{ ++    return ( (x & ~((1 << 8)-1)) ? (-x)>>31 & ((1 << 8)-1) : x ); ++} ++ ++void hf (uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, ++	 intptr_t stride, int width, int height, int16_t *buf) ++{ ++    const int pad = (8 > 9) ? (-10 * ((1 << 8)-1)) : 0; ++    for( int y = 0; y < height; y++ ) { ++        for( int x = -2; x < width+3; x++ ) { ++            int v = ((src)[x-2*stride] + (src)[x+3*stride] - 5*((src)[x-stride] ++		     + (src)[x+2*stride]) + 20*((src)[x] + (src)[x+stride])); ++            dstv[x] = clip ( (v + 16) >> 5 ); ++            buf[x+2] = v + pad; ++        } ++        for( int x = 0; x < width; x++ ) ++            dstc[x] = clip ((((buf+2)[x-2*1] + (buf+2)[x+3*1] - 5*((buf+2)[x-1] ++			      + (buf+2)[x+2*1]) + 20*((buf+2)[x] + (buf+2)[x+1])) ++			     - 32*pad + 512) >> 10); ++        for( int x = 0; x < width; x++ ) ++            dsth[x] = clip ((((src)[x-2*1] + (src)[x+3*1] - 5*((src)[x-1] ++			      + (src)[x+2*1]) + 20*((src)[x] + (src)[x+1])) ++			     + 16) >> 5); ++        dsth += stride; ++        dstv += stride; ++        dstc += stride; ++        src += stride; ++    } ++} ++ ++/* { dg-final { scan-assembler-times {smax\t} 4 } }  */ ++/* { dg-final { scan-assembler-times {smin\t} 4 } }  */ ++/* { dg-final { scan-assembler-times {cmtst\t} 2 } }  */ ++/* { dg-final { scan-assembler-times {uzp1\t} 6 } }  */ +--  +2.33.0 +  | 
