1 files changed, 1824 insertions, 0 deletions
diff --git a/0157-Backport-SME-aarch64-Vector-move-fixes-for-nosimd.patch b/0157-Backport-SME-aarch64-Vector-move-fixes-for-nosimd.patch
new file mode 100644
index 0000000..b2257ea
--- /dev/null
+++ b/0157-Backport-SME-aarch64-Vector-move-fixes-for-nosimd.patch
@@ -0,0 +1,1824 @@
+From 737d2a5f1c5e725b7e5a20075270016ebf56b44c Mon Sep 17 00:00:00 2001
+From: Richard Sandiford <richard.sandiford@arm.com>
+Date: Tue, 13 Sep 2022 09:28:49 +0100
+Subject: [PATCH 058/157] [Backport][SME] aarch64: Vector move fixes for
+ +nosimd
+
+Reference: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=721c0fb3aca31d3bf8ad6e929eab32e29a427e60
+
+This patch fixes various issues around the handling of vectors
+and (particularly) vector structures with +nosimd.  Previously,
+passing and returning structures would trigger an ICE, since:
+
+* we didn't allow the structure modes to be stored in FPRs
+
+* we didn't provide +nosimd move patterns
+
+* splitting the moves into word-sized pieces (the default
+  strategy without move patterns) doesn't work because the
+  registers are doubleword sized.
+
+The patch is a bit of a hodge-podge since a lot of the handling of
+moves, register costs, and register legitimacy is so interconnected.
+It didn't seem feasible to split things further.
+
+Some notes:
+
+* The patch recognises vector and tuple modes based on TARGET_FLOAT
+  rather than TARGET_SIMD, and instead adds TARGET_SIMD to places
+  that really do need the vector ISA.  This is necessary for the
+  modes to be handled correctly in register arguments and returns.
+
+* The 64-bit (DREG) STP peephole required TARGET_SIMD but the
+  LDP peephole didn't.  I think the LDP one is right, since
+  DREG moves could involve GPRs as well as FPRs.
+
+* The patch keeps the existing choices of instructions for
+  TARGET_SIMD, just in case they happen to be better than FMOV
+  on some uarches.
+
+* Before the patch, +nosimd Q<->Q moves of 128-bit scalars went via
+  a GPR, thanks to a secondary reload pattern.  This approach might
+  not be ideal, but there's no reason that 128-bit vectors should
+  behave differently from 128-bit scalars.  The patch therefore
+  extends the current scalar approach to vectors.
+
+* Multi-vector LD1 and ST1 require TARGET_SIMD, so the TARGET_FLOAT
+  structure moves need to use LDP/STP and LDR/STR combinations
+  instead.  That's also what we do for big-endian even with
+  TARGET_SIMD, so most of the code was already there.  The patterns
+  for structures of 64-bit vectors are identical, but the patterns
+  for structures of 128-bit vectors need to cope with the lack of
+  128-bit Q<->Q moves.
+
+  It isn't feasible to move multi-vector tuples via GPRs, so the
+  patch moves them via memory instead.  This contaminates the port
+  with its first secondary memory reload.
+
+gcc/
+
+	* config/aarch64/aarch64.cc (aarch64_classify_vector_mode): Use
+	TARGET_FLOAT instead of TARGET_SIMD.
+	(aarch64_vectorize_related_mode): Restrict ADVSIMD handling to
+	TARGET_SIMD.
+	(aarch64_hard_regno_mode_ok): Don't allow tuples of 2 64-bit vectors
+	in GPRs.
+	(aarch64_classify_address): Treat little-endian structure moves
+	like big-endian for TARGET_FLOAT && !TARGET_SIMD.
+	(aarch64_secondary_memory_needed): New function.
+	(aarch64_secondary_reload): Handle 128-bit Advanced SIMD vectors
+	in the same way as TF, TI and TD.
+	(aarch64_rtx_mult_cost): Restrict ADVSIMD handling to TARGET_SIMD.
+	(aarch64_rtx_costs): Likewise.
+	(aarch64_register_move_cost): Treat a pair of 64-bit vectors
+	separately from a single 128-bit vector.  Handle the cost implied
+	by aarch64_secondary_memory_needed.
+	(aarch64_simd_valid_immediate): Restrict ADVSIMD handling to
+	TARGET_SIMD.
+	(aarch64_expand_vec_perm_const_1): Likewise.
+	(TARGET_SECONDARY_MEMORY_NEEDED): New macro.
+	* config/aarch64/iterators.md (VTX): New iterator.
+	* config/aarch64/aarch64.md (arches): Add fp_q as a synonym of simd.
+	(arch_enabled): Adjust accordingly.
+	(@aarch64_reload_mov<TX:mode>): Extend to...
+	(@aarch64_reload_mov<VTX:mode>): ...this.
+	* config/aarch64/aarch64-simd.md (mov<mode>): Require TARGET_FLOAT
+	rather than TARGET_SIMD.
+	(movmisalign<mode>): Likewise.
+	(load_pair<DREG:mode><DREG2:mode>): Likewise.
+	(vec_store_pair<DREG:mode><DREG2:mode>): Likewise.
+	(load_pair<VQ:mode><VQ2:mode>): Likewise.
+	(vec_store_pair<VQ:mode><VQ2:mode>): Likewise.
+	(@aarch64_split_simd_mov<mode>): Likewise.
+	(aarch64_get_low<mode>): Likewise.
+	(aarch64_get_high<mode>): Likewise.
+	(aarch64_get_half<mode>): Likewise.  Canonicalize to a move for
+	lowpart extracts.
+	(*aarch64_simd_mov<VDMOV:mode>): Require TARGET_FLOAT rather than
+	TARGET_SIMD.  Use different w<-w and r<-w instructions for
+	!TARGET_SIMD.  Disable immediate moves for !TARGET_SIMD but
+	add an alternative specifically for w<-Z.
+	(*aarch64_simd_mov<VQMOV:mode>): Require TARGET_FLOAT rather than
+	TARGET_SIMD.  Likewise for the associated define_splits.  Disable
+	FPR moves and immediate moves for !TARGET_SIMD but add an alternative
+	specifically for w<-Z.
+	(aarch64_simd_mov_from_<mode>high): Require TARGET_FLOAT rather than
+	TARGET_SIMD.  Restrict the existing alternatives to TARGET_SIMD
+	but add a new r<-w one for !TARGET_SIMD.
+	(*aarch64_get_high<mode>): New pattern.
+	(load_pair_lanes<mode>): Require TARGET_FLOAT rather than TARGET_SIMD.
+	(store_pair_lanes<mode>): Likewise.
+	(*aarch64_combine_internal<mode>): Likewise.  Restrict existing
+	w<-w, w<-r and w<-m alternatives to TARGET_SIMD but add a new w<-r
+	alternative for !TARGET_SIMD.
+	(*aarch64_combine_internal_be<mode>): Likewise.
+	(aarch64_combinez<mode>): Require TARGET_FLOAT rather than TARGET_SIMD.
+	Remove bogus arch attribute.
+	(*aarch64_combinez_be<mode>): Likewise.
+	(@aarch64_vec_concat<mode>): Require TARGET_FLOAT rather than
+	TARGET_SIMD.
+	(aarch64_combine<mode>): Likewise.
+	(aarch64_rev_reglist<mode>): Likewise.
+	(mov<mode>): Likewise.
+	(*aarch64_be_mov<VSTRUCT_2D:mode>): Extend to TARGET_FLOAT &&
+	!TARGET_SIMD, regardless of endianness.  Extend associated
+	define_splits in the same way, both for this pattern and the
+	ones below.
+	(*aarch64_be_mov<VSTRUCT_2Qmode>): Likewise.  Restrict w<-w
+	alternative to TARGET_SIMD.
+	(*aarch64_be_movoi): Likewise.
+	(*aarch64_be_movci): Likewise.
+	(*aarch64_be_movxi): Likewise.
+	(*aarch64_be_mov<VSTRUCT_4QD:mode>): Extend to TARGET_FLOAT
+	&& !TARGET_SIMD, regardless of endianness.  Restrict w<-w alternative
+	to TARGET_SIMD for tuples of 128-bit vectors.
+	(*aarch64_be_mov<VSTRUCT_4QD:mode>): Likewise.
+	* config/aarch64/aarch64-ldpstp.md: Remove TARGET_SIMD condition
+	from DREG STP peephole.  Change TARGET_SIMD to TARGET_FLOAT in
+	the VQ and VP_2E LDP and STP peepholes.
+
+gcc/testsuite/
+	* gcc.target/aarch64/ldp_stp_20.c: New test.
+	* gcc.target/aarch64/ldp_stp_21.c: Likewise.
+	* gcc.target/aarch64/ldp_stp_22.c: Likewise.
+	* gcc.target/aarch64/ldp_stp_23.c: Likewise.
+	* gcc.target/aarch64/ldp_stp_24.c: Likewise.
+	* gcc.target/aarch64/movv16qi_1.c (gpr_to_gpr): New function.
+	* gcc.target/aarch64/movv8qi_1.c (gpr_to_gpr): Likewise.
+	* gcc.target/aarch64/movv16qi_2.c: New test.
+	* gcc.target/aarch64/movv16qi_3.c: Likewise.
+	* gcc.target/aarch64/movv2di_1.c: Likewise.
+	* gcc.target/aarch64/movv2x16qi_1.c: Likewise.
+	* gcc.target/aarch64/movv2x8qi_1.c: Likewise.
+	* gcc.target/aarch64/movv3x16qi_1.c: Likewise.
+	* gcc.target/aarch64/movv3x8qi_1.c: Likewise.
+	* gcc.target/aarch64/movv4x16qi_1.c: Likewise.
+	* gcc.target/aarch64/movv4x8qi_1.c: Likewise.
+	* gcc.target/aarch64/movv8qi_2.c: Likewise.
+	* gcc.target/aarch64/movv8qi_3.c: Likewise.
+	* gcc.target/aarch64/vect_unary_2.c: Likewise.
+---
+ gcc/config/aarch64/aarch64-ldpstp.md          |  11 +-
+ gcc/config/aarch64/aarch64-simd.md            | 199 +++++++++++-------
+ gcc/config/aarch64/aarch64.cc                 |  94 ++++++---
+ gcc/config/aarch64/aarch64.md                 |  11 +-
+ gcc/config/aarch64/iterators.md               |   2 +
+ gcc/testsuite/gcc.target/aarch64/ldp_stp_20.c |   7 +
+ gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c |   7 +
+ gcc/testsuite/gcc.target/aarch64/ldp_stp_22.c |  13 ++
+ gcc/testsuite/gcc.target/aarch64/ldp_stp_23.c |  16 ++
+ gcc/testsuite/gcc.target/aarch64/ldp_stp_24.c |  16 ++
+ gcc/testsuite/gcc.target/aarch64/movv16qi_1.c |  21 ++
+ gcc/testsuite/gcc.target/aarch64/movv16qi_2.c |  27 +++
+ gcc/testsuite/gcc.target/aarch64/movv16qi_3.c |  30 +++
+ gcc/testsuite/gcc.target/aarch64/movv2di_1.c  | 103 +++++++++
+ .../gcc.target/aarch64/movv2x16qi_1.c         |  40 ++++
+ .../gcc.target/aarch64/movv2x8qi_1.c          |  38 ++++
+ .../gcc.target/aarch64/movv3x16qi_1.c         |  44 ++++
+ .../gcc.target/aarch64/movv3x8qi_1.c          |  41 ++++
+ .../gcc.target/aarch64/movv4x16qi_1.c         |  44 ++++
+ .../gcc.target/aarch64/movv4x8qi_1.c          |  42 ++++
+ gcc/testsuite/gcc.target/aarch64/movv8qi_1.c  |  15 ++
+ gcc/testsuite/gcc.target/aarch64/movv8qi_2.c  |  27 +++
+ gcc/testsuite/gcc.target/aarch64/movv8qi_3.c  |  30 +++
+ .../gcc.target/aarch64/vect_unary_2.c         |   5 +
+ 24 files changed, 774 insertions(+), 109 deletions(-)
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_stp_20.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_stp_22.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_stp_23.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/ldp_stp_24.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/movv16qi_2.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/movv16qi_3.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/movv2di_1.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/movv2x16qi_1.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/movv2x8qi_1.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/movv3x16qi_1.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/movv3x8qi_1.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/movv4x16qi_1.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/movv4x8qi_1.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/movv8qi_2.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/movv8qi_3.c
+ create mode 100644 gcc/testsuite/gcc.target/aarch64/vect_unary_2.c
+
+diff --git a/gcc/config/aarch64/aarch64-ldpstp.md b/gcc/config/aarch64/aarch64-ldpstp.md
+index ba76a1b78..f8446e212 100644
+--- a/gcc/config/aarch64/aarch64-ldpstp.md
++++ b/gcc/config/aarch64/aarch64-ldpstp.md
+@@ -83,8 +83,7 @@
+ 	(match_operand:DREG 1 "register_operand" ""))
+    (set (match_operand:DREG2 2 "memory_operand" "")
+ 	(match_operand:DREG2 3 "register_operand" ""))]
+-  "TARGET_SIMD
+-   && aarch64_operands_ok_for_ldpstp (operands, false, <DREG:MODE>mode)"
++  "aarch64_operands_ok_for_ldpstp (operands, false, <DREG:MODE>mode)"
+   [(parallel [(set (match_dup 0) (match_dup 1))
+ 	      (set (match_dup 2) (match_dup 3))])]
+ {
+@@ -96,7 +95,7 @@
+ 	(match_operand:VQ 1 "memory_operand" ""))
+    (set (match_operand:VQ2 2 "register_operand" "")
+ 	(match_operand:VQ2 3 "memory_operand" ""))]
+-  "TARGET_SIMD
++  "TARGET_FLOAT
+    && aarch64_operands_ok_for_ldpstp (operands, true, <VQ:MODE>mode)
+    && (aarch64_tune_params.extra_tuning_flags
+ 	& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
+@@ -111,7 +110,7 @@
+ 	(match_operand:VQ 1 "register_operand" ""))
+    (set (match_operand:VQ2 2 "memory_operand" "")
+ 	(match_operand:VQ2 3 "register_operand" ""))]
+-  "TARGET_SIMD
++  "TARGET_FLOAT
+    && aarch64_operands_ok_for_ldpstp (operands, false, <VQ:MODE>mode)
+    && (aarch64_tune_params.extra_tuning_flags
+ 	& AARCH64_EXTRA_TUNE_NO_LDP_STP_QREGS) == 0"
+@@ -306,7 +305,7 @@
+    (set (match_operand:VP_2E 6 "memory_operand" "")
+         (match_operand:VP_2E 7 "aarch64_reg_or_zero" ""))
+    (match_dup 8)]
+-  "TARGET_SIMD
++  "TARGET_FLOAT
+    && aarch64_operands_adjust_ok_for_ldpstp (operands, false, <MODE>mode)"
+   [(const_int 0)]
+ {
+@@ -327,7 +326,7 @@
+    (set (match_operand:VP_2E 6 "register_operand" "")
+         (match_operand:VP_2E 7 "memory_operand" ""))
+    (match_dup 8)]
+-  "TARGET_SIMD
++  "TARGET_FLOAT
+    && aarch64_operands_adjust_ok_for_ldpstp (operands, true, <MODE>mode)"
+   [(const_int 0)]
+ {
+diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
+index a47b39281..ef7fc4ecb 100644
+--- a/gcc/config/aarch64/aarch64-simd.md
++++ b/gcc/config/aarch64/aarch64-simd.md
+@@ -21,7 +21,7 @@
+ (define_expand "mov<mode>"
+   [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
+ 	(match_operand:VALL_F16 1 "general_operand"))]
+-  "TARGET_SIMD"
++  "TARGET_FLOAT"
+   "
+   /* Force the operand into a register if it is not an
+      immediate whose use can be replaced with xzr.
+@@ -52,7 +52,7 @@
+ (define_expand "movmisalign<mode>"
+   [(set (match_operand:VALL_F16 0 "nonimmediate_operand")
+         (match_operand:VALL_F16 1 "general_operand"))]
+-  "TARGET_SIMD && !STRICT_ALIGNMENT"
++  "TARGET_FLOAT && !STRICT_ALIGNMENT"
+ {
+   /* This pattern is not permitted to fail during expansion: if both arguments
+      are non-registers (e.g. memory := constant, which can be created by the
+@@ -116,10 +116,10 @@
+ 
+ (define_insn "*aarch64_simd_mov<VDMOV:mode>"
+   [(set (match_operand:VDMOV 0 "nonimmediate_operand"
+-		"=w, m,  m,  w, ?r, ?w, ?r, w")
++		"=w, m,  m,  w, ?r, ?w, ?r,  w,  w")
+ 	(match_operand:VDMOV 1 "general_operand"
+-		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
+-  "TARGET_SIMD
++		"m,  Dz, w,  w,  w,  r,  r, Dn, Dz"))]
++  "TARGET_FLOAT
+    && (register_operand (operands[0], <MODE>mode)
+        || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
+ {
+@@ -128,26 +128,34 @@
+      case 0: return "ldr\t%d0, %1";
+      case 1: return "str\txzr, %0";
+      case 2: return "str\t%d1, %0";
+-     case 3: return "mov\t%0.<Vbtype>, %1.<Vbtype>";
+-     case 4: return "umov\t%0, %1.d[0]";
++     case 3:
++       if (TARGET_SIMD)
++	 return "mov\t%0.<Vbtype>, %1.<Vbtype>";
++       return "fmov\t%d0, %d1";
++     case 4:
++       if (TARGET_SIMD)
++	 return "umov\t%0, %1.d[0]";
++       return "fmov\t%x0, %d1";
+      case 5: return "fmov\t%d0, %1";
+      case 6: return "mov\t%0, %1";
+      case 7:
+ 	return aarch64_output_simd_mov_immediate (operands[1], 64);
++     case 8: return "fmov\t%d0, xzr";
+      default: gcc_unreachable ();
+      }
+ }
+   [(set_attr "type" "neon_load1_1reg<q>, store_8, neon_store1_1reg<q>,\
+ 		     neon_logic<q>, neon_to_gp<q>, f_mcr,\
+-		     mov_reg, neon_move<q>")]
++		     mov_reg, neon_move<q>, f_mcr")
++   (set_attr "arch" "*,*,*,*,*,*,*,simd,*")]
+ )
+ 
+ (define_insn "*aarch64_simd_mov<VQMOV:mode>"
+   [(set (match_operand:VQMOV 0 "nonimmediate_operand"
+-		"=w, Umn,  m,  w, ?r, ?w, ?r, w")
++		"=w, Umn,  m,  w, ?r, ?w, ?r, w,  w")
+ 	(match_operand:VQMOV 1 "general_operand"
+-		"m,  Dz, w,  w,  w,  r,  r, Dn"))]
+-  "TARGET_SIMD
++		"m,  Dz, w,  w,  w,  r,  r, Dn, Dz"))]
++  "TARGET_FLOAT
+    && (register_operand (operands[0], <MODE>mode)
+        || aarch64_simd_reg_or_zero (operands[1], <MODE>mode))"
+ {
+@@ -167,14 +175,17 @@
+ 	return "#";
+     case 7:
+ 	return aarch64_output_simd_mov_immediate (operands[1], 128);
++    case 8:
++	return "fmov\t%d0, xzr";
+     default:
+ 	gcc_unreachable ();
+     }
+ }
+   [(set_attr "type" "neon_load1_1reg<q>, store_16, neon_store1_1reg<q>,\
+ 		     neon_logic<q>, multiple, multiple,\
+-		     multiple, neon_move<q>")
+-   (set_attr "length" "4,4,4,4,8,8,8,4")]
++		     multiple, neon_move<q>, fmov")
++   (set_attr "length" "4,4,4,4,8,8,8,4,4")
++   (set_attr "arch" "*,*,*,simd,*,*,*,simd,*")]
+ )
+ 
+ ;; When storing lane zero we can use the normal STR and its more permissive
+@@ -195,7 +206,7 @@
+ 	(match_operand:DREG 1 "aarch64_mem_pair_operand" "Ump"))
+    (set (match_operand:DREG2 2 "register_operand" "=w")
+ 	(match_operand:DREG2 3 "memory_operand" "m"))]
+-  "TARGET_SIMD
++  "TARGET_FLOAT
+    && rtx_equal_p (XEXP (operands[3], 0),
+ 		   plus_constant (Pmode,
+ 				  XEXP (operands[1], 0),
+@@ -209,7 +220,7 @@
+ 	(match_operand:DREG 1 "register_operand" "w"))
+    (set (match_operand:DREG2 2 "memory_operand" "=m")
+ 	(match_operand:DREG2 3 "register_operand" "w"))]
+-  "TARGET_SIMD
++  "TARGET_FLOAT
+    && rtx_equal_p (XEXP (operands[2], 0),
+ 		   plus_constant (Pmode,
+ 				  XEXP (operands[0], 0),
+@@ -223,7 +234,7 @@
+ 	(match_operand:VQ 1 "aarch64_mem_pair_operand" "Ump"))
+    (set (match_operand:VQ2 2 "register_operand" "=w")
+ 	(match_operand:VQ2 3 "memory_operand" "m"))]
+-  "TARGET_SIMD
++  "TARGET_FLOAT
+     && rtx_equal_p (XEXP (operands[3], 0),
+ 		    plus_constant (Pmode,
+ 			       XEXP (operands[1], 0),
+@@ -237,10 +248,11 @@
+ 	(match_operand:VQ 1 "register_operand" "w"))
+    (set (match_operand:VQ2 2 "memory_operand" "=m")
+ 	(match_operand:VQ2 3 "register_operand" "w"))]
+-  "TARGET_SIMD && rtx_equal_p (XEXP (operands[2], 0),
+-		plus_constant (Pmode,
+-			       XEXP (operands[0], 0),
+-			       GET_MODE_SIZE (<VQ:MODE>mode)))"
++  "TARGET_FLOAT
++   && rtx_equal_p (XEXP (operands[2], 0),
++		   plus_constant (Pmode,
++				  XEXP (operands[0], 0),
++				  GET_MODE_SIZE (<VQ:MODE>mode)))"
+   "stp\\t%q1, %q3, %z0"
+   [(set_attr "type" "neon_stp_q")]
+ )
+@@ -248,8 +260,9 @@
+ 
+ (define_split
+   [(set (match_operand:VQMOV 0 "register_operand" "")
+-      (match_operand:VQMOV 1 "register_operand" ""))]
+-  "TARGET_SIMD && reload_completed
++	(match_operand:VQMOV 1 "register_operand" ""))]
++  "TARGET_FLOAT
++   && reload_completed
+    && GP_REGNUM_P (REGNO (operands[0]))
+    && GP_REGNUM_P (REGNO (operands[1]))"
+   [(const_int 0)]
+@@ -261,7 +274,8 @@
+ (define_split
+   [(set (match_operand:VQMOV 0 "register_operand" "")
+         (match_operand:VQMOV 1 "register_operand" ""))]
+-  "TARGET_SIMD && reload_completed
++  "TARGET_FLOAT
++   && reload_completed
+    && ((FP_REGNUM_P (REGNO (operands[0])) && GP_REGNUM_P (REGNO (operands[1])))
+        || (GP_REGNUM_P (REGNO (operands[0])) && FP_REGNUM_P (REGNO (operands[1]))))"
+   [(const_int 0)]
+@@ -273,7 +287,7 @@
+ (define_expand "@aarch64_split_simd_mov<mode>"
+   [(set (match_operand:VQMOV 0)
+ 	(match_operand:VQMOV 1))]
+-  "TARGET_SIMD"
++  "TARGET_FLOAT"
+   {
+     rtx dst = operands[0];
+     rtx src = operands[1];
+@@ -306,13 +320,20 @@
+         (vec_select:<VHALF>
+           (match_operand:VQMOV 1 "register_operand")
+           (match_operand 2 "ascending_int_parallel")))]
+-  "TARGET_SIMD"
++  "TARGET_FLOAT"
++  {
++    if (vect_par_cnst_lo_half (operands[2], <MODE>mode))
++      {
++	emit_move_insn (operands[0], gen_lowpart (<VHALF>mode, operands[1]));
++	DONE;
++      }
++  }
+ )
+ 
+ (define_expand "aarch64_get_low<mode>"
+   [(match_operand:<VHALF> 0 "register_operand")
+    (match_operand:VQMOV 1 "register_operand")]
+-  "TARGET_SIMD"
++  "TARGET_FLOAT"
+   {
+     rtx lo = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, false);
+     emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], lo));
+@@ -323,7 +344,7 @@
+ (define_expand "aarch64_get_high<mode>"
+   [(match_operand:<VHALF> 0 "register_operand")
+    (match_operand:VQMOV 1 "register_operand")]
+-  "TARGET_SIMD"
++  "TARGET_FLOAT"
+   {
+     rtx hi = aarch64_simd_vect_par_cnst_half (<MODE>mode, <nunits>, true);
+     emit_insn (gen_aarch64_get_half<mode> (operands[0], operands[1], hi));
+@@ -350,15 +371,17 @@
+ )
+ 
+ (define_insn "aarch64_simd_mov_from_<mode>high"
+-  [(set (match_operand:<VHALF> 0 "register_operand" "=w,?r")
++  [(set (match_operand:<VHALF> 0 "register_operand" "=w,?r,?r")
+         (vec_select:<VHALF>
+-          (match_operand:VQMOV_NO2E 1 "register_operand" "w,w")
++          (match_operand:VQMOV_NO2E 1 "register_operand" "w,w,w")
+           (match_operand:VQMOV_NO2E 2 "vect_par_cnst_hi_half" "")))]
+-  "TARGET_SIMD"
++  "TARGET_FLOAT"
+   "@
+-   dup\\t%d0, %1.d[1]
+-   umov\t%0, %1.d[1]"
+-  [(set_attr "type" "neon_dup<q>,neon_to_gp<q>")
++   dup\t%d0, %1.d[1]
++   umov\t%0, %1.d[1]
++   fmov\t%0, %1.d[1]"
++  [(set_attr "type" "neon_dup<q>,neon_to_gp<q>,f_mrc")
++   (set_attr "arch" "simd,simd,*")
+    (set_attr "length" "4")]
+ )
+ 
+@@ -4322,12 +4345,22 @@
+   [(set_attr "type" "neon_to_gp<q>, neon_dup<q>, neon_store1_one_lane<q>")]
+ )
+ 
++(define_insn "*aarch64_get_high<mode>"
++  [(set (match_operand:<VEL> 0 "aarch64_simd_nonimmediate_operand" "=r")
++	(vec_select:<VEL>
++	  (match_operand:VQ_2E 1 "register_operand" "w")
++	  (parallel [(match_operand:SI 2 "immediate_operand")])))]
++  "TARGET_FLOAT && ENDIAN_LANE_N (<nunits>, INTVAL (operands[2])) == 1"
++  "fmov\t%0, %1.d[1]"
++  [(set_attr "type" "f_mrc")]
++)
++
+ (define_insn "load_pair_lanes<mode>"
+   [(set (match_operand:<VDBL> 0 "register_operand" "=w")
+ 	(vec_concat:<VDBL>
+ 	   (match_operand:VDCSIF 1 "memory_operand" "Utq")
+ 	   (match_operand:VDCSIF 2 "memory_operand" "m")))]
+-  "TARGET_SIMD
++  "TARGET_FLOAT
+    && aarch64_mergeable_load_pair_p (<VDBL>mode, operands[1], operands[2])"
+   "ldr\\t%<single_dtype>0, %1"
+   [(set_attr "type" "neon_load1_1reg<dblq>")]
+@@ -4357,7 +4390,7 @@
+ 	(vec_concat:<VDBL>
+ 	   (match_operand:VDCSIF 1 "register_operand" "w, r")
+ 	   (match_operand:VDCSIF 2 "register_operand" "w, r")))]
+-  "TARGET_SIMD"
++  "TARGET_FLOAT"
+   "@
+    stp\t%<single_type>1, %<single_type>2, %y0
+    stp\t%<single_wx>1, %<single_wx>2, %y0"
+@@ -4372,39 +4405,44 @@
+ ;; the register alternatives either don't accept or themselves disparage.
+ 
+ (define_insn "*aarch64_combine_internal<mode>"
+-  [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, Umn, Umn")
++  [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, w, Umn, Umn")
+ 	(vec_concat:<VDBL>
+-	  (match_operand:VDCSIF 1 "register_operand" "0, 0, 0, ?w, ?r")
+-	  (match_operand:VDCSIF 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, w, ?r")))]
+-  "TARGET_SIMD
++	  (match_operand:VDCSIF 1 "register_operand" "0, 0, 0, 0, ?w, ?r")
++	  (match_operand:VDCSIF 2 "aarch64_simd_nonimmediate_operand" "w, ?r, ?r, Utv, w, ?r")))]
++  "TARGET_FLOAT
+    && !BYTES_BIG_ENDIAN
+    && (register_operand (operands[0], <VDBL>mode)
+        || register_operand (operands[2], <MODE>mode))"
+   "@
+    ins\t%0.<single_type>[1], %2.<single_type>[0]
+    ins\t%0.<single_type>[1], %<single_wx>2
++   fmov\t%0.d[1], %2
+    ld1\t{%0.<single_type>}[1], %2
+    stp\t%<single_type>1, %<single_type>2, %y0
+    stp\t%<single_wx>1, %<single_wx>2, %y0"
+-  [(set_attr "type" "neon_ins<dblq>, neon_from_gp<dblq>, neon_load1_one_lane<dblq>, neon_stp, store_16")]
++  [(set_attr "type" "neon_ins<dblq>, neon_from_gp<dblq>, f_mcr,
++		     neon_load1_one_lane<dblq>, neon_stp, store_16")
++   (set_attr "arch" "simd,simd,*,simd,*,*")]
+ )
+ 
+ (define_insn "*aarch64_combine_internal_be<mode>"
+-  [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, Umn, Umn")
++  [(set (match_operand:<VDBL> 0 "aarch64_reg_or_mem_pair_operand" "=w, w, w, w, Umn, Umn")
+ 	(vec_concat:<VDBL>
+-	  (match_operand:VDCSIF 2 "aarch64_simd_nonimmediate_operand" "w, ?r, Utv, ?w, ?r")
+-	  (match_operand:VDCSIF 1 "register_operand" "0, 0, 0, ?w, ?r")))]
+-  "TARGET_SIMD
++	  (match_operand:VDCSIF 2 "aarch64_simd_nonimmediate_operand" "w, ?r, ?r, Utv, ?w, ?r")
++	  (match_operand:VDCSIF 1 "register_operand" "0, 0, 0, 0, ?w, ?r")))]
++  "TARGET_FLOAT
+    && BYTES_BIG_ENDIAN
+    && (register_operand (operands[0], <VDBL>mode)
+        || register_operand (operands[2], <MODE>mode))"
+   "@
+    ins\t%0.<single_type>[1], %2.<single_type>[0]
+    ins\t%0.<single_type>[1], %<single_wx>2
++   fmov\t%0.d[1], %2
+    ld1\t{%0.<single_type>}[1], %2
+    stp\t%<single_type>2, %<single_type>1, %y0
+    stp\t%<single_wx>2, %<single_wx>1, %y0"
+-  [(set_attr "type" "neon_ins<dblq>, neon_from_gp<dblq>, neon_load1_one_lane<dblq>, neon_stp, store_16")]
++  [(set_attr "type" "neon_ins<dblq>, neon_from_gp<dblq>, f_mcr, neon_load1_one_lane<dblq>, neon_stp, store_16")
++   (set_attr "arch" "simd,simd,*,simd,*,*")]
+ )
+ 
+ ;; In this insn, operand 1 should be low, and operand 2 the high part of the
+@@ -4415,13 +4453,12 @@
+ 	(vec_concat:<VDBL>
+ 	  (match_operand:VDCSIF 1 "nonimmediate_operand" "w,?r,m")
+ 	  (match_operand:VDCSIF 2 "aarch64_simd_or_scalar_imm_zero")))]
+-  "TARGET_SIMD && !BYTES_BIG_ENDIAN"
++  "TARGET_FLOAT && !BYTES_BIG_ENDIAN"
+   "@
+    fmov\\t%<single_type>0, %<single_type>1
+    fmov\t%<single_type>0, %<single_wx>1
+    ldr\\t%<single_type>0, %1"
+-  [(set_attr "type" "neon_move<q>, neon_from_gp, neon_load1_1reg")
+-   (set_attr "arch" "simd,fp,simd")]
++  [(set_attr "type" "neon_move<q>, neon_from_gp, neon_load1_1reg")]
+ )
+ 
+ (define_insn "*aarch64_combinez_be<mode>"
+@@ -4429,13 +4466,12 @@
+         (vec_concat:<VDBL>
+ 	  (match_operand:VDCSIF 2 "aarch64_simd_or_scalar_imm_zero")
+ 	  (match_operand:VDCSIF 1 "nonimmediate_operand" "w,?r,m")))]
+-  "TARGET_SIMD && BYTES_BIG_ENDIAN"
++  "TARGET_FLOAT && BYTES_BIG_ENDIAN"
+   "@
+    fmov\\t%<single_type>0, %<single_type>1
+    fmov\t%<single_type>0, %<single_wx>1
+    ldr\\t%<single_type>0, %1"
+-  [(set_attr "type" "neon_move<q>, neon_from_gp, neon_load1_1reg")
+-   (set_attr "arch" "simd,fp,simd")]
++  [(set_attr "type" "neon_move<q>, neon_from_gp, neon_load1_1reg")]
+ )
+ 
+ ;; Form a vector whose first half (in array order) comes from operand 1
+@@ -4446,7 +4482,7 @@
+ 	(vec_concat:<VDBL>
+ 	  (match_operand:VDCSIF 1 "general_operand")
+ 	  (match_operand:VDCSIF 2 "general_operand")))]
+-  "TARGET_SIMD"
++  "TARGET_FLOAT"
+ {
+   int lo = BYTES_BIG_ENDIAN ? 2 : 1;
+   int hi = BYTES_BIG_ENDIAN ? 1 : 2;
+@@ -4464,7 +4500,7 @@
+     }
+   else
+     {
+-      /* Use *aarch64_combine_general<mode>.  */
++      /* Use *aarch64_combine_internal<mode>.  */
+       operands[lo] = force_reg (<MODE>mode, operands[lo]);
+       if (!aarch64_simd_nonimmediate_operand (operands[hi], <MODE>mode))
+ 	{
+@@ -4486,7 +4522,7 @@
+   [(match_operand:<VDBL> 0 "register_operand")
+    (match_operand:VDC 1 "general_operand")
+    (match_operand:VDC 2 "general_operand")]
+-  "TARGET_SIMD"
++  "TARGET_FLOAT"
+ {
+   if (BYTES_BIG_ENDIAN)
+     std::swap (operands[1], operands[2]);
+@@ -7367,7 +7403,7 @@
+ (define_expand "mov<mode>"
+   [(set (match_operand:VSTRUCT_QD 0 "nonimmediate_operand")
+ 	(match_operand:VSTRUCT_QD 1 "general_operand"))]
+-  "TARGET_SIMD"
++  "TARGET_FLOAT"
+ {
+   if (can_create_pseudo_p ())
+     {
+@@ -7379,7 +7415,7 @@
+ (define_expand "mov<mode>"
+   [(set (match_operand:VSTRUCT 0 "nonimmediate_operand")
+ 	(match_operand:VSTRUCT 1 "general_operand"))]
+-  "TARGET_SIMD"
++  "TARGET_FLOAT"
+ {
+   if (can_create_pseudo_p ())
+     {
+@@ -7559,7 +7595,8 @@
+ (define_insn "*aarch64_be_mov<mode>"
+   [(set (match_operand:VSTRUCT_2D 0 "nonimmediate_operand" "=w,m,w")
+ 	(match_operand:VSTRUCT_2D 1 "general_operand"      " w,w,m"))]
+-  "TARGET_SIMD && BYTES_BIG_ENDIAN
++  "TARGET_FLOAT
++   && (!TARGET_SIMD || BYTES_BIG_ENDIAN)
+    && (register_operand (operands[0], <MODE>mode)
+        || register_operand (operands[1], <MODE>mode))"
+   "@
+@@ -7573,7 +7610,8 @@
+ (define_insn "*aarch64_be_mov<mode>"
+   [(set (match_operand:VSTRUCT_2Q 0 "nonimmediate_operand" "=w,m,w")
+ 	(match_operand:VSTRUCT_2Q 1 "general_operand"      " w,w,m"))]
+-  "TARGET_SIMD && BYTES_BIG_ENDIAN
++  "TARGET_FLOAT
++   && (!TARGET_SIMD || BYTES_BIG_ENDIAN)
+    && (register_operand (operands[0], <MODE>mode)
+        || register_operand (operands[1], <MODE>mode))"
+   "@
+@@ -7581,13 +7619,15 @@
+    stp\\t%q1, %R1, %0
+    ldp\\t%q0, %R0, %1"
+   [(set_attr "type" "multiple,neon_stp_q,neon_ldp_q")
++   (set_attr "arch" "simd,*,*")
+    (set_attr "length" "8,4,4")]
+ )
+ 
+ (define_insn "*aarch64_be_movoi"
+   [(set (match_operand:OI 0 "nonimmediate_operand" "=w,m,w")
+ 	(match_operand:OI 1 "general_operand"      " w,w,m"))]
+-  "TARGET_SIMD && BYTES_BIG_ENDIAN
++  "TARGET_FLOAT
++   && (!TARGET_SIMD || BYTES_BIG_ENDIAN)
+    && (register_operand (operands[0], OImode)
+        || register_operand (operands[1], OImode))"
+   "@
+@@ -7595,57 +7635,66 @@
+    stp\\t%q1, %R1, %0
+    ldp\\t%q0, %R0, %1"
+   [(set_attr "type" "multiple,neon_stp_q,neon_ldp_q")
++   (set_attr "arch" "simd,*,*")
+    (set_attr "length" "8,4,4")]
+ )
+ 
+ (define_insn "*aarch64_be_mov<mode>"
+   [(set (match_operand:VSTRUCT_3QD 0 "nonimmediate_operand" "=w,o,w")
+ 	(match_operand:VSTRUCT_3QD 1 "general_operand"      " w,w,o"))]
+-  "TARGET_SIMD && BYTES_BIG_ENDIAN
++  "TARGET_FLOAT
++   && (!TARGET_SIMD || BYTES_BIG_ENDIAN)
+    && (register_operand (operands[0], <MODE>mode)
+        || register_operand (operands[1], <MODE>mode))"
+   "#"
+   [(set_attr "type" "multiple")
++   (set_attr "arch" "fp<q>,*,*")
+    (set_attr "length" "12,8,8")]
+ )
+ 
+ (define_insn "*aarch64_be_movci"
+   [(set (match_operand:CI 0 "nonimmediate_operand" "=w,o,w")
+ 	(match_operand:CI 1 "general_operand"      " w,w,o"))]
+-  "TARGET_SIMD && BYTES_BIG_ENDIAN
++  "TARGET_FLOAT
++   && (!TARGET_SIMD || BYTES_BIG_ENDIAN)
+    && (register_operand (operands[0], CImode)
+        || register_operand (operands[1], CImode))"
+   "#"
+   [(set_attr "type" "multiple")
+-   (set_attr "length" "12,4,4")]
++   (set_attr "arch" "simd,*,*")
++   (set_attr "length" "12,8,8")]
+ )
+ 
+ (define_insn "*aarch64_be_mov<mode>"
+   [(set (match_operand:VSTRUCT_4QD 0 "nonimmediate_operand" "=w,o,w")
+ 	(match_operand:VSTRUCT_4QD 1 "general_operand"      " w,w,o"))]
+-  "TARGET_SIMD && BYTES_BIG_ENDIAN
++  "TARGET_FLOAT
++   && (!TARGET_SIMD || BYTES_BIG_ENDIAN)
+    && (register_operand (operands[0], <MODE>mode)
+        || register_operand (operands[1], <MODE>mode))"
+   "#"
+   [(set_attr "type" "multiple")
++   (set_attr "arch" "fp<q>,*,*")
+    (set_attr "length" "16,8,8")]
+ )
+ 
+ (define_insn "*aarch64_be_movxi"
+   [(set (match_operand:XI 0 "nonimmediate_operand" "=w,o,w")
+ 	(match_operand:XI 1 "general_operand"      " w,w,o"))]
+-  "TARGET_SIMD && BYTES_BIG_ENDIAN
++  "TARGET_FLOAT
++   && (!TARGET_SIMD || BYTES_BIG_ENDIAN)
+    && (register_operand (operands[0], XImode)
+        || register_operand (operands[1], XImode))"
+   "#"
+   [(set_attr "type" "multiple")
+-   (set_attr "length" "16,4,4")]
++   (set_attr "arch" "simd,*,*")
++   (set_attr "length" "16,8,8")]
+ )
+ 
+ (define_split
+   [(set (match_operand:VSTRUCT_2QD 0 "register_operand")
+ 	(match_operand:VSTRUCT_2QD 1 "register_operand"))]
+-  "TARGET_SIMD && reload_completed"
++  "TARGET_FLOAT && reload_completed"
+   [(const_int 0)]
+ {
+   aarch64_simd_emit_reg_reg_move (operands, <VSTRUCT_ELT>mode, 2);
+@@ -7655,7 +7704,7 @@
+ (define_split
+   [(set (match_operand:OI 0 "register_operand")
+ 	(match_operand:OI 1 "register_operand"))]
+-  "TARGET_SIMD && reload_completed"
++  "TARGET_FLOAT && reload_completed"
+   [(const_int 0)]
+ {
+   aarch64_simd_emit_reg_reg_move (operands, TImode, 2);
+@@ -7665,7 +7714,7 @@
+ (define_split
+   [(set (match_operand:VSTRUCT_3QD 0 "nonimmediate_operand")
+ 	(match_operand:VSTRUCT_3QD 1 "general_operand"))]
+-  "TARGET_SIMD && reload_completed"
++  "TARGET_FLOAT && reload_completed"
+   [(const_int 0)]
+ {
+   if (register_operand (operands[0], <MODE>mode)
+@@ -7674,7 +7723,7 @@
+       aarch64_simd_emit_reg_reg_move (operands, <VSTRUCT_ELT>mode, 3);
+       DONE;
+     }
+-  else if (BYTES_BIG_ENDIAN)
++  else if (!TARGET_SIMD || BYTES_BIG_ENDIAN)
+     {
+       int elt_size = GET_MODE_SIZE (<MODE>mode).to_constant () / <nregs>;
+       machine_mode pair_mode = elt_size == 16 ? V2x16QImode : V2x8QImode;
+@@ -7701,7 +7750,7 @@
+ (define_split
+   [(set (match_operand:CI 0 "nonimmediate_operand")
+ 	(match_operand:CI 1 "general_operand"))]
+-  "TARGET_SIMD && reload_completed"
++  "TARGET_FLOAT && reload_completed"
+   [(const_int 0)]
+ {
+   if (register_operand (operands[0], CImode)
+@@ -7710,7 +7759,7 @@
+       aarch64_simd_emit_reg_reg_move (operands, TImode, 3);
+       DONE;
+     }
+-  else if (BYTES_BIG_ENDIAN)
++  else if (!TARGET_SIMD || BYTES_BIG_ENDIAN)
+     {
+       emit_move_insn (simplify_gen_subreg (OImode, operands[0], CImode, 0),
+ 		      simplify_gen_subreg (OImode, operands[1], CImode, 0));
+@@ -7729,7 +7778,7 @@
+ (define_split
+   [(set (match_operand:VSTRUCT_4QD 0 "nonimmediate_operand")
+ 	(match_operand:VSTRUCT_4QD 1 "general_operand"))]
+-  "TARGET_SIMD && reload_completed"
++  "TARGET_FLOAT && reload_completed"
+   [(const_int 0)]
+ {
+   if (register_operand (operands[0], <MODE>mode)
+@@ -7738,7 +7787,7 @@
+       aarch64_simd_emit_reg_reg_move (operands, <VSTRUCT_ELT>mode, 4);
+       DONE;
+     }
+-  else if (BYTES_BIG_ENDIAN)
++  else if (!TARGET_SIMD || BYTES_BIG_ENDIAN)
+     {
+       int elt_size = GET_MODE_SIZE (<MODE>mode).to_constant () / <nregs>;
+       machine_mode pair_mode = elt_size == 16 ? V2x16QImode : V2x8QImode;
+@@ -7759,7 +7808,7 @@
+ (define_split
+   [(set (match_operand:XI 0 "nonimmediate_operand")
+ 	(match_operand:XI 1 "general_operand"))]
+-  "TARGET_SIMD && reload_completed"
++  "TARGET_FLOAT && reload_completed"
+   [(const_int 0)]
+ {
+   if (register_operand (operands[0], XImode)
+@@ -7768,7 +7817,7 @@
+       aarch64_simd_emit_reg_reg_move (operands, TImode, 4);
+       DONE;
+     }
+-  else if (BYTES_BIG_ENDIAN)
++  else if (!TARGET_SIMD || BYTES_BIG_ENDIAN)
+     {
+       emit_move_insn (simplify_gen_subreg (OImode, operands[0], XImode, 0),
+ 		      simplify_gen_subreg (OImode, operands[1], XImode, 0));
+diff --git a/gcc/config/aarch64/aarch64.cc b/gcc/config/aarch64/aarch64.cc
+index 02210ed13..b4b646fa0 100644
+--- a/gcc/config/aarch64/aarch64.cc
++++ b/gcc/config/aarch64/aarch64.cc
+@@ -3748,7 +3748,7 @@ aarch64_classify_vector_mode (machine_mode mode)
+     case E_OImode:
+     case E_CImode:
+     case E_XImode:
+-      return TARGET_SIMD ? VEC_ADVSIMD | VEC_STRUCT : 0;
++      return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT : 0;
+ 
+     /* Structures of 64-bit Advanced SIMD vectors.  */
+     case E_V2x8QImode:
+@@ -3775,7 +3775,7 @@ aarch64_classify_vector_mode (machine_mode mode)
+     case E_V4x4HFmode:
+     case E_V4x2SFmode:
+     case E_V4x1DFmode:
+-      return TARGET_SIMD ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
++      return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL : 0;
+ 
+     /* Structures of 128-bit Advanced SIMD vectors.  */
+     case E_V2x16QImode:
+@@ -3802,7 +3802,7 @@ aarch64_classify_vector_mode (machine_mode mode)
+     case E_V4x8HFmode:
+     case E_V4x4SFmode:
+     case E_V4x2DFmode:
+-      return TARGET_SIMD ? VEC_ADVSIMD | VEC_STRUCT : 0;
++      return TARGET_FLOAT ? VEC_ADVSIMD | VEC_STRUCT : 0;
+ 
+     /* 64-bit Advanced SIMD vectors.  */
+     case E_V8QImode:
+@@ -3822,7 +3822,7 @@ aarch64_classify_vector_mode (machine_mode mode)
+     case E_V8BFmode:
+     case E_V4SFmode:
+     case E_V2DFmode:
+-      return TARGET_SIMD ? VEC_ADVSIMD : 0;
++      return TARGET_FLOAT ? VEC_ADVSIMD : 0;
+ 
+     default:
+       return 0;
+@@ -4110,7 +4110,8 @@ aarch64_vectorize_related_mode (machine_mode vector_mode,
+     }
+ 
+   /* Prefer to use 1 128-bit vector instead of 2 64-bit vectors.  */
+-  if ((vec_flags & VEC_ADVSIMD)
++  if (TARGET_SIMD
++      && (vec_flags & VEC_ADVSIMD)
+       && known_eq (nunits, 0U)
+       && known_eq (GET_MODE_BITSIZE (vector_mode), 64U)
+       && maybe_ge (GET_MODE_BITSIZE (element_mode)
+@@ -4208,7 +4209,7 @@ aarch64_hard_regno_mode_ok (unsigned regno, machine_mode mode)
+ 
+   if (GP_REGNUM_P (regno))
+     {
+-      if (vec_flags & VEC_ANY_SVE)
++      if (vec_flags & (VEC_ANY_SVE | VEC_STRUCT))
+ 	return false;
+       if (known_le (GET_MODE_SIZE (mode), 8))
+ 	return true;
+@@ -10884,7 +10885,8 @@ aarch64_classify_address (struct aarch64_address_info *info,
+ 			    || mode == TImode
+ 			    || mode == TFmode
+ 			    || mode == TDmode
+-			    || (BYTES_BIG_ENDIAN && advsimd_struct_p));
++			    || ((!TARGET_SIMD || BYTES_BIG_ENDIAN)
++				&& advsimd_struct_p));
+   /* If we are dealing with ADDR_QUERY_LDP_STP_N that means the incoming mode
+      corresponds to the actual size of the memory being loaded/stored and the
+      mode of the corresponding addressing mode is half of that.  */
+@@ -10914,6 +10916,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
+   /* On LE, for AdvSIMD, don't support anything other than POST_INC or
+      REG addressing.  */
+   if (advsimd_struct_p
++      && TARGET_SIMD
+       && !BYTES_BIG_ENDIAN
+       && (code != POST_INC && code != REG))
+     return false;
+@@ -10976,7 +10979,7 @@ aarch64_classify_address (struct aarch64_address_info *info,
+ 	            && aarch64_offset_7bit_signed_scaled_p (DImode, offset + 48));
+ 
+ 	  /* A 7bit offset check because OImode will emit a ldp/stp
+-	     instruction (only big endian will get here).
++	     instruction (only !TARGET_SIMD or big endian will get here).
+ 	     For ldp/stp instructions, the offset is scaled for the size of a
+ 	     single element of the pair.  */
+ 	  if (aarch64_advsimd_partial_struct_mode_p (mode)
+@@ -10987,7 +10990,8 @@ aarch64_classify_address (struct aarch64_address_info *info,
+ 	    return aarch64_offset_7bit_signed_scaled_p (TImode, offset);
+ 
+ 	  /* Three 9/12 bit offsets checks because CImode will emit three
+-	     ldr/str instructions (only big endian will get here).  */
++	     ldr/str instructions (only !TARGET_SIMD or big endian will
++	     get here).  */
+ 	  if (aarch64_advsimd_partial_struct_mode_p (mode)
+ 	      && known_eq (GET_MODE_SIZE (mode), 24))
+ 	    return (aarch64_offset_7bit_signed_scaled_p (DImode, offset)
+@@ -12716,18 +12720,16 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
+   /* Use aarch64_sve_reload_mem for SVE memory reloads that cannot use
+      LDR and STR.  See the comment at the head of aarch64-sve.md for
+      more details about the big-endian handling.  */
++  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+   if (reg_class_subset_p (rclass, FP_REGS)
+       && !((REG_P (x) && HARD_REGISTER_P (x))
+ 	   || aarch64_simd_valid_immediate (x, NULL))
+-      && mode != VNx16QImode)
++      && mode != VNx16QImode
++      && (vec_flags & VEC_SVE_DATA)
++      && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
+     {
+-      unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+-      if ((vec_flags & VEC_SVE_DATA)
+-	  && ((vec_flags & VEC_PARTIAL) || BYTES_BIG_ENDIAN))
+-	{
+-	  sri->icode = CODE_FOR_aarch64_sve_reload_mem;
+-	  return NO_REGS;
+-	}
++      sri->icode = CODE_FOR_aarch64_sve_reload_mem;
++      return NO_REGS;
+     }
+ 
+   /* If we have to disable direct literal pool loads and stores because the
+@@ -12744,9 +12746,13 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
+   /* Without the TARGET_SIMD instructions we cannot move a Q register
+      to a Q register directly.  We need a scratch.  */
+   if (REG_P (x)
+-      && (mode == TFmode || mode == TImode || mode == TDmode)
++      && (mode == TFmode
++	  || mode == TImode
++	  || mode == TDmode
++	  || (vec_flags == VEC_ADVSIMD && known_eq (GET_MODE_SIZE (mode), 16)))
+       && mode == GET_MODE (x)
+-      && FP_REGNUM_P (REGNO (x)) && !TARGET_SIMD
++      && !TARGET_SIMD
++      && FP_REGNUM_P (REGNO (x))
+       && reg_class_subset_p (rclass, FP_REGS))
+     {
+       sri->icode = code_for_aarch64_reload_mov (mode);
+@@ -12768,6 +12774,28 @@ aarch64_secondary_reload (bool in_p ATTRIBUTE_UNUSED, rtx x,
+   return NO_REGS;
+ }
+ 
++/* Implement TARGET_SECONDARY_MEMORY_NEEDED.  */
++
++static bool
++aarch64_secondary_memory_needed (machine_mode mode, reg_class_t class1,
++				 reg_class_t class2)
++{
++  if (!TARGET_SIMD
++      && reg_classes_intersect_p (class1, FP_REGS)
++      && reg_classes_intersect_p (class2, FP_REGS))
++    {
++      /* We can't do a 128-bit FPR-to-FPR move without TARGET_SIMD,
++	 so we can't easily split a move involving tuples of 128-bit
++	 vectors.  Force the copy through memory instead.
++
++	 (Tuples of 64-bit vectors are fine.)  */
++      unsigned int vec_flags = aarch64_classify_vector_mode (mode);
++      if (vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
++	return true;
++    }
++  return false;
++}
++
+ static bool
+ aarch64_can_eliminate (const int from ATTRIBUTE_UNUSED, const int to)
+ {
+@@ -13311,7 +13339,7 @@ aarch64_rtx_mult_cost (rtx x, enum rtx_code code, int outer, bool speed)
+   if (VECTOR_MODE_P (mode))
+     {
+       unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+-      if (vec_flags & VEC_ADVSIMD)
++      if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
+ 	{
+ 	  /* The select-operand-high-half versions of the instruction have the
+ 	     same cost as the three vector version - don't add the costs of the
+@@ -14257,7 +14285,7 @@ cost_minus:
+ 	  {
+ 	    /* SUBL2 and SUBW2.  */
+ 	    unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+-	    if (vec_flags & VEC_ADVSIMD)
++	    if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
+ 	      {
+ 		/* The select-operand-high-half versions of the sub instruction
+ 		   have the same cost as the regular three vector version -
+@@ -14359,7 +14387,7 @@ cost_plus:
+ 	  {
+ 	    /* ADDL2 and ADDW2.  */
+ 	    unsigned int vec_flags = aarch64_classify_vector_mode (mode);
+-	    if (vec_flags & VEC_ADVSIMD)
++	    if (TARGET_SIMD && (vec_flags & VEC_ADVSIMD))
+ 	      {
+ 		/* The select-operand-high-half versions of the add instruction
+ 		   have the same cost as the regular three vector version -
+@@ -15284,7 +15312,9 @@ aarch64_register_move_cost (machine_mode mode,
+     return aarch64_register_move_cost (mode, from, GENERAL_REGS)
+             + aarch64_register_move_cost (mode, GENERAL_REGS, to);
+ 
+-  if (known_eq (GET_MODE_SIZE (mode), 16))
++  unsigned int vec_flags = aarch64_classify_vector_mode (mode);
++  if (vec_flags != (VEC_ADVSIMD | VEC_STRUCT | VEC_PARTIAL)
++      && known_eq (GET_MODE_SIZE (mode), 16))
+     {
+       /* 128-bit operations on general registers require 2 instructions.  */
+       if (from == GENERAL_REGS && to == GENERAL_REGS)
+@@ -15312,6 +15342,16 @@ aarch64_register_move_cost (machine_mode mode,
+   else if (to == GENERAL_REGS)
+     return regmove_cost->FP2GP;
+ 
++  if (!TARGET_SIMD && vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
++    {
++      /* Needs a round-trip through memory, which can use LDP/STP for pairs.
++	 The cost must be greater than 2 units to indicate that direct
++	 moves aren't possible.  */
++      auto per_vector = (aarch64_tune_params.memmov_cost.load_fp
++			 + aarch64_tune_params.memmov_cost.store_fp);
++      return MIN (CEIL (per_vector, 2), 4);
++    }
++
+   return regmove_cost->FP2FP;
+ }
+ 
+@@ -21504,6 +21544,9 @@ aarch64_simd_valid_immediate (rtx op, simd_immediate_info *info,
+   if (vec_flags == 0 || vec_flags == (VEC_ADVSIMD | VEC_STRUCT))
+     return false;
+ 
++  if ((vec_flags & VEC_ADVSIMD) && !TARGET_SIMD)
++    return false;
++
+   if (vec_flags & VEC_SVE_PRED)
+     return aarch64_sve_pred_valid_immediate (op, info);
+ 
+@@ -24430,7 +24473,7 @@ aarch64_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
+       std::swap (d->op0, d->op1);
+     }
+ 
+-  if ((d->vec_flags == VEC_ADVSIMD
++  if (((d->vec_flags == VEC_ADVSIMD && TARGET_SIMD)
+        || d->vec_flags == VEC_SVE_DATA
+        || d->vec_flags == (VEC_SVE_DATA | VEC_PARTIAL)
+        || d->vec_flags == VEC_SVE_PRED)
+@@ -27977,6 +28020,9 @@ aarch64_libgcc_floating_mode_supported_p
+ #undef TARGET_SECONDARY_RELOAD
+ #define TARGET_SECONDARY_RELOAD aarch64_secondary_reload
+ 
++#undef TARGET_SECONDARY_MEMORY_NEEDED
++#define TARGET_SECONDARY_MEMORY_NEEDED aarch64_secondary_memory_needed
++
+ #undef TARGET_SHIFT_TRUNCATION_MASK
+ #define TARGET_SHIFT_TRUNCATION_MASK aarch64_shift_truncation_mask
+ 
+diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
+index 8757a962f..c0cc91756 100644
+--- a/gcc/config/aarch64/aarch64.md
++++ b/gcc/config/aarch64/aarch64.md
+@@ -374,8 +374,11 @@
+ ;; Attributes of the architecture required to support the instruction (or
+ ;; alternative). This attribute is used to compute attribute "enabled", use type
+ ;; "any" to enable an alternative in all cases.
++;;
++;; As a convenience, "fp_q" means "fp" + the ability to move between
++;; Q registers and is equivalent to "simd".
+ 
+-(define_enum "arches" [ any rcpc8_4 fp simd sve fp16])
++(define_enum "arches" [ any rcpc8_4 fp fp_q simd sve fp16])
+ 
+ (define_enum_attr "arch" "arches" (const_string "any"))
+ 
+@@ -403,7 +406,7 @@
+ 	(and (eq_attr "arch" "fp")
+ 	     (match_test "TARGET_FLOAT"))
+ 
+-	(and (eq_attr "arch" "simd")
++	(and (eq_attr "arch" "fp_q, simd")
+ 	     (match_test "TARGET_SIMD"))
+ 
+ 	(and (eq_attr "arch" "fp16")
+@@ -6768,8 +6771,8 @@
+ )
+ 
+ (define_expand "@aarch64_reload_mov<mode>"
+-  [(set (match_operand:TX 0 "register_operand" "=w")
+-        (match_operand:TX 1 "register_operand" "w"))
++  [(set (match_operand:VTX 0 "register_operand" "=w")
++        (match_operand:VTX 1 "register_operand" "w"))
+    (clobber (match_operand:DI 2 "register_operand" "=&r"))
+   ]
+   "TARGET_FLOAT"
+diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
+index d0cd1b788..a8a39b65a 100644
+--- a/gcc/config/aarch64/iterators.md
++++ b/gcc/config/aarch64/iterators.md
+@@ -313,6 +313,8 @@
+ 
+ (define_mode_iterator TX [TI TF TD])
+ 
++(define_mode_iterator VTX [TI TF TD V16QI V8HI V4SI V2DI V8HF V4SF V2DF V8BF])
++
+ ;; Advanced SIMD opaque structure modes.
+ (define_mode_iterator VSTRUCT [OI CI XI])
+ 
+diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_20.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_20.c
+new file mode 100644
+index 000000000..7e705e119
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_20.c
+@@ -0,0 +1,7 @@
++/* { dg-options "-O2" } */
++
++#pragma GCC target "+nosimd+fp"
++
++#include "ldp_stp_6.c"
++
++/* { dg-final { scan-assembler "stp\td\[0-9\]+, d\[0-9\]+, \\\[x\[0-9\]+\\\]" } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c
+new file mode 100644
+index 000000000..462e3c9aa
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_21.c
+@@ -0,0 +1,7 @@
++/* { dg-options "-O2" } */
++
++#pragma GCC target "+nosimd+fp"
++
++#include "ldp_stp_8.c"
++
++/* { dg-final { scan-assembler-times "ldp\td\[0-9\], d\[0-9\]+, \\\[x\[0-9\]+\\\]" 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_22.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_22.c
+new file mode 100644
+index 000000000..283c56dd2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_22.c
+@@ -0,0 +1,13 @@
++/* { dg-options "-O2" } */
++
++#pragma GCC target "+nosimd+fp"
++
++void
++foo (__Float32x4_t *ptr)
++{
++  ptr[0] = ptr[2];
++  ptr[1] = ptr[3];
++}
++
++/* { dg-final { scan-assembler {\tldp\tq[0-9]+, q[0-9]+} } } */
++/* { dg-final { scan-assembler {\tstp\tq[0-9]+, q[0-9]+} } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_23.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_23.c
+new file mode 100644
+index 000000000..b14976cfe
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_23.c
+@@ -0,0 +1,16 @@
++/* { dg-options "-O2" } */
++
++#pragma GCC target "+nosimd+fp"
++
++void
++foo (char *char_ptr)
++{
++  __Float64x2_t *ptr = (__Float64x2_t *)(char_ptr + 1);
++  asm volatile ("" ::
++		"w" (ptr[1]),
++		"w" (ptr[2]),
++		"w" (ptr[3]),
++		"w" (ptr[4]));
++}
++
++/* { dg-final { scan-assembler-times {\tldp\tq[0-9]+, q[0-9]+} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/ldp_stp_24.c b/gcc/testsuite/gcc.target/aarch64/ldp_stp_24.c
+new file mode 100644
+index 000000000..a99426eb2
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/ldp_stp_24.c
+@@ -0,0 +1,16 @@
++/* { dg-options "-O2" } */
++
++#pragma GCC target "+nosimd+fp"
++
++void
++foo (char *char_ptr)
++{
++  __Float64x2_t *ptr = (__Float64x2_t *)(char_ptr + 1);
++  asm volatile ("" :
++		"=w" (ptr[1]),
++		"=w" (ptr[2]),
++		"=w" (ptr[3]),
++		"=w" (ptr[4]));
++}
++
++/* { dg-final { scan-assembler-times {\tstp\tq[0-9]+, q[0-9]+} 2 } } */
+diff --git a/gcc/testsuite/gcc.target/aarch64/movv16qi_1.c b/gcc/testsuite/gcc.target/aarch64/movv16qi_1.c
+index 8a6afb13b..cac4241b0 100644
+--- a/gcc/testsuite/gcc.target/aarch64/movv16qi_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/movv16qi_1.c
+@@ -80,3 +80,24 @@ fpr_to_gpr (v16qi q0)
+   x0 = q0;
+   asm volatile ("" :: "r" (x0));
+ }
++
++/*
++** gpr_to_gpr:
++** (
++**	mov	x0, x2
++**	mov	x1, x3
++** |
++**	mov	x1, x3
++**	mov	x0, x2
++** )
++**	ret
++*/
++void
++gpr_to_gpr ()
++{
++  register v16qi x0 asm ("x0");
++  register v16qi x2 asm ("x2");
++  asm volatile ("" : "=r" (x2));
++  x0 = x2;
++  asm volatile ("" :: "r" (x0));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/movv16qi_2.c b/gcc/testsuite/gcc.target/aarch64/movv16qi_2.c
+new file mode 100644
+index 000000000..08a0a19b5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/movv16qi_2.c
+@@ -0,0 +1,27 @@
++/* { dg-do assemble } */
++/* { dg-options "-O --save-temps" } */
++
++#pragma GCC target "+nosimd+fp"
++
++#define TEST_GENERAL(TYPE) \
++  TYPE mov_##TYPE (TYPE a, TYPE b) { return b; } \
++  TYPE zero_##TYPE () { return (TYPE) {}; } \
++  TYPE load_##TYPE (TYPE *ptr) { return *ptr; } \
++  void store_##TYPE (TYPE *ptr, TYPE a) { *ptr = a; }
++
++TEST_GENERAL (__Int8x16_t)
++TEST_GENERAL (__Int16x8_t)
++TEST_GENERAL (__Int32x4_t)
++TEST_GENERAL (__Int64x2_t)
++TEST_GENERAL (__Bfloat16x8_t)
++TEST_GENERAL (__Float16x8_t)
++TEST_GENERAL (__Float32x4_t)
++TEST_GENERAL (__Float64x2_t)
++
++__Int8x16_t const_s8x8 () { return (__Int8x16_t) { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; }
++__Int16x8_t const_s16x4 () { return (__Int16x8_t) { 1, 0, 1, 0, 1, 0, 1, 0 }; }
++__Int32x4_t const_s32x2 () { return (__Int32x4_t) { 1, 2, 3, 4 }; }
++__Int64x2_t const_s64x1 () { return (__Int64x2_t) { 100, 100 }; }
++__Float16x8_t const_f16x4 () { return (__Float16x8_t) { 2, 2, 2, 2, 2, 2, 2, 2 }; }
++__Float32x4_t const_f32x2 () { return (__Float32x4_t) { 1, 2, 1, 2 }; }
++__Float64x2_t const_f64x1 () { return (__Float64x2_t) { 32, 32 }; }
+diff --git a/gcc/testsuite/gcc.target/aarch64/movv16qi_3.c b/gcc/testsuite/gcc.target/aarch64/movv16qi_3.c
+new file mode 100644
+index 000000000..d43b994c1
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/movv16qi_3.c
+@@ -0,0 +1,30 @@
++/* { dg-do assemble } */
++/* { dg-options "-O --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" "" } } */
++
++#pragma GCC target "+nosimd+fp"
++
++#define TEST_VECTOR(TYPE) \
++  TYPE \
++  test_##TYPE (void) \
++  { \
++    typedef TYPE v __attribute__((aligned(1))); \
++    register v *ptr asm ("x0"); \
++    asm volatile ("" : "=r" (ptr)); \
++    return *ptr; \
++  }
++
++TEST_VECTOR (__Int8x16_t)
++TEST_VECTOR (__Int16x8_t)
++TEST_VECTOR (__Int32x4_t)
++TEST_VECTOR (__Int64x2_t)
++TEST_VECTOR (__Bfloat16x8_t)
++TEST_VECTOR (__Float16x8_t)
++TEST_VECTOR (__Float32x4_t)
++TEST_VECTOR (__Float64x2_t)
++
++/*
++** test___Int8x16_t:
++**	ldr	q0, \[x0\]
++**	ret
++*/
+diff --git a/gcc/testsuite/gcc.target/aarch64/movv2di_1.c b/gcc/testsuite/gcc.target/aarch64/movv2di_1.c
+new file mode 100644
+index 000000000..e3b55fd52
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/movv2di_1.c
+@@ -0,0 +1,103 @@
++/* { dg-do assemble } */
++/* { dg-options "-O -mtune=neoverse-v1 --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" "" } } */
++
++#pragma GCC target "+nothing+nosimd+fp"
++
++typedef long long v2di __attribute__((vector_size(16)));
++
++/*
++** fpr_to_fpr:
++**	sub	sp, sp, #16
++**	str	q1, \[sp\]
++**	ldr	q0, \[sp\]
++**	add	sp, sp, #?16
++**	ret
++*/
++v2di
++fpr_to_fpr (v2di q0, v2di q1)
++{
++  return q1;
++}
++
++/*
++** gpr_to_fpr:	{ target aarch64_little_endian }
++**	fmov	d0, x0
++**	fmov	v0.d\[1\], x1
++**	ret
++*/
++/*
++** gpr_to_fpr:	{ target aarch64_big_endian }
++**	fmov	d0, x1
++**	fmov	v0.d\[1\], x0
++**	ret
++*/
++v2di
++gpr_to_fpr ()
++{
++  register v2di x0 asm ("x0");
++  asm volatile ("" : "=r" (x0));
++  return x0;
++}
++
++/*
++** zero_to_fpr:
++**	fmov	d0, xzr
++**	ret
++*/
++v2di
++zero_to_fpr ()
++{
++  return (v2di) {};
++}
++
++/*
++** fpr_to_gpr:	{ target aarch64_little_endian }
++** (
++**	fmov	x0, d0
++**	fmov	x1, v0.d\[1\]
++** |
++**	fmov	x1, v0.d\[1\]
++**	fmov	x0, d0
++** )
++**	ret
++*/
++/*
++** fpr_to_gpr:	{ target aarch64_big_endian }
++** (
++**	fmov	x1, d0
++**	fmov	x0, v0.d\[1\]
++** |
++**	fmov	x0, v0.d\[1\]
++**	fmov	x1, d0
++** )
++**	ret
++*/
++void
++fpr_to_gpr (v2di q0)
++{
++  register v2di x0 asm ("x0");
++  x0 = q0;
++  asm volatile ("" :: "r" (x0));
++}
++
++/*
++** gpr_to_gpr:
++** (
++**	mov	x0, x2
++**	mov	x1, x3
++** |
++**	mov	x1, x3
++**	mov	x0, x2
++** )
++**	ret
++*/
++void
++gpr_to_gpr ()
++{
++  register v2di x0 asm ("x0");
++  register v2di x2 asm ("x2");
++  asm volatile ("" : "=r" (x2));
++  x0 = x2;
++  asm volatile ("" :: "r" (x0));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/movv2x16qi_1.c b/gcc/testsuite/gcc.target/aarch64/movv2x16qi_1.c
+new file mode 100644
+index 000000000..90e3b426d
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/movv2x16qi_1.c
+@@ -0,0 +1,40 @@
++/* { dg-do assemble } */
++/* { dg-options "-O --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" "" } } */
++
++#pragma GCC aarch64 "arm_neon.h"
++
++#pragma GCC target "+nosimd+fp"
++
++#define TEST_VECTOR(TYPE) \
++  TYPE mov_##TYPE (TYPE a, TYPE b) { return b; } \
++  TYPE load_##TYPE (TYPE *ptr) { return *ptr; } \
++  void store_##TYPE (TYPE *ptr, TYPE a) { *ptr = a; }
++
++TEST_VECTOR (int8x16x2_t)
++TEST_VECTOR (int16x8x2_t)
++TEST_VECTOR (int32x4x2_t)
++TEST_VECTOR (int64x2x2_t)
++TEST_VECTOR (float16x8x2_t)
++TEST_VECTOR (bfloat16x8x2_t)
++TEST_VECTOR (float32x4x2_t)
++TEST_VECTOR (float64x2x2_t)
++
++/*
++** mov_int8x16x2_t:
++**	sub	sp, sp, #32
++**	stp	q2, q3, \[sp\]
++**	ldp	q0, q1, \[sp\]
++**	add	sp, sp, #?32
++**	ret
++*/
++/*
++** load_int8x16x2_t:
++**	ldp	q0, q1, \[x0\]
++**	ret
++*/
++/*
++** store_int8x16x2_t: { xfail *-*-* }
++**	stp	q0, q1, \[x0\]
++**	ret
++*/
+diff --git a/gcc/testsuite/gcc.target/aarch64/movv2x8qi_1.c b/gcc/testsuite/gcc.target/aarch64/movv2x8qi_1.c
+new file mode 100644
+index 000000000..883a0ea71
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/movv2x8qi_1.c
+@@ -0,0 +1,38 @@
++/* { dg-do assemble } */
++/* { dg-options "-O --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" "" } } */
++
++#pragma GCC aarch64 "arm_neon.h"
++
++#pragma GCC target "+nosimd+fp"
++
++#define TEST_VECTOR(TYPE) \
++  TYPE mov_##TYPE (TYPE a, TYPE b) { return b; } \
++  TYPE load_##TYPE (TYPE *ptr) { return *ptr; } \
++  void store_##TYPE (TYPE *ptr, TYPE a) { *ptr = a; }
++
++TEST_VECTOR (int8x8x2_t)
++TEST_VECTOR (int16x4x2_t)
++TEST_VECTOR (int32x2x2_t)
++TEST_VECTOR (int64x1x2_t)
++TEST_VECTOR (float16x4x2_t)
++TEST_VECTOR (bfloat16x4x2_t)
++TEST_VECTOR (float32x2x2_t)
++TEST_VECTOR (float64x1x2_t)
++
++/*
++** mov_int8x8x2_t:
++**	fmov	d0, d2
++**	fmov	d1, d3
++**	ret
++*/
++/*
++** load_int8x8x2_t:
++**	ldp	d0, d1, \[x0\]
++**	ret
++*/
++/*
++** store_int8x8x2_t:
++**	stp	d0, d1, \[x0\]
++**	ret
++*/
+diff --git a/gcc/testsuite/gcc.target/aarch64/movv3x16qi_1.c b/gcc/testsuite/gcc.target/aarch64/movv3x16qi_1.c
+new file mode 100644
+index 000000000..070a596bf
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/movv3x16qi_1.c
+@@ -0,0 +1,44 @@
++/* { dg-do assemble } */
++/* { dg-options "-O --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" "" } } */
++
++#pragma GCC aarch64 "arm_neon.h"
++
++#pragma GCC target "+nosimd+fp"
++
++#define TEST_VECTOR(TYPE) \
++  TYPE mov_##TYPE (TYPE a, TYPE b) { return b; } \
++  TYPE load_##TYPE (TYPE *ptr) { return *ptr; } \
++  void store_##TYPE (TYPE *ptr, TYPE a) { *ptr = a; }
++
++TEST_VECTOR (int8x16x3_t)
++TEST_VECTOR (int16x8x3_t)
++TEST_VECTOR (int32x4x3_t)
++TEST_VECTOR (int64x2x3_t)
++TEST_VECTOR (float16x8x3_t)
++TEST_VECTOR (bfloat16x8x3_t)
++TEST_VECTOR (float32x4x3_t)
++TEST_VECTOR (float64x2x3_t)
++
++/*
++** mov_int8x16x3_t:
++**	sub	sp, sp, #48
++**	stp	q3, q4, \[sp\]
++**	str	q5, \[sp, #?32\]
++**	ldp	q0, q1, \[sp\]
++**	ldr	q2, \[sp, #?32\]
++**	add	sp, sp, #?48
++**	ret
++*/
++/*
++** load_int8x16x3_t:
++**	ldp	q0, q1, \[x0\]
++**	ldr	q2, \[x0, #?32\]
++**	ret
++*/
++/*
++** store_int8x16x3_t: { xfail *-*-* }
++**	stp	q0, q1, \[x0\]
++**	stp	q2, \[x0, #?32\]
++**	ret
++*/
+diff --git a/gcc/testsuite/gcc.target/aarch64/movv3x8qi_1.c b/gcc/testsuite/gcc.target/aarch64/movv3x8qi_1.c
+new file mode 100644
+index 000000000..4b873d749
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/movv3x8qi_1.c
+@@ -0,0 +1,41 @@
++/* { dg-do assemble } */
++/* { dg-options "-O --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" "" } } */
++
++#pragma GCC aarch64 "arm_neon.h"
++
++#pragma GCC target "+nosimd+fp"
++
++#define TEST_VECTOR(TYPE) \
++  TYPE mov_##TYPE (TYPE a, TYPE b) { return b; } \
++  TYPE load_##TYPE (TYPE *ptr) { return *ptr; } \
++  void store_##TYPE (TYPE *ptr, TYPE a) { *ptr = a; }
++
++TEST_VECTOR (int8x8x3_t)
++TEST_VECTOR (int16x4x3_t)
++TEST_VECTOR (int32x2x3_t)
++TEST_VECTOR (int64x1x3_t)
++TEST_VECTOR (float16x4x3_t)
++TEST_VECTOR (bfloat16x4x3_t)
++TEST_VECTOR (float32x2x3_t)
++TEST_VECTOR (float64x1x3_t)
++
++/*
++** mov_int8x8x3_t:
++**	fmov	d0, d3
++**	fmov	d1, d4
++**	fmov	d2, d5
++**	ret
++*/
++/*
++** load_int8x8x3_t:
++**	ldp	d0, d1, \[x0\]
++**	ldr	d2, \[x0, #?16\]
++**	ret
++*/
++/*
++** store_int8x8x3_t:
++**	stp	d0, d1, \[x0\]
++**	str	d2, \[x0, #?16\]
++**	ret
++*/
+diff --git a/gcc/testsuite/gcc.target/aarch64/movv4x16qi_1.c b/gcc/testsuite/gcc.target/aarch64/movv4x16qi_1.c
+new file mode 100644
+index 000000000..6a517b4fe
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/movv4x16qi_1.c
+@@ -0,0 +1,44 @@
++/* { dg-do assemble } */
++/* { dg-options "-O --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" "" } } */
++
++#pragma GCC aarch64 "arm_neon.h"
++
++#pragma GCC target "+nosimd+fp"
++
++#define TEST_VECTOR(TYPE) \
++  TYPE mov_##TYPE (TYPE a, TYPE b) { return b; } \
++  TYPE load_##TYPE (TYPE *ptr) { return *ptr; } \
++  void store_##TYPE (TYPE *ptr, TYPE a) { *ptr = a; }
++
++TEST_VECTOR (int8x16x4_t)
++TEST_VECTOR (int16x8x4_t)
++TEST_VECTOR (int32x4x4_t)
++TEST_VECTOR (int64x2x4_t)
++TEST_VECTOR (float16x8x4_t)
++TEST_VECTOR (bfloat16x8x4_t)
++TEST_VECTOR (float32x4x4_t)
++TEST_VECTOR (float64x2x4_t)
++
++/*
++** mov_int8x16x4_t:
++**	sub	sp, sp, #64
++**	stp	q4, q5, \[sp\]
++**	stp	q6, q7, \[sp, #?32\]
++**	ldp	q0, q1, \[sp\]
++**	ldp	q2, q3, \[sp, #?32\]
++**	add	sp, sp, #?64
++**	ret
++*/
++/*
++** load_int8x16x4_t:
++**	ldp	q0, q1, \[x0\]
++**	ldp	q2, q3, \[x0, #?32\]
++**	ret
++*/
++/*
++** store_int8x16x4_t: { xfail *-*-* }
++**	stp	q0, q1, \[x0\]
++**	stp	q2, q3, \[x0, #?32\]
++**	ret
++*/
+diff --git a/gcc/testsuite/gcc.target/aarch64/movv4x8qi_1.c b/gcc/testsuite/gcc.target/aarch64/movv4x8qi_1.c
+new file mode 100644
+index 000000000..f096be4a5
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/movv4x8qi_1.c
+@@ -0,0 +1,42 @@
++/* { dg-do assemble } */
++/* { dg-options "-O --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" "" } } */
++
++#pragma GCC aarch64 "arm_neon.h"
++
++#pragma GCC target "+nosimd+fp"
++
++#define TEST_VECTOR(TYPE) \
++  TYPE mov_##TYPE (TYPE a, TYPE b) { return b; } \
++  TYPE load_##TYPE (TYPE *ptr) { return *ptr; } \
++  void store_##TYPE (TYPE *ptr, TYPE a) { *ptr = a; }
++
++TEST_VECTOR (int8x8x4_t)
++TEST_VECTOR (int16x4x4_t)
++TEST_VECTOR (int32x2x4_t)
++TEST_VECTOR (int64x1x4_t)
++TEST_VECTOR (float16x4x4_t)
++TEST_VECTOR (bfloat16x4x4_t)
++TEST_VECTOR (float32x2x4_t)
++TEST_VECTOR (float64x1x4_t)
++
++/*
++** mov_int8x8x4_t:
++**	fmov	d0, d4
++**	fmov	d1, d5
++**	fmov	d2, d6
++**	fmov	d3, d7
++**	ret
++*/
++/*
++** load_int8x8x4_t:
++**	ldp	d0, d1, \[x0\]
++**	ldp	d2, d3, \[x0, #?16\]
++**	ret
++*/
++/*
++** store_int8x8x4_t:
++**	stp	d0, d1, \[x0\]
++**	stp	d2, d3, \[x0, #?16\]
++**	ret
++*/
+diff --git a/gcc/testsuite/gcc.target/aarch64/movv8qi_1.c b/gcc/testsuite/gcc.target/aarch64/movv8qi_1.c
+index 4c97e6fbc..d2b5d8025 100644
+--- a/gcc/testsuite/gcc.target/aarch64/movv8qi_1.c
++++ b/gcc/testsuite/gcc.target/aarch64/movv8qi_1.c
+@@ -53,3 +53,18 @@ fpr_to_gpr (v8qi q0)
+   x0 = q0;
+   asm volatile ("" :: "r" (x0));
+ }
++
++/*
++** gpr_to_gpr:
++**	mov	x0, x1
++**	ret
++*/
++void
++gpr_to_gpr ()
++{
++  register v8qi x0 asm ("x0");
++  register v8qi x1 asm ("x1");
++  asm volatile ("" : "=r" (x1));
++  x0 = x1;
++  asm volatile ("" :: "r" (x0));
++}
+diff --git a/gcc/testsuite/gcc.target/aarch64/movv8qi_2.c b/gcc/testsuite/gcc.target/aarch64/movv8qi_2.c
+new file mode 100644
+index 000000000..0d8576ffe
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/movv8qi_2.c
+@@ -0,0 +1,27 @@
++/* { dg-do assemble } */
++/* { dg-options "-O --save-temps" } */
++
++#pragma GCC target "+nosimd+fp"
++
++#define TEST_GENERAL(TYPE) \
++  TYPE mov_##TYPE (TYPE a, TYPE b) { return b; } \
++  TYPE zero_##TYPE () { return (TYPE) {}; } \
++  TYPE load_##TYPE (TYPE *ptr) { return *ptr; } \
++  void store_##TYPE (TYPE *ptr, TYPE a) { *ptr = a; }
++
++TEST_GENERAL (__Int8x8_t)
++TEST_GENERAL (__Int16x4_t)
++TEST_GENERAL (__Int32x2_t)
++TEST_GENERAL (__Int64x1_t)
++TEST_GENERAL (__Bfloat16x4_t)
++TEST_GENERAL (__Float16x4_t)
++TEST_GENERAL (__Float32x2_t)
++TEST_GENERAL (__Float64x1_t)
++
++__Int8x8_t const_s8x8 () { return (__Int8x8_t) { 1, 1, 1, 1, 1, 1, 1, 1 }; }
++__Int16x4_t const_s16x4 () { return (__Int16x4_t) { 1, 0, 1, 0 }; }
++__Int32x2_t const_s32x2 () { return (__Int32x2_t) { 1, 2 }; }
++__Int64x1_t const_s64x1 () { return (__Int64x1_t) { 100 }; }
++__Float16x4_t const_f16x4 () { return (__Float16x4_t) { 2, 2, 2, 2 }; }
++__Float32x2_t const_f32x2 () { return (__Float32x2_t) { 1, 2 }; }
++__Float64x1_t const_f64x1 () { return (__Float64x1_t) { 32 }; }
+diff --git a/gcc/testsuite/gcc.target/aarch64/movv8qi_3.c b/gcc/testsuite/gcc.target/aarch64/movv8qi_3.c
+new file mode 100644
+index 000000000..1caa1a788
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/movv8qi_3.c
+@@ -0,0 +1,30 @@
++/* { dg-do assemble } */
++/* { dg-options "-O --save-temps" } */
++/* { dg-final { check-function-bodies "**" "" "" } } */
++
++#pragma GCC target "+nosimd+fp"
++
++#define TEST_VECTOR(TYPE) \
++  TYPE \
++  test_##TYPE (void) \
++  { \
++    typedef TYPE v __attribute__((aligned(1))); \
++    register v *ptr asm ("x0"); \
++    asm volatile ("" : "=r" (ptr)); \
++    return *ptr; \
++  }
++
++TEST_VECTOR (__Int8x8_t)
++TEST_VECTOR (__Int16x4_t)
++TEST_VECTOR (__Int32x2_t)
++TEST_VECTOR (__Int64x1_t)
++TEST_VECTOR (__Bfloat16x4_t)
++TEST_VECTOR (__Float16x4_t)
++TEST_VECTOR (__Float32x2_t)
++TEST_VECTOR (__Float64x1_t)
++
++/*
++** test___Int8x8_t:
++**	ldr	d0, \[x0\]
++**	ret
++*/
+diff --git a/gcc/testsuite/gcc.target/aarch64/vect_unary_2.c b/gcc/testsuite/gcc.target/aarch64/vect_unary_2.c
+new file mode 100644
+index 000000000..454ac2771
+--- /dev/null
++++ b/gcc/testsuite/gcc.target/aarch64/vect_unary_2.c
+@@ -0,0 +1,5 @@
++/* { dg-options "-O3 -fno-math-errno --save-temps" } */
++
++#pragma GCC target "+nosimd+fp"
++
++#include "vect_unary_1.c"
+-- 
+2.33.0
+