summaryrefslogtreecommitdiff
path: root/gcc/config/aarch64
diff options
context:
space:
mode:
Diffstat (limited to 'gcc/config/aarch64')
-rw-r--r--gcc/config/aarch64/aarch64-builtins.c5
-rw-r--r--gcc/config/aarch64/aarch64-protos.h2
-rw-r--r--gcc/config/aarch64/aarch64-simd-builtins.def4
-rw-r--r--gcc/config/aarch64/aarch64-simd.md84
-rw-r--r--gcc/config/aarch64/aarch64.c33
-rw-r--r--gcc/config/aarch64/aarch64.h3
-rw-r--r--gcc/config/aarch64/aarch64.md22
-rw-r--r--gcc/config/aarch64/arm_neon.h2136
-rw-r--r--gcc/config/aarch64/iterators.md26
9 files changed, 1264 insertions, 1051 deletions
diff --git a/gcc/config/aarch64/aarch64-builtins.c b/gcc/config/aarch64/aarch64-builtins.c
index 4616ad24c07..a3019828a93 100644
--- a/gcc/config/aarch64/aarch64-builtins.c
+++ b/gcc/config/aarch64/aarch64-builtins.c
@@ -246,6 +246,11 @@ aarch64_types_store1_qualifiers[SIMD_MAX_BUILTIN_ARGS]
= { qualifier_void, qualifier_pointer_map_mode, qualifier_none };
#define TYPES_STORE1 (aarch64_types_store1_qualifiers)
#define TYPES_STORESTRUCT (aarch64_types_store1_qualifiers)
+static enum aarch64_type_qualifiers
+aarch64_types_storestruct_lane_qualifiers[SIMD_MAX_BUILTIN_ARGS]
+ = { qualifier_void, qualifier_pointer_map_mode,
+ qualifier_none, qualifier_none };
+#define TYPES_STORESTRUCT_LANE (aarch64_types_storestruct_lane_qualifiers)
#define CF0(N, X) CODE_FOR_aarch64_##N##X
#define CF1(N, X) CODE_FOR_##N##X##1
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 5542f023b33..04cbc780da2 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -175,6 +175,8 @@ bool aarch64_is_extend_from_extract (enum machine_mode, rtx, rtx);
bool aarch64_is_long_call_p (rtx);
bool aarch64_label_mentioned_p (rtx);
bool aarch64_legitimate_pic_operand_p (rtx);
+bool aarch64_modes_tieable_p (enum machine_mode mode1,
+ enum machine_mode mode2);
bool aarch64_move_imm (HOST_WIDE_INT, enum machine_mode);
bool aarch64_mov_operand_p (rtx, enum aarch64_symbol_context,
enum machine_mode);
diff --git a/gcc/config/aarch64/aarch64-simd-builtins.def b/gcc/config/aarch64/aarch64-simd-builtins.def
index fa332ae5948..339e8f86a4b 100644
--- a/gcc/config/aarch64/aarch64-simd-builtins.def
+++ b/gcc/config/aarch64/aarch64-simd-builtins.def
@@ -118,6 +118,10 @@
BUILTIN_VQ (STORESTRUCT, st3, 0)
BUILTIN_VQ (STORESTRUCT, st4, 0)
+ BUILTIN_VQ (STORESTRUCT_LANE, st2_lane, 0)
+ BUILTIN_VQ (STORESTRUCT_LANE, st3_lane, 0)
+ BUILTIN_VQ (STORESTRUCT_LANE, st4_lane, 0)
+
BUILTIN_VQW (BINOP, saddl2, 0)
BUILTIN_VQW (BINOP, uaddl2, 0)
BUILTIN_VQW (BINOP, ssubl2, 0)
diff --git a/gcc/config/aarch64/aarch64-simd.md b/gcc/config/aarch64/aarch64-simd.md
index c05767b2045..108bc8d8893 100644
--- a/gcc/config/aarch64/aarch64-simd.md
+++ b/gcc/config/aarch64/aarch64-simd.md
@@ -3686,6 +3686,17 @@
[(set_attr "type" "neon_store2_2reg<q>")]
)
+(define_insn "vec_store_lanesoi_lane<mode>"
+ [(set (match_operand:<V_TWO_ELEM> 0 "aarch64_simd_struct_operand" "=Utv")
+ (unspec:<V_TWO_ELEM> [(match_operand:OI 1 "register_operand" "w")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_ST2_LANE))]
+ "TARGET_SIMD"
+ "st2\\t{%S1.<Vetype> - %T1.<Vetype>}[%2], %0"
+ [(set_attr "type" "neon_store3_one_lane<q>")]
+)
+
(define_insn "vec_load_lanesci<mode>"
[(set (match_operand:CI 0 "register_operand" "=w")
(unspec:CI [(match_operand:CI 1 "aarch64_simd_struct_operand" "Utv")
@@ -3706,6 +3717,17 @@
[(set_attr "type" "neon_store3_3reg<q>")]
)
+(define_insn "vec_store_lanesci_lane<mode>"
+ [(set (match_operand:<V_THREE_ELEM> 0 "aarch64_simd_struct_operand" "=Utv")
+ (unspec:<V_THREE_ELEM> [(match_operand:CI 1 "register_operand" "w")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_ST3_LANE))]
+ "TARGET_SIMD"
+ "st3\\t{%S1.<Vetype> - %U1.<Vetype>}[%2], %0"
+ [(set_attr "type" "neon_store3_one_lane<q>")]
+)
+
(define_insn "vec_load_lanesxi<mode>"
[(set (match_operand:XI 0 "register_operand" "=w")
(unspec:XI [(match_operand:XI 1 "aarch64_simd_struct_operand" "Utv")
@@ -3726,6 +3748,17 @@
[(set_attr "type" "neon_store4_4reg<q>")]
)
+(define_insn "vec_store_lanesxi_lane<mode>"
+ [(set (match_operand:<V_FOUR_ELEM> 0 "aarch64_simd_struct_operand" "=Utv")
+ (unspec:<V_FOUR_ELEM> [(match_operand:XI 1 "register_operand" "w")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)
+ (match_operand:SI 2 "immediate_operand" "i")]
+ UNSPEC_ST4_LANE))]
+ "TARGET_SIMD"
+ "st4\\t{%S1.<Vetype> - %V1.<Vetype>}[%2], %0"
+ [(set_attr "type" "neon_store4_one_lane<q>")]
+)
+
;; Reload patterns for AdvSIMD register list operands.
(define_expand "mov<mode>"
@@ -4220,6 +4253,57 @@
DONE;
})
+(define_expand "aarch64_st2_lane<VQ:mode>"
+ [(match_operand:DI 0 "register_operand" "r")
+ (match_operand:OI 1 "register_operand" "w")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)
+ (match_operand:SI 2 "immediate_operand")]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_TWO_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[0]);
+ operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+
+ emit_insn (gen_vec_store_lanesoi_lane<VQ:mode> (mem,
+ operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_expand "aarch64_st3_lane<VQ:mode>"
+ [(match_operand:DI 0 "register_operand" "r")
+ (match_operand:CI 1 "register_operand" "w")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)
+ (match_operand:SI 2 "immediate_operand")]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_THREE_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[0]);
+ operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+
+ emit_insn (gen_vec_store_lanesci_lane<VQ:mode> (mem,
+ operands[1],
+ operands[2]));
+ DONE;
+})
+
+(define_expand "aarch64_st4_lane<VQ:mode>"
+ [(match_operand:DI 0 "register_operand" "r")
+ (match_operand:XI 1 "register_operand" "w")
+ (unspec:VQ [(const_int 0)] UNSPEC_VSTRUCTDUMMY)
+ (match_operand:SI 2 "immediate_operand")]
+ "TARGET_SIMD"
+{
+ enum machine_mode mode = <V_FOUR_ELEM>mode;
+ rtx mem = gen_rtx_MEM (mode, operands[0]);
+ operands[2] = GEN_INT (ENDIAN_LANE_N (<MODE>mode, INTVAL (operands[2])));
+
+ emit_insn (gen_vec_store_lanesxi_lane<VQ:mode> (mem,
+ operands[1],
+ operands[2]));
+ DONE;
+})
+
(define_expand "aarch64_st1<VALL:mode>"
[(match_operand:DI 0 "register_operand")
(match_operand:VALL 1 "register_operand")]
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 1d48108516d..d3d7d1e60d6 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -8316,7 +8316,8 @@ aarch64_cannot_change_mode_class (enum machine_mode from,
/* Limited combinations of subregs are safe on FPREGs. Particularly,
1. Vector Mode to Scalar mode where 1 unit of the vector is accessed.
2. Scalar to Scalar for integer modes or same size float modes.
- 3. Vector to Vector modes. */
+ 3. Vector to Vector modes.
+ 4. On little-endian only, Vector-Structure to Vector modes. */
if (GET_MODE_SIZE (from) > GET_MODE_SIZE (to))
{
if (aarch64_vector_mode_supported_p (from)
@@ -8332,11 +8333,41 @@ aarch64_cannot_change_mode_class (enum machine_mode from,
if (aarch64_vector_mode_supported_p (from)
&& aarch64_vector_mode_supported_p (to))
return false;
+
+ /* Within an vector structure straddling multiple vector registers
+ we are in a mixed-endian representation. As such, we can't
+ easily change modes for BYTES_BIG_ENDIAN. Otherwise, we can
+ switch between vectors and vector structures cheaply. */
+ if (!BYTES_BIG_ENDIAN)
+ if ((aarch64_vector_mode_supported_p (from)
+ && aarch64_vect_struct_mode_p (to))
+ || (aarch64_vector_mode_supported_p (to)
+ && aarch64_vect_struct_mode_p (from)))
+ return false;
}
return true;
}
+/* Implement MODES_TIEABLE_P. */
+
+bool
+aarch64_modes_tieable_p (enum machine_mode mode1, enum machine_mode mode2)
+{
+ if (GET_MODE_CLASS (mode1) == GET_MODE_CLASS (mode2))
+ return true;
+
+ /* We specifically want to allow elements of "structure" modes to
+ be tieable to the structure. This more general condition allows
+ other rarer situations too. */
+ if (TARGET_SIMD
+ && aarch64_vector_mode_p (mode1)
+ && aarch64_vector_mode_p (mode2))
+ return true;
+
+ return false;
+}
+
#undef TARGET_ADDRESS_COST
#define TARGET_ADDRESS_COST aarch64_address_cost
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index e2b6c8e2908..c9b30d01865 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -365,8 +365,7 @@ extern unsigned long aarch64_tune_flags;
#define HARD_REGNO_MODE_OK(REGNO, MODE) aarch64_hard_regno_mode_ok (REGNO, MODE)
-#define MODES_TIEABLE_P(MODE1, MODE2) \
- (GET_MODE_CLASS (MODE1) == GET_MODE_CLASS (MODE2))
+#define MODES_TIEABLE_P(MODE1, MODE2) aarch64_modes_tieable_p (MODE1, MODE2)
#define DWARF2_UNWIND_INFO 1
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index 7965db4c9c7..266d7873a5a 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -98,6 +98,9 @@
UNSPEC_ST2
UNSPEC_ST3
UNSPEC_ST4
+ UNSPEC_ST2_LANE
+ UNSPEC_ST3_LANE
+ UNSPEC_ST4_LANE
UNSPEC_TLS
UNSPEC_TLSDESC
UNSPEC_USHL_2S
@@ -2426,6 +2429,25 @@
}
)
+(define_expand "mov<mode>cc"
+ [(set (match_operand:GPF 0 "register_operand" "")
+ (if_then_else:GPF (match_operand 1 "aarch64_comparison_operator" "")
+ (match_operand:GPF 2 "register_operand" "")
+ (match_operand:GPF 3 "register_operand" "")))]
+ ""
+ {
+ rtx ccreg;
+ enum rtx_code code = GET_CODE (operands[1]);
+
+ if (code == UNEQ || code == LTGT)
+ FAIL;
+
+ ccreg = aarch64_gen_compare_reg (code, XEXP (operands[1], 0),
+ XEXP (operands[1], 1));
+ operands[1] = gen_rtx_fmt_ee (code, VOIDmode, ccreg, const0_rtx);
+ }
+)
+
(define_insn "*csinc2<mode>_insn"
[(set (match_operand:GPI 0 "register_operand" "=r")
(plus:GPI (match_operator:GPI 2 "aarch64_comparison_operator"
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h
index 9f1fa98e6fb..f6213ce2aea 100644
--- a/gcc/config/aarch64/arm_neon.h
+++ b/gcc/config/aarch64/arm_neon.h
@@ -13199,929 +13199,6 @@ vtstq_p16 (poly16x8_t a, poly16x8_t b)
: /* No clobbers */);
return result;
}
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vuzp1_f32 (float32x2_t a, float32x2_t b)
-{
- float32x2_t result;
- __asm__ ("uzp1 %0.2s,%1.2s,%2.2s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vuzp1_p8 (poly8x8_t a, poly8x8_t b)
-{
- poly8x8_t result;
- __asm__ ("uzp1 %0.8b,%1.8b,%2.8b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vuzp1_p16 (poly16x4_t a, poly16x4_t b)
-{
- poly16x4_t result;
- __asm__ ("uzp1 %0.4h,%1.4h,%2.4h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vuzp1_s8 (int8x8_t a, int8x8_t b)
-{
- int8x8_t result;
- __asm__ ("uzp1 %0.8b,%1.8b,%2.8b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vuzp1_s16 (int16x4_t a, int16x4_t b)
-{
- int16x4_t result;
- __asm__ ("uzp1 %0.4h,%1.4h,%2.4h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vuzp1_s32 (int32x2_t a, int32x2_t b)
-{
- int32x2_t result;
- __asm__ ("uzp1 %0.2s,%1.2s,%2.2s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vuzp1_u8 (uint8x8_t a, uint8x8_t b)
-{
- uint8x8_t result;
- __asm__ ("uzp1 %0.8b,%1.8b,%2.8b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vuzp1_u16 (uint16x4_t a, uint16x4_t b)
-{
- uint16x4_t result;
- __asm__ ("uzp1 %0.4h,%1.4h,%2.4h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vuzp1_u32 (uint32x2_t a, uint32x2_t b)
-{
- uint32x2_t result;
- __asm__ ("uzp1 %0.2s,%1.2s,%2.2s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vuzp1q_f32 (float32x4_t a, float32x4_t b)
-{
- float32x4_t result;
- __asm__ ("uzp1 %0.4s,%1.4s,%2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vuzp1q_f64 (float64x2_t a, float64x2_t b)
-{
- float64x2_t result;
- __asm__ ("uzp1 %0.2d,%1.2d,%2.2d"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vuzp1q_p8 (poly8x16_t a, poly8x16_t b)
-{
- poly8x16_t result;
- __asm__ ("uzp1 %0.16b,%1.16b,%2.16b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vuzp1q_p16 (poly16x8_t a, poly16x8_t b)
-{
- poly16x8_t result;
- __asm__ ("uzp1 %0.8h,%1.8h,%2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vuzp1q_s8 (int8x16_t a, int8x16_t b)
-{
- int8x16_t result;
- __asm__ ("uzp1 %0.16b,%1.16b,%2.16b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vuzp1q_s16 (int16x8_t a, int16x8_t b)
-{
- int16x8_t result;
- __asm__ ("uzp1 %0.8h,%1.8h,%2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vuzp1q_s32 (int32x4_t a, int32x4_t b)
-{
- int32x4_t result;
- __asm__ ("uzp1 %0.4s,%1.4s,%2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vuzp1q_s64 (int64x2_t a, int64x2_t b)
-{
- int64x2_t result;
- __asm__ ("uzp1 %0.2d,%1.2d,%2.2d"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vuzp1q_u8 (uint8x16_t a, uint8x16_t b)
-{
- uint8x16_t result;
- __asm__ ("uzp1 %0.16b,%1.16b,%2.16b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vuzp1q_u16 (uint16x8_t a, uint16x8_t b)
-{
- uint16x8_t result;
- __asm__ ("uzp1 %0.8h,%1.8h,%2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vuzp1q_u32 (uint32x4_t a, uint32x4_t b)
-{
- uint32x4_t result;
- __asm__ ("uzp1 %0.4s,%1.4s,%2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vuzp1q_u64 (uint64x2_t a, uint64x2_t b)
-{
- uint64x2_t result;
- __asm__ ("uzp1 %0.2d,%1.2d,%2.2d"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vuzp2_f32 (float32x2_t a, float32x2_t b)
-{
- float32x2_t result;
- __asm__ ("uzp2 %0.2s,%1.2s,%2.2s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vuzp2_p8 (poly8x8_t a, poly8x8_t b)
-{
- poly8x8_t result;
- __asm__ ("uzp2 %0.8b,%1.8b,%2.8b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vuzp2_p16 (poly16x4_t a, poly16x4_t b)
-{
- poly16x4_t result;
- __asm__ ("uzp2 %0.4h,%1.4h,%2.4h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vuzp2_s8 (int8x8_t a, int8x8_t b)
-{
- int8x8_t result;
- __asm__ ("uzp2 %0.8b,%1.8b,%2.8b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vuzp2_s16 (int16x4_t a, int16x4_t b)
-{
- int16x4_t result;
- __asm__ ("uzp2 %0.4h,%1.4h,%2.4h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vuzp2_s32 (int32x2_t a, int32x2_t b)
-{
- int32x2_t result;
- __asm__ ("uzp2 %0.2s,%1.2s,%2.2s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vuzp2_u8 (uint8x8_t a, uint8x8_t b)
-{
- uint8x8_t result;
- __asm__ ("uzp2 %0.8b,%1.8b,%2.8b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vuzp2_u16 (uint16x4_t a, uint16x4_t b)
-{
- uint16x4_t result;
- __asm__ ("uzp2 %0.4h,%1.4h,%2.4h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vuzp2_u32 (uint32x2_t a, uint32x2_t b)
-{
- uint32x2_t result;
- __asm__ ("uzp2 %0.2s,%1.2s,%2.2s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vuzp2q_f32 (float32x4_t a, float32x4_t b)
-{
- float32x4_t result;
- __asm__ ("uzp2 %0.4s,%1.4s,%2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vuzp2q_f64 (float64x2_t a, float64x2_t b)
-{
- float64x2_t result;
- __asm__ ("uzp2 %0.2d,%1.2d,%2.2d"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vuzp2q_p8 (poly8x16_t a, poly8x16_t b)
-{
- poly8x16_t result;
- __asm__ ("uzp2 %0.16b,%1.16b,%2.16b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vuzp2q_p16 (poly16x8_t a, poly16x8_t b)
-{
- poly16x8_t result;
- __asm__ ("uzp2 %0.8h,%1.8h,%2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vuzp2q_s8 (int8x16_t a, int8x16_t b)
-{
- int8x16_t result;
- __asm__ ("uzp2 %0.16b,%1.16b,%2.16b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vuzp2q_s16 (int16x8_t a, int16x8_t b)
-{
- int16x8_t result;
- __asm__ ("uzp2 %0.8h,%1.8h,%2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vuzp2q_s32 (int32x4_t a, int32x4_t b)
-{
- int32x4_t result;
- __asm__ ("uzp2 %0.4s,%1.4s,%2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vuzp2q_s64 (int64x2_t a, int64x2_t b)
-{
- int64x2_t result;
- __asm__ ("uzp2 %0.2d,%1.2d,%2.2d"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vuzp2q_u8 (uint8x16_t a, uint8x16_t b)
-{
- uint8x16_t result;
- __asm__ ("uzp2 %0.16b,%1.16b,%2.16b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vuzp2q_u16 (uint16x8_t a, uint16x8_t b)
-{
- uint16x8_t result;
- __asm__ ("uzp2 %0.8h,%1.8h,%2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vuzp2q_u32 (uint32x4_t a, uint32x4_t b)
-{
- uint32x4_t result;
- __asm__ ("uzp2 %0.4s,%1.4s,%2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vuzp2q_u64 (uint64x2_t a, uint64x2_t b)
-{
- uint64x2_t result;
- __asm__ ("uzp2 %0.2d,%1.2d,%2.2d"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vzip1_f32 (float32x2_t a, float32x2_t b)
-{
- float32x2_t result;
- __asm__ ("zip1 %0.2s,%1.2s,%2.2s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vzip1_p8 (poly8x8_t a, poly8x8_t b)
-{
- poly8x8_t result;
- __asm__ ("zip1 %0.8b,%1.8b,%2.8b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vzip1_p16 (poly16x4_t a, poly16x4_t b)
-{
- poly16x4_t result;
- __asm__ ("zip1 %0.4h,%1.4h,%2.4h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vzip1_s8 (int8x8_t a, int8x8_t b)
-{
- int8x8_t result;
- __asm__ ("zip1 %0.8b,%1.8b,%2.8b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vzip1_s16 (int16x4_t a, int16x4_t b)
-{
- int16x4_t result;
- __asm__ ("zip1 %0.4h,%1.4h,%2.4h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vzip1_s32 (int32x2_t a, int32x2_t b)
-{
- int32x2_t result;
- __asm__ ("zip1 %0.2s,%1.2s,%2.2s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vzip1_u8 (uint8x8_t a, uint8x8_t b)
-{
- uint8x8_t result;
- __asm__ ("zip1 %0.8b,%1.8b,%2.8b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vzip1_u16 (uint16x4_t a, uint16x4_t b)
-{
- uint16x4_t result;
- __asm__ ("zip1 %0.4h,%1.4h,%2.4h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vzip1_u32 (uint32x2_t a, uint32x2_t b)
-{
- uint32x2_t result;
- __asm__ ("zip1 %0.2s,%1.2s,%2.2s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vzip1q_f32 (float32x4_t a, float32x4_t b)
-{
- float32x4_t result;
- __asm__ ("zip1 %0.4s,%1.4s,%2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vzip1q_f64 (float64x2_t a, float64x2_t b)
-{
- float64x2_t result;
- __asm__ ("zip1 %0.2d,%1.2d,%2.2d"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vzip1q_p8 (poly8x16_t a, poly8x16_t b)
-{
- poly8x16_t result;
- __asm__ ("zip1 %0.16b,%1.16b,%2.16b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vzip1q_p16 (poly16x8_t a, poly16x8_t b)
-{
- poly16x8_t result;
- __asm__ ("zip1 %0.8h,%1.8h,%2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vzip1q_s8 (int8x16_t a, int8x16_t b)
-{
- int8x16_t result;
- __asm__ ("zip1 %0.16b,%1.16b,%2.16b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vzip1q_s16 (int16x8_t a, int16x8_t b)
-{
- int16x8_t result;
- __asm__ ("zip1 %0.8h,%1.8h,%2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vzip1q_s32 (int32x4_t a, int32x4_t b)
-{
- int32x4_t result;
- __asm__ ("zip1 %0.4s,%1.4s,%2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vzip1q_s64 (int64x2_t a, int64x2_t b)
-{
- int64x2_t result;
- __asm__ ("zip1 %0.2d,%1.2d,%2.2d"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vzip1q_u8 (uint8x16_t a, uint8x16_t b)
-{
- uint8x16_t result;
- __asm__ ("zip1 %0.16b,%1.16b,%2.16b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vzip1q_u16 (uint16x8_t a, uint16x8_t b)
-{
- uint16x8_t result;
- __asm__ ("zip1 %0.8h,%1.8h,%2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vzip1q_u32 (uint32x4_t a, uint32x4_t b)
-{
- uint32x4_t result;
- __asm__ ("zip1 %0.4s,%1.4s,%2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vzip1q_u64 (uint64x2_t a, uint64x2_t b)
-{
- uint64x2_t result;
- __asm__ ("zip1 %0.2d,%1.2d,%2.2d"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
-vzip2_f32 (float32x2_t a, float32x2_t b)
-{
- float32x2_t result;
- __asm__ ("zip2 %0.2s,%1.2s,%2.2s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
-vzip2_p8 (poly8x8_t a, poly8x8_t b)
-{
- poly8x8_t result;
- __asm__ ("zip2 %0.8b,%1.8b,%2.8b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
-vzip2_p16 (poly16x4_t a, poly16x4_t b)
-{
- poly16x4_t result;
- __asm__ ("zip2 %0.4h,%1.4h,%2.4h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
-vzip2_s8 (int8x8_t a, int8x8_t b)
-{
- int8x8_t result;
- __asm__ ("zip2 %0.8b,%1.8b,%2.8b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
-vzip2_s16 (int16x4_t a, int16x4_t b)
-{
- int16x4_t result;
- __asm__ ("zip2 %0.4h,%1.4h,%2.4h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
-vzip2_s32 (int32x2_t a, int32x2_t b)
-{
- int32x2_t result;
- __asm__ ("zip2 %0.2s,%1.2s,%2.2s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
-vzip2_u8 (uint8x8_t a, uint8x8_t b)
-{
- uint8x8_t result;
- __asm__ ("zip2 %0.8b,%1.8b,%2.8b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
-vzip2_u16 (uint16x4_t a, uint16x4_t b)
-{
- uint16x4_t result;
- __asm__ ("zip2 %0.4h,%1.4h,%2.4h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
-vzip2_u32 (uint32x2_t a, uint32x2_t b)
-{
- uint32x2_t result;
- __asm__ ("zip2 %0.2s,%1.2s,%2.2s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
-vzip2q_f32 (float32x4_t a, float32x4_t b)
-{
- float32x4_t result;
- __asm__ ("zip2 %0.4s,%1.4s,%2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
-vzip2q_f64 (float64x2_t a, float64x2_t b)
-{
- float64x2_t result;
- __asm__ ("zip2 %0.2d,%1.2d,%2.2d"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
-vzip2q_p8 (poly8x16_t a, poly8x16_t b)
-{
- poly8x16_t result;
- __asm__ ("zip2 %0.16b,%1.16b,%2.16b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
-vzip2q_p16 (poly16x8_t a, poly16x8_t b)
-{
- poly16x8_t result;
- __asm__ ("zip2 %0.8h,%1.8h,%2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
-vzip2q_s8 (int8x16_t a, int8x16_t b)
-{
- int8x16_t result;
- __asm__ ("zip2 %0.16b,%1.16b,%2.16b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
-vzip2q_s16 (int16x8_t a, int16x8_t b)
-{
- int16x8_t result;
- __asm__ ("zip2 %0.8h,%1.8h,%2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
-vzip2q_s32 (int32x4_t a, int32x4_t b)
-{
- int32x4_t result;
- __asm__ ("zip2 %0.4s,%1.4s,%2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
-vzip2q_s64 (int64x2_t a, int64x2_t b)
-{
- int64x2_t result;
- __asm__ ("zip2 %0.2d,%1.2d,%2.2d"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
-vzip2q_u8 (uint8x16_t a, uint8x16_t b)
-{
- uint8x16_t result;
- __asm__ ("zip2 %0.16b,%1.16b,%2.16b"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
-vzip2q_u16 (uint16x8_t a, uint16x8_t b)
-{
- uint16x8_t result;
- __asm__ ("zip2 %0.8h,%1.8h,%2.8h"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
-vzip2q_u32 (uint32x4_t a, uint32x4_t b)
-{
- uint32x4_t result;
- __asm__ ("zip2 %0.4s,%1.4s,%2.4s"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
-
-__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
-vzip2q_u64 (uint64x2_t a, uint64x2_t b)
-{
- uint64x2_t result;
- __asm__ ("zip2 %0.2d,%1.2d,%2.2d"
- : "=w"(result)
- : "w"(a), "w"(b)
- : /* No clobbers */);
- return result;
-}
/* End of temporary inline asm implementations. */
@@ -14452,131 +13529,224 @@ __LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q)
__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q)
__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
-#define __ST2_LANE_FUNC(intype, ptrtype, regsuffix, \
- lnsuffix, funcsuffix, Q) \
- typedef struct { ptrtype __x[2]; } __ST2_LANE_STRUCTURE_##intype; \
- __extension__ static __inline void \
- __attribute__ ((__always_inline__)) \
- vst2 ## Q ## _lane_ ## funcsuffix (ptrtype *ptr, \
- intype b, const int c) \
- { \
- __ST2_LANE_STRUCTURE_##intype *__p = \
- (__ST2_LANE_STRUCTURE_##intype *)ptr; \
- __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \
- "st2 {v16." #lnsuffix ", v17." #lnsuffix "}[%2], %0\n\t" \
- : "=Q"(*__p) \
- : "Q"(b), "i"(c) \
- : "v16", "v17"); \
- }
-
-__ST2_LANE_FUNC (int8x8x2_t, int8_t, 8b, b, s8,)
-__ST2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,)
-__ST2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,)
-__ST2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,)
-__ST2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,)
-__ST2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,)
-__ST2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,)
-__ST2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,)
-__ST2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,)
-__ST2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,)
-__ST2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,)
-__ST2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,)
-__ST2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q)
-__ST2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q)
-__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q)
-__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q)
-__ST2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q)
-__ST2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q)
-__ST2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q)
-__ST2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q)
-__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q)
-__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q)
-__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q)
-__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q)
-
-#define __ST3_LANE_FUNC(intype, ptrtype, regsuffix, \
- lnsuffix, funcsuffix, Q) \
- typedef struct { ptrtype __x[3]; } __ST3_LANE_STRUCTURE_##intype; \
- __extension__ static __inline void \
- __attribute__ ((__always_inline__)) \
- vst3 ## Q ## _lane_ ## funcsuffix (ptrtype *ptr, \
- intype b, const int c) \
- { \
- __ST3_LANE_STRUCTURE_##intype *__p = \
- (__ST3_LANE_STRUCTURE_##intype *)ptr; \
- __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t" \
- "st3 {v16." #lnsuffix " - v18." #lnsuffix "}[%2], %0\n\t" \
- : "=Q"(*__p) \
- : "Q"(b), "i"(c) \
- : "v16", "v17", "v18"); \
- }
-
-__ST3_LANE_FUNC (int8x8x3_t, int8_t, 8b, b, s8,)
-__ST3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,)
-__ST3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,)
-__ST3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,)
-__ST3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,)
-__ST3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,)
-__ST3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,)
-__ST3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,)
-__ST3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,)
-__ST3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,)
-__ST3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,)
-__ST3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,)
-__ST3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q)
-__ST3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q)
-__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q)
-__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q)
-__ST3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q)
-__ST3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q)
-__ST3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q)
-__ST3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q)
-__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q)
-__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q)
-__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q)
-__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q)
-
-#define __ST4_LANE_FUNC(intype, ptrtype, regsuffix, \
- lnsuffix, funcsuffix, Q) \
- typedef struct { ptrtype __x[4]; } __ST4_LANE_STRUCTURE_##intype; \
- __extension__ static __inline void \
- __attribute__ ((__always_inline__)) \
- vst4 ## Q ## _lane_ ## funcsuffix (ptrtype *ptr, \
- intype b, const int c) \
- { \
- __ST4_LANE_STRUCTURE_##intype *__p = \
- (__ST4_LANE_STRUCTURE_##intype *)ptr; \
- __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \
- "st4 {v16." #lnsuffix " - v19." #lnsuffix "}[%2], %0\n\t" \
- : "=Q"(*__p) \
- : "Q"(b), "i"(c) \
- : "v16", "v17", "v18", "v19"); \
- }
-
-__ST4_LANE_FUNC (int8x8x4_t, int8_t, 8b, b, s8,)
-__ST4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,)
-__ST4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,)
-__ST4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,)
-__ST4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,)
-__ST4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,)
-__ST4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,)
-__ST4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,)
-__ST4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,)
-__ST4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,)
-__ST4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,)
-__ST4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,)
-__ST4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q)
-__ST4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q)
-__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q)
-__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q)
-__ST4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q)
-__ST4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q)
-__ST4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q)
-__ST4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q)
-__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q)
-__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q)
-__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q)
-__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q)
+#define __ST2_LANE_FUNC(intype, largetype, ptrtype, \
+ mode, ptr_mode, funcsuffix, signedtype) \
+__extension__ static __inline void \
+__attribute__ ((__always_inline__)) \
+vst2_lane_ ## funcsuffix (ptrtype *__ptr, \
+ intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_oi __o; \
+ largetype __temp; \
+ __temp.val[0] \
+ = vcombine_##funcsuffix (__b.val[0], \
+ vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+ __temp.val[1] \
+ = vcombine_##funcsuffix (__b.val[1], \
+ vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+ __o = __builtin_aarch64_set_qregoi##mode (__o, \
+ (signedtype) __temp.val[0], 0); \
+ __o = __builtin_aarch64_set_qregoi##mode (__o, \
+ (signedtype) __temp.val[1], 1); \
+ __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
+ __ptr, __o, __c); \
+}
+
+__ST2_LANE_FUNC (float32x2x2_t, float32x4x2_t, float32_t, v4sf, sf, f32,
+ float32x4_t)
+__ST2_LANE_FUNC (float64x1x2_t, float64x2x2_t, float64_t, v2df, df, f64,
+ float64x2_t)
+__ST2_LANE_FUNC (poly8x8x2_t, poly8x16x2_t, poly8_t, v16qi, qi, p8, int8x16_t)
+__ST2_LANE_FUNC (poly16x4x2_t, poly16x8x2_t, poly16_t, v8hi, hi, p16,
+ int16x8_t)
+__ST2_LANE_FUNC (int8x8x2_t, int8x16x2_t, int8_t, v16qi, qi, s8, int8x16_t)
+__ST2_LANE_FUNC (int16x4x2_t, int16x8x2_t, int16_t, v8hi, hi, s16, int16x8_t)
+__ST2_LANE_FUNC (int32x2x2_t, int32x4x2_t, int32_t, v4si, si, s32, int32x4_t)
+__ST2_LANE_FUNC (int64x1x2_t, int64x2x2_t, int64_t, v2di, di, s64, int64x2_t)
+__ST2_LANE_FUNC (uint8x8x2_t, uint8x16x2_t, uint8_t, v16qi, qi, u8, int8x16_t)
+__ST2_LANE_FUNC (uint16x4x2_t, uint16x8x2_t, uint16_t, v8hi, hi, u16,
+ int16x8_t)
+__ST2_LANE_FUNC (uint32x2x2_t, uint32x4x2_t, uint32_t, v4si, si, u32,
+ int32x4_t)
+__ST2_LANE_FUNC (uint64x1x2_t, uint64x2x2_t, uint64_t, v2di, di, u64,
+ int64x2_t)
+
+#undef __ST2_LANE_FUNC
+#define __ST2_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
+__extension__ static __inline void \
+__attribute__ ((__always_inline__)) \
+vst2q_lane_ ## funcsuffix (ptrtype *__ptr, \
+ intype __b, const int __c) \
+{ \
+ union { intype __i; \
+ __builtin_aarch64_simd_oi __o; } __temp = { __b }; \
+ __builtin_aarch64_st2_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
+ __ptr, __temp.__o, __c); \
+}
+
+__ST2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32)
+__ST2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64)
+__ST2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8)
+__ST2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16)
+__ST2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8)
+__ST2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16)
+__ST2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32)
+__ST2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64)
+__ST2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8)
+__ST2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16)
+__ST2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32)
+__ST2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64)
+
+#define __ST3_LANE_FUNC(intype, largetype, ptrtype, \
+ mode, ptr_mode, funcsuffix, signedtype) \
+__extension__ static __inline void \
+__attribute__ ((__always_inline__)) \
+vst3_lane_ ## funcsuffix (ptrtype *__ptr, \
+ intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_ci __o; \
+ largetype __temp; \
+ __temp.val[0] \
+ = vcombine_##funcsuffix (__b.val[0], \
+ vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+ __temp.val[1] \
+ = vcombine_##funcsuffix (__b.val[1], \
+ vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+ __temp.val[2] \
+ = vcombine_##funcsuffix (__b.val[2], \
+ vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+ __o = __builtin_aarch64_set_qregci##mode (__o, \
+ (signedtype) __temp.val[0], 0); \
+ __o = __builtin_aarch64_set_qregci##mode (__o, \
+ (signedtype) __temp.val[1], 1); \
+ __o = __builtin_aarch64_set_qregci##mode (__o, \
+ (signedtype) __temp.val[2], 2); \
+ __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
+ __ptr, __o, __c); \
+}
+
+__ST3_LANE_FUNC (float32x2x3_t, float32x4x3_t, float32_t, v4sf, sf, f32,
+ float32x4_t)
+__ST3_LANE_FUNC (float64x1x3_t, float64x2x3_t, float64_t, v2df, df, f64,
+ float64x2_t)
+__ST3_LANE_FUNC (poly8x8x3_t, poly8x16x3_t, poly8_t, v16qi, qi, p8, int8x16_t)
+__ST3_LANE_FUNC (poly16x4x3_t, poly16x8x3_t, poly16_t, v8hi, hi, p16,
+ int16x8_t)
+__ST3_LANE_FUNC (int8x8x3_t, int8x16x3_t, int8_t, v16qi, qi, s8, int8x16_t)
+__ST3_LANE_FUNC (int16x4x3_t, int16x8x3_t, int16_t, v8hi, hi, s16, int16x8_t)
+__ST3_LANE_FUNC (int32x2x3_t, int32x4x3_t, int32_t, v4si, si, s32, int32x4_t)
+__ST3_LANE_FUNC (int64x1x3_t, int64x2x3_t, int64_t, v2di, di, s64, int64x2_t)
+__ST3_LANE_FUNC (uint8x8x3_t, uint8x16x3_t, uint8_t, v16qi, qi, u8, int8x16_t)
+__ST3_LANE_FUNC (uint16x4x3_t, uint16x8x3_t, uint16_t, v8hi, hi, u16,
+ int16x8_t)
+__ST3_LANE_FUNC (uint32x2x3_t, uint32x4x3_t, uint32_t, v4si, si, u32,
+ int32x4_t)
+__ST3_LANE_FUNC (uint64x1x3_t, uint64x2x3_t, uint64_t, v2di, di, u64,
+ int64x2_t)
+
+#undef __ST3_LANE_FUNC
+#define __ST3_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
+__extension__ static __inline void \
+__attribute__ ((__always_inline__)) \
+vst3q_lane_ ## funcsuffix (ptrtype *__ptr, \
+ intype __b, const int __c) \
+{ \
+ union { intype __i; \
+ __builtin_aarch64_simd_ci __o; } __temp = { __b }; \
+ __builtin_aarch64_st3_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
+ __ptr, __temp.__o, __c); \
+}
+
+__ST3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32)
+__ST3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64)
+__ST3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8)
+__ST3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16)
+__ST3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8)
+__ST3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16)
+__ST3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32)
+__ST3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64)
+__ST3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8)
+__ST3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16)
+__ST3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32)
+__ST3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64)
+
+#define __ST4_LANE_FUNC(intype, largetype, ptrtype, \
+ mode, ptr_mode, funcsuffix, signedtype) \
+__extension__ static __inline void \
+__attribute__ ((__always_inline__)) \
+vst4_lane_ ## funcsuffix (ptrtype *__ptr, \
+ intype __b, const int __c) \
+{ \
+ __builtin_aarch64_simd_xi __o; \
+ largetype __temp; \
+ __temp.val[0] \
+ = vcombine_##funcsuffix (__b.val[0], \
+ vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+ __temp.val[1] \
+ = vcombine_##funcsuffix (__b.val[1], \
+ vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+ __temp.val[2] \
+ = vcombine_##funcsuffix (__b.val[2], \
+ vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+ __temp.val[3] \
+ = vcombine_##funcsuffix (__b.val[3], \
+ vcreate_##funcsuffix (__AARCH64_UINT64_C (0))); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[0], 0); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[1], 1); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[2], 2); \
+ __o = __builtin_aarch64_set_qregxi##mode (__o, \
+ (signedtype) __temp.val[3], 3); \
+ __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
+ __ptr, __o, __c); \
+}
+
+__ST4_LANE_FUNC (float32x2x4_t, float32x4x4_t, float32_t, v4sf, sf, f32,
+ float32x4_t)
+__ST4_LANE_FUNC (float64x1x4_t, float64x2x4_t, float64_t, v2df, df, f64,
+ float64x2_t)
+__ST4_LANE_FUNC (poly8x8x4_t, poly8x16x4_t, poly8_t, v16qi, qi, p8, int8x16_t)
+__ST4_LANE_FUNC (poly16x4x4_t, poly16x8x4_t, poly16_t, v8hi, hi, p16,
+ int16x8_t)
+__ST4_LANE_FUNC (int8x8x4_t, int8x16x4_t, int8_t, v16qi, qi, s8, int8x16_t)
+__ST4_LANE_FUNC (int16x4x4_t, int16x8x4_t, int16_t, v8hi, hi, s16, int16x8_t)
+__ST4_LANE_FUNC (int32x2x4_t, int32x4x4_t, int32_t, v4si, si, s32, int32x4_t)
+__ST4_LANE_FUNC (int64x1x4_t, int64x2x4_t, int64_t, v2di, di, s64, int64x2_t)
+__ST4_LANE_FUNC (uint8x8x4_t, uint8x16x4_t, uint8_t, v16qi, qi, u8, int8x16_t)
+__ST4_LANE_FUNC (uint16x4x4_t, uint16x8x4_t, uint16_t, v8hi, hi, u16,
+ int16x8_t)
+__ST4_LANE_FUNC (uint32x2x4_t, uint32x4x4_t, uint32_t, v4si, si, u32,
+ int32x4_t)
+__ST4_LANE_FUNC (uint64x1x4_t, uint64x2x4_t, uint64_t, v2di, di, u64,
+ int64x2_t)
+
+#undef __ST4_LANE_FUNC
+#define __ST4_LANE_FUNC(intype, ptrtype, mode, ptr_mode, funcsuffix) \
+__extension__ static __inline void \
+__attribute__ ((__always_inline__)) \
+vst4q_lane_ ## funcsuffix (ptrtype *__ptr, \
+ intype __b, const int __c) \
+{ \
+ union { intype __i; \
+ __builtin_aarch64_simd_xi __o; } __temp = { __b }; \
+ __builtin_aarch64_st4_lane##mode ((__builtin_aarch64_simd_ ## ptr_mode *) \
+ __ptr, __temp.__o, __c); \
+}
+
+__ST4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32)
+__ST4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64)
+__ST4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8)
+__ST4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16)
+__ST4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8)
+__ST4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16)
+__ST4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32)
+__ST4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64)
+__ST4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8)
+__ST4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16)
+__ST4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32)
+__ST4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64)
__extension__ static __inline int64_t __attribute__ ((__always_inline__))
vaddlv_s32 (int32x2_t a)
@@ -25614,10 +24784,880 @@ vuqaddd_s64 (int64x1_t __a, uint64x1_t __b)
/* vuzp */
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vuzp1_f32 (float32x2_t __a, float32x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vuzp1_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vuzp1_p16 (poly16x4_t __a, poly16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
+#endif
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vuzp1_s8 (int8x8_t __a, int8x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vuzp1_s16 (int16x4_t __a, int16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
+#endif
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vuzp1_s32 (int32x2_t __a, int32x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vuzp1_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vuzp1_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {5, 7, 1, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 2, 4, 6});
+#endif
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vuzp1_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vuzp1q_f32 (float32x4_t __a, float32x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
+#endif
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vuzp1q_f64 (float64x2_t __a, float64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+#endif
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vuzp1q_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+#endif
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vuzp1q_p16 (poly16x8_t __a, poly16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vuzp1q_s8 (int8x16_t __a, int8x16_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b,
+ (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
+#else
+ return __builtin_shuffle (__a, __b,
+ (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+#endif
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vuzp1q_s16 (int16x8_t __a, int16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vuzp1q_s32 (int32x4_t __a, int32x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
+#endif
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vuzp1q_s64 (int64x2_t __a, int64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+#endif
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vuzp1q_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b,
+ (uint8x16_t) {17, 19, 21, 23, 25, 27, 29, 31, 1, 3, 5, 7, 9, 11, 13, 15});
+#else
+ return __builtin_shuffle (__a, __b,
+ (uint8x16_t) {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+#endif
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vuzp1q_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {9, 11, 13, 15, 1, 3, 5, 7});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 2, 4, 6, 8, 10, 12, 14});
+#endif
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vuzp1q_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {5, 7, 1, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 2, 4, 6});
+#endif
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vuzp1q_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+#endif
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vuzp2_f32 (float32x2_t __a, float32x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+#endif
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vuzp2_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vuzp2_p16 (poly16x4_t __a, poly16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
+#endif
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vuzp2_s8 (int8x8_t __a, int8x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vuzp2_s16 (int16x4_t __a, int16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
+#endif
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vuzp2_s32 (int32x2_t __a, int32x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+#endif
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vuzp2_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vuzp2_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 6, 0, 2});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {1, 3, 5, 7});
+#endif
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vuzp2_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+#endif
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vuzp2q_f32 (float32x4_t __a, float32x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
+#endif
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vuzp2q_f64 (float64x2_t __a, float64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
+#else
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
+#endif
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vuzp2q_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b,
+ (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
+#else
+ return __builtin_shuffle (__a, __b,
+ (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
+#endif
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vuzp2q_p16 (poly16x8_t __a, poly16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vuzp2q_s8 (int8x16_t __a, int8x16_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b,
+ (uint8x16_t) {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
+#else
+ return __builtin_shuffle (__a, __b,
+ (uint8x16_t) {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
+#endif
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vuzp2q_s16 (int16x8_t __a, int16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vuzp2q_s32 (int32x4_t __a, int32x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
+#endif
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vuzp2q_s64 (int64x2_t __a, int64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
+#else
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
+#endif
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vuzp2q_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {16, 18, 20, 22, 24, 26, 28, 30, 0, 2, 4, 6, 8, 10, 12, 14});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31});
+#endif
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vuzp2q_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 10, 12, 14, 0, 2, 4, 6});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {1, 3, 5, 7, 9, 11, 13, 15});
+#endif
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vuzp2q_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 6, 0, 2});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {1, 3, 5, 7});
+#endif
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vuzp2q_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
+#else
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
+#endif
+}
+
__INTERLEAVE_LIST (uzp)
/* vzip */
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vzip1_f32 (float32x2_t __a, float32x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vzip1_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
+#endif
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vzip1_p16 (poly16x4_t __a, poly16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
+#endif
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vzip1_s8 (int8x8_t __a, int8x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
+#endif
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vzip1_s16 (int16x4_t __a, int16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
+#endif
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vzip1_s32 (int32x2_t __a, int32x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vzip1_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {12, 4, 13, 5, 14, 6, 15, 7});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
+#endif
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vzip1_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {6, 2, 7, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {0, 4, 1, 5});
+#endif
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vzip1_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {3, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {0, 2});
+#endif
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vzip1q_f32 (float32x4_t __a, float32x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5});
+#endif
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vzip1q_f64 (float64x2_t __a, float64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+#endif
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vzip1q_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23});
+#endif
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vzip1q_p16 (poly16x8_t __a, poly16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x8_t)
+ {12, 4, 13, 5, 14, 6, 15, 7});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
+#endif
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vzip1q_s8 (int8x16_t __a, int8x16_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23});
+#endif
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vzip1q_s16 (int16x8_t __a, int16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x8_t)
+ {12, 4, 13, 5, 14, 6, 15, 7});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
+#endif
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vzip1q_s32 (int32x4_t __a, int32x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5});
+#endif
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vzip1q_s64 (int64x2_t __a, int64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+#endif
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vzip1q_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {24, 8, 25, 9, 26, 10, 27, 11, 28, 12, 29, 13, 30, 14, 31, 15});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23});
+#endif
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vzip1q_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x8_t)
+ {12, 4, 13, 5, 14, 6, 15, 7});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {0, 8, 1, 9, 2, 10, 3, 11});
+#endif
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vzip1q_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {6, 2, 7, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {0, 4, 1, 5});
+#endif
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vzip1q_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {3, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {0, 2});
+#endif
+}
+
+__extension__ static __inline float32x2_t __attribute__ ((__always_inline__))
+vzip2_f32 (float32x2_t __a, float32x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+#endif
+}
+
+__extension__ static __inline poly8x8_t __attribute__ ((__always_inline__))
+vzip2_p8 (poly8x8_t __a, poly8x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
+#endif
+}
+
+__extension__ static __inline poly16x4_t __attribute__ ((__always_inline__))
+vzip2_p16 (poly16x4_t __a, poly16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
+#endif
+}
+
+__extension__ static __inline int8x8_t __attribute__ ((__always_inline__))
+vzip2_s8 (int8x8_t __a, int8x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
+#endif
+}
+
+__extension__ static __inline int16x4_t __attribute__ ((__always_inline__))
+vzip2_s16 (int16x4_t __a, int16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
+#endif
+}
+
+__extension__ static __inline int32x2_t __attribute__ ((__always_inline__))
+vzip2_s32 (int32x2_t __a, int32x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+#endif
+}
+
+__extension__ static __inline uint8x8_t __attribute__ ((__always_inline__))
+vzip2_u8 (uint8x8_t __a, uint8x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x8_t) {4, 12, 5, 13, 6, 14, 7, 15});
+#endif
+}
+
+__extension__ static __inline uint16x4_t __attribute__ ((__always_inline__))
+vzip2_u16 (uint16x4_t __a, uint16x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {4, 0, 5, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x4_t) {2, 6, 3, 7});
+#endif
+}
+
+__extension__ static __inline uint32x2_t __attribute__ ((__always_inline__))
+vzip2_u32 (uint32x2_t __a, uint32x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {2, 0});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x2_t) {1, 3});
+#endif
+}
+
+__extension__ static __inline float32x4_t __attribute__ ((__always_inline__))
+vzip2q_f32 (float32x4_t __a, float32x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7});
+#endif
+}
+
+__extension__ static __inline float64x2_t __attribute__ ((__always_inline__))
+vzip2q_f64 (float64x2_t __a, float64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
+#else
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
+#endif
+}
+
+__extension__ static __inline poly8x16_t __attribute__ ((__always_inline__))
+vzip2q_p8 (poly8x16_t __a, poly8x16_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
+#endif
+}
+
+__extension__ static __inline poly16x8_t __attribute__ ((__always_inline__))
+vzip2q_p16 (poly16x8_t __a, poly16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t)
+ {4, 12, 5, 13, 6, 14, 7, 15});
+#endif
+}
+
+__extension__ static __inline int8x16_t __attribute__ ((__always_inline__))
+vzip2q_s8 (int8x16_t __a, int8x16_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
+#endif
+}
+
+__extension__ static __inline int16x8_t __attribute__ ((__always_inline__))
+vzip2q_s16 (int16x8_t __a, int16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t)
+ {4, 12, 5, 13, 6, 14, 7, 15});
+#endif
+}
+
+__extension__ static __inline int32x4_t __attribute__ ((__always_inline__))
+vzip2q_s32 (int32x4_t __a, int32x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7});
+#endif
+}
+
+__extension__ static __inline int64x2_t __attribute__ ((__always_inline__))
+vzip2q_s64 (int64x2_t __a, int64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
+#else
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
+#endif
+}
+
+__extension__ static __inline uint8x16_t __attribute__ ((__always_inline__))
+vzip2q_u8 (uint8x16_t __a, uint8x16_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {16, 0, 17, 1, 18, 2, 19, 3, 20, 4, 21, 5, 22, 6, 23, 7});
+#else
+ return __builtin_shuffle (__a, __b, (uint8x16_t)
+ {8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31});
+#endif
+}
+
+__extension__ static __inline uint16x8_t __attribute__ ((__always_inline__))
+vzip2q_u16 (uint16x8_t __a, uint16x8_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint16x8_t) {8, 0, 9, 1, 10, 2, 11, 3});
+#else
+ return __builtin_shuffle (__a, __b, (uint16x8_t)
+ {4, 12, 5, 13, 6, 14, 7, 15});
+#endif
+}
+
+__extension__ static __inline uint32x4_t __attribute__ ((__always_inline__))
+vzip2q_u32 (uint32x4_t __a, uint32x4_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {4, 0, 5, 1});
+#else
+ return __builtin_shuffle (__a, __b, (uint32x4_t) {2, 6, 3, 7});
+#endif
+}
+
+__extension__ static __inline uint64x2_t __attribute__ ((__always_inline__))
+vzip2q_u64 (uint64x2_t __a, uint64x2_t __b)
+{
+#ifdef __AARCH64EB__
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {2, 0});
+#else
+ return __builtin_shuffle (__a, __b, (uint64x2_t) {1, 3});
+#endif
+}
+
__INTERLEAVE_LIST (zip)
#undef __INTERLEAVE_LIST
diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md
index fd1eb482f0f..c537c3780ee 100644
--- a/gcc/config/aarch64/iterators.md
+++ b/gcc/config/aarch64/iterators.md
@@ -564,6 +564,32 @@
(define_mode_attr VSTRUCT_DREG [(OI "TI") (CI "EI") (XI "OI")])
+;; Mode of pair of elements for each vector mode, to define transfer
+;; size for structure lane/dup loads and stores.
+(define_mode_attr V_TWO_ELEM [(V8QI "HI") (V16QI "HI")
+ (V4HI "SI") (V8HI "SI")
+ (V2SI "V2SI") (V4SI "V2SI")
+ (DI "V2DI") (V2DI "V2DI")
+ (V2SF "V2SF") (V4SF "V2SF")
+ (DF "V2DI") (V2DF "V2DI")])
+
+;; Similar, for three elements.
+(define_mode_attr V_THREE_ELEM [(V8QI "BLK") (V16QI "BLK")
+ (V4HI "BLK") (V8HI "BLK")
+ (V2SI "BLK") (V4SI "BLK")
+ (DI "EI") (V2DI "EI")
+ (V2SF "BLK") (V4SF "BLK")
+ (DF "EI") (V2DF "EI")])
+
+;; Similar, for four elements.
+(define_mode_attr V_FOUR_ELEM [(V8QI "SI") (V16QI "SI")
+ (V4HI "V4HI") (V8HI "V4HI")
+ (V2SI "V4SI") (V4SI "V4SI")
+ (DI "OI") (V2DI "OI")
+ (V2SF "V4SF") (V4SF "V4SF")
+ (DF "OI") (V2DF "OI")])
+
+
;; Mode for atomic operation suffixes
(define_mode_attr atomic_sfx
[(QI "b") (HI "h") (SI "") (DI "")])