summaryrefslogtreecommitdiff
path: root/gcc/config/i386/emmintrin.h
diff options
context:
space:
mode:
authorrth <rth@138bc75d-0d04-0410-961f-82ee72b054a4>2005-01-11 21:33:14 +0000
committerrth <rth@138bc75d-0d04-0410-961f-82ee72b054a4>2005-01-11 21:33:14 +0000
commitad2c46cf58ec0f1ec0328005b016ba8159c34530 (patch)
tree41dfb70d2dcb0969d9a1eb218ace89a7d8b7331b /gcc/config/i386/emmintrin.h
parenta633f77e84acdcbfdd803d817eaf744012080639 (diff)
downloadgcc-ad2c46cf58ec0f1ec0328005b016ba8159c34530.tar.gz
PR target/13366
* config/i386/i386.h (enum ix86_builtins): Move ... * config/i386/i386.c: ... here. (IX86_BUILTIN_MOVDDUP, IX86_BUILTIN_MMX_ZERO, IX86_BUILTIN_PEXTRW, IX86_BUILTIN_PINSRW, IX86_BUILTIN_LOADAPS, IX86_BUILTIN_LOADSS, IX86_BUILTIN_STORESS, IX86_BUILTIN_SSE_ZERO, IX86_BUILTIN_PEXTRW128, IX86_BUILTIN_PINSRW128, IX86_BUILTIN_LOADAPD, IX86_BUILTIN_LOADSD, IX86_BUILTIN_STOREAPD, IX86_BUILTIN_STORESD, IX86_BUILTIN_STOREHPD, IX86_BUILTIN_STORELPD, IX86_BUILTIN_SETPD1, IX86_BUILTIN_SETPD, IX86_BUILTIN_CLRPD, IX86_BUILTIN_LOADPD1, IX86_BUILTIN_LOADRPD, IX86_BUILTIN_STOREPD1, IX86_BUILTIN_STORERPD, IX86_BUILTIN_LOADDQA, IX86_BUILTIN_STOREDQA, IX86_BUILTIN_CLRTI, IX86_BUILTIN_LOADDDUP): Remove. (IX86_BUILTIN_VEC_INIT_V2SI, IX86_BUILTIN_VEC_INIT_V4HI, IX86_BUILTIN_VEC_INIT_V8QI, IX86_BUILTIN_VEC_EXT_V2DF, IX86_BUILTIN_VEC_EXT_V2DI, IX86_BUILTIN_VEC_EXT_V4SF, IX86_BUILTIN_VEC_EXT_V8HI, IX86_BUILTIN_VEC_EXT_V4HI, IX86_BUILTIN_VEC_SET_V8HI, IX86_BUILTIN_VEC_SET_V4HI): New. (ix86_init_builtins): Make static. (ix86_init_mmx_sse_builtins): Update for changed builtins. (ix86_expand_binop_builtin): Only use ix86_fixup_binary_operands if all the modes match. Otherwise, fake it. (get_element_number, ix86_expand_vec_init_builtin, ix86_expand_vec_ext_builtin, ix86_expand_vec_set_builtin): New. (ix86_expand_builtin): Make static. Update for changed builtins. (ix86_expand_vector_move_misalign): Use sse2_loadlpd with zero operand instead of sse2_loadsd. Cast sse1 fallback to V4SFmode. (ix86_expand_vector_init_duplicate): New. (ix86_expand_vector_init_low_nonzero): New. (ix86_expand_vector_init_one_var, ix86_expand_vector_init_general): Split out from ix86_expand_vector_init; handle integer modes. (ix86_expand_vector_init): Use them. (ix86_expand_vector_set, ix86_expand_vector_extract): New. * config/i386/i386-protos.h: Update. * config/i386/predicates.md (reg_or_0_operand): New. * config/i386/mmx.md (mov<MMXMODEI>_internal): Add 'r' variants. (movv2sf_internal): Likewise. And a splitter to match them all. (vec_dupv2sf, mmx_concatv2sf, vec_setv2sf, vec_extractv2sf, vec_initv2sf, vec_dupv4hi, vec_dupv2si, mmx_concatv2si, vec_setv2si, vec_extractv2si, vec_initv2si, vec_setv4hi, vec_extractv4hi, vec_initv4hi, vec_setv8qi, vec_extractv8qi, vec_initv8qi): New. (mmx_pinsrw): Fix operand ordering. * config/i386/sse.md (movv4sf splitter): Use direct pattern, rather than sse_loadss expander. (movv2df splitter): Similarly. (sse_loadss, sse_loadlss): Remove. (vec_dupv4sf, sse_concatv2sf, sse_concatv4sf, vec_extractv4sf_0): New. (vec_setv4sf, vec_setv2df): Use ix86_expand_vector_set. (vec_extractv4sf, vec_extractv2df): Use ix86_expand_vector_extract. (sse3_movddup): Rename with '*'. (sse3_movddup splitter): Use gen_rtx_REG instead of gen_lowpart. (sse2_loadsd): Remove. (vec_dupv2df_sse3): Rename from sse3_loadddup. (vec_dupv2df, vec_concatv2df_sse3, vec_concatv2df): New. (sse2_pinsrw): Fix argument ordering. (sse2_loadld, sse2_loadq): Add sse1 alternatives. (sse2_stored): Remove 'r' destination. (vec_dupv4si, vec_dupv2di, sse2_concatv2si, sse1_concatv2si, vec_concatv4si_1, vec_concatv2di, vec_setv2di, vec_extractv2di, vec_initv2di, vec_setv4si, vec_extractv4si, vec_initv4si, vec_setv8hi, vec_extractv8hi, vec_initv8hi, vec_setv16qi, vec_extractv16qi, vec_initv16qi): New. * config/i386/emmintrin.h (__m128i, __m128d): Use typedef, not define. (_mm_set_sd, _mm_set1_pd, _mm_setzero_pd, _mm_set_epi64x, _mm_set_epi32, _mm_set_epi16, _mm_set_epi8, _mm_setzero_si128): Use constructor form. (_mm_load_pd, _mm_store_pd): Use plain dereference. (_mm_load_si128, _mm_store_si128): Likewise. (_mm_load1_pd): Use _mm_set1_pd. (_mm_load_sd): Use _mm_set_sd. (_mm_store_sd, _mm_storeh_pd): Use __builtin_ia32_vec_ext_v2df. (_mm_store1_pd, _mm_storer_pd): Use _mm_store_pd. (_mm_set_epi64): Use _mm_set_epi64x. (_mm_set1_epi64x, _mm_set1_epi64, _mm_set1_epi32, _mm_set_epi16, _mm_set1_epi8, _mm_setr_epi64, _mm_setr_epi32, _mm_setr_epi16, _mm_setr_epi8): Use _mm_set_foo form. (_mm_loadl_epi64, _mm_movpi64_epi64, _mm_move_epi64): Use _mm_set_epi64. (_mm_storel_epi64, _mm_movepi64_pi64): Use __builtin_ia32_vec_ext_v2di. (_mm_extract_epi16): Use __builtin_ia32_vec_ext_v8hi. (_mm_insert_epi16): Use __builtin_ia32_vec_set_v8hi. * config/i386/mmintrin.h (_mm_setzero_si64): Use plain cast. (_mm_set_pi32): Use __builtin_ia32_vec_init_v2si. (_mm_set_pi16): Use __builtin_ia32_vec_init_v4hi. (_mm_set_pi8): Use __builtin_ia32_vec_init_v8qi. (_mm_set1_pi16, _mm_set1_pi8): Use _mm_set_piN variant. * config/i386/pmmintrin.h (_mm_loaddup_pd): Use _mm_load1_pd. (_mm_movedup_pd): Use _mm_shuffle_pd. * config/i386/xmmintrin.h (_mm_setzero_ps, _mm_set_ss, _mm_set1_ps, _mm_set_ps, _mm_setr_ps): Use constructor form. (_mm_cvtpi16_ps, _mm_cvtpu16_ps, _mm_cvtpi8_ps, _mm_cvtpu8_ps, _mm_cvtps_pi8, _mm_cvtpi32x2_ps): Avoid __builtin_ia32_mmx_zero; Use _mm_setzero_ps. (_mm_load_ss, _mm_load1_ps): Use _mm_set* form. (_mm_load_ps, _mm_loadr_ps): Use raw dereference. (_mm_store_ss): Use __builtin_ia32_vec_ext_v4sf. (_mm_store_ps): Use raw dereference. (_mm_store1_ps): Use _mm_storeu_ps. (_mm_storer_ps): Use _mm_store_ps. (_mm_extract_pi16): Use __builtin_ia32_vec_ext_v4hi. (_mm_insert_pi16): Use __builtin_ia32_vec_set_v4hi. git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@93199 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/config/i386/emmintrin.h')
-rw-r--r--gcc/config/i386/emmintrin.h439
1 files changed, 169 insertions, 270 deletions
diff --git a/gcc/config/i386/emmintrin.h b/gcc/config/i386/emmintrin.h
index 2d2b710d734..aa7b25e7504 100644
--- a/gcc/config/i386/emmintrin.h
+++ b/gcc/config/i386/emmintrin.h
@@ -40,141 +40,156 @@ typedef int __v4si __attribute__ ((__vector_size__ (16)));
typedef short __v8hi __attribute__ ((__vector_size__ (16)));
typedef char __v16qi __attribute__ ((__vector_size__ (16)));
+typedef __v2di __m128i;
+typedef __v2df __m128d;
+
/* Create a selector for use with the SHUFPD instruction. */
#define _MM_SHUFFLE2(fp1,fp0) \
(((fp1) << 1) | (fp0))
-#define __m128i __v2di
-#define __m128d __v2df
+/* Create a vector with element 0 as F and the rest zero. */
+static __inline __m128d
+_mm_set_sd (double __F)
+{
+ return (__m128d){ __F, 0 };
+}
-/* Create a vector with element 0 as *P and the rest zero. */
+/* Create a vector with both elements equal to F. */
static __inline __m128d
-_mm_load_sd (double const *__P)
+_mm_set1_pd (double __F)
{
- return (__m128d) __builtin_ia32_loadsd (__P);
+ return (__m128d){ __F, __F };
}
-/* Create a vector with all two elements equal to *P. */
static __inline __m128d
-_mm_load1_pd (double const *__P)
+_mm_set_pd1 (double __F)
{
- __v2df __tmp = __builtin_ia32_loadsd (__P);
- return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,0));
+ return _mm_set1_pd (__F);
}
+/* Create a vector with the lower value X and upper value W. */
static __inline __m128d
-_mm_load_pd1 (double const *__P)
+_mm_set_pd (double __W, double __X)
{
- return _mm_load1_pd (__P);
+ return (__m128d){ __X, __W };
}
-/* Load two DPFP values from P. The address must be 16-byte aligned. */
+/* Create a vector with the lower value W and upper value X. */
static __inline __m128d
-_mm_load_pd (double const *__P)
+_mm_setr_pd (double __W, double __X)
{
- return (__m128d) __builtin_ia32_loadapd (__P);
+ return (__m128d){ __W, __X };
}
-/* Load two DPFP values from P. The address need not be 16-byte aligned. */
+/* Create a vector of zeros. */
static __inline __m128d
-_mm_loadu_pd (double const *__P)
+_mm_setzero_pd (void)
{
- return (__m128d) __builtin_ia32_loadupd (__P);
+ return (__m128d){ 0.0, 0.0 };
}
-/* Load two DPFP values in reverse order. The address must be aligned. */
+/* Sets the low DPFP value of A from the low value of B. */
static __inline __m128d
-_mm_loadr_pd (double const *__P)
+_mm_move_sd (__m128d __A, __m128d __B)
{
- __v2df __tmp = __builtin_ia32_loadapd (__P);
- return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
+ return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
}
-/* Create a vector with element 0 as F and the rest zero. */
+/* Load two DPFP values from P. The address must be 16-byte aligned. */
static __inline __m128d
-_mm_set_sd (double __F)
+_mm_load_pd (double const *__P)
{
- return (__m128d) __builtin_ia32_loadsd (&__F);
+ return *(__m128d *)__P;
}
-/* Create a vector with all two elements equal to F. */
+/* Load two DPFP values from P. The address need not be 16-byte aligned. */
static __inline __m128d
-_mm_set1_pd (double __F)
+_mm_loadu_pd (double const *__P)
{
- __v2df __tmp = __builtin_ia32_loadsd (&__F);
- return (__m128d) __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,0));
+ return __builtin_ia32_loadupd (__P);
}
+/* Create a vector with all two elements equal to *P. */
static __inline __m128d
-_mm_set_pd1 (double __F)
+_mm_load1_pd (double const *__P)
{
- return _mm_set1_pd (__F);
+ return _mm_set1_pd (*__P);
}
-/* Create the vector [Z Y]. */
+/* Create a vector with element 0 as *P and the rest zero. */
static __inline __m128d
-_mm_set_pd (double __Z, double __Y)
+_mm_load_sd (double const *__P)
{
- return (__v2df) {__Y, __Z};
+ return _mm_set_sd (*__P);
}
-/* Create the vector [Y Z]. */
static __inline __m128d
-_mm_setr_pd (double __Z, double __Y)
+_mm_load_pd1 (double const *__P)
{
- return _mm_set_pd (__Y, __Z);
+ return _mm_load1_pd (__P);
}
-/* Create a vector of zeros. */
+/* Load two DPFP values in reverse order. The address must be aligned. */
static __inline __m128d
-_mm_setzero_pd (void)
+_mm_loadr_pd (double const *__P)
{
- return (__m128d) __builtin_ia32_setzeropd ();
+ __m128d __tmp = _mm_load_pd (__P);
+ return __builtin_ia32_shufpd (__tmp, __tmp, _MM_SHUFFLE2 (0,1));
+}
+
+/* Store two DPFP values. The address must be 16-byte aligned. */
+static __inline void
+_mm_store_pd (double *__P, __m128d __A)
+{
+ *(__m128d *)__P = __A;
+}
+
+/* Store two DPFP values. The address need not be 16-byte aligned. */
+static __inline void
+_mm_storeu_pd (double *__P, __m128d __A)
+{
+ __builtin_ia32_storeupd (__P, __A);
}
/* Stores the lower DPFP value. */
static __inline void
_mm_store_sd (double *__P, __m128d __A)
{
- __builtin_ia32_storesd (__P, (__v2df)__A);
+ *__P = __builtin_ia32_vec_ext_v2df (__A, 0);
}
-/* Store the lower DPFP value across two words. */
static __inline void
-_mm_store1_pd (double *__P, __m128d __A)
+_mm_storel_pd (double *__P, __m128d __A)
{
- __v2df __va = (__v2df)__A;
- __v2df __tmp = __builtin_ia32_shufpd (__va, __va, _MM_SHUFFLE2 (0,0));
- __builtin_ia32_storeapd (__P, __tmp);
+ _mm_store_sd (__P, __A);
}
+/* Stores the upper DPFP value. */
static __inline void
-_mm_store_pd1 (double *__P, __m128d __A)
+_mm_storeh_pd (double *__P, __m128d __A)
{
- _mm_store1_pd (__P, __A);
+ *__P = __builtin_ia32_vec_ext_v2df (__A, 1);
}
-/* Store two DPFP values. The address must be 16-byte aligned. */
+/* Store the lower DPFP value across two words.
+ The address must be 16-byte aligned. */
static __inline void
-_mm_store_pd (double *__P, __m128d __A)
+_mm_store1_pd (double *__P, __m128d __A)
{
- __builtin_ia32_storeapd (__P, (__v2df)__A);
+ _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,0)));
}
-/* Store two DPFP values. The address need not be 16-byte aligned. */
static __inline void
-_mm_storeu_pd (double *__P, __m128d __A)
+_mm_store_pd1 (double *__P, __m128d __A)
{
- __builtin_ia32_storeupd (__P, (__v2df)__A);
+ _mm_store1_pd (__P, __A);
}
/* Store two DPFP values in reverse order. The address must be aligned. */
static __inline void
_mm_storer_pd (double *__P, __m128d __A)
{
- __v2df __va = (__v2df)__A;
- __v2df __tmp = __builtin_ia32_shufpd (__va, __va, _MM_SHUFFLE2 (0,1));
- __builtin_ia32_storeapd (__P, __tmp);
+ _mm_store_pd (__P, __builtin_ia32_shufpd (__A, __A, _MM_SHUFFLE2 (0,1)));
}
static __inline int
@@ -193,13 +208,6 @@ _mm_cvtsi128_si64x (__m128i __A)
}
#endif
-/* Sets the low DPFP value of A from the low value of B. */
-static __inline __m128d
-_mm_move_sd (__m128d __A, __m128d __B)
-{
- return (__m128d) __builtin_ia32_movsd ((__v2df)__A, (__v2df)__B);
-}
-
static __inline __m128d
_mm_add_pd (__m128d __A, __m128d __B)
@@ -543,277 +551,171 @@ _mm_ucomineq_sd (__m128d __A, __m128d __B)
return __builtin_ia32_ucomisdneq ((__v2df)__A, (__v2df)__B);
}
-/* Create a vector with element 0 as *P and the rest zero. */
+/* Create a vector of Qi, where i is the element number. */
static __inline __m128i
-_mm_load_si128 (__m128i const *__P)
+_mm_set_epi64x (long long __q1, long long __q0)
{
- return (__m128i) __builtin_ia32_loaddqa ((char const *)__P);
+ return (__m128i)(__v2di){ __q0, __q1 };
}
static __inline __m128i
-_mm_loadu_si128 (__m128i const *__P)
+_mm_set_epi64 (__m64 __q1, __m64 __q0)
{
- return (__m128i) __builtin_ia32_loaddqu ((char const *)__P);
+ return _mm_set_epi64x ((long long)__q1, (long long)__q0);
}
static __inline __m128i
-_mm_loadl_epi64 (__m128i const *__P)
+_mm_set_epi32 (int __q3, int __q2, int __q1, int __q0)
{
- return (__m128i) __builtin_ia32_movq2dq (*(unsigned long long *)__P);
+ return (__m128i)(__v4si){ __q0, __q1, __q2, __q3 };
}
-static __inline void
-_mm_store_si128 (__m128i *__P, __m128i __B)
+static __inline __m128i
+_mm_set_epi16 (short __q7, short __q6, short __q5, short __q4,
+ short __q3, short __q2, short __q1, short __q0)
{
- __builtin_ia32_storedqa ((char *)__P, (__v16qi)__B);
+ return (__m128i)(__v8hi){ __q0, __q1, __q2, __q3, __q4, __q5, __q6, __q7 };
}
-static __inline void
-_mm_storeu_si128 (__m128i *__P, __m128i __B)
+static __inline __m128i
+_mm_set_epi8 (char __q15, char __q14, char __q13, char __q12,
+ char __q11, char __q10, char __q09, char __q08,
+ char __q07, char __q06, char __q05, char __q04,
+ char __q03, char __q02, char __q01, char __q00)
{
- __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
+ return (__m128i)(__v16qi){
+ __q00, __q01, __q02, __q03, __q04, __q05, __q06, __q07,
+ __q08, __q09, __q10, __q11, __q12, __q13, __q14, __q15
+ };
}
-static __inline void
-_mm_storel_epi64 (__m128i *__P, __m128i __B)
-{
- *(long long *)__P = __builtin_ia32_movdq2q ((__v2di)__B);
-}
+/* Set all of the elements of the vector to A. */
-static __inline __m64
-_mm_movepi64_pi64 (__m128i __B)
+static __inline __m128i
+_mm_set1_epi64x (long long __A)
{
- return (__m64) __builtin_ia32_movdq2q ((__v2di)__B);
+ return _mm_set_epi64x (__A, __A);
}
static __inline __m128i
-_mm_move_epi64 (__m128i __A)
+_mm_set1_epi64 (__m64 __A)
{
- return (__m128i) __builtin_ia32_movq ((__v2di)__A);
+ return _mm_set_epi64 (__A, __A);
}
-/* Create a vector of zeros. */
static __inline __m128i
-_mm_setzero_si128 (void)
+_mm_set1_epi32 (int __A)
{
- return (__m128i) __builtin_ia32_setzero128 ();
+ return _mm_set_epi32 (__A, __A, __A, __A);
}
static __inline __m128i
-_mm_set_epi64 (__m64 __A, __m64 __B)
+_mm_set1_epi16 (short __A)
{
- __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A);
- __v2di __tmp2 = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__B);
- return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp2, __tmp);
+ return _mm_set_epi16 (__A, __A, __A, __A, __A, __A, __A, __A);
}
-/* Create the vector [Z Y X W]. */
static __inline __m128i
-_mm_set_epi32 (int __Z, int __Y, int __X, int __W)
+_mm_set1_epi8 (char __A)
{
- union {
- int __a[4];
- __m128i __v;
- } __u;
+ return _mm_set_epi8 (__A, __A, __A, __A, __A, __A, __A, __A,
+ __A, __A, __A, __A, __A, __A, __A, __A);
+}
- __u.__a[0] = __W;
- __u.__a[1] = __X;
- __u.__a[2] = __Y;
- __u.__a[3] = __Z;
+/* Create a vector of Qi, where i is the element number.
+ The parameter order is reversed from the _mm_set_epi* functions. */
- return __u.__v;
+static __inline __m128i
+_mm_setr_epi64 (__m64 __q0, __m64 __q1)
+{
+ return _mm_set_epi64 (__q1, __q0);
}
-#ifdef __x86_64__
-/* Create the vector [Z Y]. */
static __inline __m128i
-_mm_set_epi64x (long long __Z, long long __Y)
+_mm_setr_epi32 (int __q0, int __q1, int __q2, int __q3)
{
- union {
- long __a[2];
- __m128i __v;
- } __u;
-
- __u.__a[0] = __Y;
- __u.__a[1] = __Z;
-
- return __u.__v;
+ return _mm_set_epi32 (__q3, __q2, __q1, __q0);
}
-#endif
-/* Create the vector [S T U V Z Y X W]. */
static __inline __m128i
-_mm_set_epi16 (short __Z, short __Y, short __X, short __W,
- short __V, short __U, short __T, short __S)
+_mm_setr_epi16 (short __q0, short __q1, short __q2, short __q3,
+ short __q4, short __q5, short __q6, short __q7)
{
- union {
- short __a[8];
- __m128i __v;
- } __u;
-
- __u.__a[0] = __S;
- __u.__a[1] = __T;
- __u.__a[2] = __U;
- __u.__a[3] = __V;
- __u.__a[4] = __W;
- __u.__a[5] = __X;
- __u.__a[6] = __Y;
- __u.__a[7] = __Z;
-
- return __u.__v;
+ return _mm_set_epi16 (__q7, __q6, __q5, __q4, __q3, __q2, __q1, __q0);
}
-/* Create the vector [S T U V Z Y X W]. */
static __inline __m128i
-_mm_set_epi8 (char __Z, char __Y, char __X, char __W,
- char __V, char __U, char __T, char __S,
- char __Z1, char __Y1, char __X1, char __W1,
- char __V1, char __U1, char __T1, char __S1)
+_mm_setr_epi8 (char __q00, char __q01, char __q02, char __q03,
+ char __q04, char __q05, char __q06, char __q07,
+ char __q08, char __q09, char __q10, char __q11,
+ char __q12, char __q13, char __q14, char __q15)
{
- union {
- char __a[16];
- __m128i __v;
- } __u;
+ return _mm_set_epi8 (__q15, __q14, __q13, __q12, __q11, __q10, __q09, __q08,
+ __q07, __q06, __q05, __q04, __q03, __q02, __q01, __q00);
+}
- __u.__a[0] = __S1;
- __u.__a[1] = __T1;
- __u.__a[2] = __U1;
- __u.__a[3] = __V1;
- __u.__a[4] = __W1;
- __u.__a[5] = __X1;
- __u.__a[6] = __Y1;
- __u.__a[7] = __Z1;
- __u.__a[8] = __S;
- __u.__a[9] = __T;
- __u.__a[10] = __U;
- __u.__a[11] = __V;
- __u.__a[12] = __W;
- __u.__a[13] = __X;
- __u.__a[14] = __Y;
- __u.__a[15] = __Z;
+/* Create a vector with element 0 as *P and the rest zero. */
- return __u.__v;
+static __inline __m128i
+_mm_load_si128 (__m128i const *__P)
+{
+ return *__P;
}
static __inline __m128i
-_mm_set1_epi64 (__m64 __A)
+_mm_loadu_si128 (__m128i const *__P)
{
- __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A);
- return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp, __tmp);
+ return (__m128i) __builtin_ia32_loaddqu ((char const *)__P);
}
static __inline __m128i
-_mm_set1_epi32 (int __A)
+_mm_loadl_epi64 (__m128i const *__P)
{
- __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__A);
- return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0));
+ return _mm_set_epi64 ((__m64)0LL, *(__m64 *)__P);
}
-#ifdef __x86_64__
-static __inline __m128i
-_mm_set1_epi64x (long long __A)
+static __inline void
+_mm_store_si128 (__m128i *__P, __m128i __B)
{
- __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A);
- return (__m128i) __builtin_ia32_shufpd ((__v2df)__tmp, (__v2df)__tmp, _MM_SHUFFLE2 (0,0));
+ *__P = __B;
}
-#endif
-static __inline __m128i
-_mm_set1_epi16 (short __A)
+static __inline void
+_mm_storeu_si128 (__m128i *__P, __m128i __B)
{
- int __Acopy = (unsigned short)__A;
- __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__Acopy);
- __tmp = (__v4si)__builtin_ia32_punpcklwd128 ((__v8hi)__tmp, (__v8hi)__tmp);
- return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0));
+ __builtin_ia32_storedqu ((char *)__P, (__v16qi)__B);
}
-static __inline __m128i
-_mm_set1_epi8 (char __A)
+static __inline void
+_mm_storel_epi64 (__m128i *__P, __m128i __B)
{
- int __Acopy = (unsigned char)__A;
- __v4si __tmp = (__v4si)__builtin_ia32_loadd (&__Acopy);
- __tmp = (__v4si)__builtin_ia32_punpcklbw128 ((__v16qi)__tmp, (__v16qi)__tmp);
- __tmp = (__v4si)__builtin_ia32_punpcklbw128 ((__v16qi)__tmp, (__v16qi)__tmp);
- return (__m128i) __builtin_ia32_pshufd ((__v4si)__tmp, _MM_SHUFFLE (0,0,0,0));
+ *(long long *)__P = __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
}
-static __inline __m128i
-_mm_setr_epi64 (__m64 __A, __m64 __B)
+static __inline __m64
+_mm_movepi64_pi64 (__m128i __B)
{
- __v2di __tmp = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__A);
- __v2di __tmp2 = (__v2di)__builtin_ia32_movq2dq ((unsigned long long)__B);
- return (__m128i)__builtin_ia32_punpcklqdq128 (__tmp, __tmp2);
+ return (__m64) __builtin_ia32_vec_ext_v2di ((__v2di)__B, 0);
}
-/* Create the vector [Z Y X W]. */
static __inline __m128i
-_mm_setr_epi32 (int __W, int __X, int __Y, int __Z)
+_mm_movpi64_epi64 (__m64 __A)
{
- union {
- int __a[4];
- __m128i __v;
- } __u;
-
- __u.__a[0] = __W;
- __u.__a[1] = __X;
- __u.__a[2] = __Y;
- __u.__a[3] = __Z;
-
- return __u.__v;
+ return _mm_set_epi64 ((__m64)0LL, __A);
}
-/* Create the vector [S T U V Z Y X W]. */
+
static __inline __m128i
-_mm_setr_epi16 (short __S, short __T, short __U, short __V,
- short __W, short __X, short __Y, short __Z)
+_mm_move_epi64 (__m128i __A)
{
- union {
- short __a[8];
- __m128i __v;
- } __u;
-
- __u.__a[0] = __S;
- __u.__a[1] = __T;
- __u.__a[2] = __U;
- __u.__a[3] = __V;
- __u.__a[4] = __W;
- __u.__a[5] = __X;
- __u.__a[6] = __Y;
- __u.__a[7] = __Z;
-
- return __u.__v;
+ return _mm_set_epi64 ((__m64)0LL, _mm_movepi64_pi64 (__A));
}
-/* Create the vector [S T U V Z Y X W]. */
+/* Create a vector of zeros. */
static __inline __m128i
-_mm_setr_epi8 (char __S1, char __T1, char __U1, char __V1,
- char __W1, char __X1, char __Y1, char __Z1,
- char __S, char __T, char __U, char __V,
- char __W, char __X, char __Y, char __Z)
+_mm_setzero_si128 (void)
{
- union {
- char __a[16];
- __m128i __v;
- } __u;
-
- __u.__a[0] = __S1;
- __u.__a[1] = __T1;
- __u.__a[2] = __U1;
- __u.__a[3] = __V1;
- __u.__a[4] = __W1;
- __u.__a[5] = __X1;
- __u.__a[6] = __Y1;
- __u.__a[7] = __Z1;
- __u.__a[8] = __S;
- __u.__a[9] = __T;
- __u.__a[10] = __U;
- __u.__a[11] = __V;
- __u.__a[12] = __W;
- __u.__a[13] = __X;
- __u.__a[14] = __Y;
- __u.__a[15] = __Z;
-
- return __u.__v;
+ return (__m128i)(__v4si){ 0, 0, 0, 0 };
}
static __inline __m128d
@@ -956,24 +858,12 @@ _mm_loadh_pd (__m128d __A, double const *__B)
return (__m128d)__builtin_ia32_loadhpd ((__v2df)__A, __B);
}
-static __inline void
-_mm_storeh_pd (double *__A, __m128d __B)
-{
- __builtin_ia32_storehpd (__A, (__v2df)__B);
-}
-
static __inline __m128d
_mm_loadl_pd (__m128d __A, double const *__B)
{
return (__m128d)__builtin_ia32_loadlpd ((__v2df)__A, __B);
}
-static __inline void
-_mm_storel_pd (double *__A, __m128d __B)
-{
- __builtin_ia32_storelpd (__A, (__v2df)__B);
-}
-
static __inline int
_mm_movemask_pd (__m128d __A)
{
@@ -1365,9 +1255,24 @@ _mm_cmpgt_epi32 (__m128i __A, __m128i __B)
return (__m128i)__builtin_ia32_pcmpgtd128 ((__v4si)__A, (__v4si)__B);
}
-#define _mm_extract_epi16(__A, __B) __builtin_ia32_pextrw128 ((__v8hi)__A, __B)
+#if 0
+static __inline int __attribute__((__always_inline__))
+_mm_extract_epi16 (__m128i const __A, int const __N)
+{
+ return __builtin_ia32_vec_ext_v8hi ((__v8hi)__A, __N);
+}
-#define _mm_insert_epi16(__A, __B, __C) ((__m128i)__builtin_ia32_pinsrw128 ((__v8hi)__A, __B, __C))
+static __inline __m128i __attribute__((__always_inline__))
+_mm_insert_epi16 (__m128i const __A, int const __D, int const __N)
+{
+ return (__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)__A, __D, __N);
+}
+#else
+#define _mm_extract_epi16(A, N) \
+ ((int) __builtin_ia32_vec_ext_v8hi ((__v8hi)(A), (N))
+#define _mm_insert_epi16(A, D, N) \
+ ((__m128i) __builtin_ia32_vec_set_v8hi ((__v8hi)(A), (D), (N)))
+#endif
static __inline __m128i
_mm_max_epi16 (__m128i __A, __m128i __B)
@@ -1451,12 +1356,6 @@ _mm_stream_pd (double *__A, __m128d __B)
__builtin_ia32_movntpd (__A, (__v2df)__B);
}
-static __inline __m128i
-_mm_movpi64_epi64 (__m64 __A)
-{
- return (__m128i)__builtin_ia32_movq2dq ((unsigned long long)__A);
-}
-
static __inline void
_mm_clflush (void const *__A)
{