summaryrefslogtreecommitdiff
path: root/gcc/testsuite/gcc.target/i386
diff options
context:
space:
mode:
authormeissner <meissner@138bc75d-0d04-0410-961f-82ee72b054a4>2007-09-13 02:17:51 +0000
committermeissner <meissner@138bc75d-0d04-0410-961f-82ee72b054a4>2007-09-13 02:17:51 +0000
commit448e99f5e858735f9de2ab5cf614dd2b171cee5d (patch)
tree2550bf2be428ffb45e9bcb30a6c3186b44ebdc0d /gcc/testsuite/gcc.target/i386
parent3e66c14e753471249291f2403f35ebcbc436d353 (diff)
downloadgcc-448e99f5e858735f9de2ab5cf614dd2b171cee5d.tar.gz
Add AMD SSE5 support; Add iterator over function arguments; Add stdarg_p, prototype_p, function_args_count functions
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@128455 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/testsuite/gcc.target/i386')
-rw-r--r--gcc/testsuite/gcc.target/i386/i386.exp15
-rw-r--r--gcc/testsuite/gcc.target/i386/sse-12.c6
-rw-r--r--gcc/testsuite/gcc.target/i386/sse-13.c14
-rw-r--r--gcc/testsuite/gcc.target/i386/sse-14.c6
-rw-r--r--gcc/testsuite/gcc.target/i386/sse5-check.h20
-rw-r--r--gcc/testsuite/gcc.target/i386/sse5-fma-vector.c92
-rw-r--r--gcc/testsuite/gcc.target/i386/sse5-fma.c81
-rw-r--r--gcc/testsuite/gcc.target/i386/sse5-haddX.c208
-rw-r--r--gcc/testsuite/gcc.target/i386/sse5-hadduX.c207
-rw-r--r--gcc/testsuite/gcc.target/i386/sse5-hsubX.c128
-rw-r--r--gcc/testsuite/gcc.target/i386/sse5-ima-vector.c33
-rw-r--r--gcc/testsuite/gcc.target/i386/sse5-maccXX.c140
-rw-r--r--gcc/testsuite/gcc.target/i386/sse5-msubXX.c139
-rw-r--r--gcc/testsuite/gcc.target/i386/sse5-nmaccXX.c139
-rw-r--r--gcc/testsuite/gcc.target/i386/sse5-nmsubXX.c139
-rw-r--r--gcc/testsuite/gcc.target/i386/sse5-pcmov.c22
-rw-r--r--gcc/testsuite/gcc.target/i386/sse5-pcmov2.c22
-rw-r--r--gcc/testsuite/gcc.target/i386/sse5-permpX.c120
18 files changed, 1521 insertions, 10 deletions
diff --git a/gcc/testsuite/gcc.target/i386/i386.exp b/gcc/testsuite/gcc.target/i386/i386.exp
index de6a9bcc33c..7cf13ab7443 100644
--- a/gcc/testsuite/gcc.target/i386/i386.exp
+++ b/gcc/testsuite/gcc.target/i386/i386.exp
@@ -64,6 +64,21 @@ proc check_effective_target_sse4a { } {
} "-O2 -msse4a" ]
}
+# Return 1 if sse5 instructions can be compiled.
+proc check_effective_target_sse5 { } {
+ return [check_no_compiler_messages sse5 object {
+ typedef long long __m128i __attribute__ ((__vector_size__ (16)));
+ typedef long long __v2di __attribute__ ((__vector_size__ (16)));
+
+ __m128i _mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C)
+ {
+ return (__m128i) __builtin_ia32_pmacssww ((__v2di)__A,
+ (__v2di)__B,
+ (__v2di)__C);
+ }
+ } "-O2 -msse5" ]
+}
+
# If a testcase doesn't have special options, use these.
global DEFAULT_CFLAGS
if ![info exists DEFAULT_CFLAGS] then {
diff --git a/gcc/testsuite/gcc.target/i386/sse-12.c b/gcc/testsuite/gcc.target/i386/sse-12.c
index 48c4e7d43f6..395cdf7ed71 100644
--- a/gcc/testsuite/gcc.target/i386/sse-12.c
+++ b/gcc/testsuite/gcc.target/i386/sse-12.c
@@ -1,9 +1,9 @@
/* Test that {,x,e,p,t,s,a}mmintrin.h, mm3dnow.h and mm_malloc.h are
usable with -O -std=c89 -pedantic-errors. */
-/* { dg-do compile } */
-/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -m3dnow -msse4.1 -msse4a" } */
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -m3dnow -msse4.1 -msse5" } */
-#include <ammintrin.h>
+#include <bmmintrin.h>
#include <smmintrin.h>
#include <mm3dnow.h>
diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c
index 139d927cf3b..b6c34e2447c 100644
--- a/gcc/testsuite/gcc.target/i386/sse-13.c
+++ b/gcc/testsuite/gcc.target/i386/sse-13.c
@@ -1,8 +1,8 @@
-/* { dg-do compile } */
-/* { dg-options "-O2 -msse4.1 -msse4a" } */
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O2 -msse4.1 -msse5 " } */
/* Test that the intrinsics compile with optimization. All of them are
- defined as inline functions in {,x,e,p,t,s,a}mmintrin.h that reference
+ defined as inline functions in {,x,e,p,t,s,a,b}mmintrin.h that reference
the proper builtin functions. Defining away "static" and "__inline"
results in all of them being compiled as proper functions. */
@@ -66,5 +66,11 @@
#define __builtin_ia32_vec_ext_v4hi(A, N) __builtin_ia32_vec_ext_v4hi(A, 0)
#define __builtin_ia32_shufps(A, B, N) __builtin_ia32_shufps(A, B, 0)
-#include <ammintrin.h>
+/* bmmintrin.h */
+#define __builtin_ia32_protbi(A, B) __builtin_ia32_protbi(A,1)
+#define __builtin_ia32_protwi(A, B) __builtin_ia32_protwi(A,1)
+#define __builtin_ia32_protdi(A, B) __builtin_ia32_protdi(A,1)
+#define __builtin_ia32_protqi(A, B) __builtin_ia32_protqi(A,1)
+
+#include <bmmintrin.h>
#include <smmintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c
index 8c3a2d2e636..bb51c20ddcc 100644
--- a/gcc/testsuite/gcc.target/i386/sse-14.c
+++ b/gcc/testsuite/gcc.target/i386/sse-14.c
@@ -1,5 +1,5 @@
-/* { dg-do compile } */
-/* { dg-options "-O0 -msse4.1 -msse4a" } */
+/* { dg-do compile { target i?86-*-* x86_64-*-* } } */
+/* { dg-options "-O0 -msse4.1 -msse5" } */
/* Test that the intrinsics compile without optimization. All of them are
defined as inline functions in {,x,e,p,t,s,a}mmintrin.h that reference
@@ -9,5 +9,5 @@
#define static
#define __inline
-#include <ammintrin.h>
+#include <bmmintrin.h>
#include <smmintrin.h>
diff --git a/gcc/testsuite/gcc.target/i386/sse5-check.h b/gcc/testsuite/gcc.target/i386/sse5-check.h
new file mode 100644
index 00000000000..e133ed884fa
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse5-check.h
@@ -0,0 +1,20 @@
+#include <stdlib.h>
+
+#include "cpuid.h"
+
+static void sse5_test (void);
+
+int
+main ()
+{
+ unsigned int eax, ebx, ecx, edx;
+
+ if (!__get_cpuid (0x80000001, &eax, &ebx, &ecx, &edx))
+ return 0;
+
+ /* Run SSE5 test only if host has SSE5 support. */
+ if (ecx & bit_SSE5)
+ sse5_test ();
+
+ exit (0);
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse5-fma-vector.c b/gcc/testsuite/gcc.target/i386/sse5-fma-vector.c
new file mode 100644
index 00000000000..59dc7651568
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse5-fma-vector.c
@@ -0,0 +1,92 @@
+/* Test that the compiler properly optimizes floating point multiply and add
+ instructions vector into fmaddps on SSE5 systems. */
+
+/* { dg-do compile { target x86_64-*-*} } */
+/* { dg-options "-O2 -msse5 -mfused-madd -ftree-vectorize" } */
+
+extern void exit (int);
+
+typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__));
+typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__));
+
+#define SIZE 10240
+
+union {
+ __m128 f_align;
+ __m128d d_align;
+ float f[SIZE];
+ double d[SIZE];
+} a, b, c, d;
+
+void
+flt_mul_add (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.f[i] = (b.f[i] * c.f[i]) + d.f[i];
+}
+
+void
+dbl_mul_add (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.d[i] = (b.d[i] * c.d[i]) + d.d[i];
+}
+
+void
+flt_mul_sub (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.f[i] = (b.f[i] * c.f[i]) - d.f[i];
+}
+
+void
+dbl_mul_sub (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.d[i] = (b.d[i] * c.d[i]) - d.d[i];
+}
+
+void
+flt_neg_mul_add (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.f[i] = (-(b.f[i] * c.f[i])) + d.f[i];
+}
+
+void
+dbl_neg_mul_add (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.d[i] = (-(b.d[i] * c.d[i])) + d.d[i];
+}
+
+int main ()
+{
+ flt_mul_add ();
+ flt_mul_sub ();
+ flt_neg_mul_add ();
+
+ dbl_mul_add ();
+ dbl_mul_sub ();
+ dbl_neg_mul_add ();
+ exit (0);
+}
+
+/* { dg-final { scan-assembler "fmaddps" } } */
+/* { dg-final { scan-assembler "fmaddpd" } } */
+/* { dg-final { scan-assembler "fmsubps" } } */
+/* { dg-final { scan-assembler "fmsubpd" } } */
+/* { dg-final { scan-assembler "fnmaddps" } } */
+/* { dg-final { scan-assembler "fnmaddpd" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse5-fma.c b/gcc/testsuite/gcc.target/i386/sse5-fma.c
new file mode 100644
index 00000000000..598cda03c04
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse5-fma.c
@@ -0,0 +1,81 @@
+/* Test that the compiler properly optimizes floating point multiply and add
+ instructions into fmaddss, fmsubss, fnmaddss, fnmsubss on SSE5 systems. */
+
+/* { dg-do compile { target x86_64-*-*} } */
+/* { dg-options "-O2 -msse5 -mfused-madd" } */
+
+extern void exit (int);
+
+float
+flt_mul_add (float a, float b, float c)
+{
+ return (a * b) + c;
+}
+
+double
+dbl_mul_add (double a, double b, double c)
+{
+ return (a * b) + c;
+}
+
+float
+flt_mul_sub (float a, float b, float c)
+{
+ return (a * b) - c;
+}
+
+double
+dbl_mul_sub (double a, double b, double c)
+{
+ return (a * b) - c;
+}
+
+float
+flt_neg_mul_add (float a, float b, float c)
+{
+ return (-(a * b)) + c;
+}
+
+double
+dbl_neg_mul_add (double a, double b, double c)
+{
+ return (-(a * b)) + c;
+}
+
+float
+flt_neg_mul_sub (float a, float b, float c)
+{
+ return (-(a * b)) - c;
+}
+
+double
+dbl_neg_mul_sub (double a, double b, double c)
+{
+ return (-(a * b)) - c;
+}
+
+float f[10] = { 2, 3, 4 };
+double d[10] = { 2, 3, 4 };
+
+int main ()
+{
+ f[3] = flt_mul_add (f[0], f[1], f[2]);
+ f[4] = flt_mul_sub (f[0], f[1], f[2]);
+ f[5] = flt_neg_mul_add (f[0], f[1], f[2]);
+ f[6] = flt_neg_mul_sub (f[0], f[1], f[2]);
+
+ d[3] = dbl_mul_add (d[0], d[1], d[2]);
+ d[4] = dbl_mul_sub (d[0], d[1], d[2]);
+ d[5] = dbl_neg_mul_add (d[0], d[1], d[2]);
+ d[6] = dbl_neg_mul_sub (d[0], d[1], d[2]);
+ exit (0);
+}
+
+/* { dg-final { scan-assembler "fmaddss" } } */
+/* { dg-final { scan-assembler "fmaddsd" } } */
+/* { dg-final { scan-assembler "fmsubss" } } */
+/* { dg-final { scan-assembler "fmsubsd" } } */
+/* { dg-final { scan-assembler "fnmaddss" } } */
+/* { dg-final { scan-assembler "fnmaddsd" } } */
+/* { dg-final { scan-assembler "fnmsubss" } } */
+/* { dg-final { scan-assembler "fnmsubsd" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse5-haddX.c b/gcc/testsuite/gcc.target/i386/sse5-haddX.c
new file mode 100644
index 00000000000..e605e070c12
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse5-haddX.c
@@ -0,0 +1,208 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse5 } */
+/* { dg-options "-O2 -msse5" } */
+
+#include "sse5-check.h"
+
+#include <bmmintrin.h>
+#include <string.h>
+
+#define NUM 10
+
+union
+{
+ __m128i x[NUM];
+ int8_t ssi[NUM * 16];
+ int16_t si[NUM * 8];
+ int32_t li[NUM * 4];
+ int64_t lli[NUM * 2];
+} dst, res, src1;
+
+static void
+init_sbyte ()
+{
+ int i;
+ for (i=0; i < NUM * 16; i++)
+ src1.ssi[i] = i;
+}
+
+static void
+init_sword ()
+{
+ int i;
+ for (i=0; i < NUM * 8; i++)
+ src1.si[i] = i;
+}
+
+
+static void
+init_sdword ()
+{
+ int i;
+ for (i=0; i < NUM * 4; i++)
+ src1.li[i] = i;
+}
+
+static int
+check_sbyte2word ()
+{
+ int i, j, s, t, check_fails = 0;
+ for (i = 0; i < NUM * 16; i = i + 16)
+ {
+ for (j = 0; j < 8; j++)
+ {
+ t = i + (2 * j);
+ s = (i / 2) + j;
+ res.si[s] = src1.ssi[t] + src1.ssi[t + 1] ;
+ if (res.si[s] != dst.si[s])
+ check_fails++;
+ }
+ }
+}
+
+static int
+check_sbyte2dword ()
+{
+ int i, j, s, t, check_fails = 0;
+ for (i = 0; i < NUM * 16; i = i + 16)
+ {
+ for (j = 0; j < 4; j++)
+ {
+ t = i + (4 * j);
+ s = (i / 4) + j;
+ res.li[s] = (src1.ssi[t] + src1.ssi[t + 1]) + (src1.ssi[t + 2]
+ + src1.ssi[t + 3]);
+ if (res.li[s] != dst.li[s])
+ check_fails++;
+ }
+ }
+ return check_fails++;
+}
+
+static int
+check_sbyte2qword ()
+{
+ int i, j, s, t, check_fails = 0;
+ for (i = 0; i < NUM * 16; i = i + 16)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ t = i + (8 * j);
+ s = (i / 8) + j;
+ res.lli[s] = ((src1.ssi[t] + src1.ssi[t + 1]) + (src1.ssi[t + 2]
+ + src1.ssi[t + 3])) + ((src1.ssi[t + 4] + src1.ssi[t +5])
+ + (src1.ssi[t + 6] + src1.ssi[t + 7]));
+ if (res.lli[s] != dst.lli[s])
+ check_fails++;
+ }
+ }
+ return check_fails++;
+}
+
+static int
+check_sword2dword ()
+{
+ int i, j, s, t, check_fails = 0;
+ for (i = 0; i < (NUM * 8); i = i + 8)
+ {
+ for (j = 0; j < 4; j++)
+ {
+ t = i + (2 * j);
+ s = (i / 2) + j;
+ res.li[s] = src1.si[t] + src1.si[t + 1] ;
+ if (res.li[s] != dst.li[s])
+ check_fails++;
+ }
+ }
+}
+
+static int
+check_sword2qword ()
+{
+ int i, j, s, t, check_fails = 0;
+ for (i = 0; i < NUM * 8; i = i + 8)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ t = i + (4 * j);
+ s = (i / 4) + j;
+ res.lli[s] = (src1.si[t] + src1.si[t + 1]) + (src1.si[t + 2]
+ + src1.si[t + 3]);
+ if (res.lli[s] != dst.lli[s])
+ check_fails++;
+ }
+ }
+ return check_fails++;
+}
+
+static int
+check_dword2qword ()
+{
+ int i, j, s, t, check_fails = 0;
+ for (i = 0; i < (NUM * 4); i = i + 4)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ t = i + (2 * j);
+ s = (i / 2) + j;
+ res.lli[s] = src1.li[t] + src1.li[t + 1] ;
+ if (res.lli[s] != dst.lli[s])
+ check_fails++;
+ }
+ }
+}
+
+static void
+sse5_test (void)
+{
+ int i;
+
+ /* Check haddbw */
+ init_sbyte ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_haddw_epi8 (src1.x[i]);
+
+ if (check_sbyte2word())
+ abort ();
+
+ /* Check haddbd */
+ for (i = 0; i < (NUM ); i++)
+ dst.x[i] = _mm_haddd_epi8 (src1.x[i]);
+
+ if (check_sbyte2dword())
+ abort ();
+
+ /* Check haddbq */
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_haddq_epi8 (src1.x[i]);
+
+ if (check_sbyte2qword())
+ abort ();
+
+ /* Check haddwd */
+ init_sword ();
+
+ for (i = 0; i < (NUM ); i++)
+ dst.x[i] = _mm_haddd_epi16 (src1.x[i]);
+
+ if (check_sword2dword())
+ abort ();
+
+ /* Check haddbwq */
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_haddq_epi16 (src1.x[i]);
+
+ if (check_sword2qword())
+ abort ();
+
+ /* Check haddq */
+ init_sdword ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_haddq_epi32 (src1.x[i]);
+
+ if (check_dword2qword())
+ abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse5-hadduX.c b/gcc/testsuite/gcc.target/i386/sse5-hadduX.c
new file mode 100644
index 00000000000..a55fb8a527d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse5-hadduX.c
@@ -0,0 +1,207 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse5 } */
+/* { dg-options "-O2 -msse5" } */
+
+#include "sse5-check.h"
+
+#include <bmmintrin.h>
+#include <string.h>
+
+#define NUM 10
+
+union
+{
+ __m128i x[NUM];
+ unsigned char ssi[NUM * 16];
+ unsigned short si[NUM * 8];
+ unsigned int li[NUM * 4];
+ unsigned long long lli[NUM * 2];
+} dst, res, src1;
+
+static void
+init_byte ()
+{
+ int i;
+ for (i=0; i < NUM * 16; i++)
+ src1.ssi[i] = i;
+}
+
+static void
+init_word ()
+{
+ int i;
+ for (i=0; i < NUM * 8; i++)
+ src1.si[i] = i;
+}
+
+
+static void
+init_dword ()
+{
+ int i;
+ for (i=0; i < NUM * 4; i++)
+ src1.li[i] = i;
+}
+
+static int
+check_byte2word ()
+{
+ int i, j, s, t, check_fails = 0;
+ for (i = 0; i < NUM * 16; i = i + 16)
+ {
+ for (j = 0; j < 8; j++)
+ {
+ t = i + (2 * j);
+ s = (i / 2) + j;
+ res.si[s] = src1.ssi[t] + src1.ssi[t + 1] ;
+ if (res.si[s] != dst.si[s])
+ check_fails++;
+ }
+ }
+}
+
+static int
+check_byte2dword ()
+{
+ int i, j, s, t, check_fails = 0;
+ for (i = 0; i < NUM * 16; i = i + 16)
+ {
+ for (j = 0; j < 4; j++)
+ {
+ t = i + (4 * j);
+ s = (i / 4) + j;
+ res.li[s] = (src1.ssi[t] + src1.ssi[t + 1]) + (src1.ssi[t + 2]
+ + src1.ssi[t + 3]);
+ if (res.li[s] != dst.li[s])
+ check_fails++;
+ }
+ }
+ return check_fails++;
+}
+
+static int
+check_byte2qword ()
+{
+ int i, j, s, t, check_fails = 0;
+ for (i = 0; i < NUM * 16; i = i + 16)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ t = i + (8 * j);
+ s = (i / 8) + j;
+ res.lli[s] = ((src1.ssi[t] + src1.ssi[t + 1]) + (src1.ssi[t + 2]
+ + src1.ssi[t + 3])) + ((src1.ssi[t + 4] + src1.ssi[t +5])
+ + (src1.ssi[t + 6] + src1.ssi[t + 7]));
+ if (res.lli[s] != dst.lli[s])
+ check_fails++;
+ }
+ }
+ return check_fails++;
+}
+
+static int
+check_word2dword ()
+{
+ int i, j, s, t, check_fails = 0;
+ for (i = 0; i < (NUM * 8); i = i + 8)
+ {
+ for (j = 0; j < 4; j++)
+ {
+ t = i + (2 * j);
+ s = (i / 2) + j;
+ res.li[s] = src1.si[t] + src1.si[t + 1] ;
+ if (res.li[s] != dst.li[s])
+ check_fails++;
+ }
+ }
+}
+
+static int
+check_word2qword ()
+{
+ int i, j, s, t, check_fails = 0;
+ for (i = 0; i < NUM * 8; i = i + 8)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ t = i + (4 * j);
+ s = (i / 4) + j;
+ res.lli[s] = (src1.si[t] + src1.si[t + 1]) + (src1.si[t + 2]
+ + src1.si[t + 3]);
+ if (res.lli[s] != dst.lli[s])
+ check_fails++;
+ }
+ }
+ return check_fails++;
+}
+
+static int
+check_dword2qword ()
+{
+ int i, j, s, t, check_fails = 0;
+ for (i = 0; i < (NUM * 4); i = i + 4)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ t = i + (2 * j);
+ s = (i / 2) + j;
+ res.lli[s] = src1.li[t] + src1.li[t + 1] ;
+ if (res.lli[s] != dst.lli[s])
+ check_fails++;
+ }
+ }
+}
+
+static void
+sse5_test (void)
+{
+ int i;
+
+ /* Check haddubw */
+ init_byte ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_haddw_epu8 (src1.x[i]);
+
+ if (check_byte2word())
+ abort ();
+
+ /* Check haddubd */
+ for (i = 0; i < (NUM ); i++)
+ dst.x[i] = _mm_haddd_epu8 (src1.x[i]);
+
+ if (check_byte2dword())
+ abort ();
+
+ /* Check haddubq */
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_haddq_epu8 (src1.x[i]);
+
+ if (check_byte2qword())
+ abort ();
+
+ /* Check hadduwd */
+ init_word ();
+
+ for (i = 0; i < (NUM ); i++)
+ dst.x[i] = _mm_haddd_epu16 (src1.x[i]);
+
+ if (check_word2dword())
+ abort ();
+
+ /* Check haddbuwq */
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_haddq_epu16 (src1.x[i]);
+
+ if (check_word2qword())
+ abort ();
+
+ /* Check hadudq */
+ init_dword ();
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_haddq_epu32 (src1.x[i]);
+
+ if (check_dword2qword())
+ abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse5-hsubX.c b/gcc/testsuite/gcc.target/i386/sse5-hsubX.c
new file mode 100644
index 00000000000..03c7f79084e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse5-hsubX.c
@@ -0,0 +1,128 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse5 } */
+/* { dg-options "-O2 -msse5" } */
+
+#include "sse5-check.h"
+
+#include <bmmintrin.h>
+#include <string.h>
+
+#define NUM 10
+
+union
+{
+ __m128i x[NUM];
+ int8_t ssi[NUM * 16];
+ int16_t si[NUM * 8];
+ int32_t li[NUM * 4];
+ int64_t lli[NUM * 2];
+} dst, res, src1;
+
+static void
+init_sbyte ()
+{
+ int i;
+ for (i=0; i < NUM * 16; i++)
+ src1.ssi[i] = i;
+}
+
+static void
+init_sword ()
+{
+ int i;
+ for (i=0; i < NUM * 8; i++)
+ src1.si[i] = i;
+}
+
+
+static void
+init_sdword ()
+{
+ int i;
+ for (i=0; i < NUM * 4; i++)
+ src1.li[i] = i;
+}
+
+static int
+check_sbyte2word ()
+{
+ int i, j, s, t, check_fails = 0;
+ for (i = 0; i < NUM * 16; i = i + 16)
+ {
+ for (j = 0; j < 8; j++)
+ {
+ t = i + (2 * j);
+ s = (i / 2) + j;
+ res.si[s] = src1.ssi[t] - src1.ssi[t + 1] ;
+ if (res.si[s] != dst.si[s])
+ check_fails++;
+ }
+ }
+}
+
+static int
+check_sword2dword ()
+{
+ int i, j, s, t, check_fails = 0;
+ for (i = 0; i < (NUM * 8); i = i + 8)
+ {
+ for (j = 0; j < 4; j++)
+ {
+ t = i + (2 * j);
+ s = (i / 2) + j;
+ res.li[s] = src1.si[t] - src1.si[t + 1] ;
+ if (res.li[s] != dst.li[s])
+ check_fails++;
+ }
+ }
+}
+
+static int
+check_dword2qword ()
+{
+ int i, j, s, t, check_fails = 0;
+ for (i = 0; i < (NUM * 4); i = i + 4)
+ {
+ for (j = 0; j < 2; j++)
+ {
+ t = i + (2 * j);
+ s = (i / 2) + j;
+ res.lli[s] = src1.li[t] - src1.li[t + 1] ;
+ if (res.lli[s] != dst.lli[s])
+ check_fails++;
+ }
+ }
+}
+
+static void
+sse5_test (void)
+{
+ int i;
+
+ /* Check hsubbw */
+ init_sbyte ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_hsubw_epi8 (src1.x[i]);
+
+ if (check_sbyte2word())
+ abort ();
+
+
+ /* Check hsubwd */
+ init_sword ();
+
+ for (i = 0; i < (NUM ); i++)
+ dst.x[i] = _mm_hsubd_epi16 (src1.x[i]);
+
+ if (check_sword2dword())
+ abort ();
+
+ /* Check hsubdq */
+ init_sdword ();
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_hsubq_epi32 (src1.x[i]);
+
+ if (check_dword2qword())
+ abort ();
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse5-ima-vector.c b/gcc/testsuite/gcc.target/i386/sse5-ima-vector.c
new file mode 100644
index 00000000000..260291d2985
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse5-ima-vector.c
@@ -0,0 +1,33 @@
+/* Test that the compiler properly optimizes vector 32-bit integer point
+ multiply and add instructions vector into pmacsdd on SSE5 systems. */
+
+/* { dg-do compile { target x86_64-*-*} } */
+/* { dg-options "-O2 -msse5 -ftree-vectorize" } */
+
+extern void exit (int);
+
+typedef long long __m128i __attribute__ ((__vector_size__ (16), __may_alias__));
+
+#define SIZE 10240
+
+union {
+ __m128i align;
+ int i[SIZE];
+} a, b, c, d;
+
+void
+int_mul_add (void)
+{
+ int i;
+
+ for (i = 0; i < SIZE; i++)
+ a.i[i] = (b.i[i] * c.i[i]) + d.i[i];
+}
+
+int main ()
+{
+ int_mul_add ();
+ exit (0);
+}
+
+/* { dg-final { scan-assembler "pmacsdd" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse5-maccXX.c b/gcc/testsuite/gcc.target/i386/sse5-maccXX.c
new file mode 100644
index 00000000000..9603d53ea68
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse5-maccXX.c
@@ -0,0 +1,140 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse5 } */
+/* { dg-options "-O2 -msse5" } */
+
+#include "sse5-check.h"
+
+#include <bmmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+union
+{
+ __m128 x[NUM];
+ float f[NUM * 4];
+ __m128d y[NUM];
+ double d[NUM * 2];
+} dst, res, src1, src2, src3;
+
+
+/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
+ product is not rounded, only the addition is rounded. */
+
+static void
+init_maccps ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.f[i] = i;
+ src2.f[i] = i + 10;
+ src3.f[i] = i + 20;
+ }
+}
+
+static void
+init_maccpd ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.d[i] = i;
+ src2.d[i] = i + 10;
+ src3.d[i] = i + 20;
+ }
+}
+
+static int
+check_maccps ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ for (j = 0; j < 4; j++)
+ {
+ res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j];
+ if (dst.f[i + j] != res.f[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_maccpd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 2; i = i + 2)
+ for (j = 0; j < 2; j++)
+ {
+ res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j];
+ if (dst.d[i + j] != res.d[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+
+static int
+check_maccss ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i= i + 4)
+ {
+ res.f[i] = (src1.f[i] * src2.f[i]) + src3.f[i];
+ if (dst.f[i] != res.f[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_maccsd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 2; i = i + 2)
+ {
+ res.d[i] = (src1.d[i] * src2.d[i]) + src3.d[i];
+ if (dst.d[i] != res.d[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static void
+sse5_test (void)
+{
+ int i;
+
+ /* Check maccps */
+ init_maccps ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_macc_ps (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_maccps ())
+ abort ();
+
+ /* check maccss */
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_macc_ss (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_maccss ())
+ abort ();
+
+ /* Check maccpd */
+ init_maccpd ();
+
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm_macc_pd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_maccpd ())
+ abort ();
+
+ /* Check maccps */
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm_macc_sd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_maccsd ())
+ abort ();
+
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse5-msubXX.c b/gcc/testsuite/gcc.target/i386/sse5-msubXX.c
new file mode 100644
index 00000000000..151e8c6e51f
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse5-msubXX.c
@@ -0,0 +1,139 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse5 } */
+/* { dg-options "-O2 -msse5" } */
+
+#include "sse5-check.h"
+
+#include <bmmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+union
+{
+ __m128 x[NUM];
+ float f[NUM * 4];
+ __m128d y[NUM];
+ double d[NUM * 2];
+} dst, res, src1, src2, src3;
+
+/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
+ product is not rounded, only the addition is rounded. */
+
+static void
+init_msubps ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.f[i] = i;
+ src2.f[i] = i + 10;
+ src3.f[i] = i + 20;
+ }
+}
+
+static void
+init_msubpd ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.d[i] = i;
+ src2.d[i] = i + 10;
+ src3.d[i] = i + 20;
+ }
+}
+
+static int
+check_msubps ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ for (j = 0; j < 4; j++)
+ {
+ res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j];
+ if (dst.f[i + j] != res.f[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_msubpd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 2; i = i + 2)
+ for (j = 0; j < 2; j++)
+ {
+ res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j];
+ if (dst.d[i + j] != res.d[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+
+static int
+check_msubss ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ {
+ res.f[i] = (src1.f[i] * src2.f[i]) - src3.f[i];
+ if (dst.f[i] != res.f[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_msubsd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 2; i = i + 2)
+ {
+ res.d[i] = (src1.d[i] * src2.d[i]) - src3.d[i];
+ if (dst.d[i] != res.d[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static void
+sse5_test (void)
+{
+ int i;
+
+ /* Check msubps */
+ init_msubps ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_msub_ps (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_msubps ())
+ abort ();
+
+ /* check msubss */
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_msub_ss (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_msubss ())
+ abort ();
+
+ /* Check msubpd */
+ init_msubpd ();
+
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm_msub_pd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_msubpd ())
+ abort ();
+
+ /* Check msubps */
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm_msub_sd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_msubsd ())
+ abort ();
+
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse5-nmaccXX.c b/gcc/testsuite/gcc.target/i386/sse5-nmaccXX.c
new file mode 100644
index 00000000000..c5ca2bf7d1e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse5-nmaccXX.c
@@ -0,0 +1,139 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse5 } */
+/* { dg-options "-O2 -msse5" } */
+
+#include "sse5-check.h"
+
+#include <bmmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+union
+{
+ __m128 x[NUM];
+ float f[NUM * 4];
+ __m128d y[NUM];
+ double d[NUM * 2];
+} dst, res, src1, src2, src3;
+
+/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
+ product is not rounded, only the addition is rounded. */
+
+static void
+init_nmaccps ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.f[i] = i;
+ src2.f[i] = i + 10;
+ src3.f[i] = i + 20;
+ }
+}
+
+static void
+init_nmaccpd ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.d[i] = i;
+ src2.d[i] = i + 10;
+ src3.d[i] = i + 20;
+ }
+}
+
+static int
+check_nmaccps ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ for (j = 0; j < 4; j++)
+ {
+ res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j];
+ if (dst.f[i + j] != res.f[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_nmaccpd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 2; i = i + 2)
+ for (j = 0; j < 2; j++)
+ {
+ res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j];
+ if (dst.d[i + j] != res.d[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+
+static int
+check_nmaccss ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ {
+ res.f[i] = - (src1.f[i] * src2.f[i]) + src3.f[i];
+ if (dst.f[i] != res.f[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_nmaccsd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 2; i = i + 2)
+ {
+ res.d[i] = - (src1.d[i] * src2.d[i]) + src3.d[i];
+ if (dst.d[i] != res.d[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static void
+sse5_test (void)
+{
+ int i;
+
+ /* Check nmaccps */
+ init_nmaccps ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_nmacc_ps (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_nmaccps ())
+ abort ();
+
+ /* check nmaccss */
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_nmacc_ss (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_nmaccss ())
+ abort ();
+
+ /* Check nmaccpd */
+ init_nmaccpd ();
+
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm_nmacc_pd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_nmaccpd ())
+ abort ();
+
+ /* Check nmaccps */
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm_nmacc_sd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_nmaccsd ())
+ abort ();
+
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse5-nmsubXX.c b/gcc/testsuite/gcc.target/i386/sse5-nmsubXX.c
new file mode 100644
index 00000000000..acf19f9742d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse5-nmsubXX.c
@@ -0,0 +1,139 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse5 } */
+/* { dg-options "-O2 -msse5" } */
+
+#include "sse5-check.h"
+
+#include <bmmintrin.h>
+#include <string.h>
+
+#define NUM 20
+
+union
+{
+ __m128 x[NUM];
+ float f[NUM * 4];
+ __m128d y[NUM];
+ double d[NUM * 2];
+} dst, res, src1, src2, src3;
+
+/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate
+ product is not rounded, only the addition is rounded. */
+
+static void
+init_nmsubps ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.f[i] = i;
+ src2.f[i] = i + 10;
+ src3.f[i] = i + 20;
+ }
+}
+
+static void
+init_nmsubpd ()
+{
+ int i;
+ for (i = 0; i < NUM * 4; i++)
+ {
+ src1.d[i] = i;
+ src2.d[i] = i + 10;
+ src3.d[i] = i + 20;
+ }
+}
+
+static int
+check_nmsubps ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ for (j = 0; j < 4; j++)
+ {
+ res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j];
+ if (dst.f[i + j] != res.f[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_nmsubpd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 2; i = i + 2)
+ for (j = 0; j < 2; j++)
+ {
+ res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j];
+ if (dst.d[i + j] != res.d[i + j])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+
+static int
+check_nmsubss ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 4; i = i + 4)
+ {
+ res.f[i] = - (src1.f[i] * src2.f[i]) - src3.f[i];
+ if (dst.f[i] != res.f[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_nmsubsd ()
+{
+ int i, j, check_fails = 0;
+ for (i = 0; i < NUM * 2; i = i + 2)
+ {
+ res.d[i] = - (src1.d[i] * src2.d[i]) - src3.d[i];
+ if (dst.d[i] != res.d[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static void
+sse5_test (void)
+{
+ int i;
+
+ /* Check nmsubps */
+ init_nmsubps ();
+
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_nmsub_ps (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_nmsubps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4], &src3.f[i * 4]))
+ abort ();
+
+ /* check nmsubss */
+ for (i = 0; i < NUM; i++)
+ dst.x[i] = _mm_nmsub_ss (src1.x[i], src2.x[i], src3.x[i]);
+
+ if (check_nmsubss (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4], &src3.f[i * 4]))
+ abort ();
+
+ /* Check nmsubpd */
+ init_nmsubpd ();
+
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm_nmsub_pd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_nmsubpd (&dst.y[i], &src1.d[i * 2], &src2.d[i * 2], &src3.d[i * 2]))
+ abort ();
+
+ /* Check nmsubps */
+ for (i = 0; i < NUM; i++)
+ dst.y[i] = _mm_nmsub_sd (src1.y[i], src2.y[i], src3.y[i]);
+
+ if (check_nmsubsd (&dst.y[i], &src1.d[i * 2], &src2.d[i * 2], &src3.d[i * 2]))
+ abort ();
+
+}
diff --git a/gcc/testsuite/gcc.target/i386/sse5-pcmov.c b/gcc/testsuite/gcc.target/i386/sse5-pcmov.c
new file mode 100644
index 00000000000..3bc2e5dbbda
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse5-pcmov.c
@@ -0,0 +1,22 @@
+/* Test that the compiler properly optimizes conditional floating point moves
+ into the pcmov instruction on SSE5 systems. */
+
+/* { dg-do compile { target x86_64-*-*} } */
+/* { dg-options "-O2 -msse5" } */
+
+extern void exit (int);
+
+double dbl_test (double a, double b, double c, double d)
+{
+ return (a > b) ? c : d;
+}
+
+double dbl_a = 1, dbl_b = 2, dbl_c = 3, dbl_d = 4, dbl_e;
+
+int main()
+{
+ dbl_e = dbl_test (dbl_a, dbl_b, dbl_c, dbl_d);
+ exit (0);
+}
+
+/* { dg-final { scan-assembler "pcmov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse5-pcmov2.c b/gcc/testsuite/gcc.target/i386/sse5-pcmov2.c
new file mode 100644
index 00000000000..0bb366ce0c9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse5-pcmov2.c
@@ -0,0 +1,22 @@
+/* Test that the compiler properly optimizes conditional floating point moves
+ into the pcmov instruction on SSE5 systems. */
+
+/* { dg-do compile { target x86_64-*-*} } */
+/* { dg-options "-O2 -msse5" } */
+
+extern void exit (int);
+
+float flt_test (float a, float b, float c, float d)
+{
+ return (a > b) ? c : d;
+}
+
+float flt_a = 1, flt_b = 2, flt_c = 3, flt_d = 4, flt_e;
+
+int main()
+{
+ flt_e = flt_test (flt_a, flt_b, flt_c, flt_d);
+ exit (0);
+}
+
+/* { dg-final { scan-assembler "pcmov" } } */
diff --git a/gcc/testsuite/gcc.target/i386/sse5-permpX.c b/gcc/testsuite/gcc.target/i386/sse5-permpX.c
new file mode 100644
index 00000000000..d83aa58338e
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/sse5-permpX.c
@@ -0,0 +1,120 @@
+/* { dg-do run { target i?86-*-* x86_64-*-* } } */
+/* { dg-require-effective-target sse5 } */
+/* { dg-options "-O2 -msse5" } */
+
+#include "sse5-check.h"
+
+#include <bmmintrin.h>
+#include <string.h>
+
+union
+{
+ __m128 x[2];
+ __m128d y[2];
+ __m128i z[2];
+ float f[8];
+ double d[4];
+ int i[8];
+ long li[4];
+} dst, res, src1, src2, src3;
+
+
+static void
+init_ddata ()
+{
+ int i;
+ for (i = 0; i < 4; i++)
+ {
+ src1.d[i] = i;
+ src2.d[i] = i + 2;
+ }
+
+ src3.li[0] = 3;
+ src3.li[1] = 0;
+ src3.li[2] = 1;
+ src3.li[3] = 2;
+
+ res.d[0] = 3.0;
+ res.d[1] = 0.0;
+ res.d[2] = 3.0;
+ res.d[3] = 4.0;
+}
+
+
+static void
+init_fdata ()
+{
+ int i;
+ for (i = 0; i < 8; i++)
+ {
+ src1.f[i] = i;
+ src2.f[i] = i + 2;
+ }
+
+ src3.i[0] = 7;
+ src3.i[1] = 5;
+ src3.i[2] = 1;
+ src3.i[3] = 2;
+ src3.i[4] = 0;
+ src3.i[5] = 4;
+ src3.i[6] = 3;
+ src3.i[7] = 6;
+
+ res.f[0] = 5.0;
+ res.f[1] = 3.0;
+ res.f[2] = 1.0;
+ res.f[3] = 2.0;
+ res.f[4] = 4.0;
+ res.f[5] = 6.0;
+ res.f[6] = 7.0;
+ res.f[7] = 8.0;
+}
+
+static int
+check_permpd ()
+{
+ int i, check_fails = 0;
+
+ for (i = 0; i < 4; i++)
+ {
+ if (res.d[i] != dst.d[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static int
+check_permps ()
+{
+ int i, check_fails = 0;
+
+ for (i = 0; i < 8; i++)
+ {
+ if (res.f[i] != dst.f[i])
+ check_fails++;
+ }
+ return check_fails++;
+}
+
+static void
+sse5_test (void)
+{
+ int i;
+ init_ddata();
+
+ for (i = 0; i < 2; i++)
+ dst.y[i] = _mm_perm_pd (src1.y[i], src2.y[i], src3.z[i]);
+
+ if (check_permpd ())
+ abort ();
+
+ init_fdata();
+
+ for (i = 0; i < 2; i++)
+ dst.x[i] = _mm_perm_ps (src1.x[i], src2.x[i], src3.z[i]);
+
+ if (check_permps ())
+ abort ();
+}
+
+