diff options
author | hjagasia <hjagasia@138bc75d-0d04-0410-961f-82ee72b054a4> | 2009-09-30 00:00:45 +0000 |
---|---|---|
committer | hjagasia <hjagasia@138bc75d-0d04-0410-961f-82ee72b054a4> | 2009-09-30 00:00:45 +0000 |
commit | 2f212aae96e5a5b2c09d87fa91592b5a2a573e7e (patch) | |
tree | a6738f5a8818475b5816bfd870277e6deb35e967 /gcc/testsuite/gcc.target/i386 | |
parent | e9d5f86f47e11658e51f16ff38fd87729cbe688d (diff) | |
download | gcc-2f212aae96e5a5b2c09d87fa91592b5a2a573e7e.tar.gz |
2009-09-29 Harsha Jagasia <harsha.jagasia@amd.com>
* config.gcc (i[34567]86-*-*): Include fma4intrin.h.
(x86_64-*-*): Ditto.
* config/i386/fma4intrin.h: New file, provide common x86 compiler
intrinisics for FMA4.
* config/i386/cpuid.h (bit_FMA4): Define FMA4 bit.
* config/i386/x86intrin.h: Fix typo to SSE4A instead of SSE4a.
Add FMA4 check and fma4intrin.h.
* config/i386/i386-c.c(ix86_target_macros_internal): Check
ISA_FLAG for FMA4.
* config/i386/i386.h(TARGET_FMA4): New macro for FMA4.
* config/i386/i386.md (UNSPEC_FMA4_INTRINSIC): Add new UNSPEC
constant for FMA4 support.
(UNSPEC_FMA4_FMADDSUB): Ditto.
(UNSPEC_FMA4_FMSUBADD): Ditto.
* config/i386/i386.opt (-mfma4): New switch for FMA4 support.
* config/i386/i386-protos.h (ix86_fma4_valid_op_p): Add
declaration.
(ix86_expand_fma4_multiple_memory): Ditto.
* config/i386/i386.c (OPTION_MASK_ISA_FMA4_SET): New.
(OPTION_MASK_ISA_FMA4_UNSET): New.
(OPTION_MASK_ISA_SSE4A_UNSET): Change definition to
depend on FMA4.
(OPTION_MASK_ISA_AVX_UNSET): Change definition to
depend on FMA4.
(ix86_handle_option): Handle -mfma4.
(isa_opts): Handle -mfma4.
(enum pta_flags): Add PTA_FMA4.
(override_options): Add FMA4 support.
(IX86_BUILTIN_VFMADDSS): New for FMA4 intrinsic.
(IX86_BUILTIN_VFMADDSD): Ditto.
(IX86_BUILTIN_VFMADDPS): Ditto.
(IX86_BUILTIN_VFMADDPD): Ditto.
(IX86_BUILTIN_VFMSUBSS): Ditto.
(IX86_BUILTIN_VFMSUBSD): Ditto.
(IX86_BUILTIN_VFMSUBPS): Ditto.
(IX86_BUILTIN_VFMSUBPD): Ditto.
(IX86_BUILTIN_VFMADDSUBPS): Ditto.
(IX86_BUILTIN_VFMADDSUBPD): Ditto.
(IX86_BUILTIN_VFMSUBADDPS): Ditto.
(IX86_BUILTIN_VFMSUBADDPD): Ditto.
(IX86_BUILTIN_VFNMADDSS): Ditto.
(IX86_BUILTIN_VFNMADDSD): Ditto.
(IX86_BUILTIN_VFNMADDPS): Ditto.
(IX86_BUILTIN_VFNMADDPD): Ditto.
(IX86_BUILTIN_VFNMSUBSS): Ditto.
(IX86_BUILTIN_VFNMSUBSD): Ditto.
(IX86_BUILTIN_VFNMSUBPS): Ditto.
(IX86_BUILTIN_VFNMSUBPD): Ditto.
(IX86_BUILTIN_VFMADDPS256): Ditto.
(IX86_BUILTIN_VFMADDPD256): Ditto.
(IX86_BUILTIN_VFMSUBPS256): Ditto.
(IX86_BUILTIN_VFMSUBPD256): Ditto.
(IX86_BUILTIN_VFMADDSUBPS256): Ditto.
(IX86_BUILTIN_VFMADDSUBPD256): Ditto.
(IX86_BUILTIN_VFMSUBADDPS256): Ditto.
(IX86_BUILTIN_VFMSUBADDPD256): Ditto.
(IX86_BUILTIN_VFNMADDPS256): Ditto.
(IX86_BUILTIN_VFNMADDPD256): Ditto.
(IX86_BUILTIN_VFNMSUBPS256): Ditto.
(IX86_BUILTIN_VFNMSUBPD256): Ditto.
(enum multi_arg_type): New enum for describing the various FMA4
intrinsic argument types.
(bdesc_multi_arg): New table for FMA4 intrinsics.
(ix86_init_mmx_sse_builtins): Add FMA4 intrinsic support.
(ix86_expand_multi_arg_builtin): New function for creating FMA4
intrinsics.
(ix86_expand_builtin): Add FMA4 intrinsic support.
(ix86_fma4_valid_op_p): New function to validate FMA4 3 and 4
operand instructions.
(ix86_expand_fma4_multiple_memory): New function to split the
second memory reference from FMA4 instructions.
* config/i386/sse.md (ssemodesuffixf4): New mode attribute for FMA4.
(ssemodesuffixf2s): Ditto.
(fma4_fmadd<mode>4): Add FMA4 floating point multiply/add
instructions.
(fma4_fmsub<mode>4): Ditto.
(fma4_fnmadd<mode>4): Ditto.
(fma4_fnmsub<mode>4): Ditto.
(fma4_vmfmadd<mode>4): Ditto.
(fma4_vmfmsub<mode>4): Ditto.
(fma4_vmfnmadd<mode>4): Ditto.
(fma4_vmfnmsub<mode>4): Ditto.
(fma4_fmadd<mode>4256): Ditto.
(fma4_fmsub<mode>4256): Ditto.
(fma4_fnmadd<mode>4256): Ditto.
(fma4_fnmsub<mode>4256): Ditto.
(fma4_fmaddsubv8sf4): Ditto.
(fma4_fmaddsubv4sf4): Ditto.
(fma4_fmaddsubv4df4): Ditto.
(fma4_fmaddsubv2df4): Ditto.
(fma4_fmsubaddv8sf4): Ditto.
(fma4_fmsubaddv4sf4): Ditto.
(fma4_fmsubaddv4df4): Ditto.
(fma4_fmsubaddv2df4): Ditto.
(fma4i_fmadd<mode>4): Add FMA4 floating point multiply/add
instructions for intrinsics.
(fma4i_fmsub<mode>4): Ditto.
(fma4i_fnmadd<mode>4): Ditto.
(fma4i_fnmsub<mode>4): Ditto.
(fma4i_vmfmadd<mode>4): Ditto.
(fma4i_vmfmsub<mode>4): Ditto.
(fma4i_vmfnmadd<mode>4): Ditto.
(fma4i_vmfnmsub<mode>4): Ditto.
(fma4i_fmadd<mode>4256): Ditto.
(fma4i_fmsub<mode>4256): Ditto.
(fma4i_fnmadd<mode>4256): Ditto.
(fma4i_fnmsub<mode>4256): Ditto.
(fma4i_fmaddsubv8sf4): Ditto.
(fma4i_fmaddsubv4sf4): Ditto.
(fma4i_fmaddsubv4df4): Ditto.
(fma4i_fmaddsubv2df4): Ditto.
(fma4i_fmsubaddv8sf4): Ditto.
(fma4i_fmsubaddv4sf4): Ditto.
(fma4i_fmsubaddv4df4): Ditto.
(fma4i_fmsubaddv2df4): Ditto.
* doc/invoke.texi (-mfma4): Add documentation.
* doc/extend.texi (x86 intrinsics): Add FMA4 intrinsics.
* gcc.target/i386/fma4-check.h
* gcc.target/i386/fma4-fma.c
* gcc.target/i386/fma4-maccXX.c
* gcc.target/i386/fma4-msubXX.c
* gcc.target/i386/fma4-nmaccXX.c
* gcc.target/i386/fma4-nmsubXX.c
* gcc.target/i386/fma4-vector.c
* gcc.target/i386/fma4-256-maccXX.c
* gcc.target/i386/fma4-256-msubXX.c
* gcc.target/i386/fma4-256-nmaccXX.c
* gcc.target/i386/fma4-256-nmsubXX.c
* gcc.target/i386/fma4-256-vector.c
* gcc.target/i386/funcspec-2.c: New file.
* gcc.target/i386/funcspec-4.c: Test error conditions
related to FMA4.
* gcc.target/i386/funcspec-5.c
* gcc.target/i386/funcspec-6.c
* gcc.target/i386/funcspec-8.c: Add FMA4.
* gcc.target/i386/funcspec-9.c: New file.
* gcc.target/i386/i386.exp: Add check_effective_target_fma4.
* gcc.target/i386/isa-10.c
* gcc.target/i386/isa-11.c
* gcc.target/i386/isa-12.c
* gcc.target/i386/isa-13.c
* gcc.target/i386/isa-2.c
* gcc.target/i386/isa-3.c
* gcc.target/i386/isa-4.c
* gcc.target/i386/isa-7.c
* gcc.target/i386/isa-8.c
* gcc.target/i386/isa-9.c: New file.
* gcc.target/i386/isa-14.c
* gcc.target/i386/isa-1.c
* gcc.target/i386/isa-5.c
* gcc.target/i386/isa-6.c: Add FMA4.
* gcc.target/i386/sse-12.c
* gcc.target/i386/sse-13.c
* gcc.target/i386/sse-14.c
* gcc.target/i386/sse-22.c: New file.
* g++.dg/other/i386-2.C
* g++.dg/other/i386-3.C
* g++.dg/other/i386-5.C
* g++.dg/other/i386-6.C: Add -mfma4 in dg-options.
git-svn-id: svn+ssh://gcc.gnu.org/svn/gcc/trunk@152311 138bc75d-0d04-0410-961f-82ee72b054a4
Diffstat (limited to 'gcc/testsuite/gcc.target/i386')
37 files changed, 2219 insertions, 1 deletions
diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-maccXX.c b/gcc/testsuite/gcc.target/i386/fma4-256-maccXX.c new file mode 100644 index 00000000000..134200af72a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-256-maccXX.c @@ -0,0 +1,96 @@ +/* { dg-do run } */ +/* { dg-require-effective-target fma4 } */ +/* { dg-options "-O2 -mfma4" } */ + +#include "fma4-check.h" + +#include <x86intrin.h> +#include <string.h> + +#define NUM 20 + +union +{ + __m256 x[NUM]; + float f[NUM * 8]; + __m256d y[NUM]; + double d[NUM * 4]; +} dst, res, src1, src2, src3; + + +/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate + product is not rounded, only the addition is rounded. */ + +static void +init_maccps () +{ + int i; + for (i = 0; i < NUM * 8; i++) + { + src1.f[i] = i; + src2.f[i] = i + 10; + src3.f[i] = i + 20; + } +} + +static void +init_maccpd () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.d[i] = i; + src2.d[i] = i + 10; + src3.d[i] = i + 20; + } +} + +static int +check_maccps () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 8; i = i + 8) + for (j = 0; j < 8; j++) + { + res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j]; + if (dst.f[i + j] != res.f[i + j]) + check_fails++; + } + return check_fails++; +} + +static int +check_maccpd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + for (j = 0; j < 4; j++) + { + res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j]; + if (dst.d[i + j] != res.d[i + j]) + check_fails++; + } + return check_fails++; +} + +static void +fma4_test (void) +{ + int i; + + init_maccps (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm256_macc_ps (src1.x[i], src2.x[i], src3.x[i]); + + if (check_maccps ()) + abort (); + + init_maccpd (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm256_macc_pd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_maccpd ()) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-msubXX.c b/gcc/testsuite/gcc.target/i386/fma4-256-msubXX.c new file mode 100644 index 00000000000..d6cafb4d542 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-256-msubXX.c @@ -0,0 +1,96 @@ +/* { dg-do run } */ +/* { dg-require-effective-target fma4 } */ +/* { dg-options "-O2 -mfma4" } */ + +#include "fma4-check.h" + +#include <x86intrin.h> +#include <string.h> + +#define NUM 20 + +union +{ + __m256 x[NUM]; + float f[NUM * 8]; + __m256d y[NUM]; + double d[NUM * 4]; +} dst, res, src1, src2, src3; + +/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate + product is not rounded, only the addition is rounded. */ + +static void +init_msubps () +{ + int i; + for (i = 0; i < NUM * 8; i++) + { + src1.f[i] = i; + src2.f[i] = i + 10; + src3.f[i] = i + 20; + } +} + +static void +init_msubpd () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.d[i] = i; + src2.d[i] = i + 10; + src3.d[i] = i + 20; + } +} + +static int +check_msubps () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 8; i = i + 8) + for (j = 0; j < 8; j++) + { + res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j]; + if (dst.f[i + j] != res.f[i + j]) + check_fails++; + } + return check_fails++; +} + +static int +check_msubpd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + for (j = 0; j < 4; j++) + { + res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j]; + if (dst.d[i + j] != res.d[i + j]) + check_fails++; + } + return check_fails++; +} + +static void +fma4_test (void) +{ + int i; + + init_msubps (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm256_msub_ps (src1.x[i], src2.x[i], src3.x[i]); + + if (check_msubps ()) + abort (); + + init_msubpd (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm256_msub_pd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_msubpd ()) + abort (); + +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-nmaccXX.c b/gcc/testsuite/gcc.target/i386/fma4-256-nmaccXX.c new file mode 100644 index 00000000000..261f302f2f7 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-256-nmaccXX.c @@ -0,0 +1,96 @@ +/* { dg-do run } */ +/* { dg-require-effective-target fma4 } */ +/* { dg-options "-O2 -mfma4" } */ + +#include "fma4-check.h" + +#include <x86intrin.h> +#include <string.h> + +#define NUM 20 + +union +{ + __m256 x[NUM]; + float f[NUM * 8]; + __m256d y[NUM]; + double d[NUM * 4]; +} dst, res, src1, src2, src3; + +/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate + product is not rounded, only the addition is rounded. */ + +static void +init_nmaccps () +{ + int i; + for (i = 0; i < NUM * 8; i++) + { + src1.f[i] = i; + src2.f[i] = i + 10; + src3.f[i] = i + 20; + } +} + +static void +init_nmaccpd () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.d[i] = i; + src2.d[i] = i + 10; + src3.d[i] = i + 20; + } +} + +static int +check_nmaccps () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 8; i = i + 8) + for (j = 0; j < 8; j++) + { + res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j]; + if (dst.f[i + j] != res.f[i + j]) + check_fails++; + } + return check_fails++; +} + +static int +check_nmaccpd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + for (j = 0; j < 4; j++) + { + res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j]; + if (dst.d[i + j] != res.d[i + j]) + check_fails++; + } + return check_fails++; +} + +static void +fma4_test (void) +{ + int i; + + init_nmaccps (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm256_nmacc_ps (src1.x[i], src2.x[i], src3.x[i]); + + if (check_nmaccps ()) + abort (); + + init_nmaccpd (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm256_nmacc_pd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_nmaccpd ()) + abort (); + +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-nmsubXX.c b/gcc/testsuite/gcc.target/i386/fma4-256-nmsubXX.c new file mode 100644 index 00000000000..3205715efec --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-256-nmsubXX.c @@ -0,0 +1,95 @@ +/* { dg-require-effective-target fma4 } */ +/* { dg-options "-O2 -mfma4" } */ + +#include "fma4-check.h" + +#include <x86intrin.h> +#include <string.h> + +#define NUM 20 + +union +{ + __m256 x[NUM]; + float f[NUM * 8]; + __m256d y[NUM]; + double d[NUM * 4]; +} dst, res, src1, src2, src3; + +/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate + product is not rounded, only the addition is rounded. */ + +static void +init_nmsubps () +{ + int i; + for (i = 0; i < NUM * 8; i++) + { + src1.f[i] = i; + src2.f[i] = i + 10; + src3.f[i] = i + 20; + } +} + +static void +init_nmsubpd () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.d[i] = i; + src2.d[i] = i + 10; + src3.d[i] = i + 20; + } +} + +static int +check_nmsubps () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 8; i = i + 8) + for (j = 0; j < 8; j++) + { + res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j]; + if (dst.f[i + j] != res.f[i + j]) + check_fails++; + } + return check_fails++; +} + +static int +check_nmsubpd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + for (j = 0; j < 4; j++) + { + res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j]; + if (dst.d[i + j] != res.d[i + j]) + check_fails++; + } + return check_fails++; +} + +static void +fma4_test (void) +{ + int i; + + init_nmsubps (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm256_nmsub_ps (src1.x[i], src2.x[i], src3.x[i]); + + if (check_nmsubps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4], &src3.f[i * 4])) + abort (); + + init_nmsubpd (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm256_nmsub_pd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_nmsubpd (&dst.y[i], &src1.d[i * 2], &src2.d[i * 2], &src3.d[i * 2])) + abort (); + +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-256-vector.c b/gcc/testsuite/gcc.target/i386/fma4-256-vector.c new file mode 100644 index 00000000000..714b743186d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-256-vector.c @@ -0,0 +1,93 @@ +/* Test that the compiler properly optimizes floating point multiply and add + instructions vector into vfmaddps on FMA4 systems. */ + +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -mfma4 -ftree-vectorize" } */ + +extern void exit (int); + +typedef float __m256 __attribute__ ((__vector_size__ (32), __may_alias__)); +typedef double __m256d __attribute__ ((__vector_size__ (32), __may_alias__)); + +#define SIZE 10240 + +union { + __m256 f_align; + __m256d d_align; + float f[SIZE]; + double d[SIZE]; +} a, b, c, d; + +void +flt_mul_add (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.f[i] = (b.f[i] * c.f[i]) + d.f[i]; +} + +void +dbl_mul_add (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.d[i] = (b.d[i] * c.d[i]) + d.d[i]; +} + +void +flt_mul_sub (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.f[i] = (b.f[i] * c.f[i]) - d.f[i]; +} + +void +dbl_mul_sub (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.d[i] = (b.d[i] * c.d[i]) - d.d[i]; +} + +void +flt_neg_mul_add (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.f[i] = (-(b.f[i] * c.f[i])) + d.f[i]; +} + +void +dbl_neg_mul_add (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.d[i] = (-(b.d[i] * c.d[i])) + d.d[i]; +} + +int main () +{ + flt_mul_add (); + flt_mul_sub (); + flt_neg_mul_add (); + + dbl_mul_add (); + dbl_mul_sub (); + dbl_neg_mul_add (); + exit (0); +} + +/* { dg-final { scan-assembler "vfmaddps" } } */ +/* { dg-final { scan-assembler "vfmaddpd" } } */ +/* { dg-final { scan-assembler "vfmsubps" } } */ +/* { dg-final { scan-assembler "vfmsubpd" } } */ +/* { dg-final { scan-assembler "vfnmaddps" } } */ +/* { dg-final { scan-assembler "vfnmaddpd" } } */ diff --git a/gcc/testsuite/gcc.target/i386/fma4-check.h b/gcc/testsuite/gcc.target/i386/fma4-check.h new file mode 100644 index 00000000000..76fcdef99b6 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-check.h @@ -0,0 +1,20 @@ +#include <stdlib.h> + +#include "cpuid.h" + +static void fma4_test (void); + +int +main () +{ + unsigned int eax, ebx, ecx, edx; + + if (!__get_cpuid (0x80000001, &eax, &ebx, &ecx, &edx)) + return 0; + + /* Run FMA4 test only if host has FMA4 support. */ + if (ecx & bit_FMA4) + fma4_test (); + + exit (0); +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-fma.c b/gcc/testsuite/gcc.target/i386/fma4-fma.c new file mode 100644 index 00000000000..cb906916117 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-fma.c @@ -0,0 +1,83 @@ +/* Test that the compiler properly optimizes floating point multiply + and add instructions into vfmaddss, vfmsubss, vfnmaddss, + vfnmsubss on FMA4 systems. */ + +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -mfma4" } */ + +extern void exit (int); + +float +flt_mul_add (float a, float b, float c) +{ + return (a * b) + c; +} + +double +dbl_mul_add (double a, double b, double c) +{ + return (a * b) + c; +} + +float +flt_mul_sub (float a, float b, float c) +{ + return (a * b) - c; +} + +double +dbl_mul_sub (double a, double b, double c) +{ + return (a * b) - c; +} + +float +flt_neg_mul_add (float a, float b, float c) +{ + return (-(a * b)) + c; +} + +double +dbl_neg_mul_add (double a, double b, double c) +{ + return (-(a * b)) + c; +} + +float +flt_neg_mul_sub (float a, float b, float c) +{ + return (-(a * b)) - c; +} + +double +dbl_neg_mul_sub (double a, double b, double c) +{ + return (-(a * b)) - c; +} + +float f[10] = { 2, 3, 4 }; +double d[10] = { 2, 3, 4 }; + +int main () +{ + f[3] = flt_mul_add (f[0], f[1], f[2]); + f[4] = flt_mul_sub (f[0], f[1], f[2]); + f[5] = flt_neg_mul_add (f[0], f[1], f[2]); + f[6] = flt_neg_mul_sub (f[0], f[1], f[2]); + + d[3] = dbl_mul_add (d[0], d[1], d[2]); + d[4] = dbl_mul_sub (d[0], d[1], d[2]); + d[5] = dbl_neg_mul_add (d[0], d[1], d[2]); + d[6] = dbl_neg_mul_sub (d[0], d[1], d[2]); + exit (0); +} + +/* { dg-final { scan-assembler "vfmaddss" } } */ +/* { dg-final { scan-assembler "vfmaddsd" } } */ +/* { dg-final { scan-assembler "vfmsubss" } } */ +/* { dg-final { scan-assembler "vfmsubsd" } } */ +/* { dg-final { scan-assembler "vfnmaddss" } } */ +/* { dg-final { scan-assembler "vfnmaddsd" } } */ +/* { dg-final { scan-assembler "vfnmsubss" } } */ +/* { dg-final { scan-assembler "vfnmsubsd" } } */ diff --git a/gcc/testsuite/gcc.target/i386/fma4-maccXX.c b/gcc/testsuite/gcc.target/i386/fma4-maccXX.c new file mode 100644 index 00000000000..4b4c00596a7 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-maccXX.c @@ -0,0 +1,136 @@ +/* { dg-do run } */ +/* { dg-require-effective-target fma4 } */ +/* { dg-options "-O0 -mfma4" } */ + +#include "fma4-check.h" + +#include <x86intrin.h> +#include <string.h> + +#define NUM 20 + +union +{ + __m128 x[NUM]; + float f[NUM * 4]; + __m128d y[NUM]; + double d[NUM * 2]; +} dst, res, src1, src2, src3; + + +/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate + product is not rounded, only the addition is rounded. */ + +static void +init_maccps () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.f[i] = i; + src2.f[i] = i + 10; + src3.f[i] = i + 20; + } +} + +static void +init_maccpd () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.d[i] = i; + src2.d[i] = i + 10; + src3.d[i] = i + 20; + } +} + +static int +check_maccps () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + for (j = 0; j < 4; j++) + { + res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j]; + if (dst.f[i + j] != res.f[i + j]) + check_fails++; + } + return check_fails++; +} + +static int +check_maccpd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 2; i = i + 2) + for (j = 0; j < 2; j++) + { + res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j]; + if (dst.d[i + j] != res.d[i + j]) + check_fails++; + } + return check_fails++; +} + + +static int +check_maccss () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i= i + 4) + { + res.f[i] = (src1.f[i] * src2.f[i]) + src3.f[i]; + if (dst.f[i] != res.f[i]) + check_fails++; + } + return check_fails++; +} + +static int +check_maccsd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 2; i = i + 2) + { + res.d[i] = (src1.d[i] * src2.d[i]) + src3.d[i]; + if (dst.d[i] != res.d[i]) + check_fails++; + } + return check_fails++; +} + +static void +fma4_test (void) +{ + int i; + + init_maccps (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm_macc_ps (src1.x[i], src2.x[i], src3.x[i]); + + if (check_maccps ()) + abort (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm_macc_ss (src1.x[i], src2.x[i], src3.x[i]); + + if (check_maccss ()) + abort (); + + init_maccpd (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm_macc_pd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_maccpd ()) + abort (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm_macc_sd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_maccsd ()) + abort (); + +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-msubXX.c b/gcc/testsuite/gcc.target/i386/fma4-msubXX.c new file mode 100644 index 00000000000..eed75580e8d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-msubXX.c @@ -0,0 +1,134 @@ +/* { dg-do run } */ +/* { dg-require-effective-target fma4 } */ +/* { dg-options "-O0 -mfma4" } */ + +#include "fma4-check.h" + +#include <x86intrin.h> +#include <string.h> + +#define NUM 20 + +union +{ + __m128 x[NUM]; + float f[NUM * 4]; + __m128d y[NUM]; + double d[NUM * 2]; +} dst, res, src1, src2, src3; + +/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate + product is not rounded, only the addition is rounded. */ + +static void +init_msubps () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.f[i] = i; + src2.f[i] = i + 10; + src3.f[i] = i + 20; + } +} + +static void +init_msubpd () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.d[i] = i; + src2.d[i] = i + 10; + src3.d[i] = i + 20; + } +} + +static int +check_msubps () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + for (j = 0; j < 4; j++) + { + res.f[i + j] = (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j]; + if (dst.f[i + j] != res.f[i + j]) + check_fails++; + } + return check_fails++; +} + +static int +check_msubpd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 2; i = i + 2) + for (j = 0; j < 2; j++) + { + res.d[i + j] = (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j]; + if (dst.d[i + j] != res.d[i + j]) + check_fails++; + } + return check_fails++; +} + + +static int +check_msubss () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + { + res.f[i] = (src1.f[i] * src2.f[i]) - src3.f[i]; + if (dst.f[i] != res.f[i]) + check_fails++; + } + return check_fails++; +} + +static int +check_msubsd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 2; i = i + 2) + { + res.d[i] = (src1.d[i] * src2.d[i]) - src3.d[i]; + if (dst.d[i] != res.d[i]) + check_fails++; + } + return check_fails++; +} + +static void +fma4_test (void) +{ + int i; + + init_msubps (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm_msub_ps (src1.x[i], src2.x[i], src3.x[i]); + + if (check_msubps ()) + abort (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm_msub_ss (src1.x[i], src2.x[i], src3.x[i]); + + if (check_msubss ()) + abort (); + + init_msubpd (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm_msub_pd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_msubpd ()) + abort (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm_msub_sd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_msubsd ()) + abort (); +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-nmaccXX.c b/gcc/testsuite/gcc.target/i386/fma4-nmaccXX.c new file mode 100644 index 00000000000..9abf7460477 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-nmaccXX.c @@ -0,0 +1,137 @@ +/* { dg-do run } */ +/* { dg-require-effective-target fma4 } */ +/* { dg-options "-O0 -mfma4" } */ + +#include "fma4-check.h" + +#include <x86intrin.h> +#include <string.h> + +#define NUM 20 + +union +{ + __m128 x[NUM]; + float f[NUM * 4]; + __m128d y[NUM]; + double d[NUM * 2]; +} dst, res, src1, src2, src3; + +/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate + product is not rounded, only the addition is rounded. */ + +static void +init_nmaccps () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.f[i] = i; + src2.f[i] = i + 10; + src3.f[i] = i + 20; + } +} + +static void +init_nmaccpd () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.d[i] = i; + src2.d[i] = i + 10; + src3.d[i] = i + 20; + } +} + +static int +check_nmaccps () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + for (j = 0; j < 4; j++) + { + res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) + src3.f[i + j]; + if (dst.f[i + j] != res.f[i + j]) + check_fails++; + } + return check_fails++; +} + +static int +check_nmaccpd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 2; i = i + 2) + for (j = 0; j < 2; j++) + { + res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) + src3.d[i + j]; + if (dst.d[i + j] != res.d[i + j]) + check_fails++; + } + return check_fails++; +} + + +static int +check_nmaccss () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + { + res.f[i] = - (src1.f[i] * src2.f[i]) + src3.f[i]; + if (dst.f[i] != res.f[i]) + check_fails++; + } + return check_fails++; +} + +static int +check_nmaccsd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 2; i = i + 2) + { + res.d[i] = - (src1.d[i] * src2.d[i]) + src3.d[i]; + if (dst.d[i] != res.d[i]) + check_fails++; + } + return check_fails++; +} + +static void +fma4_test (void) +{ + int i; + + init_nmaccps (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm_nmacc_ps (src1.x[i], src2.x[i], src3.x[i]); + + if (check_nmaccps ()) + abort (); + + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm_nmacc_ss (src1.x[i], src2.x[i], src3.x[i]); + + if (check_nmaccss ()) + abort (); + + init_nmaccpd (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm_nmacc_pd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_nmaccpd ()) + abort (); + + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm_nmacc_sd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_nmaccsd ()) + abort (); + +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-nmsubXX.c b/gcc/testsuite/gcc.target/i386/fma4-nmsubXX.c new file mode 100644 index 00000000000..85fbecddb3d --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-nmsubXX.c @@ -0,0 +1,137 @@ +/* { dg-do run } */ +/* { dg-require-effective-target fma4 } */ +/* { dg-options "-O0 -mfma4" } */ + +#include "fma4-check.h" + +#include <x86intrin.h> +#include <string.h> + +#define NUM 20 + +union +{ + __m128 x[NUM]; + float f[NUM * 4]; + __m128d y[NUM]; + double d[NUM * 2]; +} dst, res, src1, src2, src3; + +/* Note that in macc*,msub*,mnmacc* and mnsub* instructions, the intermdediate + product is not rounded, only the addition is rounded. */ + +static void +init_nmsubps () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.f[i] = i; + src2.f[i] = i + 10; + src3.f[i] = i + 20; + } +} + +static void +init_nmsubpd () +{ + int i; + for (i = 0; i < NUM * 4; i++) + { + src1.d[i] = i; + src2.d[i] = i + 10; + src3.d[i] = i + 20; + } +} + +static int +check_nmsubps () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + for (j = 0; j < 4; j++) + { + res.f[i + j] = - (src1.f[i + j] * src2.f[i + j]) - src3.f[i + j]; + if (dst.f[i + j] != res.f[i + j]) + check_fails++; + } + return check_fails++; +} + +static int +check_nmsubpd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 2; i = i + 2) + for (j = 0; j < 2; j++) + { + res.d[i + j] = - (src1.d[i + j] * src2.d[i + j]) - src3.d[i + j]; + if (dst.d[i + j] != res.d[i + j]) + check_fails++; + } + return check_fails++; +} + + +static int +check_nmsubss () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 4; i = i + 4) + { + res.f[i] = - (src1.f[i] * src2.f[i]) - src3.f[i]; + if (dst.f[i] != res.f[i]) + check_fails++; + } + return check_fails++; +} + +static int +check_nmsubsd () +{ + int i, j, check_fails = 0; + for (i = 0; i < NUM * 2; i = i + 2) + { + res.d[i] = - (src1.d[i] * src2.d[i]) - src3.d[i]; + if (dst.d[i] != res.d[i]) + check_fails++; + } + return check_fails++; +} + +static void +fma4_test (void) +{ + int i; + + init_nmsubps (); + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm_nmsub_ps (src1.x[i], src2.x[i], src3.x[i]); + + if (check_nmsubps (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4], &src3.f[i * 4])) + abort (); + + + for (i = 0; i < NUM; i++) + dst.x[i] = _mm_nmsub_ss (src1.x[i], src2.x[i], src3.x[i]); + + if (check_nmsubss (&dst.x[i], &src1.f[i * 4], &src2.f[i * 4], &src3.f[i * 4])) + abort (); + + init_nmsubpd (); + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm_nmsub_pd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_nmsubpd (&dst.y[i], &src1.d[i * 2], &src2.d[i * 2], &src3.d[i * 2])) + abort (); + + + for (i = 0; i < NUM; i++) + dst.y[i] = _mm_nmsub_sd (src1.y[i], src2.y[i], src3.y[i]); + + if (check_nmsubsd (&dst.y[i], &src1.d[i * 2], &src2.d[i * 2], &src3.d[i * 2])) + abort (); + +} diff --git a/gcc/testsuite/gcc.target/i386/fma4-vector.c b/gcc/testsuite/gcc.target/i386/fma4-vector.c new file mode 100644 index 00000000000..df8463ea32a --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/fma4-vector.c @@ -0,0 +1,93 @@ +/* Test that the compiler properly optimizes floating point multiply and add + instructions vector into vfmaddps on FMA4 systems. */ + +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -mfma4 -ftree-vectorize" } */ + +extern void exit (int); + +typedef float __m128 __attribute__ ((__vector_size__ (16), __may_alias__)); +typedef double __m128d __attribute__ ((__vector_size__ (16), __may_alias__)); + +#define SIZE 10240 + +union { + __m128 f_align; + __m128d d_align; + float f[SIZE]; + double d[SIZE]; +} a, b, c, d; + +void +flt_mul_add (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.f[i] = (b.f[i] * c.f[i]) + d.f[i]; +} + +void +dbl_mul_add (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.d[i] = (b.d[i] * c.d[i]) + d.d[i]; +} + +void +flt_mul_sub (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.f[i] = (b.f[i] * c.f[i]) - d.f[i]; +} + +void +dbl_mul_sub (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.d[i] = (b.d[i] * c.d[i]) - d.d[i]; +} + +void +flt_neg_mul_add (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.f[i] = (-(b.f[i] * c.f[i])) + d.f[i]; +} + +void +dbl_neg_mul_add (void) +{ + int i; + + for (i = 0; i < SIZE; i++) + a.d[i] = (-(b.d[i] * c.d[i])) + d.d[i]; +} + +int main () +{ + flt_mul_add (); + flt_mul_sub (); + flt_neg_mul_add (); + + dbl_mul_add (); + dbl_mul_sub (); + dbl_neg_mul_add (); + exit (0); +} + +/* { dg-final { scan-assembler "vfmaddps" } } */ +/* { dg-final { scan-assembler "vfmaddpd" } } */ +/* { dg-final { scan-assembler "vfmsubps" } } */ +/* { dg-final { scan-assembler "vfmsubpd" } } */ +/* { dg-final { scan-assembler "vfnmaddps" } } */ +/* { dg-final { scan-assembler "vfnmaddpd" } } */ diff --git a/gcc/testsuite/gcc.target/i386/funcspec-2.c b/gcc/testsuite/gcc.target/i386/funcspec-2.c new file mode 100644 index 00000000000..c132fc9a965 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/funcspec-2.c @@ -0,0 +1,99 @@ +/* Test whether using target specific options, we can generate FMA4 code. */ +/* { dg-do compile } */ +/* { dg-require-effective-target lp64 } */ +/* { dg-options "-O2 -march=k8" } */ + +extern void exit (int); + +#define FMA4_ATTR __attribute__((__target__("fma4"))) +extern float flt_mul_add (float a, float b, float c) FMA4_ATTR; +extern float flt_mul_sub (float a, float b, float c) FMA4_ATTR; +extern float flt_neg_mul_add (float a, float b, float c) FMA4_ATTR; +extern float flt_neg_mul_sub (float a, float b, float c) FMA4_ATTR; + +extern double dbl_mul_add (double a, double b, double c) FMA4_ATTR; +extern double dbl_mul_sub (double a, double b, double c) FMA4_ATTR; +extern double dbl_neg_mul_add (double a, double b, double c) FMA4_ATTR; +extern double dbl_neg_mul_sub (double a, double b, double c) FMA4_ATTR; + +float +flt_mul_add (float a, float b, float c) +{ + return (a * b) + c; +} + +double +dbl_mul_add (double a, double b, double c) +{ + return (a * b) + c; +} + +float +flt_mul_sub (float a, float b, float c) +{ + return (a * b) - c; +} + +double +dbl_mul_sub (double a, double b, double c) +{ + return (a * b) - c; +} + +float +flt_neg_mul_add (float a, float b, float c) +{ + return (-(a * b)) + c; +} + +double +dbl_neg_mul_add (double a, double b, double c) +{ + return (-(a * b)) + c; +} + +float +flt_neg_mul_sub (float a, float b, float c) +{ + return (-(a * b)) - c; +} + +double +dbl_neg_mul_sub (double a, double b, double c) +{ + return (-(a * b)) - c; +} + +float f[10] = { 2, 3, 4 }; +double d[10] = { 2, 3, 4 }; + +int main () +{ + f[3] = flt_mul_add (f[0], f[1], f[2]); + f[4] = flt_mul_sub (f[0], f[1], f[2]); + f[5] = flt_neg_mul_add (f[0], f[1], f[2]); + f[6] = flt_neg_mul_sub (f[0], f[1], f[2]); + + d[3] = dbl_mul_add (d[0], d[1], d[2]); + d[4] = dbl_mul_sub (d[0], d[1], d[2]); + d[5] = dbl_neg_mul_add (d[0], d[1], d[2]); + d[6] = dbl_neg_mul_sub (d[0], d[1], d[2]); + exit (0); +} + +/* { dg-final { scan-assembler "vfmaddss" } } */ +/* { dg-final { scan-assembler "vfmaddsd" } } */ +/* { dg-final { scan-assembler "vfmsubss" } } */ +/* { dg-final { scan-assembler "vfmsubsd" } } */ +/* { dg-final { scan-assembler "vfnmaddss" } } */ +/* { dg-final { scan-assembler "vfnmaddsd" } } */ +/* { dg-final { scan-assembler "vfnmsubss" } } */ +/* { dg-final { scan-assembler "vfnmsubsd" } } */ +/* { dg-final { scan-assembler "call\t(.*)flt_mul_add" } } */ +/* { dg-final { scan-assembler "call\t(.*)flt_mul_sub" } } */ +/* { dg-final { scan-assembler "call\t(.*)flt_neg_mul_add" } } */ +/* { dg-final { scan-assembler "call\t(.*)flt_neg_mul_sub" } } */ +/* { dg-final { scan-assembler "call\t(.*)dbl_mul_add" } } */ +/* { dg-final { scan-assembler "call\t(.*)dbl_mul_sub" } } */ +/* { dg-final { scan-assembler "call\t(.*)dbl_neg_mul_add" } } */ +/* { dg-final { scan-assembler "call\t(.*)dbl_neg_mul_sub" } } */ diff --git a/gcc/testsuite/gcc.target/i386/funcspec-4.c b/gcc/testsuite/gcc.target/i386/funcspec-4.c index e2eef418bc2..025b97dff8e 100644 --- a/gcc/testsuite/gcc.target/i386/funcspec-4.c +++ b/gcc/testsuite/gcc.target/i386/funcspec-4.c @@ -1,6 +1,9 @@ /* Test some error conditions with function specific options. */ /* { dg-do compile } */ +/* no fma400 switch */ +extern void error1 (void) __attribute__((__target__("fma400"))); /* { dg-error "unknown" } */ + /* Multiple arch switches */ extern void error2 (void) __attribute__((__target__("arch=core2,arch=k8"))); /* { dg-error "already specified" } */ diff --git a/gcc/testsuite/gcc.target/i386/funcspec-5.c b/gcc/testsuite/gcc.target/i386/funcspec-5.c index cbecfaad56d..34da51ceb9f 100644 --- a/gcc/testsuite/gcc.target/i386/funcspec-5.c +++ b/gcc/testsuite/gcc.target/i386/funcspec-5.c @@ -16,6 +16,7 @@ extern void test_sse4 (void) __attribute__((__target__("sse4"))); extern void test_sse4_1 (void) __attribute__((__target__("sse4.1"))); extern void test_sse4_2 (void) __attribute__((__target__("sse4.2"))); extern void test_sse4a (void) __attribute__((__target__("sse4a"))); +extern void test_fma4 (void) __attribute__((__target__("fma4"))); extern void test_ssse3 (void) __attribute__((__target__("ssse3"))); extern void test_no_abm (void) __attribute__((__target__("no-abm"))); @@ -31,6 +32,7 @@ extern void test_no_sse4 (void) __attribute__((__target__("no-sse4"))); extern void test_no_sse4_1 (void) __attribute__((__target__("no-sse4.1"))); extern void test_no_sse4_2 (void) __attribute__((__target__("no-sse4.2"))); extern void test_no_sse4a (void) __attribute__((__target__("no-sse4a"))); +extern void test_no_fma4 (void) __attribute__((__target__("no-fma4"))); extern void test_no_ssse3 (void) __attribute__((__target__("no-ssse3"))); extern void test_arch_i386 (void) __attribute__((__target__("arch=i386"))); diff --git a/gcc/testsuite/gcc.target/i386/funcspec-6.c b/gcc/testsuite/gcc.target/i386/funcspec-6.c index 7f46ad0c1b9..575be9bbbdd 100644 --- a/gcc/testsuite/gcc.target/i386/funcspec-6.c +++ b/gcc/testsuite/gcc.target/i386/funcspec-6.c @@ -16,6 +16,7 @@ extern void test_sse4 (void) __attribute__((__target__("sse4"))); extern void test_sse4_1 (void) __attribute__((__target__("sse4.1"))); extern void test_sse4_2 (void) __attribute__((__target__("sse4.2"))); extern void test_sse4a (void) __attribute__((__target__("sse4a"))); +extern void test_fma4 (void) __attribute__((__target__("fma4"))); extern void test_ssse3 (void) __attribute__((__target__("ssse3"))); extern void test_no_abm (void) __attribute__((__target__("no-abm"))); @@ -31,6 +32,7 @@ extern void test_no_sse4 (void) __attribute__((__target__("no-sse4"))); extern void test_no_sse4_1 (void) __attribute__((__target__("no-sse4.1"))); extern void test_no_sse4_2 (void) __attribute__((__target__("no-sse4.2"))); extern void test_no_sse4a (void) __attribute__((__target__("no-sse4a"))); +extern void test_no_fma4 (void) __attribute__((__target__("no-fma4"))); extern void test_no_ssse3 (void) __attribute__((__target__("no-ssse3"))); extern void test_arch_nocona (void) __attribute__((__target__("arch=nocona"))); diff --git a/gcc/testsuite/gcc.target/i386/funcspec-8.c b/gcc/testsuite/gcc.target/i386/funcspec-8.c index c370733a01b..ba4b7f22743 100644 --- a/gcc/testsuite/gcc.target/i386/funcspec-8.c +++ b/gcc/testsuite/gcc.target/i386/funcspec-8.c @@ -104,6 +104,25 @@ generic_insertq (__m128i a, __m128i b) return __builtin_ia32_insertq (a, b); /* { dg-error "needs isa option" } */ } +#ifdef __FMA4__ +#error "-mfma4 should not be set for this test" +#endif + +__m128d fma4_fmaddpd (__m128d a, __m128d b, __m128d c) __attribute__((__target__("fma4"))); +__m128d generic_fmaddpd (__m128d a, __m128d b, __m128d c); + +__m128d +fma4_fmaddpd (__m128d a, __m128d b, __m128d c) +{ + return __builtin_ia32_vfmaddpd (a, b, c); +} + +__m128d +generic_fmaddpd (__m128d a, __m128d b, __m128d c) +{ + return __builtin_ia32_vfmaddpd (a, b, c); /* { dg-error "needs isa option" } */ +} + #ifdef __AES__ #error "-maes should not be set for this test" #endif diff --git a/gcc/testsuite/gcc.target/i386/funcspec-9.c b/gcc/testsuite/gcc.target/i386/funcspec-9.c new file mode 100644 index 00000000000..78714e12417 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/funcspec-9.c @@ -0,0 +1,36 @@ +/* Test whether using target specific options, we can generate FMA4 code. */ +/* { dg-do compile } */ +/* { dg-options "-O2 -march=k8 -mfpmath=sse -msse2" } */ + +extern void exit (int); + +#ifdef __FMA4__ +#warning "__FMA4__ should not be defined before #pragma GCC target." +#endif + +#pragma GCC push_options +#pragma GCC target ("fma4") + +#ifndef __FMA4__ +#warning "__FMA4__ should have be defined after #pragma GCC target." +#endif + +float +flt_mul_add (float a, float b, float c) +{ + return (a * b) + c; +} + +#pragma GCC pop_options +#ifdef __FMA4__ +#warning "__FMA4__ should not be defined after #pragma GCC pop target." +#endif + +double +dbl_mul_add (double a, double b, double c) +{ + return (a * b) + c; +} + +/* { dg-final { scan-assembler "vfmaddss" } } */ +/* { dg-final { scan-assembler "addsd" } } */ diff --git a/gcc/testsuite/gcc.target/i386/i386.exp b/gcc/testsuite/gcc.target/i386/i386.exp index 242c40fb373..c7c6e12d1f4 100644 --- a/gcc/testsuite/gcc.target/i386/i386.exp +++ b/gcc/testsuite/gcc.target/i386/i386.exp @@ -120,6 +120,20 @@ proc check_effective_target_sse4a { } { } "-O2 -msse4a" ] } +# Return 1 if fma4 instructions can be compiled. +proc check_effective_target_fma4 { } { + return [check_no_compiler_messages fma4 object { + typedef float __m128 __attribute__ ((__vector_size__ (16))); + typedef float __v4sf __attribute__ ((__vector_size__ (16))); + __m128 _mm_macc_ps(__m128 __A, __m128 __B, __m128 __C) + { + return (__m128) __builtin_ia32_vfmaddps ((__v4sf)__A, + (__v4sf)__B, + (__v4sf)__C); + } + } "-O2 -mfma4" ] +} + # If a testcase doesn't have special options, use these. global DEFAULT_CFLAGS if ![info exists DEFAULT_CFLAGS] then { diff --git a/gcc/testsuite/gcc.target/i386/isa-1.c b/gcc/testsuite/gcc.target/i386/isa-1.c index b2040b3e3f6..d98c14ffb19 100644 --- a/gcc/testsuite/gcc.target/i386/isa-1.c +++ b/gcc/testsuite/gcc.target/i386/isa-1.c @@ -27,5 +27,11 @@ main () #if defined __SSE4A__ abort (); #endif +#if defined __AVX__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif return 0; } diff --git a/gcc/testsuite/gcc.target/i386/isa-10.c b/gcc/testsuite/gcc.target/i386/isa-10.c new file mode 100644 index 00000000000..5f57be91390 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-10.c @@ -0,0 +1,34 @@ +/* { dg-do run } */ +/* { dg-options "-march=x86-64 -mfma4 -mno-sse4" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if !defined __SSE3__ + abort (); +#endif +#if !defined __SSSE3__ + abort (); +#endif +#if defined __SSE4_1__ + abort (); +#endif +#if defined __SSE4_2__ + abort (); +#endif +#if !defined __SSE4A__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-11.c b/gcc/testsuite/gcc.target/i386/isa-11.c new file mode 100644 index 00000000000..64755b099cc --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-11.c @@ -0,0 +1,34 @@ +/* { dg-do run } */ +/* { dg-options "-march=x86-64 -mfma4 -mno-ssse3" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if !defined __SSE3__ + abort (); +#endif +#if defined __SSSE3__ + abort (); +#endif +#if defined __SSE4_1__ + abort (); +#endif +#if defined __SSE4_2__ + abort (); +#endif +#if !defined __SSE4A__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-12.c b/gcc/testsuite/gcc.target/i386/isa-12.c new file mode 100644 index 00000000000..fde84a21aef --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-12.c @@ -0,0 +1,34 @@ +/* { dg-do run } */ +/* { dg-options "-march=x86-64 -mfma4 -mno-sse3" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if defined __SSE3__ + abort (); +#endif +#if defined __SSSE3__ + abort (); +#endif +#if defined __SSE4_1__ + abort (); +#endif +#if defined __SSE4_2__ + abort (); +#endif +#if defined __SSE4A__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-13.c b/gcc/testsuite/gcc.target/i386/isa-13.c new file mode 100644 index 00000000000..74e37d92d12 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-13.c @@ -0,0 +1,34 @@ +/* { dg-do run } */ +/* { dg-options "-march=x86-64 -mfma4 -mno-sse2" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if defined __SSE2__ + abort (); +#endif +#if defined __SSE3__ + abort (); +#endif +#if defined __SSSE3__ + abort (); +#endif +#if defined __SSE4_1__ + abort (); +#endif +#if defined __SSE4_2__ + abort (); +#endif +#if defined __SSE4A__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-14.c b/gcc/testsuite/gcc.target/i386/isa-14.c index 09dda6d062c..5d49e6e77fe 100644 --- a/gcc/testsuite/gcc.target/i386/isa-14.c +++ b/gcc/testsuite/gcc.target/i386/isa-14.c @@ -1,5 +1,5 @@ /* { dg-do run } */ -/* { dg-options "-march=x86-64 -msse4a -mno-sse" } */ +/* { dg-options "-march=x86-64 -msse4a -mfma4 -mno-sse" } */ extern void abort (void); @@ -27,5 +27,8 @@ main () #if defined __SSE4A__ abort (); #endif +#if defined __FMA4__ + abort (); +#endif return 0; } diff --git a/gcc/testsuite/gcc.target/i386/isa-2.c b/gcc/testsuite/gcc.target/i386/isa-2.c new file mode 100644 index 00000000000..aa8958c9364 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-2.c @@ -0,0 +1,37 @@ +/* { dg-do run } */ +/* { dg-options "-march=x86-64 -msse4 -mfma4" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if !defined __SSE3__ + abort (); +#endif +#if !defined __SSSE3__ + abort (); +#endif +#if !defined __SSE4_1__ + abort (); +#endif +#if !defined __SSE4_2__ + abort (); +#endif +#if !defined __SSE4A__ + abort (); +#endif +#if !defined __AVX__ + abort (); +#endif +#if !defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-3.c b/gcc/testsuite/gcc.target/i386/isa-3.c new file mode 100644 index 00000000000..a4d93f4bd99 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-3.c @@ -0,0 +1,37 @@ +/* { dg-do run } */ +/* { dg-options "-march=x86-64 -msse4 -mfma4 -msse4a" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if !defined __SSE3__ + abort (); +#endif +#if !defined __SSSE3__ + abort (); +#endif +#if !defined __SSE4_1__ + abort (); +#endif +#if !defined __SSE4_2__ + abort (); +#endif +#if !defined __SSE4A__ + abort (); +#endif +#if !defined __AVX__ + abort (); +#endif +#if !defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-4.c b/gcc/testsuite/gcc.target/i386/isa-4.c new file mode 100644 index 00000000000..0137257c8d1 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-4.c @@ -0,0 +1,37 @@ +/* { dg-do run } */ +/* { dg-options "-march=core2 -mfma4 -mno-sse4" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if !defined __SSE3__ + abort (); +#endif +#if !defined __SSSE3__ + abort (); +#endif +#if defined __SSE4_1__ + abort (); +#endif +#if defined __SSE4_2__ + abort (); +#endif +#if !defined __SSE4A__ + abort (); +#endif +#if defined __AVX__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-5.c b/gcc/testsuite/gcc.target/i386/isa-5.c index 2157499aefa..39d065e68ee 100644 --- a/gcc/testsuite/gcc.target/i386/isa-5.c +++ b/gcc/testsuite/gcc.target/i386/isa-5.c @@ -27,5 +27,11 @@ main () #if !defined __SSE4A__ abort (); #endif +#if defined __AVX__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif return 0; } diff --git a/gcc/testsuite/gcc.target/i386/isa-6.c b/gcc/testsuite/gcc.target/i386/isa-6.c index 389621b475b..a9a0ddb14c1 100644 --- a/gcc/testsuite/gcc.target/i386/isa-6.c +++ b/gcc/testsuite/gcc.target/i386/isa-6.c @@ -28,5 +28,11 @@ main () #if !defined __SSE4A__ abort (); #endif +#if defined __AVX__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif return 0; } diff --git a/gcc/testsuite/gcc.target/i386/isa-7.c b/gcc/testsuite/gcc.target/i386/isa-7.c new file mode 100644 index 00000000000..8dd628e9241 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-7.c @@ -0,0 +1,37 @@ +/* { dg-do run } */ +/* { dg-options "-march=amdfam10 -mfma4 -mno-sse4" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if !defined __SSE3__ + abort (); +#endif +#if !defined __SSSE3__ + abort (); +#endif +#if defined __SSE4_1__ + abort (); +#endif +#if defined __SSE4_2__ + abort (); +#endif +#if !defined __SSE4A__ + abort (); +#endif +#if defined __AVX__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-8.c b/gcc/testsuite/gcc.target/i386/isa-8.c new file mode 100644 index 00000000000..2ffd80fbad2 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-8.c @@ -0,0 +1,34 @@ +/* { dg-do run } */ +/* { dg-options "-march=amdfam10 -mfma4 -mno-sse4a" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if !defined __SSE3__ + abort (); +#endif +#if !defined __SSSE3__ + abort (); +#endif +#if !defined __SSE4_1__ + abort (); +#endif +#if !defined __SSE4_2__ + abort (); +#endif +#if defined __SSE4A__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/isa-9.c b/gcc/testsuite/gcc.target/i386/isa-9.c new file mode 100644 index 00000000000..64cbdbd6654 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/isa-9.c @@ -0,0 +1,34 @@ +/* { dg-do run } */ +/* { dg-options "-march=amdfam10 -mno-fma4" } */ + +extern void abort (void); + +int +main () +{ +#if !defined __SSE__ + abort (); +#endif +#if !defined __SSE2__ + abort (); +#endif +#if !defined __SSE3__ + abort (); +#endif +#if defined __SSSE3__ + abort (); +#endif +#if defined __SSE4_1__ + abort (); +#endif +#if defined __SSE4_2__ + abort (); +#endif +#if !defined __SSE4A__ + abort (); +#endif +#if defined __FMA4__ + abort (); +#endif + return 0; +} diff --git a/gcc/testsuite/gcc.target/i386/sse-12.c b/gcc/testsuite/gcc.target/i386/sse-12.c new file mode 100644 index 00000000000..85c36c8be31 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse-12.c @@ -0,0 +1,8 @@ +/* Test that {,x,e,p,t,s,w,a,b,i}mmintrin.h, mm3dnow.h and mm_malloc.h are + usable with -O -std=c89 -pedantic-errors. */ +/* { dg-do compile } */ +/* { dg-options "-O -std=c89 -pedantic-errors -march=k8 -m3dnow -mavx -mfma4 -maes -mpclmul" } */ + +#include <x86intrin.h> + +int dummy; diff --git a/gcc/testsuite/gcc.target/i386/sse-13.c b/gcc/testsuite/gcc.target/i386/sse-13.c new file mode 100644 index 00000000000..1ce9d960884 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse-13.c @@ -0,0 +1,128 @@ +/* { dg-do compile } */ +/* { dg-options "-O2 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -maes -mpclmul" } */ + +#include <mm_malloc.h> + +/* Test that the intrinsics compile with optimization. All of them are + defined as inline functions in {,x,e,p,t,s,w,a,b,i}mmintrin.h and mm3dnow.h + that reference the proper builtin functions. Defining away "extern" and + "__inline" results in all of them being compiled as proper functions. */ + +#define extern +#define __inline + +/* Following intrinsics require immediate arguments. */ + +/* ammintrin.h */ +#define __builtin_ia32_extrqi(X, I, L) __builtin_ia32_extrqi(X, 1, 1) +#define __builtin_ia32_insertqi(X, Y, I, L) __builtin_ia32_insertqi(X, Y, 1, 1) + +/* immintrin.h */ +#define __builtin_ia32_blendpd256(X, Y, M) __builtin_ia32_blendpd256(X, Y, 1) +#define __builtin_ia32_blendps256(X, Y, M) __builtin_ia32_blendps256(X, Y, 1) +#define __builtin_ia32_dpps256(X, Y, M) __builtin_ia32_dpps256(X, Y, 1) +#define __builtin_ia32_shufpd256(X, Y, M) __builtin_ia32_shufpd256(X, Y, 1) +#define __builtin_ia32_shufps256(X, Y, M) __builtin_ia32_shufps256(X, Y, 1) +#define __builtin_ia32_cmpsd(X, Y, O) __builtin_ia32_cmpsd(X, Y, 1) +#define __builtin_ia32_cmpss(X, Y, O) __builtin_ia32_cmpss(X, Y, 1) +#define __builtin_ia32_cmppd(X, Y, O) __builtin_ia32_cmppd(X, Y, 1) +#define __builtin_ia32_cmpps(X, Y, O) __builtin_ia32_cmpps(X, Y, 1) +#define __builtin_ia32_cmppd256(X, Y, O) __builtin_ia32_cmppd256(X, Y, 1) +#define __builtin_ia32_cmpps256(X, Y, O) __builtin_ia32_cmpps256(X, Y, 1) +#define __builtin_ia32_vextractf128_pd256(X, N) __builtin_ia32_vextractf128_pd256(X, 1) +#define __builtin_ia32_vextractf128_ps256(X, N) __builtin_ia32_vextractf128_ps256(X, 1) +#define __builtin_ia32_vextractf128_si256(X, N) __builtin_ia32_vextractf128_si256(X, 1) +#define __builtin_ia32_vpermilpd(X, N) __builtin_ia32_vpermilpd(X, 1) +#define __builtin_ia32_vpermilpd256(X, N) __builtin_ia32_vpermilpd256(X, 1) +#define __builtin_ia32_vpermilps(X, N) __builtin_ia32_vpermilps(X, 1) +#define __builtin_ia32_vpermilps256(X, N) __builtin_ia32_vpermilps256(X, 1) +#define __builtin_ia32_vpermil2pd(X, Y, C, I) __builtin_ia32_vpermil2pd(X, Y, C, 1) +#define __builtin_ia32_vpermil2pd256(X, Y, C, I) __builtin_ia32_vpermil2pd256(X, Y, C, 1) +#define __builtin_ia32_vpermil2ps(X, Y, C, I) __builtin_ia32_vpermil2ps(X, Y, C, 1) +#define __builtin_ia32_vpermil2ps256(X, Y, C, I) __builtin_ia32_vpermil2ps256(X, Y, C, 1) +#define __builtin_ia32_vperm2f128_pd256(X, Y, C) __builtin_ia32_vperm2f128_pd256(X, Y, 1) +#define __builtin_ia32_vperm2f128_ps256(X, Y, C) __builtin_ia32_vperm2f128_ps256(X, Y, 1) +#define __builtin_ia32_vperm2f128_si256(X, Y, C) __builtin_ia32_vperm2f128_si256(X, Y, 1) +#define __builtin_ia32_vinsertf128_pd256(X, Y, C) __builtin_ia32_vinsertf128_pd256(X, Y, 1) +#define __builtin_ia32_vinsertf128_ps256(X, Y, C) __builtin_ia32_vinsertf128_ps256(X, Y, 1) +#define __builtin_ia32_vinsertf128_si256(X, Y, C) __builtin_ia32_vinsertf128_si256(X, Y, 1) +#define __builtin_ia32_roundpd256(V, M) __builtin_ia32_roundpd256(V, 1) +#define __builtin_ia32_roundps256(V, M) __builtin_ia32_roundps256(V, 1) + +/* wmmintrin.h */ +#define __builtin_ia32_aeskeygenassist128(X, C) __builtin_ia32_aeskeygenassist128(X, 1) +#define __builtin_ia32_pclmulqdq128(X, Y, I) __builtin_ia32_pclmulqdq128(X, Y, 1) + +/* smmintrin.h */ +#define __builtin_ia32_roundpd(V, M) __builtin_ia32_roundpd(V, 1) +#define __builtin_ia32_roundsd(D, V, M) __builtin_ia32_roundsd(D, V, 1) +#define __builtin_ia32_roundps(V, M) __builtin_ia32_roundps(V, 1) +#define __builtin_ia32_roundss(D, V, M) __builtin_ia32_roundss(D, V, 1) + +#define __builtin_ia32_pblendw128(X, Y, M) __builtin_ia32_pblendw128 (X, Y, 1) +#define __builtin_ia32_blendps(X, Y, M) __builtin_ia32_blendps(X, Y, 1) +#define __builtin_ia32_blendpd(X, Y, M) __builtin_ia32_blendpd(X, Y, 1) +#define __builtin_ia32_dpps(X, Y, M) __builtin_ia32_dpps(X, Y, 1) +#define __builtin_ia32_dppd(X, Y, M) __builtin_ia32_dppd(X, Y, 1) +#define __builtin_ia32_insertps128(D, S, N) __builtin_ia32_insertps128(D, S, 1) +#define __builtin_ia32_vec_ext_v4sf(X, N) __builtin_ia32_vec_ext_v4sf(X, 1) +#define __builtin_ia32_vec_set_v16qi(D, S, N) __builtin_ia32_vec_set_v16qi(D, S, 1) +#define __builtin_ia32_vec_set_v4si(D, S, N) __builtin_ia32_vec_set_v4si(D, S, 1) +#define __builtin_ia32_vec_set_v2di(D, S, N) __builtin_ia32_vec_set_v2di(D, S, 1) +#define __builtin_ia32_vec_ext_v16qi(X, N) __builtin_ia32_vec_ext_v16qi(X, 1) +#define __builtin_ia32_vec_ext_v4si(X, N) __builtin_ia32_vec_ext_v4si(X, 1) +#define __builtin_ia32_vec_ext_v2di(X, N) __builtin_ia32_vec_ext_v2di(X, 1) +#define __builtin_ia32_mpsadbw128(X, Y, M) __builtin_ia32_mpsadbw128(X, Y, 1) +#define __builtin_ia32_pcmpistrm128(X, Y, M) \ + __builtin_ia32_pcmpistrm128(X, Y, 1) +#define __builtin_ia32_pcmpistri128(X, Y, M) \ + __builtin_ia32_pcmpistri128(X, Y, 1) +#define __builtin_ia32_pcmpestrm128(X, LX, Y, LY, M) \ + __builtin_ia32_pcmpestrm128(X, LX, Y, LY, 1) +#define __builtin_ia32_pcmpestri128(X, LX, Y, LY, M) \ + __builtin_ia32_pcmpestri128(X, LX, Y, LY, 1) +#define __builtin_ia32_pcmpistria128(X, Y, M) \ + __builtin_ia32_pcmpistria128(X, Y, 1) +#define __builtin_ia32_pcmpistric128(X, Y, M) \ + __builtin_ia32_pcmpistric128(X, Y, 1) +#define __builtin_ia32_pcmpistrio128(X, Y, M) \ + __builtin_ia32_pcmpistrio128(X, Y, 1) +#define __builtin_ia32_pcmpistris128(X, Y, M) \ + __builtin_ia32_pcmpistris128(X, Y, 1) +#define __builtin_ia32_pcmpistriz128(X, Y, M) \ + __builtin_ia32_pcmpistriz128(X, Y, 1) +#define __builtin_ia32_pcmpestria128(X, LX, Y, LY, M) \ + __builtin_ia32_pcmpestria128(X, LX, Y, LY, 1) +#define __builtin_ia32_pcmpestric128(X, LX, Y, LY, M) \ + __builtin_ia32_pcmpestric128(X, LX, Y, LY, 1) +#define __builtin_ia32_pcmpestrio128(X, LX, Y, LY, M) \ + __builtin_ia32_pcmpestrio128(X, LX, Y, LY, 1) +#define __builtin_ia32_pcmpestris128(X, LX, Y, LY, M) \ + __builtin_ia32_pcmpestris128(X, LX, Y, LY, 1) +#define __builtin_ia32_pcmpestriz128(X, LX, Y, LY, M) \ + __builtin_ia32_pcmpestriz128(X, LX, Y, LY, 1) + +/* tmmintrin.h */ +#define __builtin_ia32_palignr128(X, Y, N) __builtin_ia32_palignr128(X, Y, 8) +#define __builtin_ia32_palignr(X, Y, N) __builtin_ia32_palignr(X, Y, 8) + +/* emmintrin.h */ +#define __builtin_ia32_psrldqi128(A, B) __builtin_ia32_psrldqi128(A, 8) +#define __builtin_ia32_pslldqi128(A, B) __builtin_ia32_pslldqi128(A, 8) +#define __builtin_ia32_pshufhw(A, N) __builtin_ia32_pshufhw(A, 0) +#define __builtin_ia32_pshuflw(A, N) __builtin_ia32_pshuflw(A, 0) +#define __builtin_ia32_pshufd(A, N) __builtin_ia32_pshufd(A, 0) +#define __builtin_ia32_vec_set_v8hi(A, D, N) \ + __builtin_ia32_vec_set_v8hi(A, D, 0) +#define __builtin_ia32_vec_ext_v8hi(A, N) __builtin_ia32_vec_ext_v8hi(A, 0) +#define __builtin_ia32_shufpd(A, B, N) __builtin_ia32_shufpd(A, B, 0) + +/* xmmintrin.h */ +#define __builtin_prefetch(P, A, I) __builtin_prefetch(P, A, _MM_HINT_NTA) +#define __builtin_ia32_pshufw(A, N) __builtin_ia32_pshufw(A, 0) +#define __builtin_ia32_vec_set_v4hi(A, D, N) \ + __builtin_ia32_vec_set_v4hi(A, D, 0) +#define __builtin_ia32_vec_ext_v4hi(A, N) __builtin_ia32_vec_ext_v4hi(A, 0) +#define __builtin_ia32_shufps(A, B, N) __builtin_ia32_shufps(A, B, 0) + +#include <x86intrin.h> diff --git a/gcc/testsuite/gcc.target/i386/sse-14.c b/gcc/testsuite/gcc.target/i386/sse-14.c new file mode 100644 index 00000000000..c1ddb96e5c3 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse-14.c @@ -0,0 +1,157 @@ +/* { dg-do compile } */ +/* { dg-options "-O0 -Werror-implicit-function-declaration -march=k8 -m3dnow -mavx -msse4a -maes -mpclmul" } */ + +#include <mm_malloc.h> + +/* Test that the intrinsics compile without optimization. All of them are + defined as inline functions in {,x,e,p,t,s,w,a}mmintrin.h and mm3dnow.h + that reference the proper builtin functions. Defining away "extern" and + "__inline" results in all of them being compiled as proper functions. */ + +#define extern +#define __inline + +#include <x86intrin.h> + +#define _CONCAT(x,y) x ## y + +#define test_1(func, type, op1_type, imm) \ + type _CONCAT(_,func) (op1_type A, int const I) \ + { return func (A, imm); } + +#define test_1x(func, type, op1_type, imm1, imm2) \ + type _CONCAT(_,func) (op1_type A, int const I, int const L) \ + { return func (A, imm1, imm2); } + +#define test_2(func, type, op1_type, op2_type, imm) \ + type _CONCAT(_,func) (op1_type A, op2_type B, int const I) \ + { return func (A, B, imm); } + +#define test_2x(func, type, op1_type, op2_type, imm1, imm2) \ + type _CONCAT(_,func) (op1_type A, op2_type B, int const I, int const L) \ + { return func (A, B, imm1, imm2); } + +#define test_3(func, type, op1_type, op2_type, op3_type, imm) \ + type _CONCAT(_,func) (op1_type A, op2_type B, \ + op3_type C, int const I) \ + { return func (A, B, C, imm); } + +#define test_4(func, type, op1_type, op2_type, op3_type, op4_type, imm) \ + type _CONCAT(_,func) (op1_type A, op2_type B, \ + op3_type C, op4_type D, int const I) \ + { return func (A, B, C, D, imm); } + + +/* Following intrinsics require immediate arguments. They + are defined as macros for non-optimized compilations. */ + +/* ammintrin.h */ +test_1x (_mm_extracti_si64, __m128i, __m128i, 1, 1) +test_2x (_mm_inserti_si64, __m128i, __m128i, __m128i, 1, 1) + +/* immintrin.h */ +test_2 (_mm256_blend_pd, __m256d, __m256d, __m256d, 1) +test_2 (_mm256_blend_ps, __m256, __m256, __m256, 1) +test_2 (_mm256_dp_ps, __m256, __m256, __m256, 1) +test_2 (_mm256_shuffle_pd, __m256d, __m256d, __m256d, 1) +test_2 (_mm256_shuffle_ps, __m256, __m256, __m256, 1) +test_2 (_mm_cmp_sd, __m128d, __m128d, __m128d, 1) +test_2 (_mm_cmp_ss, __m128, __m128, __m128, 1) +test_2 (_mm_cmp_pd, __m128d, __m128d, __m128d, 1) +test_2 (_mm_cmp_ps, __m128, __m128, __m128, 1) +test_2 (_mm256_cmp_pd, __m256d, __m256d, __m256d, 1) +test_2 (_mm256_cmp_ps, __m256, __m256, __m256, 1) +test_1 (_mm256_extractf128_pd, __m128d, __m256d, 1) +test_1 (_mm256_extractf128_ps, __m128, __m256, 1) +test_1 (_mm256_extractf128_si256, __m128i, __m256i, 1) +test_1 (_mm256_extract_epi8, int, __m256i, 20) +test_1 (_mm256_extract_epi16, int, __m256i, 13) +test_1 (_mm256_extract_epi32, int, __m256i, 6) +#ifdef __x86_64__ +test_1 (_mm256_extract_epi64, long long, __m256i, 2) +#endif +test_1 (_mm_permute_pd, __m128d, __m128d, 1) +test_1 (_mm256_permute_pd, __m256d, __m256d, 1) +test_1 (_mm_permute_ps, __m128, __m128, 1) +test_1 (_mm256_permute_ps, __m256, __m256, 1) +test_2 (_mm256_permute2f128_pd, __m256d, __m256d, __m256d, 1) +test_2 (_mm256_permute2f128_ps, __m256, __m256, __m256, 1) +test_2 (_mm256_permute2f128_si256, __m256i, __m256i, __m256i, 1) +test_2 (_mm256_insertf128_pd, __m256d, __m256d, __m128d, 1) +test_2 (_mm256_insertf128_ps, __m256, __m256, __m128, 1) +test_2 (_mm256_insertf128_si256, __m256i, __m256i, __m128i, 1) +test_2 (_mm256_insert_epi8, __m256i, __m256i, int, 30) +test_2 (_mm256_insert_epi16, __m256i, __m256i, int, 7) +test_2 (_mm256_insert_epi32, __m256i, __m256i, int, 3) +#ifdef __x86_64__ +test_2 (_mm256_insert_epi64, __m256i, __m256i, long long, 1) +#endif +test_1 (_mm256_round_pd, __m256d, __m256d, 1) +test_1 (_mm256_round_ps, __m256, __m256, 1) + +/* wmmintrin.h */ +test_1 (_mm_aeskeygenassist_si128, __m128i, __m128i, 1) +test_2 (_mm_clmulepi64_si128, __m128i, __m128i, __m128i, 1) + +/* smmintrin.h */ +test_1 (_mm_round_pd, __m128d, __m128d, 1) +test_1 (_mm_round_ps, __m128, __m128, 1) +test_2 (_mm_round_sd, __m128d, __m128d, __m128d, 1) +test_2 (_mm_round_ss, __m128, __m128, __m128, 1) + +test_2 (_mm_blend_epi16, __m128i, __m128i, __m128i, 1) +test_2 (_mm_blend_ps, __m128, __m128, __m128, 1) +test_2 (_mm_blend_pd, __m128d, __m128d, __m128d, 1) +test_2 (_mm_dp_ps, __m128, __m128, __m128, 1) +test_2 (_mm_dp_pd, __m128d, __m128d, __m128d, 1) +test_2 (_mm_insert_ps, __m128, __m128, __m128, 1) +test_1 (_mm_extract_ps, int, __m128, 1) +test_2 (_mm_insert_epi8, __m128i, __m128i, int, 1) +test_2 (_mm_insert_epi32, __m128i, __m128i, int, 1) +#ifdef __x86_64__ +test_2 (_mm_insert_epi64, __m128i, __m128i, long long, 1) +#endif +test_1 (_mm_extract_epi8, int, __m128i, 1) +test_1 (_mm_extract_epi32, int, __m128i, 1) +#ifdef __x86_64__ +test_1 (_mm_extract_epi64, long long, __m128i, 1) +#endif +test_2 (_mm_mpsadbw_epu8, __m128i, __m128i, __m128i, 1) +test_2 (_mm_cmpistrm, __m128i, __m128i, __m128i, 1) +test_2 (_mm_cmpistri, int, __m128i, __m128i, 1) +test_4 (_mm_cmpestrm, __m128i, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestri, int, __m128i, int, __m128i, int, 1) +test_2 (_mm_cmpistra, int, __m128i, __m128i, 1) +test_2 (_mm_cmpistrc, int, __m128i, __m128i, 1) +test_2 (_mm_cmpistro, int, __m128i, __m128i, 1) +test_2 (_mm_cmpistrs, int, __m128i, __m128i, 1) +test_2 (_mm_cmpistrz, int, __m128i, __m128i, 1) +test_4 (_mm_cmpestra, int, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestrc, int, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestro, int, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestrs, int, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestrz, int, __m128i, int, __m128i, int, 1) + +/* tmmintrin.h */ +test_2 (_mm_alignr_epi8, __m128i, __m128i, __m128i, 1) +test_2 (_mm_alignr_pi8, __m64, __m64, __m64, 1) + +/* emmintrin.h */ +test_2 (_mm_shuffle_pd, __m128d, __m128d, __m128d, 1) +test_1 (_mm_srli_si128, __m128i, __m128i, 1) +test_1 (_mm_slli_si128, __m128i, __m128i, 1) +test_1 (_mm_extract_epi16, int, __m128i, 1) +test_2 (_mm_insert_epi16, __m128i, __m128i, int, 1) +test_1 (_mm_shufflehi_epi16, __m128i, __m128i, 1) +test_1 (_mm_shufflelo_epi16, __m128i, __m128i, 1) +test_1 (_mm_shuffle_epi32, __m128i, __m128i, 1) + +/* xmmintrin.h */ +test_2 (_mm_shuffle_ps, __m128, __m128, __m128, 1) +test_1 (_mm_extract_pi16, int, __m64, 1) +test_1 (_m_pextrw, int, __m64, 1) +test_2 (_mm_insert_pi16, __m64, __m64, int, 1) +test_2 (_m_pinsrw, __m64, __m64, int, 1) +test_1 (_mm_shuffle_pi16, __m64, __m64, 1) +test_1 (_m_pshufw, __m64, __m64, 1) +test_1 (_mm_prefetch, void, void *, _MM_HINT_NTA) diff --git a/gcc/testsuite/gcc.target/i386/sse-22.c b/gcc/testsuite/gcc.target/i386/sse-22.c new file mode 100644 index 00000000000..eeae0fcab75 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/sse-22.c @@ -0,0 +1,161 @@ +/* Same as sse-14, except converted to use #pragma GCC option. */ +/* { dg-do compile } */ +/* { dg-options "-O0 -Werror-implicit-function-declaration" } */ + +#include <mm_malloc.h> + +/* Test that the intrinsics compile without optimization. All of them are + defined as inline functions in {,x,e,p,t,s,w,a}mmintrin.h and mm3dnow.h + that reference the proper builtin functions. Defining away "extern" and + "__inline" results in all of them being compiled as proper functions. */ + +#define extern +#define __inline + +#define _CONCAT(x,y) x ## y + +#define test_1(func, type, op1_type, imm) \ + type _CONCAT(_,func) (op1_type A, int const I) \ + { return func (A, imm); } + +#define test_1x(func, type, op1_type, imm1, imm2) \ + type _CONCAT(_,func) (op1_type A, int const I, int const L) \ + { return func (A, imm1, imm2); } + +#define test_2(func, type, op1_type, op2_type, imm) \ + type _CONCAT(_,func) (op1_type A, op2_type B, int const I) \ + { return func (A, B, imm); } + +#define test_2x(func, type, op1_type, op2_type, imm1, imm2) \ + type _CONCAT(_,func) (op1_type A, op2_type B, int const I, int const L) \ + { return func (A, B, imm1, imm2); } + +#define test_4(func, type, op1_type, op2_type, op3_type, op4_type, imm) \ + type _CONCAT(_,func) (op1_type A, op2_type B, \ + op3_type C, op4_type D, int const I) \ + { return func (A, B, C, D, imm); } + + +#ifndef DIFFERENT_PRAGMAS +#pragma GCC target ("mmx,3dnow,sse,sse2,sse3,ssse3,sse4.1,sse4.2,sse4a,aes,pclmul") +#endif + +/* Following intrinsics require immediate arguments. They + are defined as macros for non-optimized compilations. */ + +/* mmintrin.h (MMX). */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("mmx") +#endif +#include <mmintrin.h> + +/* mm3dnow.h (3DNOW). */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("3dnow") +#endif +#include <mm3dnow.h> + +/* xmmintrin.h (SSE). */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("sse") +#endif +#include <xmmintrin.h> +test_2 (_mm_shuffle_ps, __m128, __m128, __m128, 1) +test_1 (_mm_extract_pi16, int, __m64, 1) +test_1 (_m_pextrw, int, __m64, 1) +test_2 (_mm_insert_pi16, __m64, __m64, int, 1) +test_2 (_m_pinsrw, __m64, __m64, int, 1) +test_1 (_mm_shuffle_pi16, __m64, __m64, 1) +test_1 (_m_pshufw, __m64, __m64, 1) +test_1 (_mm_prefetch, void, void *, _MM_HINT_NTA) + +/* emmintrin.h (SSE2). */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("sse2") +#endif +#include <emmintrin.h> +test_2 (_mm_shuffle_pd, __m128d, __m128d, __m128d, 1) +test_1 (_mm_srli_si128, __m128i, __m128i, 1) +test_1 (_mm_slli_si128, __m128i, __m128i, 1) +test_1 (_mm_extract_epi16, int, __m128i, 1) +test_2 (_mm_insert_epi16, __m128i, __m128i, int, 1) +test_1 (_mm_shufflehi_epi16, __m128i, __m128i, 1) +test_1 (_mm_shufflelo_epi16, __m128i, __m128i, 1) +test_1 (_mm_shuffle_epi32, __m128i, __m128i, 1) + +/* pmmintrin.h (SSE3). */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("sse3") +#endif +#include <pmmintrin.h> + +/* tmmintrin.h (SSSE3). */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("ssse3") +#endif +#include <tmmintrin.h> +test_2 (_mm_alignr_epi8, __m128i, __m128i, __m128i, 1) +test_2 (_mm_alignr_pi8, __m64, __m64, __m64, 1) + +/* ammintrin.h (SSE4A). */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("sse4a") +#endif +#include <ammintrin.h> +test_1x (_mm_extracti_si64, __m128i, __m128i, 1, 1) +test_2x (_mm_inserti_si64, __m128i, __m128i, __m128i, 1, 1) + +/* smmintrin.h (SSE4.1). */ +/* nmmintrin.h (SSE4.2). */ +/* Note, nmmintrin.h includes smmintrin.h, and smmintrin.h checks for the + #ifdef. So just set the option to SSE4.2. */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("sse4.2") +#endif +#include <nmmintrin.h> +test_2 (_mm_blend_epi16, __m128i, __m128i, __m128i, 1) +test_2 (_mm_blend_ps, __m128, __m128, __m128, 1) +test_2 (_mm_blend_pd, __m128d, __m128d, __m128d, 1) +test_2 (_mm_dp_ps, __m128, __m128, __m128, 1) +test_2 (_mm_dp_pd, __m128d, __m128d, __m128d, 1) +test_2 (_mm_insert_ps, __m128, __m128, __m128, 1) +test_1 (_mm_extract_ps, int, __m128, 1) +test_2 (_mm_insert_epi8, __m128i, __m128i, int, 1) +test_2 (_mm_insert_epi32, __m128i, __m128i, int, 1) +#ifdef __x86_64__ +test_2 (_mm_insert_epi64, __m128i, __m128i, long long, 1) +#endif +test_1 (_mm_extract_epi8, int, __m128i, 1) +test_1 (_mm_extract_epi32, int, __m128i, 1) +#ifdef __x86_64__ +test_1 (_mm_extract_epi64, long long, __m128i, 1) +#endif +test_2 (_mm_mpsadbw_epu8, __m128i, __m128i, __m128i, 1) +test_2 (_mm_cmpistrm, __m128i, __m128i, __m128i, 1) +test_2 (_mm_cmpistri, int, __m128i, __m128i, 1) +test_4 (_mm_cmpestrm, __m128i, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestri, int, __m128i, int, __m128i, int, 1) +test_2 (_mm_cmpistra, int, __m128i, __m128i, 1) +test_2 (_mm_cmpistrc, int, __m128i, __m128i, 1) +test_2 (_mm_cmpistro, int, __m128i, __m128i, 1) +test_2 (_mm_cmpistrs, int, __m128i, __m128i, 1) +test_2 (_mm_cmpistrz, int, __m128i, __m128i, 1) +test_4 (_mm_cmpestra, int, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestrc, int, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestro, int, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestrs, int, __m128i, int, __m128i, int, 1) +test_4 (_mm_cmpestrz, int, __m128i, int, __m128i, int, 1) + +/* wmmintrin.h (AES/PCLMUL). */ +#ifdef DIFFERENT_PRAGMAS +#pragma GCC target ("aes,pclmul") +#endif +#include <wmmintrin.h> +test_1 (_mm_aeskeygenassist_si128, __m128i, __m128i, 1) +test_2 (_mm_clmulepi64_si128, __m128i, __m128i, __m128i, 1) + +/* smmintrin.h (SSE4.1). */ +test_1 (_mm_round_pd, __m128d, __m128d, 1) +test_1 (_mm_round_ps, __m128, __m128, 1) +test_2 (_mm_round_sd, __m128d, __m128d, __m128d, 1) +test_2 (_mm_round_ss, __m128, __m128, __m128, 1) |